#!/usr/bin/env ruby # # Extract the corpus to a number of text files for indexing by Lucene and # Ferret benchmarks. # require 'fileutils' source_dir = ARGV[0] unless source_dir and File.directory?(source_dir) raise "Usage: ./extract_reuters.rb /path/to/expanded/archive/dir" end main_out_dir = 'corpus' FileUtils.mkdir_p(main_out_dir) unless File.directory?(main_out_dir) num_files = 0 # get a list of the sgm files Dir["#{source_dir}/**/*.sgm"].each do |file_name| puts "Processing :" + file_name path = File.join(main_out_dir, File.basename(file_name)) FileUtils.mkdir_p(File.join(main_out_dir, File.basename(file_name))) in_body = in_title = false body = nil title = nil File.readlines(file_name).each do |line| case line when /<REUTERS/ title = nil body = [] when %r{<TITLE>([^<]*)</TITLE>} title = $1 when %r{<TITLE>([^<]*)} in_title = true title = $1 when %r{([^<]*)</TITLE>} in_title = false title << $1 when /<BODY>(.*)/m body << $1 in_body = true when %r{(.*)</BODY>} in_body = false body << $1 File.open(File.join(path, "article%05d.txt" % num_files), "w") do |f| f.puts title f.puts "" f.puts body.join('') end num_files += 1 else body << line if in_body title << line if in_title end end end puts "Total articles extracted: #{num_files}"
