| | 1 | {{{ |
| | 2 | #!ruby |
| | 3 | #!/usr/bin/env ruby |
| | 4 | # |
| | 5 | # Extract the corpus to a number of text files for indexing by Lucene and |
| | 6 | # Ferret benchmarks. |
| | 7 | # |
| | 8 | require 'fileutils' |
| | 9 | |
| | 10 | source_dir = ARGV[0] |
| | 11 | unless source_dir and File.directory?(source_dir) |
| | 12 | raise "Usage: ./extract_reuters.rb /path/to/expanded/archive/dir" |
| | 13 | end |
| | 14 | |
| | 15 | main_out_dir = 'corpus' |
| | 16 | FileUtils.mkdir_p(main_out_dir) unless File.directory?(main_out_dir) |
| | 17 | |
| | 18 | num_files = 0 |
| | 19 | # get a list of the sgm files |
| | 20 | Dir["#{source_dir}/**/*.sgm"].each do |file_name| |
| | 21 | puts "Processing :" + file_name |
| | 22 | path = File.join(main_out_dir, File.basename(file_name)) |
| | 23 | FileUtils.mkdir_p(File.join(main_out_dir, File.basename(file_name))) |
| | 24 | in_body = in_title = false |
| | 25 | body = nil |
| | 26 | title = nil |
| | 27 | File.readlines(file_name).each do |line| |
| | 28 | case line |
| | 29 | when /<REUTERS/ |
| | 30 | title = nil |
| | 31 | body = [] |
| | 32 | when %r{<TITLE>([^<]*)</TITLE>} |
| | 33 | title = $1 |
| | 34 | when %r{<TITLE>([^<]*)} |
| | 35 | in_title = true |
| | 36 | title = $1 |
| | 37 | when %r{([^<]*)</TITLE>} |
| | 38 | in_title = false |
| | 39 | title << $1 |
| | 40 | when /<BODY>(.*)/m |
| | 41 | body << $1 |
| | 42 | in_body = true |
| | 43 | when %r{(.*)</BODY>} |
| | 44 | in_body = false |
| | 45 | body << $1 |
| | 46 | File.open(File.join(path, "article%05d.txt" % num_files), "w") do |f| |
| | 47 | f.puts title |
| | 48 | f.puts "" |
| | 49 | f.puts body.join('') |
| | 50 | end |
| | 51 | num_files += 1 |
| | 52 | else |
| | 53 | body << line if in_body |
| | 54 | title << line if in_title |
| | 55 | end |
| | 56 | end |
| | 57 | end |
| | 58 | |
| | 59 | puts "Total articles extracted: #{num_files}" |
| | 60 | }}} |