require 'optparse' require 'rubygems' require 'ferret' FL = Dir["corpus/**/*.txt"] include Ferret include Ferret::Index def init_writer(create) options = { :path => "ferret_index", :analyzer => Analysis::WhiteSpaceAnalyzer.new(), :merge_factor => 100, :use_compound_file => true, :max_buffer_memory => 0x10000000, :max_buffered_docs => 20_000 } if create options[:create] = true field_infos = FieldInfos.new() field_infos.add_field(:body, :store => :no, :term_vector => :no) unless @store options[:field_infos] = field_infos end IndexWriter.new(options) end def build_index(file_list, max_to_index, increment, store) writer = init_writer(true) docs_so_far = 0 file_list.each do |fn| File.open(fn) do |f| raise("Failed to read title") if (title = f.readline).nil? writer << {:title => title, :body => f.readlines} end docs_so_far += 1 break if (docs_so_far >= max_to_index) if (docs_so_far % increment == 0) writer.close() writer = init_writer(false) end end # finish index num_indexed = writer.doc_count() writer.optimize() writer.close() return num_indexed end @docs = FL.size @reps = 1 @inc = 0 @store = false opts = OptionParser.new do |opts| opts.banner = "Usage: f.rb [options]" opts.separator "" opts.separator "Specific options:" opts.on("-d", "--docs VAL", Integer) {|v| @docs = v} opts.on("-r", "--reps VAL", Integer) {|v| @reps = v} opts.on("-i", "--inc VAL", Integer) {|v| @reps = v} opts.on("-s", "--store") {|v| @store = true} end opts.parse(ARGV) @inc = @inc == 0 ? @docs + 1 : @inc puts "-" * 60 times = [] @reps.times do |i| t = Time.now num_indexed = build_index(FL, @docs, @inc, @store) t = Time.new - t times << t puts "#{i} Secs: %.2f Docs: #{num_indexed}" % t end times.sort! num_to_chop = @reps >> 2 num_kept = 0 mean_time = 0.0 trunc_mean_time = 0.0 @reps.times do |i| mean_time += times[i] next if (i < num_to_chop) || (i >= (@reps - num_to_chop)) trunc_mean_time += times[i] num_kept += 1 end mean_time /= @reps trunc_mean_time /= num_kept puts "-" * 60 puts "Mean %.2f secs" % mean_time puts "Truncated Mean (#{num_kept} kept, #{@reps - num_kept} discarded): " + "%.2f secs" % trunc_mean_time puts "-" * 60
