Changeset e37ef045516872af81e36339debad2296a2d7506
- Timestamp:
- 04/28/08 11:42:00 (8 months ago)
- Parents:
- afa46936e67e892eb5c8429bd0991a4630cbd71f, 8f4fd07de7bf767b30c219684608535cb9cb6f19
- Children:
- a2344576c2573c3b9a6fb40dfc614fd4cfe178f7
- git-committer:
- dave <dave@06fd6eb0-0002-0410-a719-e5602cce40bc> / 2008-04-28T01:42:00Z+0000
- Files:
-
- 2 added
- 13 modified
-
c/.rake/.gitignore (modified) (1 diff)
-
c/.rake/gcov.history (added)
-
c/.rake/gcov_history.plot (added)
-
c/.rake/gcov_results.erb (modified) (1 diff)
-
c/Rakefile (modified) (11 diffs)
-
c/include/internal.h (modified) (1 diff)
-
c/include/search.h (modified) (1 diff)
-
c/src/index.c (modified) (3 diffs)
-
c/src/q_fuzzy.c (modified) (5 diffs)
-
c/src/q_phrase.c (modified) (13 diffs)
-
c/test/test.c (modified) (3 diffs)
-
c/test/test_bitvector.c (modified) (13 diffs)
-
c/test/test_q_fuzzy.c (modified) (9 diffs)
-
c/test/test_search.c (modified) (13 diffs)
-
ruby/Rakefile (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
-
c/.rake/.gitignore
r31d0b3 r190f7a 3 3 copts 4 4 gcov_results.html 5 gcov_history.jpg 6 gcov_history.data -
c/.rake/gcov_results.erb
rb78e07 r190f7a 16 16 <h1>Ferret gcov Results</h1> 17 17 <table cellpadding="0" cellspacing="5"> 18 <% $gcov_results.each do |percent, line_cnt, filename| %>18 <% gcov_sorted_keys.each do |fn| %> 19 19 <tr> 20 <th class="filename"><%= f ilename%></th>20 <th class="filename"><%= fn %></th> 21 21 <td class="result"> 22 <div class="bar" style="width:<%= percent%>%;"> <div>22 <div class="bar" style="width:<%=$gcov_results.files[fn].percent%>%;"> <div> 23 23 </td> 24 24 </tr> 25 25 <% end %> 26 26 </table> 27 <img src="gcov_history.jpg" alt="Overall gcov coverage results"/> 27 28 <script type="text/javascript"> 28 29 var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www."); -
c/Rakefile
r3d5c1b r2142bb 3 3 require 'rake' 4 4 require 'rake/clean' 5 require 'ostruct' 5 6 6 7 class String … … 54 55 sh script 55 56 end 57 58 ### 59 # Useful Constants 60 ### 61 62 DAY = 24 * 60 * 60 # seconds in day 56 63 57 64 ### … … 105 112 basename 106 113 end 107 $gcov_results = [] 114 115 $gcov_results = OpenStruct.new(:files => {}) 116 def gcov_sorted_keys 117 $gcov_results.files.keys.sort {|fn1, fn2| 118 res = $gcov_results.files[fn1].percent <=> 119 $gcov_results.files[fn2].percent 120 res = fn2 <=> fn1 if res == 0 121 res 122 }.reverse 123 end 108 124 109 125 ### … … 172 188 if copts != old_copts 173 189 File.open(copts_file, 'w') {|f| f.write(Marshal.dump(copts))} 174 Rake::Task[: clean].invoke190 Rake::Task[:scrub].invoke 175 191 end 176 192 end … … 190 206 '*.gcov', 191 207 'y.tab.c']) 192 193 208 CLOBBER.include([ '*.a', 194 209 '**/*.o', … … 200 215 GCOV_DIR]) 201 216 202 task :default => :testall 203 217 SCRUB = FileList[CLEAN, '**/*.o'] 218 desc "clean for rebuild" 219 task :scrub do 220 SCRUB.each {|fn| rm_r fn rescue nil} 221 end 222 223 ### 224 # Test Helpers 225 ### 204 226 def run_tests 205 227 check_coptions … … 215 237 end 216 238 239 ### 240 # gcov Helpers 241 ### 242 GCOV_HIST_FILE = ".rake/gcov.history" 243 def load_gcov_history 244 Marshal.load(File.read(GCOV_HIST_FILE)) 245 end 246 247 task :default => :testall 248 217 249 desc "Run all tests" 218 250 task :test do … … 229 261 task :gcov => :do_gcov do 230 262 puts "\ngcov Results\n=============\n" 231 $gcov_results.each do |percent, line_cnt, fn| 232 puts "%25s %6.2f%% (%4d)" % [fn, percent, line_cnt] 233 end 234 end 235 236 task :gcov2html => :do_gcov do 263 total_lines = 0 264 total_lines_covered = 0 265 gcov_sorted_keys.each do |fn| 266 res = $gcov_results.files[fn] 267 puts "%25s %6.2f%% (%4d)" % [fn, res.percent, res.line_count] 268 end 269 puts "Total Lines: %s" % $gcov_results.total_lines 270 puts "Total Lines covered: %s (%0.2f%%)" % 271 [$gcov_results.covered_lines, $gcov_results.percent] 272 end 273 274 desc "Generate .rake/gcov_results.html" 275 task :gcov2html => [:do_gcov, '.rake/gcov_history.jpg'] do 237 276 puts "Generating .rake/gcov_results.html..." 238 277 File.open('.rake/gcov_results.html', 'w') do |f| … … 241 280 end 242 281 282 file '.rake/gcov_history.data' => GCOV_HIST_FILE do |t| 283 history = load_gcov_history 284 start = history.start_date 285 File.open(t.name, 'w') do |f| 286 history.data.each do |d| 287 f.write "%s %.2f\n" % 288 [Time.at((d.day + start) * DAY).strftime("%Y-%m-%d"), d.percent] 289 end 290 end 291 end 292 293 file '.rake/gcov_history.jpg' => '.rake/gcov_history.data' do |t| 294 sh "gnuplot .rake/gcov_history.plot" 295 end 296 243 297 task :do_gcov do 244 298 CFLAGS << " -fprofile-arcs -ftest-coverage" 245 299 run_tests 246 gcov_result s = []300 gcov_result_map = {} 247 301 gcov_scanner = /File '([^']+)'\nLines executed:(\d+.\d+)% of (\d+)/ 302 total_lines = covered_lines = 0 248 303 OBJS.each do |obj| 249 304 bn = obj.pathmap('%n') … … 256 311 gcov_cmd = "gcov -o .gcov/ #{obj}" 257 312 details = IO.popen(gcov_cmd).read.scan(gcov_scanner) 258 details.each do |filename, percent, line_c nt|313 details.each do |filename, percent, line_count| 259 314 next if filename =~ /^\// 260 $gcov_results << [percent.to_f, line_cnt, filename] 261 end 262 end 263 $gcov_results = $gcov_results.sort_by{|res| res[0]}.reverse 315 line_count = line_count.to_i 316 percent = percent.to_f 317 $gcov_results.files[filename] = OpenStruct.new(:percent => percent, 318 :line_count => line_count) 319 total_lines += line_count 320 covered_lines += (0.01 * percent * line_count).to_i 321 end 322 end 323 $gcov_results.total_lines = total_lines 324 $gcov_results.covered_lines = covered_lines 325 $gcov_results.percent = percent = 100.0 * covered_lines /total_lines 326 history = load_gcov_history 327 today = Time.now.to_i/DAY - history.start_date 328 history.data.pop if history.data.last.day == today 329 history.data << OpenStruct.new(:day => today, 330 :total => total_lines, 331 :covered => covered_lines, 332 :percent => percent) 333 File.open(GCOV_HIST_FILE, 'w') {|f| f.write(Marshal.dump(history))} 264 334 end 265 335 … … 451 521 desc "Publish the gcoverage results" 452 522 task :gcov => :gcov2html do 453 sh %{scp .rake/gcov_results.html www@davebalmain.com:/var/www/ferret} 523 sh "scp .rake/gcov_results.html .rake/gcov_history.jpg " + 524 "www@davebalmain.com:/var/www/ferret" 454 525 end 455 526 end -
c/include/internal.h
ra04da2 r8f4fd0 738 738 #define phq_append_multi_term frt_phq_append_multi_term 739 739 #define phq_new frt_phq_new 740 #define phq_set_slop frt_phq_set_slop 740 741 #define pl_add_occ frt_pl_add_occ 741 742 #define pl_cmp frt_pl_cmp -
c/include/search.h
r950230 r8f4fd0 326 326 extern void frt_phq_add_term_abs(FrtQuery *self, const char *term, int position); 327 327 extern void frt_phq_append_multi_term(FrtQuery *self, const char *term); 328 extern void frt_phq_set_slop(FrtQuery *self, int slop); 328 329 329 330 /*************************************************************************** -
c/src/index.c
r0c11a5 rafa469 366 366 FieldInfo *fis_get_field(FieldInfos *fis, Symbol name) 367 367 { 368 return (FieldInfo *)h_get(fis->field_dict, name);368 return (FieldInfo *)h_get(fis->field_dict, I(name)); 369 369 } 370 370 371 371 int fis_get_field_num(FieldInfos *fis, Symbol name) 372 372 { 373 FieldInfo *fi = (FieldInfo *)h_get(fis->field_dict, name);373 FieldInfo *fi = (FieldInfo *)h_get(fis->field_dict, I(name)); 374 374 if (fi) { 375 375 return fi->number; … … 382 382 FieldInfo *fis_get_or_add_field(FieldInfos *fis, Symbol name) 383 383 { 384 FieldInfo *fi = (FieldInfo *)h_get(fis->field_dict, name);384 FieldInfo *fi = (FieldInfo *)h_get(fis->field_dict, I(name)); 385 385 if (!fi) { 386 386 fi = (FieldInfo*)fi_new(name, fis->store, fis->index, fis->term_vector); … … 4548 4548 Symbol field) 4549 4549 { 4550 FieldInfo *fi = (FieldInfo *)h_get(ir->fis->field_dict, field);4550 FieldInfo *fi = (FieldInfo *)h_get(ir->fis->field_dict, I(field)); 4551 4551 FieldsReader *fr; 4552 4552 -
c/src/q_fuzzy.c
r950230 r88f53b 8 8 * FuzzyStuff 9 9 * 10 * The main method here is the fuzq_score method which scores a term against 11 * another term. The other methods all act in support. 10 * The main method here is the fuzq_score_mn method which scores a term 11 * against another term. The other methods all act in support. 12 * 13 * To learn more about the fuzzy scoring algorithm see; 14 * 15 * http://en.wikipedia.org/wiki/Levenshtein_distance 12 16 * 13 17 ****************************************************************************/ 14 18 19 /** 20 * Calculate the maximum nomber of allowed edits (or maximum edit distance) 21 * for a word to be a match. 22 * 23 * Note that fuzq->text_len and m are both the lengths text *after* the prefix 24 * so `MIN(fuzq->text_len, m) + fuzq->pre_len)` actually gets the byte length 25 * of the shorter string out of the query string and the index term being 26 * compared. 27 */ 15 28 static INLINE int fuzq_calculate_max_distance(FuzzyQuery *fuzq, int m) 16 29 { … … 18 31 } 19 32 33 /** 34 * The max-distance formula gets used a lot - it needs to be calculated for 35 * every possible match in the index - so we cache the results for all 36 * lengths up to the TYPICAL_LONGEST_WORD limit. For words longer than this we 37 * calculate the value live. 38 */ 20 39 static void fuzq_initialize_max_distances(FuzzyQuery *fuzq) 21 40 { … … 26 45 } 27 46 47 /** 48 * Return the cached max-distance value if the word is within the 49 * TYPICAL_LONGEST_WORD limit. 50 */ 28 51 static INLINE int fuzq_get_max_distance(FuzzyQuery *fuzq, int m) 29 52 { … … 33 56 } 34 57 58 /** 59 * Calculate the similarity score for the +target+ against the query. 60 * 61 * @params fuzq The Fuzzy Query 62 * @params target *the term to compare against minus the prefix 63 * @params m the string length of +target+ 64 * @params n the string length of the query string minus length of the prefix 65 */ 35 66 static INLINE float fuzq_score_mn(FuzzyQuery *fuzq, 36 67 const char *target, … … 173 204 } 174 205 175 if (te == NULL) { 176 if (prefix) free(prefix); 177 return q; 178 } 206 assert(NULL != te); 179 207 180 208 fuzq->scale_factor = (float)(1.0 / (1.0 - fuzq->min_sim)); -
c/src/q_phrase.c
r0c11a5 r8f4fd0 117 117 static bool pp_less_than(const PhPos *pp1, const PhPos *pp2) 118 118 { 119 /* docs will all be equal when this method is used */ 120 return pp1->position < pp2->position; 121 /* 122 if (PP(p)->doc == PP(p)->doc) { 123 return PP(p)->position < PP(p)->position; 119 if (pp1->position == pp2->position) { 120 return pp1->offset < pp2->offset; 124 121 } 125 122 else { 126 return PP(p)->doc < PP(p)->doc; 127 } 128 */ 123 return pp1->position < pp2->position; 124 } 129 125 } 130 126 … … 168 164 bool first_time : 1; 169 165 bool more : 1; 166 bool check_repeats : 1; 170 167 } PhraseScorer; 171 168 … … 235 232 return raw_score * sim_decode_norm( 236 233 self->similarity, 237 phsc->norms[ phsc->phrase_pos[phsc->pp_first_idx]->doc]);234 phsc->norms[self->doc]); 238 235 } 239 236 … … 279 276 phsc_skip_to(self, doc_num); 280 277 281 phrase_freq = (self->doc == doc_num) ? phsc->freq : (float)0.0;278 phrase_freq = (self->doc == doc_num) ? phsc->freq : 0.0f; 282 279 return expl_new(sim_tf(self->similarity, phrase_freq), 283 280 "tf(phrase_freq=%f)", phrase_freq); … … 295 292 } 296 293 297 static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum, 294 static Scorer *phsc_new(Weight *weight, 295 TermDocEnum **term_pos_enum, 298 296 PhrasePosition *positions, int pos_cnt, 299 Similarity *similarity, uchar *norms) 297 Similarity *similarity, 298 uchar *norms, 299 int slop) 300 300 { 301 301 int i; 302 302 Scorer *self = scorer_new(PhraseScorer, similarity); 303 HashSet *term_set = NULL; 304 303 305 304 306 PhSc(self)->weight = weight; … … 308 310 PhSc(self)->pp_first_idx = 0; 309 311 PhSc(self)->pp_cnt = pos_cnt; 310 PhSc(self)->slop = 0;312 PhSc(self)->slop = slop; 311 313 PhSc(self)->first_time = true; 312 314 PhSc(self)->more = true; 313 315 PhSc(self)->check_repeats = false; 316 317 if (slop) { 318 term_set = hs_new_str((free_ft)NULL); 319 } 314 320 for (i = 0; i < pos_cnt; i++) { 321 /* check for repeats */ 322 if (slop && !PhSc(self)->check_repeats) { 323 char **terms = positions[i].terms; 324 const int t_cnt = ary_size(terms); 325 int j; 326 for (j = 0; j < t_cnt; j++) { 327 if (hs_add(term_set, terms[j])) { 328 PhSc(self)->check_repeats = true; 329 goto repeat_check_done; 330 } 331 } 332 } 333 repeat_check_done: 315 334 PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos); 335 } 336 337 if (slop) { 338 hs_destroy(term_set); 316 339 } 317 340 … … 376 399 Similarity *similarity, uchar *norms) 377 400 { 378 Scorer *self = 379 phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms); 401 Scorer *self = phsc_new(weight, 402 term_pos_enum, 403 positions, 404 pp_cnt, 405 similarity, 406 norms, 407 0); 380 408 381 409 PhSc(self)->phrase_freq = &ephsc_phrase_freq; … … 386 414 * SloppyPhraseScorer 387 415 ***************************************************************************/ 416 417 static bool sphsc_check_repeats(PhPos *pp, 418 PhPos **positions, 419 const int p_cnt) 420 { 421 int j; 422 for (j = 0; j < p_cnt; j++) { 423 PhPos *ppj = positions[j]; 424 /* If offsets are equal, either we are at the current PhPos +pp+ or 425 * +pp+ and +ppj+ are supposed to match in the same position in which 426 * case we don't need to check. */ 427 if (ppj->offset == pp->offset) { 428 continue; 429 } 430 /* the two phrase positions are matching on the same term 431 * which we want to avoid */ 432 if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) { 433 if (!pp_next_position(pp)) { 434 /* We have no matches for this document */ 435 return false; 436 } 437 /* we changed the position so we need to start check again */ 438 j = -1; 439 } 440 } 441 return true; 442 } 388 443 389 444 static float sphsc_phrase_freq(Scorer *self) … … 396 451 int last_pos = 0, pos, next_pos, start, match_length, i; 397 452 bool done = false; 453 bool check_repeats = phsc->check_repeats; 398 454 float freq = 0.0; 399 455 400 456 for (i = 0; i < pp_cnt; i++) { 401 457 pp = phsc->phrase_pos[i]; 402 pp_first_position(pp); 458 /* we should always have at least one position or this functions 459 * shouldn't have been called. */ 460 assert(pp_first_position(pp)); 461 if (check_repeats && i > 0) { 462 if (!sphsc_check_repeats(pp, phsc->phrase_pos, i - 1)) { 463 goto return_freq; 464 } 465 } 403 466 if (pp->position > last_pos) { 404 467 last_pos = pp->position; … … 413 476 while (pos <= next_pos) { 414 477 start = pos; /* advance pp to min window */ 415 if (!pp_next_position(pp)) { 416 done = true; /* ran out of a positions for a term - done */ 478 if (!pp_next_position(pp) 479 || (check_repeats 480 && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) { 481 done = true; 417 482 break; 418 483 } … … 431 496 pq_push(pq, pp); /* restore pq */ 432 497 } while (!done); 498 499 return_freq: 433 500 434 501 pq_destroy(pq); … … 442 509 int slop, uchar *norms) 443 510 { 444 Scorer *self = 445 phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms); 446 447 PhSc(self)->slop = slop; 511 Scorer *self = phsc_new(weight, 512 term_pos_enum, 513 positions, 514 pp_cnt, 515 similarity, 516 norms, 517 slop); 518 448 519 PhSc(self)->phrase_freq = &sphsc_phrase_freq; 449 520 return self; … … 1125 1196 } 1126 1197 } 1198 1199 void frt_phq_set_slop(FrtQuery *self, int slop) 1200 { 1201 PhQ(self)->slop = slop; 1202 } -
c/test/test.c
r03f3cf r8f4fd0 259 259 260 260 dptr = suite->head; 261 fprintf(stdout, "%- 15s\t\tTotal\tFail\tFailed %%\n", "Failed Tests");261 fprintf(stdout, "%-24sTotal\tFail\tFailed %%\n", "Failed Tests"); 262 262 fprintf(stdout, "===================================================\n"); 263 263 while (dptr != NULL) { … … 265 265 double percent = 266 266 ((double) dptr->failed / (double) dptr->num_test); 267 fprintf(stdout, "%- 15s\t\t%5d\t%4d\t%6.2f%%\n", dptr->name,267 fprintf(stdout, "%-24s%5d\t%4d\t%6.2f%%\n", dptr->name, 268 268 dptr->num_test, dptr->failed, percent * 100); 269 269 } … … 456 456 return true; 457 457 } 458 fprintf(stderr, "diff = %g\n", diff); 458 459 459 460 tc->failed = true; -
c/test/test_bitvector.c
r48290f rcf8e43 147 147 } 148 148 149 #define test_bveq(_bv1, _bv2) \ 150 do { \ 151 BitVector *_not_bv1, *_not_bv2; \ 152 Assert(bv_eq(_bv1, _bv2), "BitVectors are equal"); \ 153 Assert(bv_eq(_bv2, _bv1), "BitVectors are equal"); \ 154 Assert(bv_eq(_bv1, _bv1), "bv_eq on self should work"); \ 155 Aiequal(bv_hash(_bv1), bv_hash(_bv2)); \ 156 /* test flipped bitvectors */ \ 157 _not_bv1 = bv_not(_bv1); _not_bv2 = bv_not(_bv2); \ 158 bv_set(_not_bv1, 1100); /* should make no difference */ \ 159 Assert(bv_eq(_not_bv1, _not_bv2), "BitVectors are equal"); \ 160 Assert(bv_eq(_not_bv2, _not_bv1), "BitVectors are equal"); \ 161 Assert(bv_eq(_not_bv1, _not_bv1), "bv_eq on self should work"); \ 162 Aiequal(bv_hash(_not_bv1), bv_hash(_not_bv2)); \ 163 bv_destroy(_not_bv1); bv_destroy(_not_bv2); \ 164 } while (0) 165 166 #define test_bvneq(_bv1, _bv2) \ 167 do { \ 168 BitVector *_not_bv1, *_not_bv2; \ 169 Assert(!bv_eq(_bv1, _bv2), "BitVectors are not equal"); \ 170 Assert(!bv_eq(_bv2, _bv1), "BitVectors are not equal"); \ 171 Assert(bv_hash(_bv1) != bv_hash(_bv2), "BitVectors not equal"); \ 172 /* test flipped bitvectors */ \
