Changeset 8f4fd07de7bf767b30c219684608535cb9cb6f19
- Timestamp:
- 04/28/08 11:06:13 (8 months ago)
- Author:
- David Balmain <dbalmain@…>
- Parents:
- 56424f8d1cfd0e927e5e8f9e0c4329531221a966
- Children:
- e37ef045516872af81e36339debad2296a2d7506
- git-committer:
- David Balmain <dbalmain@gmail.com> / 2008-04-28T11:06:13Z+1000
- Message:
-
Fixed repeated term bug in sloppy PhraseQuery
Repeated terms were breaking sloppy PhraseQueries?. For example, if you had the
query "one two one"~2 it would match "one two". This has now been fixed.
- Location:
- c
- Files:
-
Legend:
- Unmodified
- Added
- Removed
-
|
ra04da2
|
r8f4fd0
|
|
| 738 | 738 | #define phq_append_multi_term frt_phq_append_multi_term |
| 739 | 739 | #define phq_new frt_phq_new |
| | 740 | #define phq_set_slop frt_phq_set_slop |
| 740 | 741 | #define pl_add_occ frt_pl_add_occ |
| 741 | 742 | #define pl_cmp frt_pl_cmp |
-
|
r950230
|
r8f4fd0
|
|
| 326 | 326 | extern void frt_phq_add_term_abs(FrtQuery *self, const char *term, int position); |
| 327 | 327 | extern void frt_phq_append_multi_term(FrtQuery *self, const char *term); |
| | 328 | extern void frt_phq_set_slop(FrtQuery *self, int slop); |
| 328 | 329 | |
| 329 | 330 | /*************************************************************************** |
-
|
r56424f
|
r8f4fd0
|
|
| 164 | 164 | bool first_time : 1; |
| 165 | 165 | bool more : 1; |
| | 166 | bool check_repeats : 1; |
| 166 | 167 | } PhraseScorer; |
| 167 | 168 | |
| … |
… |
|
| 291 | 292 | } |
| 292 | 293 | |
| 293 | | static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum, |
| | 294 | static Scorer *phsc_new(Weight *weight, |
| | 295 | TermDocEnum **term_pos_enum, |
| 294 | 296 | PhrasePosition *positions, int pos_cnt, |
| 295 | | Similarity *similarity, uchar *norms) |
| | 297 | Similarity *similarity, |
| | 298 | uchar *norms, |
| | 299 | int slop) |
| 296 | 300 | { |
| 297 | 301 | int i; |
| 298 | 302 | Scorer *self = scorer_new(PhraseScorer, similarity); |
| | 303 | HashSet *term_set = NULL; |
| | 304 | |
| 299 | 305 | |
| 300 | 306 | PhSc(self)->weight = weight; |
| … |
… |
|
| 304 | 310 | PhSc(self)->pp_first_idx = 0; |
| 305 | 311 | PhSc(self)->pp_cnt = pos_cnt; |
| 306 | | PhSc(self)->slop = 0; |
| | 312 | PhSc(self)->slop = slop; |
| 307 | 313 | PhSc(self)->first_time = true; |
| 308 | 314 | PhSc(self)->more = true; |
| 309 | | |
| | 315 | PhSc(self)->check_repeats = false; |
| | 316 | |
| | 317 | if (slop) { |
| | 318 | term_set = hs_new_str((free_ft)NULL); |
| | 319 | } |
| 310 | 320 | for (i = 0; i < pos_cnt; i++) { |
| | 321 | /* check for repeats */ |
| | 322 | if (slop && !PhSc(self)->check_repeats) { |
| | 323 | char **terms = positions[i].terms; |
| | 324 | const int t_cnt = ary_size(terms); |
| | 325 | int j; |
| | 326 | for (j = 0; j < t_cnt; j++) { |
| | 327 | if (hs_add(term_set, terms[j])) { |
| | 328 | PhSc(self)->check_repeats = true; |
| | 329 | goto repeat_check_done; |
| | 330 | } |
| | 331 | } |
| | 332 | } |
| | 333 | repeat_check_done: |
| 311 | 334 | PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos); |
| | 335 | } |
| | 336 | |
| | 337 | if (slop) { |
| | 338 | hs_destroy(term_set); |
| 312 | 339 | } |
| 313 | 340 | |
| … |
… |
|
| 372 | 399 | Similarity *similarity, uchar *norms) |
| 373 | 400 | { |
| 374 | | Scorer *self = |
| 375 | | phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms); |
| | 401 | Scorer *self = phsc_new(weight, |
| | 402 | term_pos_enum, |
| | 403 | positions, |
| | 404 | pp_cnt, |
| | 405 | similarity, |
| | 406 | norms, |
| | 407 | 0); |
| 376 | 408 | |
| 377 | 409 | PhSc(self)->phrase_freq = &ephsc_phrase_freq; |
| … |
… |
|
| 382 | 414 | * SloppyPhraseScorer |
| 383 | 415 | ***************************************************************************/ |
| | 416 | |
| | 417 | static bool sphsc_check_repeats(PhPos *pp, |
| | 418 | PhPos **positions, |
| | 419 | const int p_cnt) |
| | 420 | { |
| | 421 | int j; |
| | 422 | for (j = 0; j < p_cnt; j++) { |
| | 423 | PhPos *ppj = positions[j]; |
| | 424 | /* If offsets are equal, either we are at the current PhPos +pp+ or |
| | 425 | * +pp+ and +ppj+ are supposed to match in the same position in which |
| | 426 | * case we don't need to check. */ |
| | 427 | if (ppj->offset == pp->offset) { |
| | 428 | continue; |
| | 429 | } |
| | 430 | /* the two phrase positions are matching on the same term |
| | 431 | * which we want to avoid */ |
| | 432 | if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) { |
| | 433 | if (!pp_next_position(pp)) { |
| | 434 | /* We have no matches for this document */ |
| | 435 | return false; |
| | 436 | } |
| | 437 | /* we changed the position so we need to start check again */ |
| | 438 | j = -1; |
| | 439 | } |
| | 440 | } |
| | 441 | return true; |
| | 442 | } |
| 384 | 443 | |
| 385 | 444 | static float sphsc_phrase_freq(Scorer *self) |
| … |
… |
|
| 392 | 451 | int last_pos = 0, pos, next_pos, start, match_length, i; |
| 393 | 452 | bool done = false; |
| | 453 | bool check_repeats = phsc->check_repeats; |
| 394 | 454 | float freq = 0.0; |
| 395 | 455 | |
| 396 | 456 | for (i = 0; i < pp_cnt; i++) { |
| 397 | 457 | pp = phsc->phrase_pos[i]; |
| 398 | | pp_first_position(pp); |
| | 458 | /* we should always have at least one position or this functions |
| | 459 | * shouldn't have been called. */ |
| | 460 | assert(pp_first_position(pp)); |
| | 461 | if (check_repeats && i > 0) { |
| | 462 | if (!sphsc_check_repeats(pp, phsc->phrase_pos, i - 1)) { |
| | 463 | goto return_freq; |
| | 464 | } |
| | 465 | } |
| 399 | 466 | if (pp->position > last_pos) { |
| 400 | 467 | last_pos = pp->position; |
| … |
… |
|
| 409 | 476 | while (pos <= next_pos) { |
| 410 | 477 | start = pos; /* advance pp to min window */ |
| 411 | | if (!pp_next_position(pp)) { |
| 412 | | done = true; /* ran out of a positions for a term - done */ |
| | 478 | if (!pp_next_position(pp) |
| | 479 | || (check_repeats |
| | 480 | && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) { |
| | 481 | done = true; |
| 413 | 482 | break; |
| 414 | 483 | } |
| … |
… |
|
| 427 | 496 | pq_push(pq, pp); /* restore pq */ |
| 428 | 497 | } while (!done); |
| | 498 | |
| | 499 | return_freq: |
| 429 | 500 | |
| 430 | 501 | pq_destroy(pq); |
| … |
… |
|
| 438 | 509 | int slop, uchar *norms) |
| 439 | 510 | { |
| 440 | | Scorer *self = |
| 441 | | phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms); |
| 442 | | |
| 443 | | PhSc(self)->slop = slop; |
| | 511 | Scorer *self = phsc_new(weight, |
| | 512 | term_pos_enum, |
| | 513 | positions, |
| | 514 | pp_cnt, |
| | 515 | similarity, |
| | 516 | norms, |
| | 517 | slop); |
| | 518 | |
| 444 | 519 | PhSc(self)->phrase_freq = &sphsc_phrase_freq; |
| 445 | 520 | return self; |
| … |
… |
|
| 1121 | 1196 | } |
| 1122 | 1197 | } |
| | 1198 | |
| | 1199 | void frt_phq_set_slop(FrtQuery *self, int slop) |
| | 1200 | { |
| | 1201 | PhQ(self)->slop = slop; |
| | 1202 | } |
-
|
r736446
|
r8f4fd0
|
|
| 456 | 456 | return true; |
| 457 | 457 | } |
| | 458 | fprintf(stderr, "diff = %g\n", diff); |
| 458 | 459 | |
| 459 | 460 | tc->failed = true; |
-
|
r56424f
|
r8f4fd0
|
|
| 164 | 164 | {"20051001", "word1 word2 the quick brown fox the quick brown fox", |
| 165 | 165 | "cat1/sub1", "0.954"}, |
| 166 | | {"20051002", "word1 word3", |
| | 166 | {"20051002", "word1 word3 one two one", |
| 167 | 167 | "cat1/sub1/subsub1", "908.123434"}, |
| 168 | | {"20051003", "word1 word3", |
| | 168 | {"20051003", "word1 word3 one two", |
| 169 | 169 | "cat1/sub2", "3999"}, |
| 170 | 170 | {"20051004", "word1 word2", |
| 171 | 171 | "cat1/sub2/subsub2", "+.3413"}, |
| 172 | | {"20051005", "word1", |
| | 172 | {"20051005", "word1 one two x x x x x one two", |
| 173 | 173 | "cat2/sub1", "-1.1298"}, |
| 174 | 174 | {"20051006", "word1 word3", |
| … |
… |
|
| 265 | 265 | TopDocs *top_docs |
| 266 | 266 | = searcher_search(searcher, query, 0, total_hits + 1, NULL, NULL, NULL); |
| 267 | | if (!Aiequal(total_hits, top_docs->total_hits)) { |
| | 267 | if (!tc->failed && !Aiequal(total_hits, top_docs->total_hits)) { |
| 268 | 268 | int i; |
| 269 | 269 | Tmsg_nf("\texpected;\n\t "); |
| … |
… |
|
| 297 | 297 | /* only check the explanation if we got the correct docs. Obviously we |
| 298 | 298 | * might want to remove this to visually check the explanations */ |
| 299 | | if (total_hits == top_docs->total_hits) { |
| | 299 | if (!tc->failed && total_hits == top_docs->total_hits) { |
| 300 | 300 | Explanation *e = searcher_explain(searcher, query, hit->doc); |
| 301 | 301 | if (! Afequal(hit->score, e->value)) { |
| … |
… |
|
| 314 | 314 | } |
| 315 | 315 | td_destroy(top_docs); |
| | 316 | |
| | 317 | /* test search_unscored method */ |
| 316 | 318 | qsort(num_array, total_hits, sizeof(int), &icmp_risky); |
| 317 | 319 | count = searcher_search_unscored(searcher, query, |
| … |
… |
|
| 525 | 527 | check_hits(tc, searcher, phq, "1", 1); |
| 526 | 528 | |
| 527 | | ((PhraseQuery *)phq)->slop = 4; |
| | 529 | phq_set_slop(phq, 4); |
| 528 | 530 | check_hits(tc, searcher, phq, "1, 16, 17", 17); |
| 529 | 531 | q_deref(phq); |
| … |
… |
|
| 536 | 538 | check_hits(tc, searcher, phq, "1, 11, 14", 14); |
| 537 | 539 | |
| 538 | | ((PhraseQuery *)phq)->slop = 1; |
| | 540 | phq_set_slop(phq, 1); |
| 539 | 541 | check_hits(tc, searcher, phq, "1, 11, 14, 16", 14); |
| 540 | 542 | |
| 541 | | ((PhraseQuery *)phq)->slop = 4; |
| | 543 | phq_set_slop(phq, 4); |
| 542 | 544 | check_hits(tc, searcher, phq, "1, 11, 14, 16, 17", 14); |
| 543 | 545 | phq_add_term(phq, "red", -1); |
| … |
… |
|
| 568 | 570 | q_deref(phq); |
| 569 | 571 | |
| | 572 | /* test repeating terms check */ |
| | 573 | phq = phq_new(field); |
| | 574 | phq_add_term(phq, "one", 0); |
| | 575 | phq_add_term(phq, "two", 1); |
| | 576 | phq_add_term(phq, "one", 1); |
| | 577 | check_hits(tc, searcher, phq, "2", 2); |
| | 578 | phq_set_slop(phq, 2); |
| | 579 | check_hits(tc, searcher, phq, "2", 2); |
| | 580 | q_deref(phq); |
| | 581 | |
| 570 | 582 | phq = phq_new(I("not a field")); |
| 571 | 583 | phq_add_term(phq, "the", 0); |
| … |
… |
|
| 603 | 615 | Assert(q_eq(q1, q2), "Queries should be equal"); |
| 604 | 616 | |
| 605 | | ((PhraseQuery *)q2)->slop = 5; |
| | 617 | phq_set_slop(q2, 5); |
| 606 | 618 | Assert(q_hash(q1) != q_hash(q2), "Queries should not be equal"); |
| 607 | 619 | Assert(!q_eq(q1, q2), "Queries should not be equal"); |
| … |
… |
|
| 657 | 669 | check_hits(tc, searcher, phq, "1, 8, 11, 14", -1); |
| 658 | 670 | |
| 659 | | ((PhraseQuery *)phq)->slop = 4; |
| | 671 | phq_set_slop(phq, 4); |
| 660 | 672 | check_hits(tc, searcher, phq, "1, 8, 11, 14, 16, 17", -1); |
| 661 | 673 | check_to_s(tc, phq, NULL, "field:\"quick|fast brown|red|hairy fox\"~4"); |
| … |
… |
|
| 674 | 686 | q_deref(phq); |
| 675 | 687 | |
| | 688 | /* test repeating terms check */ |
| | 689 | phq = phq_new(field); |
| | 690 | phq_add_term(phq, "WORD3", 0); |
| | 691 | phq_append_multi_term(phq, "x"); |
| | 692 | phq_add_term(phq, "one", 0); |
| | 693 | phq_add_term(phq, "two", 1); |
| | 694 | phq_add_term(phq, "one", 1); |
| | 695 | check_hits(tc, searcher, phq, "2", -1); |
| | 696 | check_to_s(tc, phq, NULL, "field:\"WORD3|x&one two one\""); |
| | 697 | |
| | 698 | phq_set_slop(phq, 4); |
| | 699 | check_hits(tc, searcher, phq, "2", -1); |
| | 700 | check_to_s(tc, phq, NULL, "field:\"WORD3|x&one two one\"~4"); |
| | 701 | q_deref(phq); |
| | 702 | |
| | 703 | /* test phrase query on non-existing field doesn't break anything */ |
| 676 | 704 | phq = phq_new(I("not a field")); |
| 677 | 705 | phq_add_term(phq, "the", 0); |
| … |
… |
|
| 717 | 745 | Assert(q_eq(q1, q2), "Queries should be equal"); |
| 718 | 746 | |
| 719 | | ((PhraseQuery *)q2)->slop = 5; |
| | 747 | phq_set_slop(q2, 5); |
| 720 | 748 | Assert(q_hash(q1) != q_hash(q2), "Queries should not be equal"); |
| 721 | 749 | Assert(!q_eq(q1, q2), "Queries should not be equal"); |