Changeset 8f4fd07de7bf767b30c219684608535cb9cb6f19

Show
Ignore:
Timestamp:
04/28/08 11:06:13 (8 months ago)
Author:
David Balmain <dbalmain@…>
Parents:
56424f8d1cfd0e927e5e8f9e0c4329531221a966
Children:
e37ef045516872af81e36339debad2296a2d7506
git-committer:
David Balmain <dbalmain@gmail.com> / 2008-04-28T11:06:13Z+1000
Message:

Fixed repeated term bug in sloppy PhraseQuery

Repeated terms were breaking sloppy PhraseQueries?. For example, if you had the
query "one two one"~2 it would match "one two". This has now been fixed.

Location:
c
Files:
6 modified

Legend:

Unmodified
Added
Removed
  • c/include/internal.h

    ra04da2 r8f4fd0  
    738738#define phq_append_multi_term                   frt_phq_append_multi_term 
    739739#define phq_new                                 frt_phq_new 
     740#define phq_set_slop                            frt_phq_set_slop 
    740741#define pl_add_occ                              frt_pl_add_occ 
    741742#define pl_cmp                                  frt_pl_cmp 
  • c/include/search.h

    r950230 r8f4fd0  
    326326extern void frt_phq_add_term_abs(FrtQuery *self, const char *term, int position); 
    327327extern void frt_phq_append_multi_term(FrtQuery *self, const char *term); 
     328extern void frt_phq_set_slop(FrtQuery *self, int slop); 
    328329 
    329330/*************************************************************************** 
  • c/src/q_phrase.c

    r56424f r8f4fd0  
    164164    bool    first_time : 1; 
    165165    bool    more : 1; 
     166    bool    check_repeats : 1; 
    166167} PhraseScorer; 
    167168 
     
    291292} 
    292293 
    293 static Scorer *phsc_new(Weight *weight, TermDocEnum **term_pos_enum, 
     294static Scorer *phsc_new(Weight *weight, 
     295                        TermDocEnum **term_pos_enum, 
    294296                        PhrasePosition *positions, int pos_cnt, 
    295                         Similarity *similarity, uchar *norms) 
     297                        Similarity *similarity, 
     298                        uchar *norms, 
     299                        int slop) 
    296300{ 
    297301    int i; 
    298302    Scorer *self                = scorer_new(PhraseScorer, similarity); 
     303    HashSet *term_set           = NULL; 
     304 
    299305 
    300306    PhSc(self)->weight          = weight; 
     
    304310    PhSc(self)->pp_first_idx    = 0; 
    305311    PhSc(self)->pp_cnt          = pos_cnt; 
    306     PhSc(self)->slop            = 0; 
     312    PhSc(self)->slop            = slop; 
    307313    PhSc(self)->first_time      = true; 
    308314    PhSc(self)->more            = true; 
    309  
     315    PhSc(self)->check_repeats   = false; 
     316     
     317    if (slop) { 
     318        term_set = hs_new_str((free_ft)NULL); 
     319    } 
    310320    for (i = 0; i < pos_cnt; i++) { 
     321        /* check for repeats */ 
     322        if (slop && !PhSc(self)->check_repeats) { 
     323            char **terms = positions[i].terms; 
     324            const int t_cnt = ary_size(terms); 
     325            int j; 
     326            for (j = 0; j < t_cnt; j++) { 
     327                if (hs_add(term_set, terms[j])) { 
     328                    PhSc(self)->check_repeats = true; 
     329                    goto repeat_check_done; 
     330                } 
     331            } 
     332        } 
     333repeat_check_done: 
    311334        PhSc(self)->phrase_pos[i] = pp_new(term_pos_enum[i], positions[i].pos); 
     335    } 
     336 
     337    if (slop) { 
     338        hs_destroy(term_set); 
    312339    } 
    313340 
     
    372399                                       Similarity *similarity, uchar *norms) 
    373400{ 
    374     Scorer *self = 
    375         phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms); 
     401    Scorer *self = phsc_new(weight, 
     402                            term_pos_enum, 
     403                            positions, 
     404                            pp_cnt, 
     405                            similarity, 
     406                            norms, 
     407                            0); 
    376408 
    377409    PhSc(self)->phrase_freq = &ephsc_phrase_freq; 
     
    382414 * SloppyPhraseScorer 
    383415 ***************************************************************************/ 
     416 
     417static bool sphsc_check_repeats(PhPos *pp, 
     418                                PhPos **positions, 
     419                                const int p_cnt) 
     420{ 
     421    int j; 
     422    for (j = 0; j < p_cnt; j++) { 
     423        PhPos *ppj = positions[j]; 
     424        /* If offsets are equal, either we are at the current PhPos +pp+ or 
     425         * +pp+ and +ppj+ are supposed to match in the same position in which 
     426         * case we don't need to check. */ 
     427        if (ppj->offset == pp->offset) { 
     428            continue; 
     429        } 
     430        /* the two phrase positions are matching on the same term 
     431         * which we want to avoid */ 
     432        if ((ppj->position + ppj->offset) == (pp->position + pp->offset)) { 
     433            if (!pp_next_position(pp)) { 
     434                /* We have no matches for this document */ 
     435                return false; 
     436            } 
     437            /* we changed the position so we need to start check again */ 
     438            j = -1; 
     439        } 
     440    } 
     441    return true; 
     442} 
    384443 
    385444static float sphsc_phrase_freq(Scorer *self) 
     
    392451    int last_pos = 0, pos, next_pos, start, match_length, i; 
    393452    bool done = false; 
     453    bool check_repeats = phsc->check_repeats; 
    394454    float freq = 0.0; 
    395455 
    396456    for (i = 0; i < pp_cnt; i++) { 
    397457        pp = phsc->phrase_pos[i]; 
    398         pp_first_position(pp); 
     458        /* we should always have at least one position or this functions 
     459         * shouldn't have been called. */ 
     460        assert(pp_first_position(pp)); 
     461        if (check_repeats && i > 0) { 
     462            if (!sphsc_check_repeats(pp, phsc->phrase_pos, i - 1)) { 
     463                goto return_freq; 
     464            } 
     465        } 
    399466        if (pp->position > last_pos) { 
    400467            last_pos = pp->position; 
     
    409476        while (pos <= next_pos) { 
    410477            start = pos;        /* advance pp to min window */ 
    411             if (!pp_next_position(pp)) { 
    412                 done = true;    /* ran out of a positions for a term - done */ 
     478            if (!pp_next_position(pp) 
     479                || (check_repeats 
     480                    && !sphsc_check_repeats(pp, phsc->phrase_pos, pp_cnt))) { 
     481                done = true; 
    413482                break; 
    414483            } 
     
    427496        pq_push(pq, pp);        /* restore pq */ 
    428497    } while (!done); 
     498 
     499return_freq: 
    429500 
    430501    pq_destroy(pq); 
     
    438509                                        int slop, uchar *norms) 
    439510{ 
    440     Scorer *self = 
    441         phsc_new(weight, term_pos_enum, positions, pp_cnt, similarity, norms); 
    442  
    443     PhSc(self)->slop        = slop; 
     511    Scorer *self = phsc_new(weight, 
     512                            term_pos_enum, 
     513                            positions, 
     514                            pp_cnt, 
     515                            similarity, 
     516                            norms, 
     517                            slop); 
     518 
    444519    PhSc(self)->phrase_freq = &sphsc_phrase_freq; 
    445520    return self; 
     
    11211196    } 
    11221197} 
     1198 
     1199void frt_phq_set_slop(FrtQuery *self, int slop) 
     1200{ 
     1201    PhQ(self)->slop = slop; 
     1202} 
  • c/test/test.c

    r736446 r8f4fd0  
    456456        return true; 
    457457    } 
     458    fprintf(stderr, "diff = %g\n", diff); 
    458459 
    459460    tc->failed = true; 
  • c/test/test_search.c

    r56424f r8f4fd0  
    164164    {"20051001", "word1 word2 the quick brown fox the quick brown fox", 
    165165        "cat1/sub1",            "0.954"}, 
    166     {"20051002", "word1 word3", 
     166    {"20051002", "word1 word3 one two one", 
    167167        "cat1/sub1/subsub1",    "908.123434"}, 
    168     {"20051003", "word1 word3", 
     168    {"20051003", "word1 word3 one two", 
    169169        "cat1/sub2",            "3999"}, 
    170170    {"20051004", "word1 word2", 
    171171        "cat1/sub2/subsub2",    "+.3413"}, 
    172     {"20051005", "word1", 
     172    {"20051005", "word1 one two x x x x x one two", 
    173173        "cat2/sub1",            "-1.1298"}, 
    174174    {"20051006", "word1 word3", 
     
    265265    TopDocs *top_docs 
    266266        = searcher_search(searcher, query, 0, total_hits + 1, NULL, NULL, NULL); 
    267     if (!Aiequal(total_hits, top_docs->total_hits)) { 
     267    if (!tc->failed && !Aiequal(total_hits, top_docs->total_hits)) { 
    268268        int i; 
    269269        Tmsg_nf("\texpected;\n\t    "); 
     
    297297        /* only check the explanation if we got the correct docs. Obviously we 
    298298         * might want to remove this to visually check the explanations */ 
    299         if (total_hits == top_docs->total_hits) { 
     299        if (!tc->failed && total_hits == top_docs->total_hits) { 
    300300            Explanation *e = searcher_explain(searcher, query, hit->doc); 
    301301            if (! Afequal(hit->score, e->value)) { 
     
    314314    } 
    315315    td_destroy(top_docs); 
     316 
     317    /* test search_unscored method */ 
    316318    qsort(num_array, total_hits, sizeof(int), &icmp_risky); 
    317319    count = searcher_search_unscored(searcher, query, 
     
    525527    check_hits(tc, searcher, phq, "1", 1); 
    526528 
    527     ((PhraseQuery *)phq)->slop = 4; 
     529    phq_set_slop(phq, 4); 
    528530    check_hits(tc, searcher, phq, "1, 16, 17", 17); 
    529531    q_deref(phq); 
     
    536538    check_hits(tc, searcher, phq, "1, 11, 14", 14); 
    537539 
    538     ((PhraseQuery *)phq)->slop = 1; 
     540    phq_set_slop(phq, 1); 
    539541    check_hits(tc, searcher, phq, "1, 11, 14, 16", 14); 
    540542 
    541     ((PhraseQuery *)phq)->slop = 4; 
     543    phq_set_slop(phq, 4); 
    542544    check_hits(tc, searcher, phq, "1, 11, 14, 16, 17", 14); 
    543545    phq_add_term(phq, "red", -1); 
     
    568570    q_deref(phq); 
    569571 
     572    /* test repeating terms check */ 
     573    phq = phq_new(field); 
     574    phq_add_term(phq, "one", 0); 
     575    phq_add_term(phq, "two", 1); 
     576    phq_add_term(phq, "one", 1); 
     577    check_hits(tc, searcher, phq, "2", 2); 
     578    phq_set_slop(phq, 2); 
     579    check_hits(tc, searcher, phq, "2", 2); 
     580    q_deref(phq); 
     581 
    570582    phq = phq_new(I("not a field")); 
    571583    phq_add_term(phq, "the", 0); 
     
    603615    Assert(q_eq(q1, q2), "Queries should be equal"); 
    604616 
    605     ((PhraseQuery *)q2)->slop = 5; 
     617    phq_set_slop(q2, 5); 
    606618    Assert(q_hash(q1) != q_hash(q2), "Queries should not be equal"); 
    607619    Assert(!q_eq(q1, q2), "Queries should not be equal"); 
     
    657669    check_hits(tc, searcher, phq, "1, 8, 11, 14", -1); 
    658670 
    659     ((PhraseQuery *)phq)->slop = 4; 
     671    phq_set_slop(phq, 4); 
    660672    check_hits(tc, searcher, phq, "1, 8, 11, 14, 16, 17", -1); 
    661673    check_to_s(tc, phq, NULL, "field:\"quick|fast brown|red|hairy fox\"~4"); 
     
    674686    q_deref(phq); 
    675687 
     688    /* test repeating terms check */ 
     689    phq = phq_new(field); 
     690    phq_add_term(phq, "WORD3", 0); 
     691    phq_append_multi_term(phq, "x"); 
     692    phq_add_term(phq, "one", 0); 
     693    phq_add_term(phq, "two", 1); 
     694    phq_add_term(phq, "one", 1); 
     695    check_hits(tc, searcher, phq, "2", -1); 
     696    check_to_s(tc, phq, NULL, "field:\"WORD3|x&one two one\""); 
     697 
     698    phq_set_slop(phq, 4); 
     699    check_hits(tc, searcher, phq, "2", -1); 
     700    check_to_s(tc, phq, NULL, "field:\"WORD3|x&one two one\"~4"); 
     701    q_deref(phq); 
     702 
     703    /* test phrase query on non-existing field doesn't break anything */ 
    676704    phq = phq_new(I("not a field")); 
    677705    phq_add_term(phq, "the", 0); 
     
    717745    Assert(q_eq(q1, q2), "Queries should be equal"); 
    718746 
    719     ((PhraseQuery *)q2)->slop = 5; 
     747    phq_set_slop(q2, 5); 
    720748    Assert(q_hash(q1) != q_hash(q2), "Queries should not be equal"); 
    721749    Assert(!q_eq(q1, q2), "Queries should not be equal");