Changeset cafd8d281bdf982e2ac0b84d57dfbf9fed46603c

Show
Ignore:
Timestamp:
04/22/08 00:10:15 (9 months ago)
Author:
dave <dave@…>
Parents:
8c94483694f5774d39acb4072885476c45d9b571
Children:
c929863ac8839608b81f31d8c7877cb11792961e, 83fedc095fb017076709f408095674cd9fd05ecc, c93e31a2e50bb75c511e0b1553d7311ec0335cce
git-committer:
dave <dave@06fd6eb0-0002-0410-a719-e5602cce40bc> / 2008-04-21T14:10:15Z+0000
Message:

Started detailed documentation of the QueryParser

In writing the documentation I've spotted a pretty serious bug in the
QueryParser. Fields need to be pushed onto a stack as they are specified,
otherwise the parser is quite broken for multi-level, multi-field queries. I'll
fix this first thing tomorrow before finishing this documentation. Comments on
the documentation are welcome.

git-svn-id: svn+ssh://davebalmain.com/home/dave/repos/ferret/trunk@1031 06fd6eb0-0002-0410-a719-e5602cce40bc

Files:
4 modified

Legend:

Unmodified
Added
Removed
  • TODO

    r488aac rcafd8d  
    2020    useful for storing field names so that no objects need to strdup the 
    2121    field-names but can just store the symbol representative instead. 
     22    + this has been done but it can be improved using actual Symbol structs 
     23      instead of plain char* 
    2224  - Make threading optional at compile time 
    2325  - to_json should limit output to prevent memory overflow on large indexes. 
     
    3436  - Auto-loading of documents during search. ie actual documents get returned 
    3537    instead of document numbers. 
    36   - update benchmark suite to use getrusage.u 
    3738 
    3839* Ruby bindings 
     
    106107  + Working Query:  field1:value1 AND NOT field2:value2 
    107108  + Failing Query:    field1:value1 AND ( NOT field2:value2 ) 
    108  
     109* update benchmark suite to use getrusage 
  • c/TODO

    r48290f rcafd8d  
    44 
    55benchmarks 
    6 * string actions when length is known 
    76* standard tokenizer 
    8 * writevint 
  • c/src/q_parser.c

    r553474 rcafd8d  
    9999 
    100100/* Copy the first part of user declarations.  */ 
    101 #line 1 "src/q_parser.y" 
     101#line 3 "src/q_parser.y" 
    102102 
    103103#include <string.h> 
     
    149149#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED 
    150150typedef union YYSTYPE 
    151 #line 29 "src/q_parser.y" 
     151#line 31 "src/q_parser.y" 
    152152{ 
    153153    Query *query; 
     
    169169 
    170170/* Copy the second part of user declarations.  */ 
    171 #line 37 "src/q_parser.y" 
     171#line 39 "src/q_parser.y" 
    172172 
    173173static int yylex(YYSTYPE *lvalp, QParser *qp); 
     
    549549static const yytype_uint8 yyrline[] = 
    550550{ 
    551        0,   131,   131,   132,   134,   135,   136,   137,   139,   140, 
    552      141,   143,   144,   146,   147,   148,   149,   150,   151,   152, 
    553      154,   155,   156,   158,   160,   160,   162,   162,   162,   165, 
    554      166,   168,   169,   170,   171,   173,   174,   175,   176,   177, 
    555      179,   180,   181,   182,   183,   184,   185,   186,   187,   188, 
    556      189,   190 
     551       0,   133,   133,   134,   136,   137,   138,   139,   141,   142, 
     552     143,   145,   146,   148,   149,   150,   151,   152,   153,   154, 
     553     156,   157,   158,   160,   162,   162,   164,   164,   164,   167, 
     554     168,   170,   171,   172,   173,   175,   176,   177,   178,   179, 
     555     181,   182,   183,   184,   185,   186,   187,   188,   189,   190, 
     556     191,   192 
    557557}; 
    558558#endif 
     
    12131213    { 
    12141214      case 27: /* "bool_q" */ 
    1215 #line 126 "src/q_parser.y" 
     1215#line 128 "src/q_parser.y" 
    12161216        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12171217#line 1218 "src/q_parser.c" 
    12181218        break; 
    12191219      case 28: /* "bool_clss" */ 
    1220 #line 128 "src/q_parser.y" 
     1220#line 130 "src/q_parser.y" 
    12211221        { if ((yyvaluep->bclss) && qp->destruct) bca_destroy((yyvaluep->bclss)); }; 
    12221222#line 1223 "src/q_parser.c" 
    12231223        break; 
    12241224      case 29: /* "bool_cls" */ 
    1225 #line 127 "src/q_parser.y" 
     1225#line 129 "src/q_parser.y" 
    12261226        { if ((yyvaluep->bcls) && qp->destruct) bc_deref((yyvaluep->bcls)); }; 
    12271227#line 1228 "src/q_parser.c" 
    12281228        break; 
    12291229      case 30: /* "boosted_q" */ 
    1230 #line 126 "src/q_parser.y" 
     1230#line 128 "src/q_parser.y" 
    12311231        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12321232#line 1233 "src/q_parser.c" 
    12331233        break; 
    12341234      case 31: /* "q" */ 
    1235 #line 126 "src/q_parser.y" 
     1235#line 128 "src/q_parser.y" 
    12361236        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12371237#line 1238 "src/q_parser.c" 
    12381238        break; 
    12391239      case 32: /* "term_q" */ 
    1240 #line 126 "src/q_parser.y" 
     1240#line 128 "src/q_parser.y" 
    12411241        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12421242#line 1243 "src/q_parser.c" 
    12431243        break; 
    12441244      case 33: /* "wild_q" */ 
    1245 #line 126 "src/q_parser.y" 
     1245#line 128 "src/q_parser.y" 
    12461246        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12471247#line 1248 "src/q_parser.c" 
    12481248        break; 
    12491249      case 34: /* "field_q" */ 
    1250 #line 126 "src/q_parser.y" 
     1250#line 128 "src/q_parser.y" 
    12511251        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12521252#line 1253 "src/q_parser.c" 
    12531253        break; 
    12541254      case 39: /* "phrase_q" */ 
    1255 #line 126 "src/q_parser.y" 
     1255#line 128 "src/q_parser.y" 
    12561256        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12571257#line 1258 "src/q_parser.c" 
    12581258        break; 
    12591259      case 40: /* "ph_words" */ 
    1260 #line 129 "src/q_parser.y" 
     1260#line 131 "src/q_parser.y" 
    12611261        { if ((yyvaluep->phrase) && qp->destruct) ph_destroy((yyvaluep->phrase)); }; 
    12621262#line 1263 "src/q_parser.c" 
    12631263        break; 
    12641264      case 41: /* "range_q" */ 
    1265 #line 126 "src/q_parser.y" 
     1265#line 128 "src/q_parser.y" 
    12661266        { if ((yyvaluep->query) && qp->destruct) q_deref((yyvaluep->query)); }; 
    12671267#line 1268 "src/q_parser.c" 
     
    15741574    { 
    15751575        case 2: 
    1576 #line 131 "src/q_parser.y" 
     1576#line 133 "src/q_parser.y" 
    15771577    {   qp->result = (yyval.query) = NULL; } 
    15781578    break; 
    15791579 
    15801580  case 3: 
    1581 #line 132 "src/q_parser.y" 
     1581#line 134 "src/q_parser.y" 
    15821582    { T qp->result = (yyval.query) = get_bool_q((yyvsp[(1) - (1)].bclss)); E } 
    15831583    break; 
    15841584 
    15851585  case 4: 
    1586 #line 134 "src/q_parser.y" 
     1586#line 136 "src/q_parser.y" 
    15871587    { T (yyval.bclss) = first_cls((yyvsp[(1) - (1)].bcls)); E } 
    15881588    break; 
    15891589 
    15901590  case 5: 
    1591 #line 135 "src/q_parser.y" 
     1591#line 137 "src/q_parser.y" 
    15921592    { T (yyval.bclss) = add_and_cls((yyvsp[(1) - (3)].bclss), (yyvsp[(3) - (3)].bcls)); E } 
    15931593    break; 
    15941594 
    15951595  case 6: 
    1596 #line 136 "src/q_parser.y" 
     1596#line 138 "src/q_parser.y" 
    15971597    { T (yyval.bclss) = add_or_cls((yyvsp[(1) - (3)].bclss), (yyvsp[(3) - (3)].bcls)); E } 
    15981598    break; 
    15991599 
    16001600  case 7: 
    1601 #line 137 "src/q_parser.y" 
     1601#line 139 "src/q_parser.y" 
    16021602    { T (yyval.bclss) = add_default_cls(qp, (yyvsp[(1) - (2)].bclss), (yyvsp[(2) - (2)].bcls)); E } 
    16031603    break; 
    16041604 
    16051605  case 8: 
    1606 #line 139 "src/q_parser.y" 
     1606#line 141 "src/q_parser.y" 
    16071607    { T (yyval.bcls) = get_bool_cls((yyvsp[(2) - (2)].query), BC_MUST); E } 
    16081608    break; 
    16091609 
    16101610  case 9: 
    1611 #line 140 "src/q_parser.y" 
     1611#line 142 "src/q_parser.y" 
    16121612    { T (yyval.bcls) = get_bool_cls((yyvsp[(2) - (2)].query), BC_MUST_NOT); E } 
    16131613    break; 
    16141614 
    16151615  case 10: 
    1616 #line 141 "src/q_parser.y" 
     1616#line 143 "src/q_parser.y" 
    16171617    { T (yyval.bcls) = get_bool_cls((yyvsp[(1) - (1)].query), BC_SHOULD); E } 
    16181618    break; 
    16191619 
    16201620  case 12: 
    1621 #line 144 "src/q_parser.y" 
     1621#line 146 "src/q_parser.y" 
    16221622    { T if ((yyvsp[(1) - (3)].query)) sscanf((yyvsp[(3) - (3)].str),"%f",&((yyvsp[(1) - (3)].query)->boost));  (yyval.query)=(yyvsp[(1) - (3)].query); E } 
    16231623    break; 
    16241624 
    16251625  case 14: 
    1626 #line 147 "src/q_parser.y" 
     1626#line 149 "src/q_parser.y" 
    16271627    { T (yyval.query) = bq_new_max(true, qp->max_clauses); E } 
    16281628    break; 
    16291629 
    16301630  case 15: 
    1631 #line 148 "src/q_parser.y" 
     1631#line 150 "src/q_parser.y" 
    16321632    { T (yyval.query) = get_bool_q((yyvsp[(2) - (3)].bclss)); E } 
    16331633    break; 
    16341634 
    16351635  case 20: 
    1636 #line 154 "src/q_parser.y" 
     1636#line 156 "src/q_parser.y" 
    16371637    { FLDS((yyval.query), get_term_q(qp, field, (yyvsp[(1) - (1)].str))); Y} 
    16381638    break; 
    16391639 
    16401640  case 21: 
    1641 #line 155 "src/q_parser.y" 
     1641#line 157 "src/q_parser.y" 
    16421642    { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[(1) - (3)].str), (yyvsp[(3) - (3)].str))); Y} 
    16431643    break; 
    16441644 
    16451645  case 22: 
    1646 #line 156 "src/q_parser.y" 
     1646#line 158 "src/q_parser.y" 
    16471647    { FLDS((yyval.query), get_fuzzy_q(qp, field, (yyvsp[(1) - (2)].str), NULL)); Y} 
    16481648    break; 
    16491649 
    16501650  case 23: 
    1651 #line 158 "src/q_parser.y" 
     1651#line 160 "src/q_parser.y" 
    16521652    { FLDS((yyval.query), get_wild_q(qp, field, (yyvsp[(1) - (1)].str))); Y} 
    16531653    break; 
    16541654 
    16551655  case 24: 
    1656 #line 160 "src/q_parser.y" 
     1656#line 162 "src/q_parser.y" 
    16571657    { qp->fields = qp->def_fields; } 
    16581658    break; 
    16591659 
    16601660  case 25: 
    1661 #line 161 "src/q_parser.y" 
     1661#line 163 "src/q_parser.y" 
    16621662    { (yyval.query) = (yyvsp[(3) - (4)].query); } 
    16631663    break; 
    16641664 
    16651665  case 26: 
    1666 #line 162 "src/q_parser.y" 
     1666#line 164 "src/q_parser.y" 
    16671667    { qp->fields = qp->all_fields; } 
    16681668    break; 
    16691669 
    16701670  case 27: 
    1671 #line 162 "src/q_parser.y" 
     1671#line 164 "src/q_parser.y" 
    16721672    {qp->fields = qp->def_fields;} 
    16731673    break; 
    16741674 
    16751675  case 28: 
    1676 #line 163 "src/q_parser.y" 
     1676#line 165 "src/q_parser.y" 
    16771677    { (yyval.query) = (yyvsp[(4) - (5)].query); } 
    16781678    break; 
    16791679 
    16801680  case 29: 
    1681 #line 165 "src/q_parser.y" 
     1681#line 167 "src/q_parser.y" 
    16821682    { (yyval.hashset) = first_field(qp, (yyvsp[(1) - (1)].str)); } 
    16831683    break; 
    16841684 
    16851685  case 30: 
    1686 #line 166 "src/q_parser.y" 
     1686#line 168 "src/q_parser.y" 
    16871687    { (yyval.hashset) = add_field(qp, (yyvsp[(3) - (3)].str));} 
    16881688    break; 
    16891689 
    16901690  case 31: 
    1691 #line 168 "src/q_parser.y" 
     1691#line 170 "src/q_parser.y" 
    16921692    { (yyval.query) = get_phrase_q(qp, (yyvsp[(2) - (3)].phrase), NULL); } 
    16931693    break; 
    16941694 
    16951695  case 32: 
    1696 #line 169 "src/q_parser.y" 
     1696#line 171 "src/q_parser.y" 
    16971697    { (yyval.query) = get_phrase_q(qp, (yyvsp[(2) - (5)].phrase), (yyvsp[(5) - (5)].str)); } 
    16981698    break; 
    16991699 
    17001700  case 33: 
    1701 #line 170 "src/q_parser.y" 
     1701#line 172 "src/q_parser.y" 
    17021702    { (yyval.query) = NULL; } 
    17031703    break; 
    17041704 
    17051705  case 34: 
    1706 #line 171 "src/q_parser.y" 
     1706#line 173 "src/q_parser.y" 
    17071707    { (yyval.query) = NULL; (void)(yyvsp[(4) - (4)].str);} 
    17081708    break; 
    17091709 
    17101710  case 35: 
    1711 #line 173 "src/q_parser.y" 
     1711#line 175 "src/q_parser.y" 
    17121712    { (yyval.phrase) = ph_first_word((yyvsp[(1) - (1)].str)); } 
    17131713    break; 
    17141714 
    17151715  case 36: 
    1716 #line 174 "src/q_parser.y" 
     1716#line 176 "src/q_parser.y" 
    17171717    { (yyval.phrase) = ph_first_word(NULL); } 
    17181718    break; 
    17191719 
    17201720  case 37: 
    1721 #line 175 "src/q_parser.y" 
     1721#line 177 "src/q_parser.y" 
    17221722    { (yyval.phrase) = ph_add_word((yyvsp[(1) - (2)].phrase), (yyvsp[(2) - (2)].str)); } 
    17231723    break; 
    17241724 
    17251725  case 38: 
    1726 #line 176 "src/q_parser.y" 
     1726#line 178 "src/q_parser.y" 
    17271727    { (yyval.phrase) = ph_add_word((yyvsp[(1) - (3)].phrase), NULL); } 
    17281728    break; 
    17291729 
    17301730  case 39: 
    1731 #line 177 "src/q_parser.y" 
     1731#line 179 "src/q_parser.y" 
    17321732    { (yyval.phrase) = ph_add_multi_word((yyvsp[(1) - (3)].phrase), (yyvsp[(3) - (3)].str));  } 
    17331733    break; 
    17341734 
    17351735  case 40: 
    1736 #line 179 "src/q_parser.y" 
     1736#line 181 "src/q_parser.y" 
    17371737    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str),  (yyvsp[(3) - (4)].str),  true,  true)); Y} 
    17381738    break; 
    17391739 
    17401740  case 41: 
    1741 #line 180 "src/q_parser.y" 
     1741#line 182 "src/q_parser.y" 
    17421742    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str),  (yyvsp[(3) - (4)].str),  true,  false)); Y} 
    17431743    break; 
    17441744 
    17451745  case 42: 
    1746 #line 181 "src/q_parser.y" 
     1746#line 183 "src/q_parser.y" 
    17471747    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str),  (yyvsp[(3) - (4)].str),  false, true)); Y} 
    17481748    break; 
    17491749 
    17501750  case 43: 
    1751 #line 182 "src/q_parser.y" 
     1751#line 184 "src/q_parser.y" 
    17521752    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (4)].str),  (yyvsp[(3) - (4)].str),  false, false)); Y} 
    17531753    break; 
    17541754 
    17551755  case 44: 
    1756 #line 183 "src/q_parser.y" 
     1756#line 185 "src/q_parser.y" 
    17571757    { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(2) - (3)].str),  false, false)); Y} 
    17581758    break; 
    17591759 
    17601760  case 45: 
    1761 #line 184 "src/q_parser.y" 
     1761#line 186 "src/q_parser.y" 
    17621762    { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(2) - (3)].str),  false, true)); Y} 
    17631763    break; 
    17641764 
    17651765  case 46: 
    1766 #line 185 "src/q_parser.y" 
     1766#line 187 "src/q_parser.y" 
    17671767    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (3)].str),  NULL,true,  false)); Y} 
    17681768    break; 
    17691769 
    17701770  case 47: 
    1771 #line 186 "src/q_parser.y" 
     1771#line 188 "src/q_parser.y" 
    17721772    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (3)].str),  NULL,false, false)); Y} 
    17731773    break; 
    17741774 
    17751775  case 48: 
    1776 #line 187 "src/q_parser.y" 
     1776#line 189 "src/q_parser.y" 
    17771777    { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(2) - (2)].str),  false, false)); Y} 
    17781778    break; 
    17791779 
    17801780  case 49: 
    1781 #line 188 "src/q_parser.y" 
     1781#line 190 "src/q_parser.y" 
    17821782    { FLDS((yyval.query), get_r_q(qp, field, NULL,(yyvsp[(3) - (3)].str),  false, true)); Y} 
    17831783    break; 
    17841784 
    17851785  case 50: 
    1786 #line 189 "src/q_parser.y" 
     1786#line 191 "src/q_parser.y" 
    17871787    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(3) - (3)].str),  NULL,true,  false)); Y} 
    17881788    break; 
    17891789 
    17901790  case 51: 
    1791 #line 190 "src/q_parser.y" 
     1791#line 192 "src/q_parser.y" 
    17921792    { FLDS((yyval.query), get_r_q(qp, field, (yyvsp[(2) - (2)].str),  NULL,false, false)); Y} 
    17931793    break; 
     
    20092009 
    20102010 
    2011 #line 192 "src/q_parser.y" 
     2011#line 194 "src/q_parser.y" 
    20122012 
    20132013 
  • c/src/q_parser.y

    r553474 rcafd8d  
     1/***************************************************************************** 
     2 * QueryParser 
     3 * =========== 
     4 * 
     5 * Synopsis 
     6 * -------- 
     7 * 
     8 * === qp_parse 
     9 * 
     10 * The main QueryParser method is +qp_parse+. It gets called with a the query 
     11 * string. The first thing it does is to clean the query string if 
     12 * ((QueryParser *)self)->clean_str is set to true. The cleaning is done with 
     13 * the qp_clean_str. 
     14 *  
     15 * It then calls the yacc parser which will set self->result to the parsed 
     16 * query. If parsing fails in anyway, self->result should be set to NULL, in 
     17 * which case qp_parse does one of two things, depending on the value of 
     18 * self->handle_parse_errors. If it is set to true, qp_parse attempts to do a 
     19 * very basic parsing of the query by ignoring all special characters and 
     20 * parsing the query as a plain boolean query. If it is set to false, qp_parse 
     21 * will raise a PARSE_ERROR. 
     22 *  
     23 * === The Lexer 
     24 * 
     25 * yylex is the lexing method called by the QueryParser. It breaks the query 
     26 * up into special characters ( "&:()[]{}!\"~^|<>=*?+-" ) and tokens (QWRD, 
     27 * WILD_STR, AND['AND', '&&'], OR['OR', '||'], REQ['REQ', '+'], NOT['NOT', 
     28 * '-', '~']). QWRD tokens are query word tokens which are made up of 
     29 * characters other than the special characters. They can also contain special 
     30 * characters when escaped with a backslash '\'. WILD_STR is the same as QWRD 
     31 * except that it may also contain '?' and '*' characters. 
     32 * 
     33 * === The Parser 
     34 * 
     35 * For a better understanding of the how the query parser works, it is a good 
     36 * idea to study the Ferret Query Language (FQL) described below. Once you 
     37 * understand FQL the one tricky part that needs to be mentioned is how fields 
     38 * are handled. The QueryParser knows about two sets of fields, the default 
     39 * search fields and the set of all fields in the index. When no fields are 
     40 * specified then the default fields are used. The '*:' field specifier will 
     41 * search all fields contained in the all_fields set. Otherwise all fields 
     42 * specified in the field descripter separated by '|' will be searched. For 
     43 * example 'title|content:' will search the title and content fields. When 
     44 * fields are specified like this, the parser will push the fields onto a 
     45 * stack and all queries modified by the field specifier will be applied to 
     46 * the fields on top of the stack. This is where the FLDS macro comes into 
     47 * place. It takes the current query building function in the parser and calls 
     48 * it for all fields on top of the stack. 
     49 *  
     50 * Ferret Query Language (FQL) 
     51 * =========================== 
     52 * 
     53 * FIXME to be continued... 
     54 *****************************************************************************/ 
    155%{ 
    256#include <string.h> 
     
    878932} 
    879933 
     934/***************************************************************************** 
     935 * qp_clean_str method which basically scans the query string and ensures that 
     936 * all open and close parentheses '()' and quotes '"' are balanced. It does 
     937 * this be inserting or appending extra parentheses or quotes which is not 
     938 * necessarily going to be exactly what the user wanted but it will help 
     939 * prevent the parser from failing so it's the best we can do at this stage. 
     940 * It also checks  
     941 *****************************************************************************/ 
    880942char *qp_clean_str(char *str) 
    881943{