Changeset b82c99a2260ab756fe85d7b61da340581dc3b461
- Timestamp:
- 06/14/08 13:24:55 (7 months ago)
- Author:
- David Balmain <dbalmain@…>
- Parents:
- 0c24555597e26ee38f38d8910d3f7a6612393ce1
- Children:
- 97564968f9aed076ddedd60a97b504b972e3f7fc
- git-committer:
- David Balmain <dbalmain@gmail.com> / 2008-06-14T13:24:55Z+1000
- Message:
-
Added utf-8 standard analyzer. Back up to speed.
Added utf-8 standard analyzer which is on par with the legacy standard
analyzer. I'm still not happy with this solution though. I think the best idea
right now is to write our own generator which can make use of C's standard
multibyte functions. This shouldn't be too hard as we aren't using many of
Ragel's advanced features anyway.
- Location:
- c
- Files:
-
Legend:
- Unmodified
- Added
- Removed
-
|
r0c2455
|
rb82c99
|
|
| 46 | 46 | struct FrtTokenStream |
| 47 | 47 | { |
| 48 | | char *t; /* ptr used to scan text */ |
| 49 | | char *text; |
| 50 | | FrtToken *(*next)(FrtTokenStream *ts); |
| 51 | | FrtTokenStream *(*reset)(FrtTokenStream *ts, char *text); |
| 52 | | FrtTokenStream *(*clone_i)(FrtTokenStream *ts); |
| 53 | | void (*destroy_i)(FrtTokenStream *ts); |
| 54 | | int ref_cnt; |
| | 48 | char *t; /* ptr used to scan text */ |
| | 49 | char *text; |
| | 50 | FrtToken *(*next)(FrtTokenStream *ts); |
| | 51 | FrtTokenStream *(*reset)(FrtTokenStream *ts, char *text); |
| | 52 | FrtTokenStream *(*clone_i)(FrtTokenStream *ts); |
| | 53 | void (*destroy_i)(FrtTokenStream *ts); |
| | 54 | int ref_cnt; |
| 55 | 55 | }; |
| 56 | 56 | |
| … |
… |
|
| 68 | 68 | { |
| 69 | 69 | FrtCachedTokenStream super; |
| 70 | | mbstate_t state; |
| | 70 | mbstate_t state; |
| 71 | 71 | } FrtMultiByteTokenStream; |
| 72 | 72 | |
| | 73 | typedef enum |
| | 74 | { |
| | 75 | FRT_STT_ASCII, |
| | 76 | FRT_STT_MB, |
| | 77 | FRT_STT_UTF8 |
| | 78 | } FrtStandardTokenizerType; |
| | 79 | |
| 73 | 80 | typedef struct FrtStandardTokenizer |
| 74 | 81 | { |
| 75 | | FrtCachedTokenStream super; |
| 76 | | bool is_ascii; |
| | 82 | FrtCachedTokenStream super; |
| | 83 | FrtStandardTokenizerType type; |
| 77 | 84 | } FrtStandardTokenizer; |
| 78 | 85 | |
| … |
… |
|
| 141 | 148 | extern FrtTokenStream *frt_standard_tokenizer_new(); |
| 142 | 149 | extern FrtTokenStream *frt_mb_standard_tokenizer_new(); |
| | 150 | extern FrtTokenStream *frt_utf8_standard_tokenizer_new(); |
| 143 | 151 | |
| 144 | 152 | extern FrtTokenStream *frt_legacy_standard_tokenizer_new(); |
| … |
… |
|
| 212 | 220 | extern FrtAnalyzer *frt_standard_analyzer_new(bool lowercase); |
| 213 | 221 | extern FrtAnalyzer *frt_mb_standard_analyzer_new(bool lowercase); |
| | 222 | extern FrtAnalyzer *frt_utf8_standard_analyzer_new(bool lowercase); |
| 214 | 223 | |
| 215 | 224 | extern FrtAnalyzer *frt_standard_analyzer_new_with_words( |
| … |
… |
|
| 220 | 229 | const char **words, bool lowercase); |
| 221 | 230 | extern FrtAnalyzer *frt_mb_standard_analyzer_new_with_words_len( |
| | 231 | const char **words, int len, bool lowercase); |
| | 232 | extern FrtAnalyzer *frt_utf8_standard_analyzer_new_with_words( |
| | 233 | const char **words, bool lowercase); |
| | 234 | extern FrtAnalyzer *frt_utf8_standard_analyzer_new_with_words_len( |
| 222 | 235 | const char **words, int len, bool lowercase); |
| 223 | 236 | |
-
|
r0c2455
|
rb82c99
|
|
| 169 | 169 | #define STORE_YES FRT_STORE_YES |
| 170 | 170 | #define STRING_FIELD_INDEX_CLASS FRT_STRING_FIELD_INDEX_CLASS |
| | 171 | #define STT_ASCII FRT_STT_ASCII |
| | 172 | #define STT_MB FRT_STT_MB |
| | 173 | #define STT_UTF8 FRT_STT_UTF8 |
| 171 | 174 | #define TERM_QUERY FRT_TERM_QUERY |
| 172 | 175 | #define TERM_VECTOR_NO FRT_TERM_VECTOR_NO |
| … |
… |
|
| 306 | 309 | #define SpanTermQuery FrtSpanTermQuery |
| 307 | 310 | #define StandardTokenizer FrtStandardTokenizer |
| | 311 | #define StandardTokenizerType FrtStandardTokenizerType |
| 308 | 312 | #define State FrtState |
| 309 | 313 | #define StemFilter FrtStemFilter |
| … |
… |
|
| 911 | 915 | #define std_scan frt_std_scan |
| 912 | 916 | #define std_scan_mb frt_std_scan_mb |
| | 917 | #define std_scan_utf8 frt_std_scan_utf8 |
| 913 | 918 | #define stde_new frt_stde_new |
| 914 | 919 | #define ste_clone frt_ste_clone |
| … |
… |
|
| 985 | 990 | #define u64malloc frt_u64malloc |
| 986 | 991 | #define uchar frt_uchar |
| | 992 | #define utf8_standard_analyzer_new frt_utf8_standard_analyzer_new |
| | 993 | #define utf8_standard_analyzer_new_with_words frt_utf8_standard_analyzer_new_with_words |
| | 994 | #define utf8_standard_analyzer_new_with_words_len frt_utf8_standard_analyzer_new_with_words_len |
| | 995 | #define utf8_standard_tokenizer_new frt_utf8_standard_tokenizer_new |
| 987 | 996 | #define vexit frt_vexit |
| 988 | 997 | #define vstrfmt frt_vstrfmt |
-
|
r63d246
|
rb82c99
|
|
| 25 | 25 | int *token_length); |
| 26 | 26 | |
| | 27 | void frt_std_scan_utf8(const char *in, |
| | 28 | char *out, size_t out_size, |
| | 29 | const char **start, const char **end, |
| | 30 | int *token_length); |
| | 31 | |
| 27 | 32 | #ifdef __cplusplus |
| 28 | 33 | } // extern "C" |
-
|
r0c2455
|
rb82c99
|
|
| 568 | 568 | Token *tk = &(CTS(ts)->token); |
| 569 | 569 | |
| 570 | | if (std_tz->is_ascii) { |
| 571 | | frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1, |
| 572 | | &start, &end, &len); |
| 573 | | } |
| 574 | | else { |
| 575 | | frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1, |
| 576 | | &start, &end, &len); |
| | 570 | switch (std_tz->type) { |
| | 571 | case STT_ASCII: |
| | 572 | frt_std_scan(ts->t, tk->text, sizeof(tk->text) - 1, |
| | 573 | &start, &end, &len); |
| | 574 | break; |
| | 575 | case STT_MB: |
| | 576 | frt_std_scan_mb(ts->t, tk->text, sizeof(tk->text) - 1, |
| | 577 | &start, &end, &len); |
| | 578 | break; |
| | 579 | case STT_UTF8: |
| | 580 | frt_std_scan_utf8(ts->t, tk->text, sizeof(tk->text) - 1, |
| | 581 | &start, &end, &len); |
| | 582 | break; |
| 577 | 583 | } |
| 578 | 584 | |
| … |
… |
|
| 606 | 612 | { |
| 607 | 613 | TokenStream *ts = std_ts_new(); |
| 608 | | STDTS(ts)->is_ascii = true; |
| | 614 | STDTS(ts)->type = STT_ASCII; |
| 609 | 615 | return ts; |
| 610 | 616 | } |
| … |
… |
|
| 613 | 619 | { |
| 614 | 620 | TokenStream *ts = std_ts_new(); |
| 615 | | STDTS(ts)->is_ascii = false; |
| | 621 | STDTS(ts)->type = STT_MB; |
| | 622 | return ts; |
| | 623 | } |
| | 624 | |
| | 625 | TokenStream *utf8_standard_tokenizer_new() |
| | 626 | { |
| | 627 | TokenStream *ts = std_ts_new(); |
| | 628 | STDTS(ts)->type = STT_UTF8; |
| 616 | 629 | return ts; |
| 617 | 630 | } |
| … |
… |
|
| 1526 | 1539 | } |
| 1527 | 1540 | |
| | 1541 | Analyzer *utf8_standard_analyzer_new_with_words_len(const char **words, |
| | 1542 | int len, bool lowercase) |
| | 1543 | { |
| | 1544 | TokenStream *ts = utf8_standard_tokenizer_new(); |
| | 1545 | if (lowercase) { |
| | 1546 | ts = mb_lowercase_filter_new(ts); |
| | 1547 | } |
| | 1548 | ts = hyphen_filter_new(stop_filter_new_with_words_len(ts, words, len)); |
| | 1549 | return analyzer_new(ts, NULL, NULL); |
| | 1550 | } |
| | 1551 | |
| | 1552 | Analyzer *utf8_standard_analyzer_new_with_words(const char **words, |
| | 1553 | bool lowercase) |
| | 1554 | { |
| | 1555 | TokenStream *ts = utf8_standard_tokenizer_new(); |
| | 1556 | if (lowercase) { |
| | 1557 | ts = mb_lowercase_filter_new(ts); |
| | 1558 | } |
| | 1559 | ts = hyphen_filter_new(stop_filter_new_with_words(ts, words)); |
| | 1560 | return analyzer_new(ts, NULL, NULL); |
| | 1561 | } |
| | 1562 | |
| 1528 | 1563 | Analyzer *standard_analyzer_new(bool lowercase) |
| 1529 | 1564 | { |
| … |
… |
|
| 1536 | 1571 | return mb_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS, |
| 1537 | 1572 | lowercase); |
| | 1573 | } |
| | 1574 | |
| | 1575 | Analyzer *utf8_standard_analyzer_new(bool lowercase) |
| | 1576 | { |
| | 1577 | return utf8_standard_analyzer_new_with_words(FULL_ENGLISH_STOP_WORDS, |
| | 1578 | lowercase); |
| 1538 | 1579 | } |
| 1539 | 1580 | |
-
|
r018a91
|
rb82c99
|
|
| 1 | 1 | #ifndef FRT_SCANNER_H |
| 2 | 2 | #define FRT_SCANNER_H |
| | 3 | |
| | 4 | #ifdef __cplusplus |
| | 5 | extern "C" { |
| | 6 | #endif |
| 3 | 7 | |
| 4 | 8 | /* |
| … |
… |
|
| 21 | 25 | int *token_length); |
| 22 | 26 | |
| | 27 | void frt_std_scan_utf8(const char *in, |
| | 28 | char *out, size_t out_size, |
| | 29 | const char **start, const char **end, |
| | 30 | int *token_length); |
| | 31 | |
| | 32 | #ifdef __cplusplus |
| | 33 | } // extern "C" |
| | 34 | #endif |
| | 35 | |
| 23 | 36 | #endif /* FRT_SCANNER */ |
-
|
r457d8b
|
rb82c99
|
|
| 8 | 8 | |
| 9 | 9 | %%{ |
| 10 | | machine WChar; |
| | 10 | machine UTF8; |
| 11 | 11 | ualpha = |
| 12 | 12 | 0x41..0x5A #L& [26] LATIN CAPITAL LETTER A..LATIN CAPI... |