Class: Ferret::Analysis::RegExpAnalyzer
Summary
Using a RegExpAnalyzer is a simple way to create a custom analyzer. If implemented in Ruby it would look like this;
class RegExpAnalyzer
def initialize(reg_exp, lower = true)
@lower = lower
@reg_exp = reg_exp
end
def token_stream(field, str)
if @lower
return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
else
return RegExpTokenizer.new(str, reg_exp)
end
end
end
Example
csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)
Public Class Methods
RegExpAnalyzer.new(reg_exp, lower = true) → analyzer
Create a new RegExpAnalyzer which will create tokenizers based on the regular expression and lowercasing if required.
| reg_exp: | the token matcher for the tokenizer to use |
| lower: | set to false if you don‘t want to downcase the tokens |
/*
* call-seq:
* RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
*
* Create a new RegExpAnalyzer which will create tokenizers based on the
* regular expression and lowercasing if required.
*
* reg_exp:: the token matcher for the tokenizer to use
* lower:: set to false if you don't want to downcase the tokens
*/
static VALUE
frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
{
VALUE lower, rets, regex, proc;
Analyzer *a;
TokenStream *ts;
rb_scan_args(argc, argv, "02&", ®ex, &lower, &proc);
ts = rets_new(Qnil, regex, proc);
rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
object_add(ts, rets);
if (lower != Qfalse) {
rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
ts = DATA_PTR(rets);
}
REF(ts);
a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
object_add(a, self);
return self;
}Public Instance Methods
analyzer.token_stream(field_name, input) → token_stream
Create a new TokenStream to tokenize input. The TokenStream created may also depend on the field_name. Although this parameter is typically ignored.
| field_name: | name of the field to be tokenized |
| input: | data from the field to be tokenized |
/*
* call-seq:
* analyzer.token_stream(field_name, input) -> token_stream
*
* Create a new TokenStream to tokenize +input+. The TokenStream created may
* also depend on the +field_name+. Although this parameter is typically
* ignored.
*
* field_name:: name of the field to be tokenized
* input:: data from the field to be tokenized
*/
static VALUE
frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
{
TokenStream *ts;
Analyzer *a;
GET_A(a, self);
StringValue(rtext);
ts = a_get_ts(a, frt_field(rfield), rs2s(rtext));
/* Make sure that there is no entry already */
object_set(&ts->text, rtext);
if (ts->next == &rets_next) {
RETS(ts)->rtext = rtext;
rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
}
else {
RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
}
return get_rb_token_stream(ts);
}