Class: Ferret::Analysis::RegExpAnalyzer

Summary

Using a RegExpAnalyzer is a simple way to create a custom analyzer. If implemented in Ruby it would look like this;

  class RegExpAnalyzer
    def initialize(reg_exp, lower = true)
      @lower = lower
      @reg_exp = reg_exp
    end

    def token_stream(field, str)
      if @lower
        return LowerCaseFilter.new(RegExpTokenizer.new(str, reg_exp))
      else
        return RegExpTokenizer.new(str, reg_exp)
      end
    end
  end

Example

  csv_analyzer = RegExpAnalyzer.new(/[^,]+/, false)

Public Class Methods


RegExpAnalyzer.new(reg_exp, lower = true) → analyzer

Create a new RegExpAnalyzer which will create tokenizers based on the regular expression and lowercasing if required.

reg_exp:the token matcher for the tokenizer to use
lower:set to false if you don‘t want to downcase the tokens
/* 
 *  call-seq:
 *     RegExpAnalyzer.new(reg_exp, lower = true) -> analyzer
 *
 *  Create a new RegExpAnalyzer which will create tokenizers based on the
 *  regular expression and lowercasing if required.
 *
 *  reg_exp:: the token matcher for the tokenizer to use
 *  lower::   set to false if you don't want to downcase the tokens
 */
static VALUE
frt_re_analyzer_init(int argc, VALUE *argv, VALUE self)
{
    VALUE lower, rets, regex, proc;
    Analyzer *a;
    TokenStream *ts;
    rb_scan_args(argc, argv, "02&", &regex, &lower, &proc);

    ts = rets_new(Qnil, regex, proc);
    rets = Data_Wrap_Struct(cRegExpTokenizer, &frt_rets_mark, &frt_rets_free, ts);
    object_add(ts, rets);

    if (lower != Qfalse) {
        rets = frt_lowercase_filter_init(frt_data_alloc(cLowerCaseFilter), rets);
        ts = DATA_PTR(rets);
    }
    REF(ts);

    a = analyzer_new(ts, &re_analyzer_destroy_i, NULL);
    Frt_Wrap_Struct(self, &frt_re_analyzer_mark, &frt_analyzer_free, a);
    object_add(a, self);
    return self;
}

Public Instance Methods


analyzer.token_stream(field_name, input) → token_stream

Create a new TokenStream to tokenize input. The TokenStream created may also depend on the field_name. Although this parameter is typically ignored.

field_name:name of the field to be tokenized
input:data from the field to be tokenized
/*
 *  call-seq:
 *     analyzer.token_stream(field_name, input) -> token_stream
 *
 *  Create a new TokenStream to tokenize +input+. The TokenStream created may
 *  also depend on the +field_name+. Although this parameter is typically
 *  ignored.
 *
 *  field_name:: name of the field to be tokenized
 *  input::      data from the field to be tokenized
 */
static VALUE
frt_re_analyzer_token_stream(VALUE self, VALUE rfield, VALUE rtext)
{
    TokenStream *ts;
    Analyzer *a;
    GET_A(a, self);

    StringValue(rtext);

    ts = a_get_ts(a, frt_field(rfield), rs2s(rtext));

    /* Make sure that there is no entry already */
    object_set(&ts->text, rtext);
    if (ts->next == &rets_next) {
        RETS(ts)->rtext = rtext;
        rb_hash_aset(object_space, ((VALUE)ts)|1, rtext);
    }
    else {
        RETS(((TokenFilter*)ts)->sub_ts)->rtext = rtext;
        rb_hash_aset(object_space, ((VALUE)((TokenFilter*)ts)->sub_ts)|1, rtext);
    }
    return get_rb_token_stream(ts);
}