Class: Ferret::Analysis::TokenStream

Summary

A TokenStream enumerates the sequence of tokens, either from fields of a document or from query text.

This is an abstract class. Concrete subclasses are:

Tokenizer:a TokenStream whose input is a string
TokenFilter:a TokenStream whose input is another TokenStream

Public Instance Methods


token_stream.next → token

Return the next token from the TokenStream or nil if there are no more tokens.

/*
 *  call-seq:
 *     token_stream.next -> token
 *
 *  Return the next token from the TokenStream or nil if there are no more
 *  tokens.
 */
static VALUE
frt_ts_next(VALUE self)
{
    TokenStream *ts;
    Token *next;
    GET_TS(ts, self);
    next = ts->next(ts);
    if (next == NULL) {
        return Qnil;
    }

    return get_token(next);
}

token_stream.text = text → text

Return the text that the TokenStream is tokenizing

/*
 *  call-seq:
 *     token_stream.text = text -> text
 *
 *  Return the text that the TokenStream is tokenizing
 */
static VALUE
frt_ts_get_text(VALUE self)
{
    VALUE rtext = Qnil;
    TokenStream *ts; 
    Data_Get_Struct(self, TokenStream, ts);
    if ((rtext = object_get(&ts->text)) == Qnil) {
        if (ts->text) {
            rtext = rb_str_new2(ts->text);
            object_set(&ts->text, rtext);
        } 
    }
    return rtext;
}

token_stream.text = text → text

Set the text attribute of the TokenStream to the text you wish to be tokenized. For example, you may do this;

    token_stream.text = File.read(file_name)
/*
 *  call-seq:
 *     token_stream.text = text -> text
 *
 *  Set the text attribute of the TokenStream to the text you wish to be
 *  tokenized. For example, you may do this;
 *
 *      token_stream.text = File.read(file_name)
 */
static VALUE
frt_ts_set_text(VALUE self, VALUE rtext)
{
    TokenStream *ts; 
    Data_Get_Struct(self, TokenStream, ts);
    StringValue(rtext);
    ts->reset(ts, rs2s(rtext));
    
    /* prevent garbage collection */
    rb_ivar_set(self, id_text, rtext);

    return rtext;
}