Class: Ferret::Index::TermDocEnum

Summary

Use a TermDocEnum to iterate through the documents that contain a particular term. You can also iterate through the positions which the term occurs in a document.

Example

  tde = index_reader.term_docs_for(:content, "fox")

  tde.each do |doc_id, freq|
    puts "fox appeared #{freq} times in document #{doc_id}:"
    positions = []
    tde.each_position {|pos| positions << pos}
    puts "  #{positions.join(', ')}"
  end

  # or you can do it like this;
  tde.seek(:title, "red")
  while tde.next?
    puts "red appeared #{tde.freq} times in document #{tde.doc}:"
    positions = []
    while pos = tde.next_position
      positions << pos
    end
    puts "  #{positions.join(', ')}"
  end

Public Instance Methods


term_doc_enum.doc → doc_id

Returns the current document number pointed to by the term_doc_enum.

/*
 *  call-seq:
 *     term_doc_enum.doc -> doc_id
 *
 *  Returns the current document number pointed to by the +term_doc_enum+.
 */
static VALUE
frt_tde_doc(VALUE self)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    return INT2FIX(tde->doc_num(tde));
}

term_doc_enum.each {|doc_id, freq| do_something() } → doc_count

Iterate through the documents and document frequencies in the term_doc_enum.

NOTE: this method can only be called once after each seek. If you need to call +each+ again then you should call +seek+ again too.

/*
 *  call-seq:
 *     term_doc_enum.each {|doc_id, freq| do_something() } -> doc_count
 *
 *  Iterate through the documents and document frequencies in the
 *  +term_doc_enum+.
 *
 *  NOTE: this method can only be called once after each seek. If you need to
 *  call +#each+ again then you should call +#seek+ again too.
 */
static VALUE
frt_tde_each(VALUE self)
{
    int doc_cnt = 0;
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    VALUE vals = rb_ary_new2(2);
    RARRAY(vals)->len = 2;
    rb_mem_clear(RARRAY(vals)->ptr, 2);

    while (tde->next(tde)) {
        doc_cnt++;
        RARRAY(vals)->ptr[0] = INT2FIX(tde->doc_num(tde));
        RARRAY(vals)->ptr[1] = INT2FIX(tde->freq(tde));
        rb_yield(vals);

    }
    return INT2FIX(doc_cnt);
}

term_doc_enum.each_position {|pos| do_something } → term_doc_enum

Iterate through each of the positions occupied by the current term in the current document. This can only be called once per document. It can be used within the each method. For example, to print the terms documents and positions;

  tde.each do |doc_id, freq|
    puts "term appeared #{freq} times in document #{doc_id}:"
    positions = []
    tde.each_position {|pos| positions << pos}
    puts "  #{positions.join(', ')}"
  end
/*
 *  call-seq:
 *     term_doc_enum.each_position {|pos| do_something } -> term_doc_enum
 *
 *  Iterate through each of the positions occupied by the current term in the
 *  current document. This can only be called once per document. It can be
 *  used within the each method. For example, to print the terms documents and
 *  positions;
 *
 *    tde.each do |doc_id, freq|
 *      puts "term appeared #{freq} times in document #{doc_id}:"
 *      positions = []
 *      tde.each_position {|pos| positions << pos}
 *      puts "  #{positions.join(', ')}"
 *    end
 */
static VALUE
frt_tde_each_position(VALUE self)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    int pos;
    if (tde->next_position == NULL) {
        rb_raise(rb_eNotImpError, "to scan through positions you must create "
                 "the TermDocEnum with Index#term_positions method rather "
                 "than the Index#term_docs method");
    }
    while (0 <= (pos = tde->next_position(tde))) {
        rb_yield(INT2FIX(pos));
    }
    return self;
}

term_doc_enum.doc → doc_id

Returns the frequency of the current document pointed to by the term_doc_enum.

/*
 *  call-seq:
 *     term_doc_enum.doc -> doc_id
 *
 *  Returns the frequency of the current document pointed to by the
 *  +term_doc_enum+.
 */
static VALUE
frt_tde_freq(VALUE self)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    return INT2FIX(tde->freq(tde));
}

term_doc_enum.doc → doc_id

Move forward to the next document in the enumeration. Returns true if there is another document or false otherwise.

/*
 *  call-seq:
 *     term_doc_enum.doc -> doc_id
 *
 *  Move forward to the next document in the enumeration. Returns +true+ if
 *  there is another document or +false+ otherwise.
 */
static VALUE
frt_tde_next(VALUE self)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    return tde->next(tde) ? Qtrue : Qfalse;
}

term_doc_enum.doc → doc_id

Move forward to the next document in the enumeration. Returns true if there is another document or false otherwise.

/*
 *  call-seq:
 *     term_doc_enum.doc -> doc_id
 *
 *  Move forward to the next document in the enumeration. Returns +true+ if
 *  there is another document or +false+ otherwise.
 */
static VALUE
frt_tde_next_position(VALUE self)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    int pos;
    if (tde->next_position == NULL) {
        rb_raise(rb_eNotImpError, "to scan through positions you must create "
                 "the TermDocEnum with Index#term_positions method rather "
                 "than the Index#term_docs method");
    }
    pos = tde->next_position(tde);
    return pos >= 0 ? INT2FIX(pos) : Qnil;
}

term_doc_enum.seek(field, term) → self

Seek the term term in the index for field. After you call this method you can call next or each to skip through the documents and positions of this particular term.

/*
 *  call-seq:
 *     term_doc_enum.seek(field, term) -> self
 *
 *  Seek the term +term+ in the index for +field+. After you call this method
 *  you can call next or each to skip through the documents and positions of
 *  this particular term.
 */
static VALUE
frt_tde_seek(VALUE self, VALUE rfield, VALUE rterm)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    char *term;
    VALUE rfnum_map = rb_ivar_get(self, id_fld_num_map);
    VALUE rfnum = rb_hash_aref(rfnum_map, rfield);
    int field_num = -1;
    term = StringValuePtr(rterm);
    if (rfnum != Qnil) {
        field_num = FIX2INT(rfnum);
    } else {
        rb_raise(rb_eArgError, "field %s doesn't exist in the index",
                 frt_field(rfield));
    }
    tde->seek(tde, field_num, term);
    return self;
}

term_doc_enum.seek_term_enum(term_enum) → self

Seek the current term in term_enum. You could just use the standard seek method like this;

  term_doc_enum.seek(term_enum.term)

However the seek_term_enum method saves an index lookup so should offer a large performance improvement.

/*
 *  call-seq:
 *     term_doc_enum.seek_term_enum(term_enum) -> self
 *
 *  Seek the current term in +term_enum+. You could just use the standard seek
 *  method like this;
 *
 *    term_doc_enum.seek(term_enum.term)
 *
 *  However the +seek_term_enum+ method saves an index lookup so should offer
 *  a large performance improvement.
 */
static VALUE
frt_tde_seek_te(VALUE self, VALUE rterm_enum)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    TermEnum *te = (TermEnum *)frt_rb_data_ptr(rterm_enum);
    tde->seek_te(tde, te);
    return self;
}

term_doc_enum.skip_to(target) → bool

Skip to the required document number target and return true if there is a document >= target.

/*
 *  call-seq:
 *     term_doc_enum.skip_to(target) -> bool
 *
 *  Skip to the required document number +target+ and return true if there is
 *  a document >= +target+.
 */
static VALUE
frt_tde_skip_to(VALUE self, VALUE rtarget)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    return tde->skip_to(tde, FIX2INT(rtarget)) ? Qtrue : Qfalse;
}

term_doc_enum.to_json() → string

Returns a json representation of the term doc enum. It will also add the term positions if they are available. You can speed this up by having the method return arrays instead of objects, simply by passing an argument to the to_json method. For example;

  term_doc_enum.to_json() #=>
  # [
  #   {"document":1,"frequency":12},
  #   {"document":11,"frequency":1},
  #   {"document":29,"frequency":120},
  #   {"document":30,"frequency":3}
  # ]

  term_doc_enum.to_json(:fast) #=>
  # [
  #   [1,12],
  #   [11,1],
  #   [29,120],
  #   [30,3]
  # ]
/*
 *  call-seq:
 *     term_doc_enum.to_json() -> string
 *
 *  Returns a json representation of the term doc enum. It will also add the
 *  term positions if they are available. You can speed this up by having the
 *  method return arrays instead of objects, simply by passing an argument to
 *  the to_json method. For example;
 *
 *    term_doc_enum.to_json() #=> 
 *    # [
 *    #   {"document":1,"frequency":12},
 *    #   {"document":11,"frequency":1},
 *    #   {"document":29,"frequency":120},
 *    #   {"document":30,"frequency":3}
 *    # ]
 *
 *    term_doc_enum.to_json(:fast) #=> 
 *    # [
 *    #   [1,12],
 *    #   [11,1],
 *    #   [29,120],
 *    #   [30,3]
 *    # ]
 */
static VALUE
frt_tde_to_json(int argc, VALUE *argv, VALUE self)
{
    TermDocEnum *tde = (TermDocEnum *)DATA_PTR(self);
    VALUE rjson;
    char *json, *jp;
    int capa = 65536;
    char *format;
    char close = (argc > 0) ? ']' : '}';
    bool do_positions = tde->next_position != NULL;
    jp = json = ALLOC_N(char, capa);
    *(jp++) = '[';

    if (do_positions) {
        if (argc == 0) {
            format = "{\"document\":%d,\"frequency\":%d,\"positions\":[";
        }
        else {
            format = "[%d,%d,[";
        }
    }
    else {
        if (argc == 0) {
            format = "{\"document\":%d,\"frequency\":%d},";
        }
        else {
            format = "[%d,%d],";
        }
    }
    while (tde->next(tde)) {
        /* 100 chars should be enough room for an extra entry */
        if ((jp - json) + 100 + tde->freq(tde) * 20 > capa) {
            capa <<= 1;
            REALLOC_N(json, char, capa);
        }
        sprintf(jp, format, tde->doc_num(tde), tde->freq(tde));
        jp += strlen(jp);
        if (do_positions) {
            int pos;
            while (0 <= (pos = tde->next_position(tde))) {
                sprintf(jp, "%d,", pos);
                jp += strlen(jp);
            }
            if (*(jp - 1) == ',') jp--;
            *(jp++) = ']';
            *(jp++) = close;
            *(jp++) = ',';
        }
    }
    if (*(jp - 1) == ',') jp--;
    *(jp++) = ']';
    *jp = '\0';

    rjson = rb_str_new2(json);
    free(json);
    return rjson;
}