Class: Ferret::Search::FuzzyQuery

Summary

FuzzyQuery uses the Levenshtein distance formula for measuring the similarity between two terms. For example, weak and week have one letter difference and they are four characters long so the simlarity is 75% or 0.75. You can use this query to match terms that are very close to the search term.

Example

FuzzyQuery can be quite useful for find documents that wouldn‘t normally be found because of typos.

  FuzzyQuery.new(:field, "google",
                 :min_similarity => 0.6,
                 :prefix_length => 2)
  # matches => "gogle", "goggle", "googol", "googel"

Public Class Methods


FuzzyQuery.default_min_similarity → number

Get the default value for +:min_similarity+

/*
 *  call-seq:
 *     FuzzyQuery.default_min_similarity -> number
 *
 *  Get the default value for +:min_similarity+
 */
static VALUE
frt_fq_get_dms(VALUE self)
{
    return rb_cvar_get(cFuzzyQuery, id_default_min_similarity);
}

FuzzyQuery.default_min_similarity = min_sim → min_sim

Set the default value for +:min_similarity+

/*
 *  call-seq:
 *     FuzzyQuery.default_min_similarity = min_sim -> min_sim
 *
 *  Set the default value for +:min_similarity+
 */
static VALUE
frt_fq_set_dms(VALUE self, VALUE val)
{
    double min_sim = NUM2DBL(val);
    if (min_sim >= 1.0) {
        rb_raise(rb_eArgError,
                 "%f >= 1.0. :min_similarity must be < 1.0", min_sim);
    } else if (min_sim < 0.0) {
        rb_raise(rb_eArgError,
                 "%f < 0.0. :min_similarity must be > 0.0", min_sim);
    }
    qp_default_fuzzy_min_sim = (float)min_sim;
    rb_cvar_set(cFuzzyQuery, id_default_min_similarity, val, Qfalse);
    return val;
}

FuzzyQuery.default_prefix_length → number

Get the default value for +:prefix_length+

/*
 *  call-seq:
 *     FuzzyQuery.default_prefix_length -> number
 *
 *  Get the default value for +:prefix_length+
 */
static VALUE
frt_fq_get_dpl(VALUE self)
{
    return rb_cvar_get(cFuzzyQuery, id_default_prefix_length);
}

FuzzyQuery.default_prefix_length = prefix_length → prefix_length

Set the default value for +:prefix_length+

/*
 *  call-seq:
 *     FuzzyQuery.default_prefix_length = prefix_length -> prefix_length
 *
 *  Set the default value for +:prefix_length+
 */
static VALUE
frt_fq_set_dpl(VALUE self, VALUE val)
{
    int pre_len = FIX2INT(val);
    if (pre_len < 0) {
        rb_raise(rb_eArgError,
                 "%d < 0. :prefix_length must be >= 0", pre_len);
    }
    qp_default_fuzzy_pre_len = pre_len;
    rb_cvar_set(cFuzzyQuery, id_default_prefix_length, val, Qfalse);
    return val;
}

FuzzyQuery.new(field, term, options = {}) → fuzzy-query

Create a new FuzzyQuery that will match terms with a similarity of at least +:min_similarity+ to term. Similarity is scored using the Levenshtein edit distance formula. See en.wikipedia.org/wiki/Levenshtein_distance

If a +:prefix_length+ > 0 is specified, a common prefix of that length is also required.

You can also set +:max_terms+ to prevent memory overflow problems. By default it is set to 512.

Example

  FuzzyQuery.new(:content, "levenshtein",
                 :min_similarity => 0.8,
                 :prefix_length => 5,
                 :max_terms => 1024)
field:field to search
term:term to search for including it‘s close matches
:min_similarity:Default: 0.5. minimum levenshtein distance score for a match
:prefix_length:Default: 0. minimum prefix_match before levenshtein distance is measured. This parameter is used to improve performance. With a +:prefix_length+ of 0, all terms in the index must be checked which can be quite a performance hit. By setting the prefix length to a larger number you minimize the number of terms that need to be checked. Even 1 will cut down the work by a factor of about 26 depending on your character set and the first letter.
:max_terms:Limits the number of terms that can be added to the query when it is expanded as a MultiTermQuery. This is not usually a problem with FuzzyQueries unless you set +:min_similarity+ to a very low value.
/*
 *  call-seq:
 *     FuzzyQuery.new(field, term, options = {}) -> fuzzy-query
 *
 *  Create a new FuzzyQuery that will match terms with a similarity of at
 *  least +:min_similarity+ to +term+. Similarity is scored using the
 *  Levenshtein edit distance formula. See
 *  http://en.wikipedia.org/wiki/Levenshtein_distance
 *
 *  If a +:prefix_length+ > 0 is specified, a common prefix of that length is
 *  also required.
 *  
 *  You can also set +:max_terms+ to prevent memory overflow problems. By
 *  default it is set to 512.
 *
 *  == Example
 *
 *    FuzzyQuery.new(:content, "levenshtein",
 *                   :min_similarity => 0.8,
 *                   :prefix_length => 5,
 *                   :max_terms => 1024)
 *
 *  field::           field to search
 *  term::            term to search for including it's close matches
 *  :min_similarity:: Default: 0.5. minimum levenshtein distance score for a
 *                    match
 *  :prefix_length::  Default: 0. minimum prefix_match before levenshtein
 *                    distance is measured. This parameter is used to improve
 *                    performance.  With a +:prefix_length+ of 0, all terms in
 *                    the index must be checked which can be quite a
 *                    performance hit.  By setting the prefix length to a
 *                    larger number you minimize the number of terms that need
 *                    to be checked.  Even 1 will cut down the work by a
 *                    factor of about 26 depending on your character set and
 *                    the first letter.
 *  :max_terms::      Limits the number of terms that can be added to the
 *                    query when it is expanded as a MultiTermQuery. This is
 *                    not usually a problem with FuzzyQueries unless you set
 *                    +:min_similarity+ to a very low value.
 */
static VALUE
frt_fq_init(int argc, VALUE *argv, VALUE self)
{
    Query *q;
    VALUE rfield, rterm, roptions;
    float min_sim =
        (float)NUM2DBL(rb_cvar_get(cFuzzyQuery, id_default_min_similarity));
    int pre_len =
        FIX2INT(rb_cvar_get(cFuzzyQuery, id_default_prefix_length));
    int max_terms =
        FIX2INT(rb_cvar_get(cMultiTermQuery, id_default_max_terms));


    if (rb_scan_args(argc, argv, "21", &rfield, &rterm, &roptions) >= 3) {
        VALUE v;
        Check_Type(roptions, T_HASH);
        if (Qnil != (v = rb_hash_aref(roptions, sym_prefix_length))) {
            pre_len = FIX2INT(v);
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_min_similarity))) {
            min_sim = (float)NUM2DBL(v);
        }
        if (Qnil != (v = rb_hash_aref(roptions, sym_max_terms))) {
            max_terms = FIX2INT(v);
        }
    }

    if (min_sim >= 1.0) {
        rb_raise(rb_eArgError,
                 "%f >= 1.0. :min_similarity must be < 1.0", min_sim);
    } else if (min_sim < 0.0) {
        rb_raise(rb_eArgError,
                 "%f < 0.0. :min_similarity must be > 0.0", min_sim);
    }
    if (pre_len < 0) {
        rb_raise(rb_eArgError,
                 "%d < 0. :prefix_length must be >= 0", pre_len);
    }
    if (max_terms < 0) {
        rb_raise(rb_eArgError,
                 "%d < 0. :max_terms must be >= 0", max_terms);
    }

    q = fuzq_new_conf(frt_field(rfield), StringValuePtr(rterm),
                      min_sim, pre_len, max_terms);
    Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
    object_add(q, self);
    return self;
}

Public Instance Methods


FuzzyQuery.min_similarity → min_similarity

Get the +:min_similarity+ for the query.

/*
 *  call-seq:
 *     FuzzyQuery.min_similarity -> min_similarity
 *
 *  Get the +:min_similarity+ for the query.
 */
static VALUE
frt_fq_min_sim(VALUE self)
{
    GET_Q();
    return rb_float_new((double)((FuzzyQuery *)q)->min_sim);
}

FuzzyQuery.prefix_length → prefix_length

Get the +:prefix_length+ for the query.

/*
 *  call-seq:
 *     FuzzyQuery.prefix_length -> prefix_length
 *
 *  Get the +:prefix_length+ for the query.
 */
static VALUE
frt_fq_pre_len(VALUE self)
{
    GET_Q();
    return INT2FIX(((FuzzyQuery *)q)->pre_len);
}