Class: Ferret::Search::FuzzyQuery
Summary
FuzzyQuery uses the Levenshtein distance formula for measuring the similarity between two terms. For example, weak and week have one letter difference and they are four characters long so the simlarity is 75% or 0.75. You can use this query to match terms that are very close to the search term.
Example
FuzzyQuery can be quite useful for find documents that wouldn‘t normally be found because of typos.
FuzzyQuery.new(:field, "google",
:min_similarity => 0.6,
:prefix_length => 2)
# matches => "gogle", "goggle", "googol", "googel"
Public Class Methods
FuzzyQuery.default_min_similarity → number
Get the default value for +:min_similarity+
/*
* call-seq:
* FuzzyQuery.default_min_similarity -> number
*
* Get the default value for +:min_similarity+
*/
static VALUE
frt_fq_get_dms(VALUE self)
{
return rb_cvar_get(cFuzzyQuery, id_default_min_similarity);
}
FuzzyQuery.default_min_similarity = min_sim → min_sim
Set the default value for +:min_similarity+
/*
* call-seq:
* FuzzyQuery.default_min_similarity = min_sim -> min_sim
*
* Set the default value for +:min_similarity+
*/
static VALUE
frt_fq_set_dms(VALUE self, VALUE val)
{
double min_sim = NUM2DBL(val);
if (min_sim >= 1.0) {
rb_raise(rb_eArgError,
"%f >= 1.0. :min_similarity must be < 1.0", min_sim);
} else if (min_sim < 0.0) {
rb_raise(rb_eArgError,
"%f < 0.0. :min_similarity must be > 0.0", min_sim);
}
qp_default_fuzzy_min_sim = (float)min_sim;
rb_cvar_set(cFuzzyQuery, id_default_min_similarity, val, Qfalse);
return val;
}
FuzzyQuery.default_prefix_length → number
Get the default value for +:prefix_length+
/*
* call-seq:
* FuzzyQuery.default_prefix_length -> number
*
* Get the default value for +:prefix_length+
*/
static VALUE
frt_fq_get_dpl(VALUE self)
{
return rb_cvar_get(cFuzzyQuery, id_default_prefix_length);
}
FuzzyQuery.default_prefix_length = prefix_length → prefix_length
Set the default value for +:prefix_length+
/*
* call-seq:
* FuzzyQuery.default_prefix_length = prefix_length -> prefix_length
*
* Set the default value for +:prefix_length+
*/
static VALUE
frt_fq_set_dpl(VALUE self, VALUE val)
{
int pre_len = FIX2INT(val);
if (pre_len < 0) {
rb_raise(rb_eArgError,
"%d < 0. :prefix_length must be >= 0", pre_len);
}
qp_default_fuzzy_pre_len = pre_len;
rb_cvar_set(cFuzzyQuery, id_default_prefix_length, val, Qfalse);
return val;
}
FuzzyQuery.new(field, term, options = {}) → fuzzy-query
Create a new FuzzyQuery that will match terms with a similarity of at least +:min_similarity+ to term. Similarity is scored using the Levenshtein edit distance formula. See en.wikipedia.org/wiki/Levenshtein_distance
If a +:prefix_length+ > 0 is specified, a common prefix of that length is also required.
You can also set +:max_terms+ to prevent memory overflow problems. By default it is set to 512.
Example
FuzzyQuery.new(:content, "levenshtein",
:min_similarity => 0.8,
:prefix_length => 5,
:max_terms => 1024)
| field: | field to search |
| term: | term to search for including it‘s close matches |
| :min_similarity: | Default: 0.5. minimum levenshtein distance score for a match |
| :prefix_length: | Default: 0. minimum prefix_match before levenshtein distance is measured. This parameter is used to improve performance. With a +:prefix_length+ of 0, all terms in the index must be checked which can be quite a performance hit. By setting the prefix length to a larger number you minimize the number of terms that need to be checked. Even 1 will cut down the work by a factor of about 26 depending on your character set and the first letter. |
| :max_terms: | Limits the number of terms that can be added to the query when it is expanded as a MultiTermQuery. This is not usually a problem with FuzzyQueries unless you set +:min_similarity+ to a very low value. |
/*
* call-seq:
* FuzzyQuery.new(field, term, options = {}) -> fuzzy-query
*
* Create a new FuzzyQuery that will match terms with a similarity of at
* least +:min_similarity+ to +term+. Similarity is scored using the
* Levenshtein edit distance formula. See
* http://en.wikipedia.org/wiki/Levenshtein_distance
*
* If a +:prefix_length+ > 0 is specified, a common prefix of that length is
* also required.
*
* You can also set +:max_terms+ to prevent memory overflow problems. By
* default it is set to 512.
*
* == Example
*
* FuzzyQuery.new(:content, "levenshtein",
* :min_similarity => 0.8,
* :prefix_length => 5,
* :max_terms => 1024)
*
* field:: field to search
* term:: term to search for including it's close matches
* :min_similarity:: Default: 0.5. minimum levenshtein distance score for a
* match
* :prefix_length:: Default: 0. minimum prefix_match before levenshtein
* distance is measured. This parameter is used to improve
* performance. With a +:prefix_length+ of 0, all terms in
* the index must be checked which can be quite a
* performance hit. By setting the prefix length to a
* larger number you minimize the number of terms that need
* to be checked. Even 1 will cut down the work by a
* factor of about 26 depending on your character set and
* the first letter.
* :max_terms:: Limits the number of terms that can be added to the
* query when it is expanded as a MultiTermQuery. This is
* not usually a problem with FuzzyQueries unless you set
* +:min_similarity+ to a very low value.
*/
static VALUE
frt_fq_init(int argc, VALUE *argv, VALUE self)
{
Query *q;
VALUE rfield, rterm, roptions;
float min_sim =
(float)NUM2DBL(rb_cvar_get(cFuzzyQuery, id_default_min_similarity));
int pre_len =
FIX2INT(rb_cvar_get(cFuzzyQuery, id_default_prefix_length));
int max_terms =
FIX2INT(rb_cvar_get(cMultiTermQuery, id_default_max_terms));
if (rb_scan_args(argc, argv, "21", &rfield, &rterm, &roptions) >= 3) {
VALUE v;
Check_Type(roptions, T_HASH);
if (Qnil != (v = rb_hash_aref(roptions, sym_prefix_length))) {
pre_len = FIX2INT(v);
}
if (Qnil != (v = rb_hash_aref(roptions, sym_min_similarity))) {
min_sim = (float)NUM2DBL(v);
}
if (Qnil != (v = rb_hash_aref(roptions, sym_max_terms))) {
max_terms = FIX2INT(v);
}
}
if (min_sim >= 1.0) {
rb_raise(rb_eArgError,
"%f >= 1.0. :min_similarity must be < 1.0", min_sim);
} else if (min_sim < 0.0) {
rb_raise(rb_eArgError,
"%f < 0.0. :min_similarity must be > 0.0", min_sim);
}
if (pre_len < 0) {
rb_raise(rb_eArgError,
"%d < 0. :prefix_length must be >= 0", pre_len);
}
if (max_terms < 0) {
rb_raise(rb_eArgError,
"%d < 0. :max_terms must be >= 0", max_terms);
}
q = fuzq_new_conf(frt_field(rfield), StringValuePtr(rterm),
min_sim, pre_len, max_terms);
Frt_Wrap_Struct(self, NULL, &frt_q_free, q);
object_add(q, self);
return self;
}Public Instance Methods
FuzzyQuery.min_similarity → min_similarity
Get the +:min_similarity+ for the query.
/*
* call-seq:
* FuzzyQuery.min_similarity -> min_similarity
*
* Get the +:min_similarity+ for the query.
*/
static VALUE
frt_fq_min_sim(VALUE self)
{
GET_Q();
return rb_float_new((double)((FuzzyQuery *)q)->min_sim);
}
FuzzyQuery.prefix_length → prefix_length
Get the +:prefix_length+ for the query.
/*
* call-seq:
* FuzzyQuery.prefix_length -> prefix_length
*
* Get the +:prefix_length+ for the query.
*/
static VALUE
frt_fq_pre_len(VALUE self)
{
GET_Q();
return INT2FIX(((FuzzyQuery *)q)->pre_len);
}