Allow arbitrary query to be used for indexing fields
It would be nice if it were possible to allow arbitrary queries to be used for building the search index on each field. For example, when indexing people's names, I'd like to be able to use a query like this:
objects = pg.SearchManager(
fields={
"name": """
setweight(to_tsvector(name), 'A') ||
setweight(to_tsvector(case
when unaccent(name) = name then ''
else unaccent(name)
end), 'B')
""",
}
)
It looks like this would be straight forward to implement.
Would you consider such a patch?
In fact, here's the quick thing I've whipped together which seems to do a pretty decent job:
from django.db import connections
from djorm_pgfulltext.models import SearchManager as _SearchManager
from djorm_pgfulltext.fields import VectorField
__all__ = ["VectorField", "SearchManager"]
class tsvector(object):
def __init__(self, weight=None):
self.weight = weight
def __call__(self, manager, field_name, weight=None, config=None, using=None):
weight = self.weight or weight or manager.default_weight
config = config or manager.config
using = using or manager.db
qn = connections[using].ops.quote_name
fq_field_name = self.get_fq_field_name(manager, field_name, qn)
return self.get_tsvector(manager, fq_field_name, weight, config)
def get_fq_field_name(self, manager, field_name, qn):
model_meta = manager.model._meta
return "%s.%s" %(
qn(model_meta.db_table),
qn(model_meta.get_field(field_name).column),
)
def get_tsvector(self, manager, fq_field_name, weight, config):
return "setweight(to_tsvector('%s', coalesce(%s, '')), '%s')" %(
config,
fq_field_name,
weight,
)
class unaccent_tsvector(tsvector):
def __init__(self, weight=None, weight2=None):
self.weight = weight
self.weight2 = weight2
def get_tsvector(self, manager, fq_field_name, weight, config):
weight2 = self.weight2
if weight2 is None:
weight2 = {
"A": "B",
"B": "C",
"C": "D",
"D": "D",
}[weight]
return """
setweight(to_tsvector({field}), '{w1}') ||
setweight(to_tsvector(case
when unaccent({field}) = {field} then ''
else unaccent({field})
end), '{w2}')
""".format(
field=fq_field_name,
w1=weight,
w2=weight2,
)
class SearchManager(_SearchManager):
def _get_search_vector(self, config, using, fields=None):
fields = self._fields if fields is None else fields
if isinstance(fields, dict):
return self._get_search_vector_from_dict(config, using, fields)
return _SearchManager._get_search_vector(self, config, using, fields)
def _get_search_vector_from_dict(self, config, using, fields):
result = []
for field_name, vector_raw in fields.items():
if callable(vector_raw):
vector = vector_raw(self, field_name)
elif vector_raw == True:
vector = self._get_vector_for_field(field_name)
else:
raise ValueError("Unexpected field definition: %r: %s"
%(field_name, vector_raw))
result.append(vector)
return " || ".join(result)
Then I can use it like this:
…
fields={
"name": unaccent_tsvector("A", "B"),
"description": tsvector("B"),
}
…
Hi @wolever ! Thanks a lot for your contibutions. Unfortunatelly I don't have much time for this package. Can you make a pull request for this issue and and #18?
Thanks!
Perhaps it can be even more simple and general: allow fields to be a string, and if so, treat it as raw postgresql to be used in place of the constructed-by-convention _get_search_vector fragment. Then, any other appropriate customization (using locally-available extensions like unaccent or others) is also possible. I've done this in my branch: https://github.com/gojomo/djorm-ext-pgfulltext/commit/56e5b839a941823b0c79559313c23fcc601d98b1