djorm-ext-pgfulltext Allow arbitrary query to be used for indexing fields

It would be nice if it were possible to allow arbitrary queries to be used for building the search index on each field. For example, when indexing people's names, I'd like to be able to use a query like this:

    objects = pg.SearchManager(
        fields={
            "name": """
                setweight(to_tsvector(name), 'A') ||
                setweight(to_tsvector(case
                    when unaccent(name) = name then ''
                    else unaccent(name)
                end), 'B')
            """,
        }
    )

It looks like this would be straight forward to implement.

Would you consider such a patch?

Mar 24 '14 21:03 wolever

In fact, here's the quick thing I've whipped together which seems to do a pretty decent job:

from django.db import connections
from djorm_pgfulltext.models import SearchManager as _SearchManager
from djorm_pgfulltext.fields import VectorField

__all__ = ["VectorField", "SearchManager"]


class tsvector(object):
    def __init__(self, weight=None):
        self.weight = weight

    def __call__(self, manager, field_name, weight=None, config=None, using=None):
        weight = self.weight or weight or manager.default_weight
        config = config or manager.config
        using = using or manager.db
        qn = connections[using].ops.quote_name
        fq_field_name = self.get_fq_field_name(manager, field_name, qn)
        return self.get_tsvector(manager, fq_field_name, weight, config)

    def get_fq_field_name(self, manager, field_name, qn):
        model_meta = manager.model._meta
        return "%s.%s" %(
            qn(model_meta.db_table),
            qn(model_meta.get_field(field_name).column),
        )

    def get_tsvector(self, manager, fq_field_name, weight, config):
        return "setweight(to_tsvector('%s', coalesce(%s, '')), '%s')" %(
            config,
            fq_field_name,
            weight,
        )


class unaccent_tsvector(tsvector):
    def __init__(self, weight=None, weight2=None):
        self.weight = weight
        self.weight2 = weight2

    def get_tsvector(self, manager, fq_field_name, weight, config):
        weight2 = self.weight2
        if weight2 is None:
            weight2 = {
                "A": "B",
                "B": "C",
                "C": "D",
                "D": "D",
            }[weight]
        return """
            setweight(to_tsvector({field}), '{w1}') ||
            setweight(to_tsvector(case
                when unaccent({field}) = {field} then ''
                else unaccent({field})
            end), '{w2}')
        """.format(
            field=fq_field_name,
            w1=weight,
            w2=weight2,
        )


class SearchManager(_SearchManager):
    def _get_search_vector(self, config, using, fields=None):
        fields = self._fields if fields is None else fields
        if isinstance(fields, dict):
            return self._get_search_vector_from_dict(config, using, fields)
        return _SearchManager._get_search_vector(self, config, using, fields)

    def _get_search_vector_from_dict(self, config, using, fields):
        result = []
        for field_name, vector_raw in fields.items():
            if callable(vector_raw):
                vector = vector_raw(self, field_name)
            elif vector_raw == True:
                vector = self._get_vector_for_field(field_name)
            else:
                raise ValueError("Unexpected field definition: %r: %s"
                                 %(field_name, vector_raw))
            result.append(vector)
        return " || ".join(result)

Then I can use it like this:

…
    fields={
        "name": unaccent_tsvector("A", "B"),
        "description": tsvector("B"),
    }
…

Mar 24 '14 22:03 wolever

Hi @wolever ! Thanks a lot for your contibutions. Unfortunatelly I don't have much time for this package. Can you make a pull request for this issue and and #18?

Thanks!

Apr 18 '14 11:04 niwinz

Perhaps it can be even more simple and general: allow fields to be a string, and if so, treat it as raw postgresql to be used in place of the constructed-by-convention _get_search_vector fragment. Then, any other appropriate customization (using locally-available extensions like unaccent or others) is also possible. I've done this in my branch: https://github.com/gojomo/djorm-ext-pgfulltext/commit/56e5b839a941823b0c79559313c23fcc601d98b1

Sep 16 '14 01:09 gojomo