Skip to content

Commit

Permalink
Merge pull request #723 from projectcaluma/document-query-filter
Browse files Browse the repository at this point in the history
feat(search): add filters to replace removed document search
  • Loading branch information
winged authored Dec 10, 2024
2 parents 741e14e + b9126f0 commit 27319bb
Show file tree
Hide file tree
Showing 10 changed files with 139 additions and 40 deletions.
2 changes: 1 addition & 1 deletion alexandria/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def mock_celery(mocker):
)
mocker.patch(
"alexandria.core.tasks.set_content_vector.delay",
side_effect=lambda id: tasks.set_content_vector(id),
side_effect=lambda id, b=False: tasks.set_content_vector(id, b),
)
mocker.patch(
"alexandria.core.tasks.create_thumbnail.delay",
Expand Down
33 changes: 28 additions & 5 deletions alexandria/core/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@

from django.conf import settings
from django.contrib.postgres.search import SearchHeadline, SearchQuery, SearchRank
from django.db.models import Exists, F, FloatField, OuterRef, Q, TextField, Value
from django.db.models import (
Exists,
F,
FloatField,
OuterRef,
Q,
TextField,
Value,
)
from django.db.models.fields.json import KeyTextTransform
from django.db.models.functions import Cast
from django.db.models.functions import Cast, Concat
from django_filters import (
BaseCSVFilter,
BaseInFilter,
Expand Down Expand Up @@ -114,7 +122,7 @@ def filter(self, qs, value):
synonyms = models.Tag.objects.filter(
Q(id=tag) | Q(tag_synonym_group__tags__id=tag)
)
qs = qs.filter(tags__in=synonyms)
qs = qs.filter(**{f"{self.field_name}__in": synonyms})
return qs


Expand Down Expand Up @@ -153,7 +161,6 @@ class DocumentFilterSet(FilterSet):
active_group = ActiveGroupFilter()
tags = TagsFilter()
marks = CharInFilter()
category = CategoriesFilter()
categories = CategoriesFilter(field_name="category")
# exclude_children is applied in CategoriesFilter, this is needed for DjangoFilterBackend
exclude_children = BooleanFilter(field_name="title", method=lambda qs, __, ___: qs)
Expand All @@ -169,6 +176,10 @@ class FileFilterSet(FilterSet):
active_group = ActiveGroupFilter()
files = BaseCSVFilter(field_name="pk", lookup_expr="in")
only_newest = BooleanFilter(method="filter_only_newest")
tags = TagsFilter(field_name="document__tags")
categories = CategoriesFilter(field_name="document__category")
# exclude_children is applied in CategoriesFilter, this is needed for DjangoFilterBackend
exclude_children = BooleanFilter(field_name="name", method=lambda qs, __, ___: qs)

def filter_only_newest(self, qs, name, value):
if value:
Expand Down Expand Up @@ -242,7 +253,19 @@ def search_files(self, queryset, name, value):

queryset = queryset.annotate(
search_rank=SearchRank(F("content_vector"), search_query),
search_context=SearchHeadline(F("content_text"), search_query),
# SearchHeadline is a very expensive operation, evaluate usage if performance is an issue
search_context=SearchHeadline(
Concat(
F("document__title"),
Value(" "),
F("document__description"),
Value(" "),
F("name"),
Value(" "),
F("content_text"),
),
search_query,
),
).filter(content_vector=search_query)

# Can't do the default ordering in the viewset, as this is an annotated
Expand Down
2 changes: 1 addition & 1 deletion alexandria/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class Document(UUIDModel):
marks = models.ManyToManyField(Mark, blank=True, related_name="documents")
date = models.DateField(blank=True, null=True)

def get_latest_original(self):
def get_latest_original(self) -> "File":
if not self.files.count():
raise ObjectDoesNotExist("Document has no files")
return self.files.filter(variant=File.Variant.ORIGINAL).latest("created_at")
Expand Down
45 changes: 29 additions & 16 deletions alexandria/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,35 +19,48 @@


@shared_task(soft_time_limit=301)
def set_content_vector(file_pk: str):
def set_content_vector(file_pk: str, document_update: bool = False):
file = File.objects.get(pk=file_pk)
file.content.file.file.seek(0)

# tika has an internal time limit of 300s, set the request limit to match that
# different values should be set in tika as well
# https://github.com/CogStack/tika-service/blob/master/README.md#tika-parsers-configuration
parsed_content = tika.parser.from_buffer(
file.content.file.file, requestOptions={"timeout": 300}
if document_update:
parsed_content = file.content_text
else:
file.content.file.file.seek(0)
# tika has an internal time limit of 300s, set the request limit to match that
# different values should be set in tika as well
# https://github.com/CogStack/tika-service/blob/master/README.md#tika-parsers-configuration
parsed_content = tika.parser.from_buffer(
file.content.file.file, requestOptions={"timeout": 300}
)["content"]

file_name = str(Path(file.name).stem)
file_name_vector = SearchVector(Value(file_name), weight="D")
document_name_vector = SearchVector(Value(file.document.title), weight="A")
document_desc_vector = SearchVector(
Value(file.document.description or ""), weight="B"
)

name = str(Path(file.name).stem)
name_vector = SearchVector(Value(name), weight="A")
base_vector = file_name_vector + document_name_vector + document_desc_vector

if not parsed_content["content"]:
if not parsed_content:
# Update only content_vector and content_text to avoid race conditions
File.objects.filter(pk=file.pk).update(
content_vector=name_vector, content_text=name
content_vector=base_vector, content_text=""
)
return

# use part of content for language detection, beacause metadata is not reliable
language = tika.language.from_buffer(parsed_content["content"][:1000])
if document_update:
language = file.language
else:
# use part of content for language detection, beacause metadata is not reliable
language = tika.language.from_buffer(parsed_content[:1000])

config = settings.ALEXANDRIA_ISO_639_TO_PSQL_SEARCH_CONFIG.get(language, "simple")
text_content = parsed_content["content"].strip()
content_vector = name_vector + SearchVector(
text_content = parsed_content.strip()
content_vector = base_vector + SearchVector(
Value(text_content),
config=config,
weight="B",
weight="C",
)

# Update only need fields, to avoid race conditions
Expand Down
Loading

0 comments on commit 27319bb

Please sign in to comment.