From 4e676a63b8a6b12fc243da9df200e86d8837c769 Mon Sep 17 00:00:00 2001 From: Anthony Bernabeu <64135631+brnaba-aws@users.noreply.github.com> Date: Wed, 5 Jun 2024 15:33:54 +0200 Subject: [PATCH] community[minor]: Added filter search for LanceDB (#22461) - [ ] **community**: "vectorstore: added filtering support for LanceDB vector store" - [ ] **This PR adds filtering capabilities to LanceDB**: - **Description:** In LanceDB filtering can be applied when searching for data into the vectorstore. It is using the SQL language as mentioned in the LanceDB documentation. - **Issue:** #18235 - **Dependencies:** No - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ --- .../vectorstores/lancedb.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/vectorstores/lancedb.py b/libs/community/langchain_community/vectorstores/lancedb.py index bb28165be24be..555ba836aa73b 100644 --- a/libs/community/langchain_community/vectorstores/lancedb.py +++ b/libs/community/langchain_community/vectorstores/lancedb.py @@ -113,7 +113,7 @@ def add_texts( Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. - ids: Optional list of ids to associate w ith the texts. + ids: Optional list of ids to associate with the texts. Returns: List of ids of the added texts. @@ -218,14 +218,42 @@ def similarity_search( Args: query: String to query the vectorstore with. k: Number of documents to return. + filter (Optional[Dict]): Optional filter arguments + sql_filter(Optional[string]): SQL filter to apply to the query. + prefilter(Optional[bool]): Whether to apply the filter prior + to the vector search. + Raises: + ValueError: If the specified table is not found in the database. Returns: List of documents most similar to the query. + + Examples: + + .. code-block:: python + + # Retrieve documents with filtering based on a metadata file_type + vector_store.as_retriever(search_kwargs={"k": 4, "filter":{ + 'sql_filter':"file_type='notice'", + 'prefilter': True + } + }) + + # Retrieve documents with filtering on a specific file name + vector_store.as_retriever(search_kwargs={"k": 4, "filter":{ + 'sql_filter':"source='my-file.txt'", + 'prefilter': True + } + }) """ embedding = self._embedding.embed_query(query) # type: ignore tbl = self.get_table(name) + filters = kwargs.pop("filter", {}) + sql_filter = filters.pop("sql_filter", None) + prefilter = filters.pop("prefilter", False) docs = ( tbl.search(embedding, vector_column_name=self._vector_key) + .where(sql_filter, prefilter=prefilter) .limit(k) .to_arrow() )