From b7cb0284e74e5d08cbf830794446bdfcdf79efaf Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Mon, 11 Sep 2023 09:45:25 +0000
Subject: [PATCH 01/13] WIP fuzzy matching strategy

---
 .../deanonymizer_matching_strategies.py       |  32 +++
 .../data_anonymizer/presidio.py               |   3 +-
 .../data_anonymizer/test.ipynb                | 200 ++++++++++++++++++
 3 files changed, 234 insertions(+), 1 deletion(-)
 create mode 100644 libs/experimental/langchain_experimental/data_anonymizer/test.ipynb

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
index e5d9e8581b6dc..17c2d25a2dfef 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@@ -15,3 +15,35 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
         for anonymized, original in deanonymizer_mapping[entity_type].items():
             text = text.replace(anonymized, original)
     return text
+
+
+def generate_ngrams(words_list, n):
+    """Generate n-grams from a list of words"""
+    return [" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))]
+
+
+def fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, fuzzy_threshold: int = 80
+) -> str:
+    from fuzzywuzzy import fuzz
+
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            # Split the anonymized entity and text into words
+            anonymized_words = anonymized.split()
+            text_words = text.split()
+
+            # Generate text segments of the same length as the anonymized entity
+            segments = generate_ngrams(text_words, len(anonymized_words))
+
+            # Iterate over each segment
+            for i, segment in enumerate(segments):
+                # Fuzzy match the segment with the anonymized entity
+                if fuzz.ratio(anonymized.lower(), segment.lower()) > fuzzy_threshold:
+                    # Replace the words in the original text
+                    text_words[i : i + len(anonymized_words)] = original.split()
+
+            # Join the words back into text
+            text = " ".join(text_words)
+
+    return text
diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
index b2be1dc5a1c0d..a62fbe5580bd6 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@@ -17,6 +17,7 @@
 )
 from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
     default_matching_strategy,
+    fuzzy_matching_strategy,
 )
 from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
     get_pseudoanonymizer_mapping,
@@ -289,7 +290,7 @@ def _deanonymize(
         text_to_deanonymize: str,
         deanonymizer_matching_strategy: Callable[
             [str, MappingDataType], str
-        ] = default_matching_strategy,
+        ] = fuzzy_matching_strategy,
     ) -> str:
         """Deanonymize text.
         Each anonymized entity is replaced with its original value.
diff --git a/libs/experimental/langchain_experimental/data_anonymizer/test.ipynb b/libs/experimental/langchain_experimental/data_anonymizer/test.ipynb
new file mode 100644
index 0000000000000..a872dad5f993e
--- /dev/null
+++ b/libs/experimental/langchain_experimental/data_anonymizer/test.ipynb
@@ -0,0 +1,200 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/vscode/langchain-py-env/lib/python3.11/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
+      "  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'My name is Maria Lynch, call me at 7344131647 or email me at wdavis@example.net. By the way, my card number is: 213186379402654 and my name is Maria Lynch'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
+    "\n",
+    "anonymizer = PresidioReversibleAnonymizer(\n",
+    "    analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
+    "    # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
+    "    # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
+    "    faker_seed=42,\n",
+    ")\n",
+    "\n",
+    "anonymizer.anonymize(\n",
+    "    \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
+    "    \"By the way, my card number is: 4916 0387 9536 0861 \"\n",
+    "    \"and my name is Slim Shady\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n",
+       " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
+       " 'EMAIL_ADDRESS': {'wdavis@example.net': 'real.slim.shady@gmail.com'},\n",
+       " 'CREDIT_CARD': {'213186379402654': '4916 0387 9536 0861'}}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "anonymizer.deanonymizer_mapping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Are you Slim Shady I found your card with number 4916 0387 9536 0861'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "anonymizer.deanonymize(\n",
+    "    \"Are you Mari lync? I found your card with number 213186379402654\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Are you Slim Shady I found your card with number 2131 8637 9402 654'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "anonymizer.deanonymize(\n",
+    "    \"Are you Mari lync? I found your card with number 2131 8637 9402 654\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Are you Slim Shady I found your card with number 2131 4916 0387 9536 0861'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "anonymizer.deanonymize(\n",
+    "    \"Are you Mari lync? I found your card with number 2131 86379402654\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "89"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from fuzzywuzzy import fuzz\n",
+    "\n",
+    "fuzz.partial_ratio(\"Marie Lynch\", \"Mari Lync\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "100"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from fuzzywuzzy import fuzz\n",
+    "\n",
+    "fuzz.partial_ratio(\"Slim Shady\", \"a\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain-py-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 83f996f3303f55df1bd582a2cfafb1044f0d206c Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Wed, 13 Sep 2023 14:44:39 +0000
Subject: [PATCH 02/13] Add more strategies

---
 .../deanonymizer_matching_strategies.py       | 62 +++++++++++++++----
 1 file changed, 50 insertions(+), 12 deletions(-)

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
index 17c2d25a2dfef..935e4f13a0964 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@@ -1,9 +1,10 @@
-from langchain_experimental.data_anonymizer.presidio import MappingDataType
+import re
+from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
 
 
-def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
+def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
     """
-    Default matching strategy for deanonymization.
+    Exact matching strategy for deanonymization.
     It replaces all the anonymized entities with the original ones.
 
     Args:
@@ -17,33 +18,70 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
     return text
 
 
-def generate_ngrams(words_list, n):
-    """Generate n-grams from a list of words"""
-    return [" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))]
+def case_insensitive_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType
+) -> str:
+    """
+    Case insensitive matching strategy for deanonymization.
+    It replaces all the anonymized entities with the original ones irrespective of their letter case.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+    """
+
+    # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            # Use regular expressions for case-insensitive matching and replacing
+            text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
+    return text
 
 
-def fuzzy_matching_strategy(
+def ngram_fuzzy_matching_strategy(
     text: str, deanonymizer_mapping: MappingDataType, fuzzy_threshold: int = 80
 ) -> str:
-    from fuzzywuzzy import fuzz
+    """
+    N-gram fuzzy matching strategy for deanonymization.
+    It replaces all the anonymized entities with the original ones.
+    It uses fuzzy matching to find the position of the anonymized entity in the text.
+    It generates n-grams of the same length as the anonymized entity from the text and
+    uses fuzzy matching to find the position of the anonymized entity in the text.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        fuzzy_threshold: fuzzy matching threshold
+    """
+
+    def generate_ngrams(words_list, n):
+        """Generate n-grams from a list of words"""
+        return [
+            " ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
+        ]
+
+    try:
+        from fuzzywuzzy import fuzz
+
+    except ImportError as e:
+        raise ImportError(
+            "Could not import fuzzywuzzy, please install with "
+            "`pip install fuzzywuzzy`."
+        ) from e
 
     for entity_type in deanonymizer_mapping:
         for anonymized, original in deanonymizer_mapping[entity_type].items():
-            # Split the anonymized entity and text into words
             anonymized_words = anonymized.split()
             text_words = text.split()
 
             # Generate text segments of the same length as the anonymized entity
             segments = generate_ngrams(text_words, len(anonymized_words))
 
-            # Iterate over each segment
             for i, segment in enumerate(segments):
                 # Fuzzy match the segment with the anonymized entity
                 if fuzz.ratio(anonymized.lower(), segment.lower()) > fuzzy_threshold:
-                    # Replace the words in the original text
                     text_words[i : i + len(anonymized_words)] = original.split()
 
-            # Join the words back into text
             text = " ".join(text_words)
 
     return text

From 2d7f426299e62ede10ac61601a2a1bf411daabcd Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Wed, 13 Sep 2023 14:45:21 +0000
Subject: [PATCH 03/13] Adjust anonymizer to strategies

---
 .../data_anonymizer/base.py                   | 25 ++++++++++++++++---
 .../data_anonymizer/presidio.py               |  7 ++----
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/base.py b/libs/experimental/langchain_experimental/data_anonymizer/base.py
index 292d2a2a0f690..6cca949c0d623 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/base.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py
@@ -1,5 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Callable, Optional
+
+from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
+from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
+    exact_matching_strategy,
+)
+
+DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy
 
 
 class AnonymizerBase(ABC):
@@ -23,10 +30,20 @@ class ReversibleAnonymizerBase(AnonymizerBase):
     Base abstract class for reversible anonymizers.
     """
 
-    def deanonymize(self, text: str) -> str:
+    def deanonymize(
+        self,
+        text_to_deanonymize: str,
+        deanonymizer_matching_strategy: Callable[
+            [str, MappingDataType], str
+        ] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
+    ) -> str:
         """Deanonymize text"""
-        return self._deanonymize(text)
+        return self._deanonymize(text_to_deanonymize, deanonymizer_matching_strategy)
 
     @abstractmethod
-    def _deanonymize(self, text: str) -> str:
+    def _deanonymize(
+        self,
+        text_to_deanonymize: str,
+        deanonymizer_matching_strategy: Callable[[str, MappingDataType], str],
+    ) -> str:
         """Abstract method to deanonymize text"""
diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
index a62fbe5580bd6..75083d2aa767a 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@@ -8,6 +8,7 @@
 import yaml
 
 from langchain_experimental.data_anonymizer.base import (
+    DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
     AnonymizerBase,
     ReversibleAnonymizerBase,
 )
@@ -15,10 +16,6 @@
     DeanonymizerMapping,
     MappingDataType,
 )
-from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
-    default_matching_strategy,
-    fuzzy_matching_strategy,
-)
 from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
     get_pseudoanonymizer_mapping,
 )
@@ -290,7 +287,7 @@ def _deanonymize(
         text_to_deanonymize: str,
         deanonymizer_matching_strategy: Callable[
             [str, MappingDataType], str
-        ] = fuzzy_matching_strategy,
+        ] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
     ) -> str:
         """Deanonymize text.
         Each anonymized entity is replaced with its original value.

From d59b5e692b3634a9959cb1f95b0f9f27deee0736 Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Wed, 13 Sep 2023 14:45:37 +0000
Subject: [PATCH 04/13] Docs update

---
 .../reversible.ipynb                          | 89 ++++++++++++++++---
 1 file changed, 79 insertions(+), 10 deletions(-)

diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb
index de5655ba1e9d5..af519946e941d 100644
--- a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb
+++ b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb
@@ -273,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
@@ -285,7 +285,7 @@
        " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -317,7 +317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -337,7 +337,7 @@
        "  '3537672423884966': '4001 9192 5753 7193'}}"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -361,7 +361,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -380,7 +380,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -389,7 +389,7 @@
        "{}"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -402,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -415,7 +415,7 @@
        "  '3537672423884966': '4001 9192 5753 7193'}}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -426,6 +426,75 @@
     "anonymizer.deanonymizer_mapping"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Custom deanonymization strategy\n",
+    "\n",
+    "The default deanonymisation strategy is to exactly match the substring in the text with the mapping entry. Due to the indeterminism of LLMs, it may be that the model will change the format of the private data slightly or make a typo, for example:\n",
+    "- *Keanu Reeves* -> *Kaenu Reeves*\n",
+    "- *John Doe* -> *John*\n",
+    "- *Main St, New York* -> *New York*\n",
+    "\n",
+    "It is therefore worth considering appropriate prompt engineering (have the model return PII in unchanged format) or trying to implement your replacing strategy. For example, you can use fuzzy matching in combination with ngrams. This will solve problems with typos and minor changes in the text. Some implementations of the swapping strategy can be found in the file `deanonymizer_matching_strategies.py`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "maria lynch\n",
+      "Slim Shady\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
+    "    ngram_fuzzy_matching_strategy,\n",
+    "    case_insensitive_matching_strategy,\n",
+    ")\n",
+    "\n",
+    "# Original name: Maria Lynch\n",
+    "print(anonymizer.deanonymize(\"maria lynch\"))\n",
+    "print(\n",
+    "    anonymizer.deanonymize(\n",
+    "        \"maria lynch\", deanonymizer_matching_strategy=case_insensitive_matching_strategy\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Call Marie Lync at 734-413-1647\n",
+      "Call Slim Shady at 313-666-7440\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Original name: Maria Lynch\n",
+    "# Original phone number: 7344131647 (without dashes)\n",
+    "print(anonymizer.deanonymize(\"Call Marie Lync at 734-413-1647\"))\n",
+    "print(\n",
+    "    anonymizer.deanonymize(\n",
+    "        \"Call Marie Lync at 734-413-1647\",\n",
+    "        deanonymizer_matching_strategy=ngram_fuzzy_matching_strategy,\n",
+    "    )\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -453,7 +522,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,

From 0dece6f1a2f79b6fbe47abb2b459d5bc50b5975a Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Wed, 13 Sep 2023 14:47:26 +0000
Subject: [PATCH 05/13] Lint

---
 .../data_anonymizer/deanonymizer_matching_strategies.py    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
index 935e4f13a0964..75794214ea966 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@@ -1,4 +1,6 @@
 import re
+from typing import List
+
 from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
 
 
@@ -23,7 +25,8 @@ def case_insensitive_matching_strategy(
 ) -> str:
     """
     Case insensitive matching strategy for deanonymization.
-    It replaces all the anonymized entities with the original ones irrespective of their letter case.
+    It replaces all the anonymized entities with the original ones
+        irrespective of their letter case.
 
     Args:
         text: text to deanonymize
@@ -54,7 +57,7 @@ def ngram_fuzzy_matching_strategy(
         fuzzy_threshold: fuzzy matching threshold
     """
 
-    def generate_ngrams(words_list, n):
+    def generate_ngrams(words_list: List[str], n: int) -> list:
         """Generate n-grams from a list of words"""
         return [
             " ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))

From 509c6749a3bf8b2bfe6d3ae7c48363d6c9fcf920 Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Wed, 13 Sep 2023 14:50:03 +0000
Subject: [PATCH 06/13] Remove temporary file

---
 .../data_anonymizer/test.ipynb                | 200 ------------------
 1 file changed, 200 deletions(-)
 delete mode 100644 libs/experimental/langchain_experimental/data_anonymizer/test.ipynb

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/test.ipynb b/libs/experimental/langchain_experimental/data_anonymizer/test.ipynb
deleted file mode 100644
index a872dad5f993e..0000000000000
--- a/libs/experimental/langchain_experimental/data_anonymizer/test.ipynb
+++ /dev/null
@@ -1,200 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/vscode/langchain-py-env/lib/python3.11/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
-      "  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'My name is Maria Lynch, call me at 7344131647 or email me at wdavis@example.net. By the way, my card number is: 213186379402654 and my name is Maria Lynch'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
-    "\n",
-    "anonymizer = PresidioReversibleAnonymizer(\n",
-    "    analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
-    "    # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
-    "    # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
-    "    faker_seed=42,\n",
-    ")\n",
-    "\n",
-    "anonymizer.anonymize(\n",
-    "    \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
-    "    \"By the way, my card number is: 4916 0387 9536 0861 \"\n",
-    "    \"and my name is Slim Shady\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n",
-       " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
-       " 'EMAIL_ADDRESS': {'wdavis@example.net': 'real.slim.shady@gmail.com'},\n",
-       " 'CREDIT_CARD': {'213186379402654': '4916 0387 9536 0861'}}"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "anonymizer.deanonymizer_mapping"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Are you Slim Shady I found your card with number 4916 0387 9536 0861'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "anonymizer.deanonymize(\n",
-    "    \"Are you Mari lync? I found your card with number 213186379402654\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Are you Slim Shady I found your card with number 2131 8637 9402 654'"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "anonymizer.deanonymize(\n",
-    "    \"Are you Mari lync? I found your card with number 2131 8637 9402 654\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Are you Slim Shady I found your card with number 2131 4916 0387 9536 0861'"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "anonymizer.deanonymize(\n",
-    "    \"Are you Mari lync? I found your card with number 2131 86379402654\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "89"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from fuzzywuzzy import fuzz\n",
-    "\n",
-    "fuzz.partial_ratio(\"Marie Lynch\", \"Mari Lync\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "100"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from fuzzywuzzy import fuzz\n",
-    "\n",
-    "fuzz.partial_ratio(\"Slim Shady\", \"a\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "langchain-py-env",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.4"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From 15e1ef65b5bb47d818bd177835fe3b6f6cd4cb8f Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Thu, 14 Sep 2023 19:56:19 +0000
Subject: [PATCH 07/13] Add better fuzzy matching method

---
 .../reversible.ipynb                          |  75 +++++++++--
 .../deanonymizer_matching_strategies.py       | 123 ++++++++++++++++--
 2 files changed, 174 insertions(+), 24 deletions(-)

diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb
index af519946e941d..beb59db42744a 100644
--- a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb
+++ b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb
@@ -432,17 +432,17 @@
    "source": [
     "### Custom deanonymization strategy\n",
     "\n",
-    "The default deanonymisation strategy is to exactly match the substring in the text with the mapping entry. Due to the indeterminism of LLMs, it may be that the model will change the format of the private data slightly or make a typo, for example:\n",
+    "The default deanonymization strategy is to exactly match the substring in the text with the mapping entry. Due to the indeterminism of LLMs, it may be that the model will change the format of the private data slightly or make a typo, for example:\n",
     "- *Keanu Reeves* -> *Kaenu Reeves*\n",
-    "- *John Doe* -> *John*\n",
+    "- *John F. Kennedy* -> *John Kennedy*\n",
     "- *Main St, New York* -> *New York*\n",
     "\n",
-    "It is therefore worth considering appropriate prompt engineering (have the model return PII in unchanged format) or trying to implement your replacing strategy. For example, you can use fuzzy matching in combination with ngrams. This will solve problems with typos and minor changes in the text. Some implementations of the swapping strategy can be found in the file `deanonymizer_matching_strategies.py`."
+    "It is therefore worth considering appropriate prompt engineering (have the model return PII in unchanged format) or trying to implement your replacing strategy. For example, you can use fuzzy matching - this will solve problems with typos and minor changes in the text. Some implementations of the swapping strategy can be found in the file `deanonymizer_matching_strategies.py`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -456,7 +456,6 @@
    ],
    "source": [
     "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
-    "    ngram_fuzzy_matching_strategy,\n",
     "    case_insensitive_matching_strategy,\n",
     ")\n",
     "\n",
@@ -471,30 +470,86 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Call Marie Lync at 734-413-1647\n",
+      "Call Maria K. Lynch at 734-413-1647\n",
       "Call Slim Shady at 313-666-7440\n"
      ]
     }
    ],
    "source": [
+    "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
+    "    fuzzy_matching_strategy,\n",
+    ")\n",
+    "\n",
     "# Original name: Maria Lynch\n",
     "# Original phone number: 7344131647 (without dashes)\n",
-    "print(anonymizer.deanonymize(\"Call Marie Lync at 734-413-1647\"))\n",
+    "print(anonymizer.deanonymize(\"Call Maria K. Lynch at 734-413-1647\"))\n",
+    "print(\n",
+    "    anonymizer.deanonymize(\n",
+    "        \"Call Maria K. Lynch at 734-413-1647\",\n",
+    "        deanonymizer_matching_strategy=fuzzy_matching_strategy,\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It seems that the combined method works best:\n",
+    "- first apply the exact match strategy\n",
+    "- then match the rest using the fuzzy strategy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Are you Slim Shady? I found your card with number 4916 0387 9536 0861.\n",
+      "Is this your phone number: 313-666-7440?\n",
+      "Is this your email address: wdavis@example.net\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
+    "    combined_exact_fuzzy_matching_strategy,\n",
+    ")\n",
+    "\n",
+    "# Changed some values for fuzzy match showcase:\n",
+    "# - \"Maria Lynch\" -> \"Maria K. Lynch\"\n",
+    "# - \"7344131647\" -> \"734-413-1647\"\n",
+    "# - \"213186379402654\" -> \"2131 8637 9402 654\"\n",
     "print(\n",
     "    anonymizer.deanonymize(\n",
-    "        \"Call Marie Lync at 734-413-1647\",\n",
-    "        deanonymizer_matching_strategy=ngram_fuzzy_matching_strategy,\n",
+    "        (\n",
+    "            \"Are you Maria F. Lynch? I found your card with number 4838 6379 40262.\\n\"\n",
+    "            \"Is this your phone number: 734-413-1647?\\n\"\n",
+    "            \"Is this your email address: wdavis@example.net\"\n",
+    "        ),\n",
+    "        deanonymizer_matching_strategy=combined_exact_fuzzy_matching_strategy,\n",
     "    )\n",
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Of course, there is no perfect method and it is worth experimenting and finding the one best suited to your use case."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
index 75794214ea966..6bb8c9b8d749d 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@@ -31,6 +31,10 @@ def case_insensitive_matching_strategy(
     Args:
         text: text to deanonymize
         deanonymizer_mapping: mapping between anonymized entities and original ones
+
+    Examples of matching:
+        keanu reeves -> Keanu Reeves
+        JOHN F. KENNEDY -> John F. Kennedy
     """
 
     # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
@@ -41,8 +45,78 @@ def case_insensitive_matching_strategy(
     return text
 
 
+def fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
+) -> str:
+    """
+    Fuzzy matching strategy for deanonymization.
+    It uses fuzzy matching to find the position of the anonymized entity in the text.
+    It replaces all the anonymized entities with the original ones.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
+            text segment to consider it a match
+
+    Examples of matching:
+        Kaenu Reves -> Keanu Reeves
+        John F. Kennedy -> John Kennedy
+    """
+
+    try:
+        from fuzzysearch import find_near_matches
+    except ImportError as e:
+        raise ImportError(
+            "Could not import fuzzysearch, please install with "
+            "`pip install fuzzysearch`."
+        ) from e
+
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
+            new_text = ""
+            last_end = 0
+            for m in matches:
+                # add the text that isn't part of a match
+                new_text += text[last_end : m.start]
+                # add the replacement text
+                new_text += original
+                last_end = m.end
+            # add the remaining text that wasn't part of a match
+            new_text += text[last_end:]
+            text = new_text
+
+    return text
+
+
+def combined_exact_fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
+) -> str:
+    """
+    RECOMMENDED STRATEGY.
+    Combined exact and fuzzy matching strategy for deanonymization.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
+            text segment to consider it a match
+
+    Examples of matching:
+        Kaenu Reves -> Keanu Reeves
+        John F. Kennedy -> John Kennedy
+    """
+    text = exact_matching_strategy(text, deanonymizer_mapping)
+    text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
+    return text
+
+
 def ngram_fuzzy_matching_strategy(
-    text: str, deanonymizer_mapping: MappingDataType, fuzzy_threshold: int = 80
+    text: str,
+    deanonymizer_mapping: MappingDataType,
+    fuzzy_threshold: int = 85,
+    use_variable_length: bool = True,
 ) -> str:
     """
     N-gram fuzzy matching strategy for deanonymization.
@@ -55,6 +129,7 @@ def ngram_fuzzy_matching_strategy(
         text: text to deanonymize
         deanonymizer_mapping: mapping between anonymized entities and original ones
         fuzzy_threshold: fuzzy matching threshold
+        use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams
     """
 
     def generate_ngrams(words_list: List[str], n: int) -> list:
@@ -65,26 +140,46 @@ def generate_ngrams(words_list: List[str], n: int) -> list:
 
     try:
         from fuzzywuzzy import fuzz
-
     except ImportError as e:
         raise ImportError(
             "Could not import fuzzywuzzy, please install with "
             "`pip install fuzzywuzzy`."
         ) from e
 
+    text_words = text.split()
+    replacements = []
+    matched_indices = []
+
     for entity_type in deanonymizer_mapping:
         for anonymized, original in deanonymizer_mapping[entity_type].items():
             anonymized_words = anonymized.split()
-            text_words = text.split()
-
-            # Generate text segments of the same length as the anonymized entity
-            segments = generate_ngrams(text_words, len(anonymized_words))
-
-            for i, segment in enumerate(segments):
-                # Fuzzy match the segment with the anonymized entity
-                if fuzz.ratio(anonymized.lower(), segment.lower()) > fuzzy_threshold:
-                    text_words[i : i + len(anonymized_words)] = original.split()
 
-            text = " ".join(text_words)
-
-    return text
+            if use_variable_length:
+                gram_lengths = [
+                    len(anonymized_words) - 1,
+                    len(anonymized_words),
+                    len(anonymized_words) + 1,
+                ]
+            else:
+                gram_lengths = [len(anonymized_words)]
+            for n in gram_lengths:
+                if n > 0:  # Take only positive values
+                    segments = generate_ngrams(text_words, n)
+                    for i, segment in enumerate(segments):
+                        if (
+                            fuzz.ratio(anonymized.lower(), segment.lower())
+                            > fuzzy_threshold
+                            and i not in matched_indices
+                        ):
+                            replacements.append((i, n, original))
+                            # Add the matched segment indices to the list
+                            matched_indices.extend(range(i, i + n))
+
+    # Sort replacements by index in reverse order
+    replacements.sort(key=lambda x: x[0], reverse=True)
+
+    # Apply replacements in reverse order to not affect subsequent indices
+    for start, length, replacement in replacements:
+        text_words[start : start + length] = replacement.split()
+
+    return " ".join(text_words)

From 0a432399fbe3d8b1f85cfd8b7d34c4fbde2ed66d Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Thu, 14 Sep 2023 19:56:33 +0000
Subject: [PATCH 08/13] Add tests

---
 .../tests/unit_tests/test_data_anonymizer.py  | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
index 138b60eca89e0..95d5d4b598319 100644
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@@ -82,3 +82,76 @@ def test_add_recognizer_operator() -> None:
     anonymizer.add_operators(custom_operator)
     anonymized_text = anonymizer.anonymize(text)
     assert anonymized_text == "Dear Jane Doe was here."
+
+
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_exact_matching_strategy() -> None:
+    """
+    Test exact matching strategy for deanonymization.
+    """
+    from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
+        exact_matching_strategy,
+    )
+
+    deanonymizer_mapping = {
+        "PERSON": {"Maria Lynch": "Slim Shady"},
+        "PHONE_NUMBER": {"7344131647": "313-666-7440"},
+        "EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
+        "CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
+    }
+
+    text = (
+        "Are you Maria Lynch? I found your card with number 213186379402654. "
+        "Is this your phone number: 7344131647? "
+        "Is this your email address: wdavis@example.net"
+    )
+
+    deanonymized_text = exact_matching_strategy(text, deanonymizer_mapping)
+
+    for original_value in [
+        "Slim Shady",
+        "313-666-7440",
+        "real.slim.shady@gmail.com",
+        "4916 0387 9536 0861",
+    ]:
+        assert original_value in deanonymized_text
+
+
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_best_matching_strategy() -> None:
+    """
+    Test exact matching strategy for deanonymization.
+    """
+    from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
+        combined_exact_fuzzy_matching_strategy,
+    )
+
+    deanonymizer_mapping = {
+        "PERSON": {"Maria Lynch": "Slim Shady"},
+        "PHONE_NUMBER": {"7344131647": "313-666-7440"},
+        "EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
+        "CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
+    }
+
+    # Changed some values:
+    # - "Maria Lynch" -> "Maria K. Lynch"
+    # - "7344131647" -> "734-413-1647"
+    # - "213186379402654" -> "2131 8637 9402 654"
+    # - "wdavis@example.net" -> the same to test exact match
+    text = (
+        "Are you Maria K. Lynch? I found your card with number 2131 8637 9402 654. "
+        "Is this your phone number: 734-413-1647?"
+        "Is this your email address: wdavis@example.net"
+    )
+
+    deanonymized_text = combined_exact_fuzzy_matching_strategy(
+        text, deanonymizer_mapping
+    )
+
+    for original_value in [
+        "Slim Shady",
+        "313-666-7440",
+        "real.slim.shady@gmail.com",
+        "4916 0387 9536 0861",
+    ]:
+        assert original_value in deanonymized_text

From 575efc573d9317a448ff68e94875fdc2bca700eb Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Thu, 14 Sep 2023 19:59:34 +0000
Subject: [PATCH 09/13] Lint

---
 .../tests/unit_tests/test_data_anonymizer.py         | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
index 95d5d4b598319..052d23865940f 100644
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@@ -89,8 +89,8 @@ def test_exact_matching_strategy() -> None:
     """
     Test exact matching strategy for deanonymization.
     """
-    from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
-        exact_matching_strategy,
+    from langchain_experimental.data_anonymizer import (
+        deanonymizer_matching_strategies as dms,
     )
 
     deanonymizer_mapping = {
@@ -106,7 +106,7 @@ def test_exact_matching_strategy() -> None:
         "Is this your email address: wdavis@example.net"
     )
 
-    deanonymized_text = exact_matching_strategy(text, deanonymizer_mapping)
+    deanonymized_text = dms.exact_matching_strategy(text, deanonymizer_mapping)
 
     for original_value in [
         "Slim Shady",
@@ -122,8 +122,8 @@ def test_best_matching_strategy() -> None:
     """
     Test exact matching strategy for deanonymization.
     """
-    from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
-        combined_exact_fuzzy_matching_strategy,
+    from langchain_experimental.data_anonymizer import (
+        deanonymizer_matching_strategies as dms,
     )
 
     deanonymizer_mapping = {
@@ -144,7 +144,7 @@ def test_best_matching_strategy() -> None:
         "Is this your email address: wdavis@example.net"
     )
 
-    deanonymized_text = combined_exact_fuzzy_matching_strategy(
+    deanonymized_text = dms.combined_exact_fuzzy_matching_strategy(
         text, deanonymizer_mapping
     )
 

From 319c1ed0cd6a0cbe547b4300fa68d5b35eb15600 Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Thu, 14 Sep 2023 20:04:37 +0000
Subject: [PATCH 10/13] Lint 2

---
 .../data_anonymizer/deanonymizer_matching_strategies.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
index 6bb8c9b8d749d..da43f95c97901 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@@ -148,7 +148,7 @@ def generate_ngrams(words_list: List[str], n: int) -> list:
 
     text_words = text.split()
     replacements = []
-    matched_indices = []
+    matched_indices: List[int] = []
 
     for entity_type in deanonymizer_mapping:
         for anonymized, original in deanonymizer_mapping[entity_type].items():

From 8adc212bed6d00b83b34a65c5f0610673b2d5cc5 Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Mon, 9 Oct 2023 15:13:17 +0000
Subject: [PATCH 11/13] Exact match strategy for anonymization

---
 .../langchain_experimental/data_anonymizer/presidio.py     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
index 6fda5c96f18e4..0b4012cdb7287 100644
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@@ -16,6 +16,9 @@
     MappingDataType,
     create_anonymizer_mapping,
 )
+from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
+    exact_matching_strategy,
+)
 from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
     get_pseudoanonymizer_mapping,
 )
@@ -188,7 +191,7 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
             filtered_analyzer_results,
             anonymizer_results,
         )
-        return default_matching_strategy(text, anonymizer_mapping)
+        return exact_matching_strategy(text, anonymizer_mapping)
 
 
 class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
@@ -280,7 +283,7 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str:
         )
         self._deanonymizer_mapping.update(new_deanonymizer_mapping)
 
-        return default_matching_strategy(text, self.anonymizer_mapping)
+        return exact_matching_strategy(text, self.anonymizer_mapping)
 
     def _deanonymize(
         self,

From 20c29b76beaed6cbce9217d662c3af6d10aa455f Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Mon, 9 Oct 2023 15:13:55 +0000
Subject: [PATCH 12/13] Remove unnecessary fixtures from tests

---
 .../tests/unit_tests/test_data_anonymizer.py             | 9 ---------
 .../tests/unit_tests/test_reversible_data_anonymizer.py  | 9 ---------
 2 files changed, 18 deletions(-)

diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
index c600dae923ec3..60fd16e082b0f 100644
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@@ -2,8 +2,6 @@
 
 import pytest
 
-from . import is_libcublas_available
-
 
 @pytest.fixture(scope="module", autouse=True)
 def check_spacy_model() -> Iterator[None]:
@@ -14,13 +12,6 @@ def check_spacy_model() -> Iterator[None]:
     yield
 
 
-@pytest.fixture(scope="module", autouse=True)
-def check_libcublas() -> Iterator[None]:
-    if not is_libcublas_available():
-        pytest.skip(reason="libcublas.so is not available")
-    yield
-
-
 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 @pytest.mark.parametrize(
     "analyzed_fields,should_contain",
diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
index 0a30afa054b12..8ef2dcf68ca3f 100644
--- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
@@ -3,8 +3,6 @@
 
 import pytest
 
-from . import is_libcublas_available
-
 
 @pytest.fixture(scope="module", autouse=True)
 def check_spacy_model() -> Iterator[None]:
@@ -15,13 +13,6 @@ def check_spacy_model() -> Iterator[None]:
     yield
 
 
-@pytest.fixture(scope="module", autouse=True)
-def check_libcublas() -> Iterator[None]:
-    if not is_libcublas_available():
-        pytest.skip(reason="libcublas.so is not available")
-    yield
-
-
 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 @pytest.mark.parametrize(
     "analyzed_fields,should_contain",

From 33e827c08b73eecde7fdfc78a32e1e0bb43945e2 Mon Sep 17 00:00:00 2001
From: maks-operlejn-ds <maksymilian.operlejn@deepsense.ai>
Date: Mon, 9 Oct 2023 15:34:16 +0000
Subject: [PATCH 13/13] Revert "Remove unnecessary fixtures from tests"

This reverts commit 20c29b76beaed6cbce9217d662c3af6d10aa455f.
---
 .../tests/unit_tests/test_data_anonymizer.py             | 9 +++++++++
 .../tests/unit_tests/test_reversible_data_anonymizer.py  | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
index 60fd16e082b0f..c600dae923ec3 100644
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@@ -2,6 +2,8 @@
 
 import pytest
 
+from . import is_libcublas_available
+
 
 @pytest.fixture(scope="module", autouse=True)
 def check_spacy_model() -> Iterator[None]:
@@ -12,6 +14,13 @@ def check_spacy_model() -> Iterator[None]:
     yield
 
 
+@pytest.fixture(scope="module", autouse=True)
+def check_libcublas() -> Iterator[None]:
+    if not is_libcublas_available():
+        pytest.skip(reason="libcublas.so is not available")
+    yield
+
+
 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 @pytest.mark.parametrize(
     "analyzed_fields,should_contain",
diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
index 8ef2dcf68ca3f..0a30afa054b12 100644
--- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
@@ -3,6 +3,8 @@
 
 import pytest
 
+from . import is_libcublas_available
+
 
 @pytest.fixture(scope="module", autouse=True)
 def check_spacy_model() -> Iterator[None]:
@@ -13,6 +15,13 @@ def check_spacy_model() -> Iterator[None]:
     yield
 
 
+@pytest.fixture(scope="module", autouse=True)
+def check_libcublas() -> Iterator[None]:
+    if not is_libcublas_available():
+        pytest.skip(reason="libcublas.so is not available")
+    yield
+
+
 @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 @pytest.mark.parametrize(
     "analyzed_fields,should_contain",