Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Anonymization matching strategy #18

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand All @@ -285,7 +285,7 @@
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}"
]
},
"execution_count": 8,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -317,7 +317,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand All @@ -337,7 +337,7 @@
" '3537672423884966': '4001 9192 5753 7193'}}"
]
},
"execution_count": 9,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -361,7 +361,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -380,7 +380,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -389,7 +389,7 @@
"{}"
]
},
"execution_count": 11,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -402,7 +402,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -415,7 +415,7 @@
" '3537672423884966': '4001 9192 5753 7193'}}"
]
},
"execution_count": 12,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -426,6 +426,75 @@
"anonymizer.deanonymizer_mapping"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Custom deanonymization strategy\n",
"\n",
"The default deanonymisation strategy is to exactly match the substring in the text with the mapping entry. Due to the indeterminism of LLMs, it may be that the model will change the format of the private data slightly or make a typo, for example:\n",
maks-operlejn-ds marked this conversation as resolved.
Show resolved Hide resolved
"- *Keanu Reeves* -> *Kaenu Reeves*\n",
"- *John Doe* -> *John*\n",
"- *Main St, New York* -> *New York*\n",
"\n",
"It is therefore worth considering appropriate prompt engineering (have the model return PII in unchanged format) or trying to implement your replacing strategy. For example, you can use fuzzy matching in combination with ngrams. This will solve problems with typos and minor changes in the text. Some implementations of the swapping strategy can be found in the file `deanonymizer_matching_strategies.py`."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"maria lynch\n",
"Slim Shady\n"
]
}
],
"source": [
"from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
" ngram_fuzzy_matching_strategy,\n",
" case_insensitive_matching_strategy,\n",
")\n",
"\n",
"# Original name: Maria Lynch\n",
"print(anonymizer.deanonymize(\"maria lynch\"))\n",
"print(\n",
" anonymizer.deanonymize(\n",
" \"maria lynch\", deanonymizer_matching_strategy=case_insensitive_matching_strategy\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Call Marie Lync at 734-413-1647\n",
"Call Slim Shady at 313-666-7440\n"
]
}
],
"source": [
"# Original name: Maria Lynch\n",
"# Original phone number: 7344131647 (without dashes)\n",
"print(anonymizer.deanonymize(\"Call Marie Lync at 734-413-1647\"))\n",
"print(\n",
" anonymizer.deanonymize(\n",
" \"Call Marie Lync at 734-413-1647\",\n",
" deanonymizer_matching_strategy=ngram_fuzzy_matching_strategy,\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -453,7 +522,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down
25 changes: 21 additions & 4 deletions libs/experimental/langchain_experimental/data_anonymizer/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
from abc import ABC, abstractmethod
from typing import Optional
from typing import Callable, Optional

from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
exact_matching_strategy,
)

DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy


class AnonymizerBase(ABC):
Expand All @@ -23,10 +30,20 @@ class ReversibleAnonymizerBase(AnonymizerBase):
Base abstract class for reversible anonymizers.
"""

def deanonymize(self, text: str) -> str:
def deanonymize(
self,
text_to_deanonymize: str,
deanonymizer_matching_strategy: Callable[
[str, MappingDataType], str
] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
) -> str:
"""Deanonymize text"""
return self._deanonymize(text)
return self._deanonymize(text_to_deanonymize, deanonymizer_matching_strategy)

@abstractmethod
def _deanonymize(self, text: str) -> str:
def _deanonymize(
self,
text_to_deanonymize: str,
deanonymizer_matching_strategy: Callable[[str, MappingDataType], str],
) -> str:
"""Abstract method to deanonymize text"""
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from langchain_experimental.data_anonymizer.presidio import MappingDataType
import re
from typing import List

from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType

def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:

def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
"""
Default matching strategy for deanonymization.
Exact matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones.

Args:
Expand All @@ -15,3 +18,73 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
for anonymized, original in deanonymizer_mapping[entity_type].items():
text = text.replace(anonymized, original)
return text


def case_insensitive_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType
) -> str:
"""
Case insensitive matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones
irrespective of their letter case.

Args:
text: text to deanonymize
deanonymizer_mapping: mapping between anonymized entities and original ones
"""

# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
# Use regular expressions for case-insensitive matching and replacing
text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
return text


def ngram_fuzzy_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType, fuzzy_threshold: int = 80
) -> str:
"""
N-gram fuzzy matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones.
It uses fuzzy matching to find the position of the anonymized entity in the text.
It generates n-grams of the same length as the anonymized entity from the text and
uses fuzzy matching to find the position of the anonymized entity in the text.
maks-operlejn-ds marked this conversation as resolved.
Show resolved Hide resolved

Args:
text: text to deanonymize
deanonymizer_mapping: mapping between anonymized entities and original ones
fuzzy_threshold: fuzzy matching threshold
maks-operlejn-ds marked this conversation as resolved.
Show resolved Hide resolved
"""

def generate_ngrams(words_list: List[str], n: int) -> list:
"""Generate n-grams from a list of words"""
return [
" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
]

try:
from fuzzywuzzy import fuzz

except ImportError as e:
raise ImportError(
"Could not import fuzzywuzzy, please install with "
"`pip install fuzzywuzzy`."
) from e

for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
anonymized_words = anonymized.split()
text_words = text.split()

# Generate text segments of the same length as the anonymized entity
segments = generate_ngrams(text_words, len(anonymized_words))
maks-operlejn-ds marked this conversation as resolved.
Show resolved Hide resolved

for i, segment in enumerate(segments):
# Fuzzy match the segment with the anonymized entity
if fuzz.ratio(anonymized.lower(), segment.lower()) > fuzzy_threshold:
text_words[i : i + len(anonymized_words)] = original.split()

text = " ".join(text_words)

return text
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,14 @@
import yaml

from langchain_experimental.data_anonymizer.base import (
DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
AnonymizerBase,
ReversibleAnonymizerBase,
)
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
DeanonymizerMapping,
MappingDataType,
)
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
default_matching_strategy,
)
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
get_pseudoanonymizer_mapping,
)
Expand Down Expand Up @@ -289,7 +287,7 @@ def _deanonymize(
text_to_deanonymize: str,
deanonymizer_matching_strategy: Callable[
[str, MappingDataType], str
] = default_matching_strategy,
] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
) -> str:
"""Deanonymize text.
Each anonymized entity is replaced with its original value.
Expand Down