diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb index 2502a45092244..617809d489d85 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb @@ -53,7 +53,7 @@ { "data": { "text/plain": [ - "'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'" + "'My name is James Martinez, call me at (576)928-1972x679 or email me at lisa44@example.com'" ] }, "execution_count": 2, @@ -114,11 +114,11 @@ "text": [ "Dear Sir/Madam,\n", "\n", - "We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n", + "We regret to inform you that Mr. Dennis Cooper has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 3588895295514977. \n", "\n", - "Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n", + "Should you happen to come across the aforementioned wallet, kindly contact us immediately at (428)451-3494x4110 or send an email to perryluke@example.com.\n", "\n", - "Thank you for your attention to this matter.\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", "\n", "Yours faithfully,\n", "\n", @@ -159,7 +159,7 @@ { "data": { "text/plain": [ - "'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + "'My name is Shannon Steele, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" ] }, "execution_count": 6, @@ -190,7 +190,7 @@ { "data": { "text/plain": [ - "'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'" + "'My name is Wesley Flores, call me at (498)576-9526 or email me at real.slim.shady@gmail.com'" ] }, "execution_count": 7, @@ -225,7 +225,7 @@ { "data": { "text/plain": [ - "'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'" + "'My name is Carla Fisher, call me at 001-683-324-0721x0644 or email me at krausejeremy@example.com'" ] }, "execution_count": 8, @@ -256,7 +256,7 @@ { "data": { "text/plain": [ - "'My polish phone number is NRGN41434238921378'" + "'My polish phone number is QESQ21234635370499'" ] }, "execution_count": 9, @@ -361,7 +361,7 @@ { "data": { "text/plain": [ - "'511 622 683'" + "'665 631 080'" ] }, "execution_count": 13, @@ -422,7 +422,7 @@ { "data": { "text/plain": [ - "'My polish phone number is +48 734 630 977'" + "'My polish phone number is 538 521 657'" ] }, "execution_count": 16, @@ -438,8 +438,80 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Future works\n", - "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object." + "## Important considerations\n", + "\n", + "### Anonymizer detection rates\n", + "\n", + "**The level of anonymization and the precision of detection are just as good as the quality of the recognizers implemented.**\n", + "\n", + "Texts from different sources and in different languages have varying characteristics, so it is necessary to test the detection precision and iteratively add recognizers and operators to achieve better and better results.\n", + "\n", + "Microsoft Presidio gives a lot of freedom to refine anonymization. The library's author has provided his [recommendations and a step-by-step guide for improving detection rates](https://github.com/microsoft/presidio/discussions/767#discussion-3567223)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Instance anonymization\n", + "\n", + "`PresidioAnonymizer` has no built-in memory. Therefore, two occurrences of the entity in the subsequent texts will be replaced with two different fake values:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Robert Morales. Hi Robert Morales!\n", + "My name is Kelly Mccoy. Hi Kelly Mccoy!\n" + ] + } + ], + "source": [ + "print(anonymizer.anonymize(\"My name is John Doe. Hi John Doe!\"))\n", + "print(anonymizer.anonymize(\"My name is John Doe. Hi John Doe!\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To preserve previous anonymization results, use `PresidioReversibleAnonymizer`, which has built-in memory:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Ashley Cervantes. Hi Ashley Cervantes!\n", + "My name is Ashley Cervantes. Hi Ashley Cervantes!\n" + ] + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer_with_memory = PresidioReversibleAnonymizer()\n", + "\n", + "print(anonymizer_with_memory.anonymize(\"My name is John Doe. Hi John Doe!\"))\n", + "print(anonymizer_with_memory.anonymize(\"My name is John Doe. Hi John Doe!\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can learn more about `PresidioReversibleAnonymizer` in the next section." ] } ], @@ -459,7 +531,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb index de5655ba1e9d5..a61f7894d3ec2 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb @@ -185,14 +185,13 @@ "text": [ "Dear Sir/Madam,\n", "\n", - "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", + "We regret to inform you that Monique Turner has recently misplaced his wallet, which contains a sum of cash and his credit card with the number 213152056829866. \n", "\n", - "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", + "If you happen to come across this wallet, kindly contact us at (770)908-7734x2835 or send an email to barbara25@example.net.\n", "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", - "\n", - "Yours faithfully,\n", + "Thank you for your cooperation.\n", "\n", + "Sincerely,\n", "[Your Name]\n" ] } @@ -232,14 +231,13 @@ "text": [ "Dear Sir/Madam,\n", "\n", - "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", - "\n", - "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", + "We regret to inform you that Slim Shady has recently misplaced his wallet, which contains a sum of cash and his credit card with the number 4916 0387 9536 0861. \n", "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", + "If you happen to come across this wallet, kindly contact us at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", "\n", - "Yours faithfully,\n", + "Thank you for your cooperation.\n", "\n", + "Sincerely,\n", "[Your Name]\n" ] } @@ -356,13 +354,57 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can save the mapping itself to a file for future use: " + "Thanks to the built-in memory, entities that have already been detected and anonymised will take the same form in subsequent processed texts, so no duplicates will exist in the mapping:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My VISA card number is 3537672423884966 and my name is William Bowman.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"My VISA card number is 4001 9192 5753 7193 and my name is John Doe.\"\n", + " )\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the mapping itself to a file for future use: " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, "outputs": [], "source": [ "# We can save the deanonymizer mapping as a JSON or YAML file\n", @@ -380,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -389,7 +431,7 @@ "{}" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -402,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -415,7 +457,7 @@ " '3537672423884966': '4001 9192 5753 7193'}}" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -432,7 +474,6 @@ "source": [ "## Future works\n", "\n", - "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." ] } @@ -453,7 +494,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index 2ee03eb208040..9db586c2848c3 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -1,10 +1,26 @@ +import re from collections import defaultdict from dataclasses import dataclass, field -from typing import Dict +from typing import Dict, List + +from presidio_analyzer import RecognizerResult +from presidio_anonymizer.entities import EngineResult MappingDataType = Dict[str, Dict[str, str]] +def format_duplicated_operator(operator_name: str, count: int) -> str: + """Format the operator name with the count""" + + clean_operator_name = re.sub(r"[<>]", "", operator_name) + clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name) + + if operator_name.startswith("<") and operator_name.endswith(">"): + return f"<{clean_operator_name}_{count}>" + else: + return f"{clean_operator_name}_{count}" + + @dataclass class DeanonymizerMapping: mapping: MappingDataType = field( @@ -17,5 +33,107 @@ def data(self) -> MappingDataType: return {k: dict(v) for k, v in self.mapping.items()} def update(self, new_mapping: MappingDataType) -> None: + """Update the deanonymizer mapping with new values + Duplicated values will not be added + If there are multiple entities of the same type, the mapping will + include a count to differentiate them. For example, if there are + two names in the input text, the mapping will include NAME_1 and NAME_2. + """ + seen_values = set() + for entity_type, values in new_mapping.items(): - self.mapping[entity_type].update(values) + count = len(self.mapping[entity_type]) + 1 + + for key, value in values.items(): + if ( + value not in seen_values + and value not in self.mapping[entity_type].values() + ): + new_key = ( + format_duplicated_operator(key, count) + if key in self.mapping[entity_type] + else key + ) + + self.mapping[entity_type][new_key] = value + seen_values.add(value) + count += 1 + + +def create_anonymizer_mapping( + original_text: str, + analyzer_results: List[RecognizerResult], + anonymizer_results: EngineResult, + is_reversed: bool = False, +) -> MappingDataType: + """Creates or updates the mapping used to anonymize and/or deanonymize text. + + This method exploits the results returned by the + analysis and anonymization processes. + + If is_reversed is True, it constructs a mapping from each original + entity to its anonymized value. + + If is_reversed is False, it constructs a mapping from each + anonymized entity back to its original text value. + + If there are multiple entities of the same type, the mapping will + include a count to differentiate them. For example, if there are + two names in the input text, the mapping will include NAME_1 and NAME_2. + + Example of mapping: + { + "PERSON": { + "": "", + "John Doe": "Slim Shady" + }, + "PHONE_NUMBER": { + "111-111-1111": "555-555-5555" + } + ... + } + """ + # We are able to zip and loop through both lists because we expect + # them to return corresponding entities for each identified piece + # of analyzable data from our input. + + # We sort them by their 'start' attribute because it allows us to + # match corresponding entities by their position in the input text. + analyzer_results.sort(key=lambda d: d.start) + anonymizer_results.items.sort(key=lambda d: d.start) + + mapping: MappingDataType = defaultdict(dict) + count: dict = defaultdict(int) + + for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items): + original_value = original_text[analyzed.start : analyzed.end] + entity_type = anonymized.entity_type + + if is_reversed: + cond = original_value in mapping[entity_type].values() + else: + cond = original_value in mapping[entity_type] + + if cond: + continue + + if ( + anonymized.text in mapping[entity_type].values() + or anonymized.text in mapping[entity_type] + ): + anonymized_value = format_duplicated_operator( + anonymized.text, count[entity_type] + 2 + ) + count[entity_type] += 1 + else: + anonymized_value = anonymized.text + + mapping_key, mapping_value = ( + (anonymized_value, original_value) + if is_reversed + else (original_value, anonymized_value) + ) + + mapping[entity_type][mapping_key] = mapping_value + + return mapping diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index b2be1dc5a1c0d..6f102fdd62b55 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -1,7 +1,6 @@ from __future__ import annotations import json -from collections import defaultdict from pathlib import Path from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union @@ -14,6 +13,7 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import ( DeanonymizerMapping, MappingDataType, + create_anonymizer_mapping, ) from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import ( default_matching_strategy, @@ -43,8 +43,7 @@ ) from e if TYPE_CHECKING: - from presidio_analyzer import EntityRecognizer, RecognizerResult - from presidio_anonymizer.entities import EngineResult + from presidio_analyzer import EntityRecognizer # Configuring Anonymizer for multiple languages # Detailed description and examples can be found here: @@ -69,6 +68,7 @@ def __init__( analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, + add_default_faker_operators: bool = True, faker_seed: Optional[int] = None, ): """ @@ -93,10 +93,9 @@ def __init__( if analyzed_fields is not None else list(get_pseudoanonymizer_mapping().keys()) ) - self.operators = ( - operators - if operators is not None - else { + + if add_default_faker_operators: + self.operators = { field: OperatorConfig( operator_name="custom", params={"lambda": faker_function} ) @@ -104,7 +103,11 @@ def __init__( faker_seed ).items() } - ) + else: + self.operators = {} + + if operators: + self.add_operators(operators) provider = NlpEngineProvider(nlp_configuration=languages_config) nlp_engine = provider.create_engine() @@ -140,6 +143,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. + PresidioAnonymizer has no built-in memory - + so it will not remember the effects of anonymizing previous texts. + >>> anonymizer = PresidioAnonymizer() + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Noah Rhodes. Hi Noah Rhodes!' + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Brett Russell. Hi Brett Russell!' + Args: text: text to anonymize language: language to use for analysis of PII @@ -156,17 +167,30 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: "Change your language configuration file to add more languages." ) - results = self._analyzer.analyze( + analyzer_results = self._analyzer.analyze( text, entities=self.analyzed_fields, language=language, ) - return self._anonymizer.anonymize( + filtered_analyzer_results = ( + self._anonymizer._remove_conflicts_and_get_text_manipulation_data( + analyzer_results + ) + ) + + anonymizer_results = self._anonymizer.anonymize( text, - analyzer_results=results, + analyzer_results=analyzer_results, operators=self.operators, - ).text + ) + + anonymizer_mapping = create_anonymizer_mapping( + text, + filtered_analyzer_results, + anonymizer_results, + ) + return default_matching_strategy(text, anonymizer_mapping) class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase): @@ -175,9 +199,16 @@ def __init__( analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, + add_default_faker_operators: bool = True, faker_seed: Optional[int] = None, ): - super().__init__(analyzed_fields, operators, languages_config, faker_seed) + super().__init__( + analyzed_fields, + operators, + languages_config, + add_default_faker_operators, + faker_seed, + ) self._deanonymizer_mapping = DeanonymizerMapping() @property @@ -185,57 +216,14 @@ def deanonymizer_mapping(self) -> MappingDataType: """Return the deanonymizer mapping""" return self._deanonymizer_mapping.data - def _update_deanonymizer_mapping( - self, - original_text: str, - analyzer_results: List[RecognizerResult], - anonymizer_results: EngineResult, - ) -> None: - """Creates or updates the mapping used to de-anonymize text. - - This method exploits the results returned by the - analysis and anonymization processes. - - It constructs a mapping from each anonymized entity - back to its original text value. - - Mapping will be stored as "deanonymizer_mapping" property. - - Example of "deanonymizer_mapping": - { - "PERSON": { - "": "", - "John Doe": "Slim Shady" - }, - "PHONE_NUMBER": { - "111-111-1111": "555-555-5555" - } - ... + @property + def anonymizer_mapping(self) -> MappingDataType: + """Return the anonymizer mapping + This is just the reverse version of the deanonymizer mapping.""" + return { + key: {v: k for k, v in inner_dict.items()} + for key, inner_dict in self.deanonymizer_mapping.items() } - """ - - # We are able to zip and loop through both lists because we expect - # them to return corresponding entities for each identified piece - # of analyzable data from our input. - - # We sort them by their 'start' attribute because it allows us to - # match corresponding entities by their position in the input text. - analyzer_results = sorted(analyzer_results, key=lambda d: d.start) - anonymizer_results.items = sorted( - anonymizer_results.items, key=lambda d: d.start - ) - - new_deanonymizer_mapping: MappingDataType = defaultdict(dict) - - for analyzed_entity, anonymized_entity in zip( - analyzer_results, anonymizer_results.items - ): - original_value = original_text[analyzed_entity.start : analyzed_entity.end] - new_deanonymizer_mapping[anonymized_entity.entity_type][ - anonymized_entity.text - ] = original_value - - self._deanonymizer_mapping.update(new_deanonymizer_mapping) def _anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text. @@ -244,6 +232,14 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: At the same time, we will create a mapping from each anonymized entity back to its original text value. + Thanks to the built-in memory, all previously anonymised entities + will be remembered and replaced by the same fake values: + >>> anonymizer = PresidioReversibleAnonymizer() + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Noah Rhodes. Hi Noah Rhodes!' + >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") + 'My name is Noah Rhodes. Hi Noah Rhodes!' + Args: text: text to anonymize language: language to use for analysis of PII @@ -278,11 +274,15 @@ def _anonymize(self, text: str, language: Optional[str] = None) -> str: operators=self.operators, ) - self._update_deanonymizer_mapping( - text, filtered_analyzer_results, anonymizer_results + new_deanonymizer_mapping = create_anonymizer_mapping( + text, + filtered_analyzer_results, + anonymizer_results, + is_reversed=True, ) + self._deanonymizer_mapping.update(new_deanonymizer_mapping) - return anonymizer_results.text + return default_matching_strategy(text, self.anonymizer_mapping) def _deanonymize( self, diff --git a/libs/experimental/tests/unit_tests/test_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_data_anonymizer.py index 138b60eca89e0..bf12a87395847 100644 --- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py @@ -39,6 +39,23 @@ def test_anonymize_multiple() -> None: assert phrase not in anonymized_text +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_check_instances() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = ( + "This is John Smith. John Smith works in a bakery." "John Smith is a good guy" + ) + anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text.count("Connie Lawrence") == 3 + + # New name should be generated + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text.count("Connie Lawrence") == 0 + + @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") def test_anonymize_with_custom_operator() -> None: """Test anonymize a name with a custom operator""" @@ -46,13 +63,13 @@ def test_anonymize_with_custom_operator() -> None: from langchain_experimental.data_anonymizer import PresidioAnonymizer - custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": ""})} + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})} anonymizer = PresidioAnonymizer(operators=custom_operator) text = "Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == " was here." + assert anonymized_text == "NAME was here." @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @@ -82,3 +99,21 @@ def test_add_recognizer_operator() -> None: anonymizer.add_operators(custom_operator) anonymized_text = anonymizer.anonymize(text) assert anonymized_text == "Dear Jane Doe was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_non_faker_values() -> None: + """Test anonymizing multiple items in a sentence without faker values""" + from langchain_experimental.data_anonymizer import PresidioAnonymizer + + text = ( + "My name is John Smith. Your name is Adam Smith. Her name is Jane Smith." + "Our names are: John Smith, Adam Smith, Jane Smith." + ) + expected_result = ( + "My name is . Your name is . Her name is ." + "Our names are: , , ." + ) + anonymizer = PresidioAnonymizer(add_default_faker_operators=False) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == expected_result diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py index 9484a0e9dca06..8ef2dcf68ca3f 100644 --- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -40,6 +40,32 @@ def test_anonymize_multiple() -> None: assert phrase not in anonymized_text +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_check_instances() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = ( + "This is John Smith. John Smith works in a bakery." "John Smith is a good guy" + ) + anonymizer = PresidioReversibleAnonymizer(["PERSON"], faker_seed=42) + anonymized_text = anonymizer.anonymize(text) + persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys()) + assert len(persons) == 1 + + anonymized_name = persons[0] + assert anonymized_text.count(anonymized_name) == 3 + + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text.count(anonymized_name) == 3 + assert anonymizer.deanonymizer_mapping["PERSON"][anonymized_name] == "John Smith" + + text = "This is Jane Smith" + anonymized_text = anonymizer.anonymize(text) + persons = list(anonymizer.deanonymizer_mapping["PERSON"].keys()) + assert len(persons) == 2 + + @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") def test_anonymize_with_custom_operator() -> None: """Test anonymize a name with a custom operator""" @@ -47,13 +73,13 @@ def test_anonymize_with_custom_operator() -> None: from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer - custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": ""})} + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})} anonymizer = PresidioReversibleAnonymizer(operators=custom_operator) text = "Jane Doe was here." anonymized_text = anonymizer.anonymize(text) - assert anonymized_text == " was here." + assert anonymized_text == "NAME was here." @pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") @@ -79,6 +105,8 @@ def test_add_recognizer_operator() -> None: assert anonymized_text == " Jane Doe was here." # anonymizing with custom recognizer and operator + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[]) + anonymizer.add_recognizer(custom_recognizer) custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} anonymizer.add_operators(custom_operator) anonymized_text = anonymizer.anonymize(text) @@ -152,3 +180,21 @@ def test_save_load_deanonymizer_mapping() -> None: finally: os.remove("test_file.json") + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_non_faker_values() -> None: + """Test anonymizing multiple items in a sentence without faker values""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = ( + "My name is John Smith. Your name is Adam Smith. Her name is Jane Smith." + "Our names are: John Smith, Adam Smith, Jane Smith." + ) + expected_result = ( + "My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>." + "Our names are: <PERSON>, <PERSON_2>, <PERSON_3>." + ) + anonymizer = PresidioReversibleAnonymizer(add_default_faker_operators=False) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == expected_result