From 818fe901ff9104c9488061a7da60d979b7afa78e Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Mon, 26 Jun 2023 16:39:16 +0530 Subject: [PATCH 01/23] IN_PAN pattern recognizer Added India PAN (Permanent Account Number) recognizer --- .../predefined_recognizers/__init__.py | 2 + .../in_pan_recognizer.py | 67 +++++++++++++++++++ .../recognizer_registry.py | 1 + .../tests/test_in_pan_recognizer.py | 42 ++++++++++++ 4 files changed, 112 insertions(+) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py create mode 100644 presidio-analyzer/tests/test_in_pan_recognizer.py diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 5ba0db9de..030e3d6cf 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -32,6 +32,7 @@ from .it_vat_code import ItVatCodeRecognizer from .it_identity_card_recognizer import ItIdentityCardRecognizer from .it_passport_recognizer import ItPassportRecognizer +from .in_pan_recognizer import InPanRecognizer NLP_RECOGNIZERS = { "spacy": SpacyRecognizer, @@ -71,4 +72,5 @@ "ItVatCodeRecognizer", "ItIdentityCardRecognizer", "ItPassportRecognizer", + "InPanRecognizer" ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py new file mode 100644 index 000000000..b68797757 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py @@ -0,0 +1,67 @@ +from typing import Optional, List, Tuple + +from presidio_analyzer import Pattern, PatternRecognizer + + +class InPanRecognizer(PatternRecognizer): + """ + Recognizes Indian Permanent Account Number ("PAN"). + + The Permanent Account Number (PAN) is a ten digit alpha-numeric code + with the last digit being a check digit calculated using a + modified modulus 10 calculation. + This recognizer identifies PAN using regex and context words. + Reference: https://en.wikipedia.org/wiki/Permanent_account_number, + https://incometaxindia.gov.in/Forms/tps/1.Permanent%20Account%20Number%20(PAN).pdf + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + PATTERNS = [ + Pattern( + "PAN (Medium)", + r"\b\[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}\b", + 0.6, + ), + Pattern( + "PAN (High)", + r"\b\[A-Za-z]{3}(A|a|B|b|C|c|F|f|G|g|H|h|J|j|L|l|P|p|T|t){1}[0-9]{4}[A-Za-z]{1}\b", + 0.85, + ), + Pattern( + "PAN (Low)", + r"\b\[A-Za-z][0-9]{4}[A-Za-z]\b", + 0.15, + ), + ] + + CONTEXT = [ + "permanent account number", + "pan", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_PAN", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 393be6ed2..9325f77a2 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -40,6 +40,7 @@ TransformersRecognizer, ItPassportRecognizer, ItIdentityCardRecognizer, + InPanRecognizer, ) logger = logging.getLogger("presidio-analyzer") diff --git a/presidio-analyzer/tests/test_in_pan_recognizer.py b/presidio-analyzer/tests/test_in_pan_recognizer.py new file mode 100644 index 000000000..6460e0bfc --- /dev/null +++ b/presidio-analyzer/tests/test_in_pan_recognizer.py @@ -0,0 +1,42 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import InPanRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return InPanRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_PAN"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + ("ABCPD1234Z", 1, (0, 9), 0.8), ("AAAA1111R", 0, (), (),) + # fmt: on + ], +) +def test_when_sgfins_in_text_then_all_sg_fins_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) From 87a1aae3aa6abaca3a84d2b4ca87570bcd807fbd Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Tue, 27 Jun 2023 18:52:02 +0530 Subject: [PATCH 02/23] refined IN_PAN regex refined the regex for better recognition and enhanced the test cases accordingly --- .../predefined_recognizers/in_pan_recognizer.py | 16 ++++++++-------- .../tests/test_in_pan_recognizer.py | 9 +++++++-- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py index b68797757..83c82c1fb 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py @@ -24,20 +24,20 @@ class InPanRecognizer(PatternRecognizer): """ PATTERNS = [ - Pattern( - "PAN (Medium)", - r"\b\[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}\b", - 0.6, - ), Pattern( "PAN (High)", - r"\b\[A-Za-z]{3}(A|a|B|b|C|c|F|f|G|g|H|h|J|j|L|l|P|p|T|t){1}[0-9]{4}[A-Za-z]{1}\b", + r"\b([A-Za-z]{3}[AaBbCcFfGgHhJjLlPpTt]{1}[A-Za-z]{1}[0-9]{4}[A-Za-z]{1})\b", 0.85, ), + Pattern( + "PAN (Medium)", + r"\b([A-Za-z]{5}[0-9]{4}[A-Za-z]{1})\b", + 0.6, + ), Pattern( "PAN (Low)", - r"\b\[A-Za-z][0-9]{4}[A-Za-z]\b", - 0.15, + r"\b((?=.*?[a-zA-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", + 0.05, ), ] diff --git a/presidio-analyzer/tests/test_in_pan_recognizer.py b/presidio-analyzer/tests/test_in_pan_recognizer.py index 6460e0bfc..9a7edeaed 100644 --- a/presidio-analyzer/tests/test_in_pan_recognizer.py +++ b/presidio-analyzer/tests/test_in_pan_recognizer.py @@ -18,11 +18,14 @@ def entities(): "text, expected_len, expected_position, expected_score", [ # fmt: off - ("ABCPD1234Z", 1, (0, 9), 0.8), ("AAAA1111R", 0, (), (),) + ("AAASA1111R", 1, (0,10), 0.6) , + ("ABCPD1234Z", 1, (0, 10), 0.85), + ("ABCND1234Z", 1, (0, 10), 0.6), + ("A1111DFSFS", 1, (0,10),0.05), # fmt: on ], ) -def test_when_sgfins_in_text_then_all_sg_fins_found( +def test_when_pan_in_text_then_all_pans_found( text, expected_len, expected_position, @@ -31,6 +34,8 @@ def test_when_sgfins_in_text_then_all_sg_fins_found( entities, ): results = recognizer.analyze(text, entities) + print(results) + assert len(results) == expected_len if results: assert_result( From 8756c9345cea712b86a0a76b1a7c4c0311fc2fab Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Wed, 28 Jun 2023 13:56:52 +0530 Subject: [PATCH 03/23] Update recognizer_registry.py Fixed lint error that was missed earlier. --- .../presidio_analyzer/recognizer_registry/recognizer_registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 9325f77a2..43ea8b244 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -88,6 +88,7 @@ def load_predefined_recognizers( AuAcnRecognizer, AuTfnRecognizer, AuMedicareRecognizer, + InPanRecognizer, ], "es": [EsNifRecognizer], "it": [ From 2f85d5d2c144d2b82abede606d8cfe6ff3568e8e Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Wed, 28 Jun 2023 20:24:44 +0530 Subject: [PATCH 04/23] Fixed Lint errors Added test cases , verification and context data --- .../tests/data/context_sentences_tests.txt | 13 +++++++++++++ presidio-analyzer/tests/test_context_support.py | 8 +++++--- presidio-analyzer/tests/test_recognizer_registry.py | 4 ++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/presidio-analyzer/tests/data/context_sentences_tests.txt b/presidio-analyzer/tests/data/context_sentences_tests.txt index 75c553bfc..f19282f74 100644 --- a/presidio-analyzer/tests/data/context_sentences_tests.txt +++ b/presidio-analyzer/tests/data/context_sentences_tests.txt @@ -94,3 +94,16 @@ Special NRIC numbers e.g. S0000001I that are numerically significant have been i # Verify SG NRIC/FIN mixed case (e.g. lower case ) FIN my fin is g3300299L + +#Verify IN PAN in adjacent context words +IN_PAN +my pan is DJPMS1234Z amongst so many other things + +#Verify IN PAN context words +IN_PAN +Typical tax filing identifier is known as PAN in India also known as permanent account number + + +#Verify IN PAN mixed case +IN_PAN +my PAN number is DJPMS1234Z \ No newline at end of file diff --git a/presidio-analyzer/tests/test_context_support.py b/presidio-analyzer/tests/test_context_support.py index 0269514e8..124da5d21 100644 --- a/presidio-analyzer/tests/test_context_support.py +++ b/presidio-analyzer/tests/test_context_support.py @@ -15,6 +15,7 @@ IpRecognizer, UsSsnRecognizer, SgFinRecognizer, + InPanRecognizer, ) from presidio_analyzer.nlp_engine import NlpArtifacts from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer @@ -32,6 +33,7 @@ def recognizers_map(): "US_BANK_NUMBER": UsBankRecognizer(), "US_PASSPORT": UsPassportRecognizer(), "FIN": SgFinRecognizer(), + "IN_PAN": InPanRecognizer(), } return rec_map @@ -70,9 +72,9 @@ def dataset(recognizers_map): raise ValueError(f"bad entity type {entity_type}") test_items.append((item, recognizer, [entity_type])) - # Currently we have 28 sentences, this is a sanity check - if not len(test_items) == 28: - raise ValueError(f"expected 28 context sentences but found {len(test_items)}") + # Currently we have 31 sentences, this is a sanity check + if not len(test_items) == 31: + raise ValueError(f"expected 31 context sentences but found {len(test_items)}") yield test_items diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index cb1f77286..848c33d13 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -54,8 +54,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 21 predefined - assert len(recognizers) == 1 + 21 + # 1 custom recognizer in english + 22 predefined + assert len(recognizers) == 1 + 22 def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry): From b0d1ce82cba515fe470a15bcfa43daab1420abdf Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Tue, 4 Jul 2023 21:11:51 +0530 Subject: [PATCH 05/23] Added more test cases in test_in_pan_recognizer.py Added negative test cases per review comments. --- presidio-analyzer/tests/test_in_pan_recognizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/presidio-analyzer/tests/test_in_pan_recognizer.py b/presidio-analyzer/tests/test_in_pan_recognizer.py index 9a7edeaed..9eb925ede 100644 --- a/presidio-analyzer/tests/test_in_pan_recognizer.py +++ b/presidio-analyzer/tests/test_in_pan_recognizer.py @@ -22,6 +22,8 @@ def entities(): ("ABCPD1234Z", 1, (0, 10), 0.85), ("ABCND1234Z", 1, (0, 10), 0.6), ("A1111DFSFS", 1, (0,10),0.05), + ("ABCD1234",0,(),(),), + ("My PAN number is ABBPM4567S with a lot of text beyond it", 1, (17,27),.85), # fmt: on ], ) From 88c6c1f6cb0fb5966f6cde45a141b95da67d3b57 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Tue, 16 Jan 2024 01:09:21 +0530 Subject: [PATCH 06/23] added IN_AADHAAR recognizer --- docs/supported_entities.md | 6 ++ .../predefined_recognizers/__init__.py | 4 +- .../in_aadhaar_recognizer.py | 74 +++++++++++++++++++ .../presidio_analyzer/recognizer_registry.py | 2 + .../tests/test_in_aadhaar_recognizer.py | 47 ++++++++++++ 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py create mode 100644 presidio-analyzer/tests/test_in_aadhaar_recognizer.py diff --git a/docs/supported_entities.md b/docs/supported_entities.md index aee8641cb..293920ccf 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -78,6 +78,12 @@ For more information, refer to the [adding new recognizers documentation](analyz |AU_TFN| The tax file number (TFN) is a unique identifier issued by the Australian Taxation Office to each taxpaying entity | Pattern match, context, and checksum | |AU_MEDICARE| Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system| Pattern match, context, and checksum | +### India +| FieldType | Description |Detection Method| +|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--- | +| IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context | +| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum | + ## Adding a custom PII entity See [this documentation](analyzer/adding_recognizers.md) for instructions on how to add a new Recognizer for a new type of PII entity. diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index a87b4aea0..2720a5ccb 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -35,6 +35,7 @@ from .in_pan_recognizer import InPanRecognizer from .pl_pesel_recognizer import PlPeselRecognizer from .azure_ai_language import AzureAILanguageRecognizer +from .in_aadhaar_recognizer import InAadhaarRecognizer NLP_RECOGNIZERS = { "spacy": SpacyRecognizer, @@ -76,5 +77,6 @@ "ItPassportRecognizer", "InPanRecognizer", "PlPeselRecognizer", - "AzureAILanguageRecognizer" + "AzureAILanguageRecognizer", + "InAadhaarRecognizer" ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py new file mode 100644 index 000000000..25d035db9 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py @@ -0,0 +1,74 @@ +from stdnum import verhoeff +from presidio_analyzer import Pattern, PatternRecognizer +from typing import Optional, List, Tuple + + +class InAadhaarRecognizer(PatternRecognizer): + """ + Recognizes Indian UIDAI Person Identification Number ("AADHAAR"). + + Reference: https://en.wikipedia.org/wiki/Aadhaar + A 12 digit unique number that is issued to each individual by Government of India + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + PATTERNS = [ + Pattern( + "AADHAAR (Low)", + r"\b[0-9]{12}\b", + 0.05, + ), + ] + + CONTEXT = [ + "aadhaar", + "uidai", + ] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_AADHAAR", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs + else [("-", ""), (" ", ""), (":", "")] + ) + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """Determine absolute value based on calculation.""" + sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs) + return self.__check_aadhaar(sanitized_value) + + @staticmethod + def __check_aadhaar(sanitized_value: str) -> bool: + is_valid_aadhaar: bool = False + if len(sanitized_value) == 12 and \ + sanitized_value.isnumeric() is True and \ + int(sanitized_value[0]) >= 2 and \ + verhoeff.is_valid(number=int(sanitized_value)) is True: + is_valid_aadhaar = True + return is_valid_aadhaar + + @staticmethod + def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry.py index 7a83da178..79d346020 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry.py @@ -45,6 +45,7 @@ ItIdentityCardRecognizer, InPanRecognizer, PlPeselRecognizer, + InAadhaarRecognizer, ) logger = logging.getLogger("presidio-analyzer") @@ -101,6 +102,7 @@ def load_predefined_recognizers( AuTfnRecognizer, AuMedicareRecognizer, InPanRecognizer, + InAadhaarRecognizer, ], "es": [EsNifRecognizer], "it": [ diff --git a/presidio-analyzer/tests/test_in_aadhaar_recognizer.py b/presidio-analyzer/tests/test_in_aadhaar_recognizer.py new file mode 100644 index 000000000..678b4c624 --- /dev/null +++ b/presidio-analyzer/tests/test_in_aadhaar_recognizer.py @@ -0,0 +1,47 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import InAadhaarRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return InAadhaarRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_AADHAAR"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + ("123456789012", 0, (0,12), 0), + ("312345678909", 1, (0, 12), 1), + ("399876543211", 1, (0, 12), 1), + ("My Aadhaar number is 400123456787 with a lot of text beyond it", 1, (21,33), 1), + # fmt: on + ], +) +def test_when_aadhaar_in_text_then_all_aadhaars_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + print(results) + + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) From 2d01bd0ab44b3275cb160ea19af1b1164f16bcef Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Wed, 17 Jan 2024 00:54:03 +0530 Subject: [PATCH 07/23] Update in_aadhaar_recognizer.py linted code --- .../in_aadhaar_recognizer.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py index 25d035db9..3e7cb2d9e 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py @@ -32,15 +32,16 @@ class InAadhaarRecognizer(PatternRecognizer): ] def __init__( - self, - patterns: Optional[List[Pattern]] = None, - context: Optional[List[str]] = None, - supported_language: str = "en", - supported_entity: str = "IN_AADHAAR", - replacement_pairs: Optional[List[Tuple[str, str]]] = None, + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_AADHAAR", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, ): self.replacement_pairs = ( - replacement_pairs if replacement_pairs + replacement_pairs + if replacement_pairs else [("-", ""), (" ", ""), (":", "")] ) patterns = patterns if patterns else self.PATTERNS @@ -60,10 +61,12 @@ def validate_result(self, pattern_text: str) -> bool: @staticmethod def __check_aadhaar(sanitized_value: str) -> bool: is_valid_aadhaar: bool = False - if len(sanitized_value) == 12 and \ - sanitized_value.isnumeric() is True and \ - int(sanitized_value[0]) >= 2 and \ - verhoeff.is_valid(number=int(sanitized_value)) is True: + if ( + len(sanitized_value) == 12 + and sanitized_value.isnumeric() is True + and int(sanitized_value[0]) >= 2 + and verhoeff.is_valid(number=int(sanitized_value)) is True + ): is_valid_aadhaar = True return is_valid_aadhaar From 2434bb5b1b07a56d50ddf16d1b841b4395c2ac73 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Wed, 17 Jan 2024 19:59:10 +0530 Subject: [PATCH 08/23] Update in_aadhaar_recognizer.py update pattern recognizer value per suggestion in review --- .../predefined_recognizers/in_aadhaar_recognizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py index 3e7cb2d9e..34c5ed23f 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py @@ -20,9 +20,9 @@ class InAadhaarRecognizer(PatternRecognizer): PATTERNS = [ Pattern( - "AADHAAR (Low)", + "AADHAAR (Very Weak)", r"\b[0-9]{12}\b", - 0.05, + 0.01, ), ] From b6db593f2b4942f018c97206ad78e45bbac672a9 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Tue, 23 Jan 2024 23:57:11 +0530 Subject: [PATCH 09/23] added utility function class added PresidioAnalyzerUtils class with generic functions. removed usage of stdnum --- .../presidio_analyzer/__init__.py | 3 +- .../presidio_analyzer/analyzer_utils.py | 75 +++++++++++++++++++ .../in_aadhaar_recognizer.py | 18 ++--- 3 files changed, 84 insertions(+), 12 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/analyzer_utils.py diff --git a/presidio-analyzer/presidio_analyzer/__init__.py b/presidio-analyzer/presidio_analyzer/__init__.py index eb5050d9d..84713a3ce 100644 --- a/presidio-analyzer/presidio_analyzer/__init__.py +++ b/presidio-analyzer/presidio_analyzer/__init__.py @@ -16,7 +16,7 @@ from presidio_analyzer.analyzer_request import AnalyzerRequest from presidio_analyzer.context_aware_enhancers import ContextAwareEnhancer from presidio_analyzer.context_aware_enhancers import LemmaContextAwareEnhancer - +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils # Define default loggers behavior @@ -49,4 +49,5 @@ "ContextAwareEnhancer", "LemmaContextAwareEnhancer", "BatchAnalyzerEngine", + "PresidioAnalyzerUtils", ] diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py new file mode 100644 index 000000000..b6909c5c4 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -0,0 +1,75 @@ +from typing import List, Tuple + + +class PresidioAnalyzerUtils: + """ + Utility functions for Presidio Analyzer. + + The class provides a bundle of utility functions that help centralizing the logic + for reusability and maintainability + """ + + @staticmethod + def is_palindrome(text: str, case_insensitive: bool = False): + """ + Validate if input text is a true palindrome. + + :param text: input text string to check for palindrome + :param case_insensitive: optional flag to check palindrome with no case + :return: True / False + """ + palindrome_text = text + if case_insensitive: + palindrome_text = palindrome_text.replace(" ", "").lower() + return palindrome_text == palindrome_text[::-1] + + @staticmethod + def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: + """ + Cleanse the input string of the replacement pairs specified as argument. + + :param text: input string + :param replacement_pairs: pairs of what has to be replaced with which value + :return: cleansed string + """ + for search_string, replacement_string in replacement_pairs: + text = text.replace(search_string, replacement_string) + return text + + @staticmethod + def is_verhoeff_number(input_number: int): + """ + Check if the input number is a true verhoeff number. + + :param input_number: + :return: + """ + __d__ = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [1, 2, 3, 4, 0, 6, 7, 8, 9, 5], + [2, 3, 4, 0, 1, 7, 8, 9, 5, 6], + [3, 4, 0, 1, 2, 8, 9, 5, 6, 7], + [4, 0, 1, 2, 3, 9, 5, 6, 7, 8], + [5, 9, 8, 7, 6, 0, 4, 3, 2, 1], + [6, 5, 9, 8, 7, 1, 0, 4, 3, 2], + [7, 6, 5, 9, 8, 2, 1, 0, 4, 3], + [8, 7, 6, 5, 9, 3, 2, 1, 0, 4], + [9, 8, 7, 6, 5, 4, 3, 2, 1, 0], + ] + __p__ = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [1, 5, 7, 6, 2, 8, 3, 0, 9, 4], + [5, 8, 0, 3, 7, 9, 6, 1, 4, 2], + [8, 9, 1, 6, 0, 4, 3, 5, 2, 7], + [9, 4, 5, 3, 1, 2, 6, 8, 7, 0], + [4, 2, 8, 6, 5, 7, 3, 9, 0, 1], + [2, 7, 9, 3, 8, 0, 6, 4, 1, 5], + [7, 0, 4, 6, 9, 1, 3, 2, 5, 8], + ] + __inv__ = [0, 4, 3, 2, 1, 5, 6, 7, 8, 9] + + c = 0 + inverted_number = list(map(int, reversed(str(input_number)))) + for i in range(len(inverted_number)): + c = __d__[c][__p__[i % 8][inverted_number[i]]] + return __inv__[c] == 0 diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py index 34c5ed23f..f5e4e8e1c 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_aadhaar_recognizer.py @@ -1,6 +1,6 @@ -from stdnum import verhoeff from presidio_analyzer import Pattern, PatternRecognizer from typing import Optional, List, Tuple +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils class InAadhaarRecognizer(PatternRecognizer): @@ -31,6 +31,8 @@ class InAadhaarRecognizer(PatternRecognizer): "uidai", ] + utils = None + def __init__( self, patterns: Optional[List[Pattern]] = None, @@ -55,23 +57,17 @@ def __init__( def validate_result(self, pattern_text: str) -> bool: """Determine absolute value based on calculation.""" - sanitized_value = self.__sanitize_value(pattern_text, self.replacement_pairs) + sanitized_value = Utils.sanitize_value(pattern_text, self.replacement_pairs) return self.__check_aadhaar(sanitized_value) - @staticmethod - def __check_aadhaar(sanitized_value: str) -> bool: + def __check_aadhaar(self, sanitized_value: str) -> bool: is_valid_aadhaar: bool = False if ( len(sanitized_value) == 12 and sanitized_value.isnumeric() is True and int(sanitized_value[0]) >= 2 - and verhoeff.is_valid(number=int(sanitized_value)) is True + and Utils.is_verhoeff_number(int(sanitized_value)) is True + and Utils.is_palindrome(sanitized_value) is False ): is_valid_aadhaar = True return is_valid_aadhaar - - @staticmethod - def __sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: - for search_string, replacement_string in replacement_pairs: - text = text.replace(search_string, replacement_string) - return text From fd28708c175a46a6a587e2ad1b3bf3545b0ae8c6 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Mon, 29 Jan 2024 00:22:34 +0530 Subject: [PATCH 10/23] Create test_analyzer_utils.py added test cases for analyzer_utils.py in prescribed format --- .../tests/test_analyzer_utils.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 presidio-analyzer/tests/test_analyzer_utils.py diff --git a/presidio-analyzer/tests/test_analyzer_utils.py b/presidio-analyzer/tests/test_analyzer_utils.py new file mode 100644 index 000000000..30b709da4 --- /dev/null +++ b/presidio-analyzer/tests/test_analyzer_utils.py @@ -0,0 +1,64 @@ +from presidio_analyzer import PresidioAnalyzerUtils +import pytest + +palindrome_test_set = [ + ["abMA", False, False], + ["abCba", False, True], + ["ABBA", False, True], + ["aBba", True, True], +] + + +sanitizer_test_set = [ + [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], + ["def", "", "def"], +] + +verhoeff_test_set = [ + [312345678909, True], + [400123456787, True], + [123456789012, False], +] + + +@pytest.mark.parametrize( + "input_text,case_sensitive, expected_output", palindrome_test_set +) +def test_is_palindrome(input_text, case_sensitive, expected_output): + """ + Test if input is a true palindrome as defined in base class. + + :param input_text: input text to validate + :param case_sensitive: flag to calculate palindrome with no case + :param expected_output: calculated output + :return: True/False + """ + assert ( + PresidioAnalyzerUtils.is_palindrome(input_text, case_sensitive) + == expected_output + ) + + +@pytest.mark.parametrize("input_text, params, expected_output", sanitizer_test_set) +def test_sanitize_value(input_text, params, expected_output): + """ + Test to assert sanitize_value functionality from base class. + + :param input_text: input string + :param params: List of tuples, indicating what has to be sanitized with which + :param expected_output: sanitized value + :return: True/False + """ + assert PresidioAnalyzerUtils.sanitize_value(input_text, params) == expected_output + + +@pytest.mark.parametrize("input_number, is_verhoeff", verhoeff_test_set) +def test_is_verhoeff(input_number, is_verhoeff): + """ + Test to assert verhoeff number validation based on checksum from base class. + + :param input_number: input integer + :param is_verhoeff: expected flag + :return: True/False + """ + assert PresidioAnalyzerUtils.is_verhoeff_number(input_number) == is_verhoeff From f0c9737e3359fe12609778dce450e021c308d0ad Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Mon, 29 Jan 2024 20:29:49 +0530 Subject: [PATCH 11/23] Update test_recognizer_registry.py added to the count of predefined recognizers --- presidio-analyzer/tests/test_recognizer_registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 9e2bf398f..57d5cebaa 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -57,8 +57,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 22 predefined - assert len(recognizers) == 1 + 22 + # 1 custom recognizer in english + 23 predefined + assert len(recognizers) == 1 + 23 def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry): From 57b229453aed26b4a413ac41f0073c7a03eef9fa Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Fri, 9 Feb 2024 22:36:46 +0530 Subject: [PATCH 12/23] added predefined recognizer : IN_VEHICLE_REGISTRATION Added India specific predefined pattern recognizer for vehicle registration number --- docs/supported_entities.md | 1 + .../predefined_recognizers/__init__.py | 4 +- .../in_vehicle_registration_recognizer.py | 1582 +++++++++++++++++ .../presidio_analyzer/recognizer_registry.py | 2 + ...test_in_vehicle_registration_recognizer.py | 52 + .../tests/test_recognizer_registry.py | 11 +- 6 files changed, 1644 insertions(+), 8 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py create mode 100644 presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 293920ccf..52bf94c45 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -83,6 +83,7 @@ For more information, refer to the [adding new recognizers documentation](analyz |------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--- | | IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context | | IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum | +| IN_VEHICLE_REGISTRATION | Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number | Pattern match, context, and checksum | ## Adding a custom PII entity diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 2720a5ccb..80d68b8b4 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -36,6 +36,7 @@ from .pl_pesel_recognizer import PlPeselRecognizer from .azure_ai_language import AzureAILanguageRecognizer from .in_aadhaar_recognizer import InAadhaarRecognizer +from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer NLP_RECOGNIZERS = { "spacy": SpacyRecognizer, @@ -78,5 +79,6 @@ "InPanRecognizer", "PlPeselRecognizer", "AzureAILanguageRecognizer", - "InAadhaarRecognizer" + "InAadhaarRecognizer", + "InVehicleRegistrationRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py new file mode 100644 index 000000000..5d29b02df --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py @@ -0,0 +1,1582 @@ +from presidio_analyzer import Pattern, PatternRecognizer +from typing import Optional, List, Tuple +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils + + +class InVehicleRegistrationRecognizer(PatternRecognizer): + """ + Recognizes Indian Vehicle Registration Number issued by RTO. + + Reference(s): + https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_India + https://en.wikipedia.org/wiki/Regional_Transport_Office + https://en.wikipedia.org/wiki/List_of_Regional_Transport_Office_districts_in_India + + The registration scheme changed over time with multiple formats + in play over the years + India has multiple active patterns for registration plates issued to different + vehicle categories + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input e.g. by removing dashes or spaces + """ + + PATTERNS = [ + Pattern( + "India Vehicle Registration (Very Weak)", + r"\b[A-Z]{1}(?!0000)[0-9]{4}\b", + 0.01, + ), + Pattern( + "India Vehicle Registration (Very Weak)", + r"\b[A-Z]{2}(?!0000)\d{4}\b", + 0.01, + ), + Pattern( + "India Vehicle Registration (Very Weak)", + r"\b(I)(?!00000)\d{5}\b", + 0.01, + ), + Pattern( + "India Vehicle Registration (Weak)", + r"\b[A-Z]{3}(?!0000)\d{4}\b", + 0.2, + ), + Pattern( + "India Vehicle Registration (Medium)", + r"\b\d{1,3}(CD|CC|UN)[1-9]{1}[0-9]{1,3}\b", + 0.40, + ), + Pattern( + "India Vehicle Registration", + r"\b[A-Z]{2}\d{1,2}[A-Z]{1,2}(?!0000)\d{4}\b", + 0.85, + ), + Pattern( + "India Vehicle Registration", + r"\b[2-9]{1}[1-9]{1}(BH)(?!0000)\d{4}[A-HJ-NP-Z]{2}\b", + 0.85, + ), + Pattern( + "India Vehicle Registration", + r"\b(?!00)\d{2}(A|B|C|D|E|F|H|K|P|R|X)\d{6}[A-Z]{1}\b", + 0.85, + ), + ] + + CONTEXT = ["RTO", "vehicle", "plate", "registration"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_VEHICLE_REGISTRATION", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs + if replacement_pairs + else [("-", ""), (" ", ""), (":", "")] + ) + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """Determine absolute value based on calculation.""" + sanitized_value = Utils.sanitize_value(pattern_text, self.replacement_pairs) + # print('Sanitized value:' + sanitized_value) + return self.__check_vehicle_registration(sanitized_value) + + def __check_vehicle_registration(self, sanitized_value: str) -> bool: + # print('check function called') + is_valid_registration = None + # logic here + an_list = ["01"] + ap_list = ["39", "40"] + ar_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "19", + "20", + "22", + ] + as_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + ] + br_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "19", + "21", + "22", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "37", + "38", + "39", + "43", + "44", + "45", + "46", + "50", + "51", + "52", + "53", + "55", + "56", + ] + cg_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + ] + ch_list = ["01", "02", "03", "04"] + dd_list = ["01", "02", "03"] + dn_list = ["09"] # old list + dl_list = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] + ga_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + ] + gj_list = [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + ] + hp_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + ] + hr_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + ] + jh_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + ] + jk_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + ] + ka_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + ] + kl_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + ] + la_list = ["01", "02"] + ld_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09"] + mh_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + ] + ml_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + mn_list = ["01", "02", "03", "04", "05", "06", "07"] + mp_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + ] + mz_list = ["01", "02", "03", "04", "05", "06", "07", "08"] + nl_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + od_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + ] + or_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + ] # old list + pb_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + ] + py_list = ["01", "02", "03", "04", "05"] + rj_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + ] + sk_list = ["01", "02", "03", "04", "05", "06", "07", "08"] + tn_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + ] + tr_list = ["01", "02", "03", "04", "05", "06", "07", "08"] + ts_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + ] + uk_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + ] + up_list = [ + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + ] + wb_list = [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + ] + + state_rto_districtcode_map = { + "AN": an_list, + "AP": ap_list, + "AR": ar_list, + "AS": as_list, + "BR": br_list, + "CG": cg_list, + "CH": ch_list, + "DD": dd_list, + "DN": dn_list, + "DL": dl_list, + "GA": ga_list, + "GJ": gj_list, + "HP": hp_list, + "HR": hr_list, + "JH": jh_list, + "JK": jk_list, + "KA": ka_list, + "KL": kl_list, + "LA": la_list, + "LD": ld_list, + "MH": mh_list, + "ML": ml_list, + "MN": mn_list, + "MP": mp_list, + "MZ": mz_list, + "NL": nl_list, + "OD": od_list, + "OR": or_list, + "PB": pb_list, + "PY": py_list, + "RJ": rj_list, + "SK": sk_list, + "TN": tn_list, + "TR": tr_list, + "TS": ts_list, + "UK": uk_list, + "UP": up_list, + "WB": wb_list, + } + + union_territories = ["AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"] + states = [ + "AP", + "AR", + "AS", + "BR", + "CG", + "GA", + "GJ", + "HR", + "HP", + "JH", + "KA", + "KL", + "MP", + "MH", + "MN", + "ML", + "MZ", + "NL", + "OD", + "PB", + "RJ", + "SK", + "TN", + "TS", + "TR", + "UP", + "UK", + "WB", + "UT", + ] + old_union_territories = ["CT", "DN"] + old_states = ["UL", "OR", "UA"] + non_standard_state_or_ut = ["DD"] + + foreign_mission_codes = [ + 84, + 85, + 89, + 93, + 94, + 95, + 97, + 98, + 99, + 102, + 104, + 105, + 106, + 109, + 111, + 112, + 113, + 117, + 119, + 120, + 121, + 122, + 123, + 125, + 126, + 128, + 133, + 134, + 135, + 137, + 141, + 145, + 147, + 149, + 152, + 153, + 155, + 156, + 157, + 159, + 160, + ] + + # armed_forces_vehicle_types = [ + # 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X'] + + two_factor_registration_prefix = [] + two_factor_registration_prefix.extend(union_territories) + two_factor_registration_prefix.extend(states) + two_factor_registration_prefix.extend(old_states) + two_factor_registration_prefix.extend(old_union_territories) + two_factor_registration_prefix.extend(non_standard_state_or_ut) + first_two_char = sanitized_value[:2].upper() + dist_code: str = "" + diplomatic_vehicle_codes = ["CC", "CD", "UN"] + + match first_two_char: + case first_two_char if first_two_char in two_factor_registration_prefix: + if sanitized_value[2].isdigit(): + if sanitized_value[3].isdigit(): + dist_code = sanitized_value[2:4] + else: + dist_code = sanitized_value[2:3] + + if dist_code and dist_code in state_rto_districtcode_map.get( + first_two_char, "" + ): + is_valid_registration = True + case _: + for diplomatic_vehicle_code in diplomatic_vehicle_codes: + if diplomatic_vehicle_code in sanitized_value: + vehicle_prefix = sanitized_value.partition( + diplomatic_vehicle_code + )[0] + if vehicle_prefix.isnumeric() and ( + 1 <= int(vehicle_prefix) <= 80 + or int(vehicle_prefix) in foreign_mission_codes + ): + is_valid_registration = True + + return is_valid_registration diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry.py index 79d346020..a47f34dbf 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry.py @@ -46,6 +46,7 @@ InPanRecognizer, PlPeselRecognizer, InAadhaarRecognizer, + InVehicleRegistrationRecognizer, ) logger = logging.getLogger("presidio-analyzer") @@ -103,6 +104,7 @@ def load_predefined_recognizers( AuMedicareRecognizer, InPanRecognizer, InAadhaarRecognizer, + InVehicleRegistrationRecognizer, ], "es": [EsNifRecognizer], "it": [ diff --git a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py new file mode 100644 index 000000000..1701d6a67 --- /dev/null +++ b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py @@ -0,0 +1,52 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import InVehicleRegistrationRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return InVehicleRegistrationRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_VEHICLE_REGISTRATION"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + ("KA53ME3456", 1, (0, 10), 1), + ("KA99ME3456", 1, (0, 10), 0.85), + ("MN2412", 1, (0, 6), 0.01), + ("MCX1243", 1, (0, 7), 0.2), + ("I15432", 1, (0, 6), 0.01), + ("ABNE123456", 0, (), (),), + ("My Bike's registration number is OD02BA2341 with a lot of text beyond", + 1, (33, 43), 1), + # fmt: on + ], +) +def test_when_regn_in_text_then_all_regns_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + print("Results") + print(results) + + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 57d5cebaa..24f03d98a 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -8,7 +8,7 @@ PatternRecognizer, EntityRecognizer, Pattern, - AnalyzerEngine + AnalyzerEngine, ) from presidio_analyzer.predefined_recognizers import SpacyRecognizer @@ -57,8 +57,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 23 predefined - assert len(recognizers) == 1 + 23 + # 1 custom recognizer in english + 24 predefined + assert len(recognizers) == 1 + 24 def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry): @@ -229,9 +229,6 @@ def test_recognizer_removed_and_returned_entities_are_correct(): assert "DATE_TIME" in supported_entities assert "PERSON" not in supported_entities - analyzer = AnalyzerEngine( - registry=registry, - supported_languages='en' - ) + analyzer = AnalyzerEngine(registry=registry, supported_languages="en") analyzer.analyze("My name is David", language="en") From 365be216c06eddca67cdf7dcc3da30e1c97a3742 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Tue, 13 Feb 2024 00:15:01 +0530 Subject: [PATCH 13/23] review comments incorporated reinstated python 3.9 compatibility, reorganized code --- .../presidio_analyzer/analyzer_utils.py | 214 ++- .../in_vehicle_registration_recognizer.py | 1533 +---------------- .../tests/test_analyzer_utils.py | 60 + ...test_in_vehicle_registration_recognizer.py | 2 +- 4 files changed, 339 insertions(+), 1470 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py index b6909c5c4..1ab146952 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_utils.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -5,8 +5,8 @@ class PresidioAnalyzerUtils: """ Utility functions for Presidio Analyzer. - The class provides a bundle of utility functions that help centralizing the logic - for reusability and maintainability + The class provides a bundle of utility functions that help centralizing the + logic for re-usability and maintainability """ @staticmethod @@ -73,3 +73,213 @@ def is_verhoeff_number(input_number: int): for i in range(len(inverted_number)): c = __d__[c][__p__[i % 8][inverted_number[i]]] return __inv__[c] == 0 + + # fmt: off + in_vehicle_foreign_mission_codes = [ + 84, 85, 89, 93, 94, 95, 97, 98, 99, 102, 104, 105, 106, 109, 111, 112, + 113, 117, 119, 120, 121, 122, 123, 125, 126, 128, 133, 134, 135, 137, + 141, 145, 147, 149, 152, 153, 155, 156, 157, 159, 160 + ] + + in_vehicle_armed_forces_codes = [ + 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X' + ] + + in_vehicle_diplomatic_codes = ["CC", "CD", "UN"] + in_vehicle_dist_an = ["01"] + in_vehicle_dist_ap = ["39", "40"] + in_vehicle_dist_ar = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "19", "20", "22" + ] + in_vehicle_dist_as = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34" + ] + in_vehicle_dist_br = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "19", + "21", "22", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", + "34", "37", "38", "39", "43", "44", "45", "46", "50", "51", "52", "53", + "55", "56" + ] + in_vehicle_dist_cg = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", + "25", "26", "27", "28", "29", "30" + ] + in_vehicle_dist_ch = ["01", "02", "03", "04"] + in_vehicle_dist_dd = ["01", "02", "03"] + in_vehicle_dist_dn = ["09"] # old list + in_vehicle_dist_dl = [ + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] + in_vehicle_dist_ga = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] + in_vehicle_dist_gj = [ + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", + "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39" + ] + in_vehicle_dist_hp = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_hr = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_jh = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24" + ] + in_vehicle_dist_jk = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22" + ] + in_vehicle_dist_ka = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" + ] + in_vehicle_dist_kl = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_la = ["01", "02"] + in_vehicle_dist_ld = ["01", "02", "03", "04", "05", "06", "07", "08", "09"] + in_vehicle_dist_mh = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51" + ] + in_vehicle_dist_ml = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + in_vehicle_dist_mn = ["01", "02", "03", "04", "05", "06", "07"] + in_vehicle_dist_mp = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" + ] + in_vehicle_dist_mz = ["01", "02", "03", "04", "05", "06", "07", "08"] + in_vehicle_dist_nl = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + in_vehicle_dist_od = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35" + ] + in_vehicle_dist_or = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31" + ] # old list + in_vehicle_dist_pb = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_py = ["01", "02", "03", "04", "05"] + in_vehicle_dist_rj = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58" + ] + in_vehicle_dist_sk = ["01", "02", "03", "04", "05", "06", "07", "08"] + in_vehicle_dist_tn = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_tr = ["01", "02", "03", "04", "05", "06", "07", "08"] + in_vehicle_dist_ts = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38" + ] + in_vehicle_dist_uk = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20" + ] + in_vehicle_dist_up = [ + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", + "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", + "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", + "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", + "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", + "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", + "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", + "96" + ] + in_vehicle_dist_wb = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98" + ] + + in_union_territories = ["AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"] + in_old_union_territories = ["CT", "DN"] + in_states = [ + "AP", "AR", "AS", "BR", "CG", "GA", "GJ", "HR", "HP", "JH", "KA", "KL", + "MP", "MH", "MN", "ML", "MZ", "NL", "OD", "PB", "RJ", "SK", "TN", "TS", + "TR", "UP", "UK", "WB", "UT" + ] + in_old_states = ["UL", "OR", "UA"] + in_non_standard_state_or_ut = ["DD"] + # fmt: on + + def list_length(self): + """ + Unimplemented functon with primary job of running content length test case. + + :return: None + """ + pass diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py index 5d29b02df..4e0538063 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py @@ -55,7 +55,7 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): Pattern( "India Vehicle Registration", r"\b[A-Z]{2}\d{1,2}[A-Z]{1,2}(?!0000)\d{4}\b", - 0.85, + 0.50, ), Pattern( "India Vehicle Registration", @@ -103,1480 +103,79 @@ def __check_vehicle_registration(self, sanitized_value: str) -> bool: # print('check function called') is_valid_registration = None # logic here - an_list = ["01"] - ap_list = ["39", "40"] - ar_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "19", - "20", - "22", - ] - as_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - ] - br_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "19", - "21", - "22", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "37", - "38", - "39", - "43", - "44", - "45", - "46", - "50", - "51", - "52", - "53", - "55", - "56", - ] - cg_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - ] - ch_list = ["01", "02", "03", "04"] - dd_list = ["01", "02", "03"] - dn_list = ["09"] # old list - dl_list = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] - ga_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - ] - gj_list = [ - "1", - "2", - "3", - "4", - "5", - "6", - "7", - "8", - "9", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "21", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - ] - hp_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - "97", - "98", - "99", - ] - hr_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - "97", - "98", - "99", - ] - jh_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - ] - jk_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - ] - ka_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - ] - kl_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - "97", - "98", - "99", - ] - la_list = ["01", "02"] - ld_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09"] - mh_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - ] - ml_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] - mn_list = ["01", "02", "03", "04", "05", "06", "07"] - mp_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - ] - mz_list = ["01", "02", "03", "04", "05", "06", "07", "08"] - nl_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] - od_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - ] - or_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - ] # old list - pb_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - "97", - "98", - "99", - ] - py_list = ["01", "02", "03", "04", "05"] - rj_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - ] - sk_list = ["01", "02", "03", "04", "05", "06", "07", "08"] - tn_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - "97", - "98", - "99", - ] - tr_list = ["01", "02", "03", "04", "05", "06", "07", "08"] - ts_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - ] - uk_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - ] - up_list = [ - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - ] - wb_list = [ - "01", - "02", - "03", - "04", - "05", - "06", - "07", - "08", - "09", - "10", - "11", - "12", - "13", - "14", - "15", - "16", - "17", - "18", - "19", - "20", - "22", - "23", - "24", - "25", - "26", - "27", - "28", - "29", - "30", - "31", - "32", - "33", - "34", - "35", - "36", - "37", - "38", - "39", - "40", - "41", - "42", - "43", - "44", - "45", - "46", - "47", - "48", - "49", - "50", - "51", - "52", - "53", - "54", - "55", - "56", - "57", - "58", - "59", - "60", - "61", - "62", - "63", - "64", - "65", - "66", - "67", - "68", - "69", - "70", - "71", - "72", - "73", - "74", - "75", - "76", - "77", - "78", - "79", - "80", - "81", - "82", - "83", - "84", - "85", - "86", - "87", - "88", - "89", - "90", - "91", - "92", - "93", - "94", - "95", - "96", - "97", - "98", - ] - state_rto_districtcode_map = { - "AN": an_list, - "AP": ap_list, - "AR": ar_list, - "AS": as_list, - "BR": br_list, - "CG": cg_list, - "CH": ch_list, - "DD": dd_list, - "DN": dn_list, - "DL": dl_list, - "GA": ga_list, - "GJ": gj_list, - "HP": hp_list, - "HR": hr_list, - "JH": jh_list, - "JK": jk_list, - "KA": ka_list, - "KL": kl_list, - "LA": la_list, - "LD": ld_list, - "MH": mh_list, - "ML": ml_list, - "MN": mn_list, - "MP": mp_list, - "MZ": mz_list, - "NL": nl_list, - "OD": od_list, - "OR": or_list, - "PB": pb_list, - "PY": py_list, - "RJ": rj_list, - "SK": sk_list, - "TN": tn_list, - "TR": tr_list, - "TS": ts_list, - "UK": uk_list, - "UP": up_list, - "WB": wb_list, + "AN": Utils.in_vehicle_dist_an, + "AP": Utils.in_vehicle_dist_ap, + "AR": Utils.in_vehicle_dist_ar, + "AS": Utils.in_vehicle_dist_as, + "BR": Utils.in_vehicle_dist_br, + "CG": Utils.in_vehicle_dist_cg, + "CH": Utils.in_vehicle_dist_ch, + "DD": Utils.in_vehicle_dist_dd, + "DN": Utils.in_vehicle_dist_dn, + "DL": Utils.in_vehicle_dist_dl, + "GA": Utils.in_vehicle_dist_ga, + "GJ": Utils.in_vehicle_dist_gj, + "HP": Utils.in_vehicle_dist_hp, + "HR": Utils.in_vehicle_dist_hr, + "JH": Utils.in_vehicle_dist_jh, + "JK": Utils.in_vehicle_dist_jk, + "KA": Utils.in_vehicle_dist_ka, + "KL": Utils.in_vehicle_dist_kl, + "LA": Utils.in_vehicle_dist_la, + "LD": Utils.in_vehicle_dist_ld, + "MH": Utils.in_vehicle_dist_mh, + "ML": Utils.in_vehicle_dist_ml, + "MN": Utils.in_vehicle_dist_mn, + "MP": Utils.in_vehicle_dist_mp, + "MZ": Utils.in_vehicle_dist_mz, + "NL": Utils.in_vehicle_dist_nl, + "OD": Utils.in_vehicle_dist_od, + "OR": Utils.in_vehicle_dist_or, + "PB": Utils.in_vehicle_dist_pb, + "PY": Utils.in_vehicle_dist_py, + "RJ": Utils.in_vehicle_dist_rj, + "SK": Utils.in_vehicle_dist_sk, + "TN": Utils.in_vehicle_dist_tn, + "TR": Utils.in_vehicle_dist_tr, + "TS": Utils.in_vehicle_dist_ts, + "UK": Utils.in_vehicle_dist_uk, + "UP": Utils.in_vehicle_dist_up, + "WB": Utils.in_vehicle_dist_wb, } - - union_territories = ["AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"] - states = [ - "AP", - "AR", - "AS", - "BR", - "CG", - "GA", - "GJ", - "HR", - "HP", - "JH", - "KA", - "KL", - "MP", - "MH", - "MN", - "ML", - "MZ", - "NL", - "OD", - "PB", - "RJ", - "SK", - "TN", - "TS", - "TR", - "UP", - "UK", - "WB", - "UT", - ] - old_union_territories = ["CT", "DN"] - old_states = ["UL", "OR", "UA"] - non_standard_state_or_ut = ["DD"] - - foreign_mission_codes = [ - 84, - 85, - 89, - 93, - 94, - 95, - 97, - 98, - 99, - 102, - 104, - 105, - 106, - 109, - 111, - 112, - 113, - 117, - 119, - 120, - 121, - 122, - 123, - 125, - 126, - 128, - 133, - 134, - 135, - 137, - 141, - 145, - 147, - 149, - 152, - 153, - 155, - 156, - 157, - 159, - 160, - ] - - # armed_forces_vehicle_types = [ - # 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X'] - two_factor_registration_prefix = [] - two_factor_registration_prefix.extend(union_territories) - two_factor_registration_prefix.extend(states) - two_factor_registration_prefix.extend(old_states) - two_factor_registration_prefix.extend(old_union_territories) - two_factor_registration_prefix.extend(non_standard_state_or_ut) + two_factor_registration_prefix.extend(Utils.in_union_territories) + two_factor_registration_prefix.extend(Utils.in_states) + two_factor_registration_prefix.extend(Utils.in_old_states) + two_factor_registration_prefix.extend(Utils.in_old_union_territories) + two_factor_registration_prefix.extend(Utils.in_non_standard_state_or_ut) first_two_char = sanitized_value[:2].upper() dist_code: str = "" - diplomatic_vehicle_codes = ["CC", "CD", "UN"] - match first_two_char: - case first_two_char if first_two_char in two_factor_registration_prefix: - if sanitized_value[2].isdigit(): - if sanitized_value[3].isdigit(): - dist_code = sanitized_value[2:4] - else: - dist_code = sanitized_value[2:3] + if first_two_char in two_factor_registration_prefix: + if sanitized_value[2].isdigit(): + if sanitized_value[3].isdigit(): + dist_code = sanitized_value[2:4] + else: + dist_code = sanitized_value[2:3] + + registration_digits = sanitized_value[-4:] + if registration_digits.isnumeric(): + if 0 < int(registration_digits) <= 9999: + if dist_code and dist_code in state_rto_districtcode_map.get( + first_two_char, "" + ): + is_valid_registration = True - if dist_code and dist_code in state_rto_districtcode_map.get( - first_two_char, "" + for diplomatic_vehicle_code in Utils.in_vehicle_diplomatic_codes: + if diplomatic_vehicle_code in sanitized_value: + vehicle_prefix = sanitized_value.partition(diplomatic_vehicle_code)[ + 0 + ] + if vehicle_prefix.isnumeric() and ( + 1 <= int(vehicle_prefix) <= 80 + or int(vehicle_prefix) in Utils.in_vehicle_foreign_mission_codes ): is_valid_registration = True - case _: - for diplomatic_vehicle_code in diplomatic_vehicle_codes: - if diplomatic_vehicle_code in sanitized_value: - vehicle_prefix = sanitized_value.partition( - diplomatic_vehicle_code - )[0] - if vehicle_prefix.isnumeric() and ( - 1 <= int(vehicle_prefix) <= 80 - or int(vehicle_prefix) in foreign_mission_codes - ): - is_valid_registration = True return is_valid_registration diff --git a/presidio-analyzer/tests/test_analyzer_utils.py b/presidio-analyzer/tests/test_analyzer_utils.py index 30b709da4..5deec0e5c 100644 --- a/presidio-analyzer/tests/test_analyzer_utils.py +++ b/presidio-analyzer/tests/test_analyzer_utils.py @@ -6,8 +6,17 @@ ["abCba", False, True], ["ABBA", False, True], ["aBba", True, True], + ["NotAPalindrome", True, False], ] +in_vehicle_metadata_test_set = [ + ["in_non_standard_state_or_ut", 1], + ["in_old_states", 3], + ["in_states", 29], + ["in_union_territories", 8], + ["in_old_union_territories", 2], + ["in_vehicle_dist_wb", 70], +] sanitizer_test_set = [ [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], @@ -62,3 +71,54 @@ def test_is_verhoeff(input_number, is_verhoeff): :return: True/False """ assert PresidioAnalyzerUtils.is_verhoeff_number(input_number) == is_verhoeff + + +def test_list_length(): + """ + Tests for static counts of each metadata lists defined + :return: True/False + """ + assert len(PresidioAnalyzerUtils.in_old_states) == 3 + assert len(PresidioAnalyzerUtils.in_non_standard_state_or_ut) == 1 + assert len(PresidioAnalyzerUtils.in_states) == 29 + assert len(PresidioAnalyzerUtils.in_old_union_territories) == 2 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_wb) == 97 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_up) == 85 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_uk) == 20 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ts) == 37 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_tr) == 8 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_tn) == 98 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_sk) == 8 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_rj) == 57 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_py) == 5 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_pb) == 98 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_or) == 30 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_od) == 34 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_nl) == 10 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_mz) == 8 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_mp) == 70 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_mn) == 7 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ml) == 10 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_mh) == 50 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ld) == 9 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_la) == 2 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_kl) == 98 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ka) == 70 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_jh) == 23 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_hr) == 98 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_hp) == 98 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_gj) == 39 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ga) == 12 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_dl) == 13 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_dn) == 1 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_dd) == 3 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ch) == 4 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_cg) == 30 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_br) == 38 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_as) == 33 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ar) == 20 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_ap) == 2 + assert len(PresidioAnalyzerUtils.in_vehicle_dist_an) == 1 + assert len(PresidioAnalyzerUtils.in_vehicle_diplomatic_codes) == 3 + assert len(PresidioAnalyzerUtils.in_vehicle_armed_forces_codes) == 11 + assert len(PresidioAnalyzerUtils.in_vehicle_foreign_mission_codes) == 41 diff --git a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py index 1701d6a67..9d007a995 100644 --- a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py @@ -19,7 +19,7 @@ def entities(): [ # fmt: off ("KA53ME3456", 1, (0, 10), 1), - ("KA99ME3456", 1, (0, 10), 0.85), + ("KA99ME3456", 1, (0, 10), 0.50), ("MN2412", 1, (0, 6), 0.01), ("MCX1243", 1, (0, 7), 0.2), ("I15432", 1, (0, 6), 0.01), From bc059ce600436ec966303df753fe55a6369922a6 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Thu, 15 Feb 2024 23:05:11 +0530 Subject: [PATCH 14/23] review comments incorporated Logic reverted from analyzer_utils to recognizer classfile --- .../presidio_analyzer/analyzer_utils.py | 210 ------------ .../in_vehicle_registration_recognizer.py | 302 +++++++++++++++--- .../tests/test_analyzer_utils.py | 51 --- ...test_in_vehicle_registration_recognizer.py | 51 +++ 4 files changed, 308 insertions(+), 306 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py index 1ab146952..74710bd8b 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_utils.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -73,213 +73,3 @@ def is_verhoeff_number(input_number: int): for i in range(len(inverted_number)): c = __d__[c][__p__[i % 8][inverted_number[i]]] return __inv__[c] == 0 - - # fmt: off - in_vehicle_foreign_mission_codes = [ - 84, 85, 89, 93, 94, 95, 97, 98, 99, 102, 104, 105, 106, 109, 111, 112, - 113, 117, 119, 120, 121, 122, 123, 125, 126, 128, 133, 134, 135, 137, - 141, 145, 147, 149, 152, 153, 155, 156, 157, 159, 160 - ] - - in_vehicle_armed_forces_codes = [ - 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X' - ] - - in_vehicle_diplomatic_codes = ["CC", "CD", "UN"] - in_vehicle_dist_an = ["01"] - in_vehicle_dist_ap = ["39", "40"] - in_vehicle_dist_ar = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "19", "20", "22" - ] - in_vehicle_dist_as = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34" - ] - in_vehicle_dist_br = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "19", - "21", "22", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", - "34", "37", "38", "39", "43", "44", "45", "46", "50", "51", "52", "53", - "55", "56" - ] - in_vehicle_dist_cg = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", - "25", "26", "27", "28", "29", "30" - ] - in_vehicle_dist_ch = ["01", "02", "03", "04"] - in_vehicle_dist_dd = ["01", "02", "03"] - in_vehicle_dist_dn = ["09"] # old list - in_vehicle_dist_dl = [ - "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] - in_vehicle_dist_ga = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] - in_vehicle_dist_gj = [ - "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", - "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39" - ] - in_vehicle_dist_hp = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", - "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", - "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", - "98", "99" - ] - in_vehicle_dist_hr = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", - "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", - "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", - "98", "99" - ] - in_vehicle_dist_jh = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24" - ] - in_vehicle_dist_jk = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22" - ] - in_vehicle_dist_ka = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" - ] - in_vehicle_dist_kl = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", - "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", - "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", - "98", "99" - ] - in_vehicle_dist_la = ["01", "02"] - in_vehicle_dist_ld = ["01", "02", "03", "04", "05", "06", "07", "08", "09"] - in_vehicle_dist_mh = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51" - ] - in_vehicle_dist_ml = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] - in_vehicle_dist_mn = ["01", "02", "03", "04", "05", "06", "07"] - in_vehicle_dist_mp = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" - ] - in_vehicle_dist_mz = ["01", "02", "03", "04", "05", "06", "07", "08"] - in_vehicle_dist_nl = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] - in_vehicle_dist_od = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35" - ] - in_vehicle_dist_or = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31" - ] # old list - in_vehicle_dist_pb = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", - "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", - "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", - "98", "99" - ] - in_vehicle_dist_py = ["01", "02", "03", "04", "05"] - in_vehicle_dist_rj = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58" - ] - in_vehicle_dist_sk = ["01", "02", "03", "04", "05", "06", "07", "08"] - in_vehicle_dist_tn = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", - "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", - "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", - "98", "99" - ] - in_vehicle_dist_tr = ["01", "02", "03", "04", "05", "06", "07", "08"] - in_vehicle_dist_ts = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38" - ] - in_vehicle_dist_uk = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20" - ] - in_vehicle_dist_up = [ - "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", - "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", - "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", - "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", - "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", - "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", - "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", - "96" - ] - in_vehicle_dist_wb = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", - "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", - "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", - "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", - "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", - "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", - "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", - "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", - "98" - ] - - in_union_territories = ["AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"] - in_old_union_territories = ["CT", "DN"] - in_states = [ - "AP", "AR", "AS", "BR", "CG", "GA", "GJ", "HR", "HP", "JH", "KA", "KL", - "MP", "MH", "MN", "ML", "MZ", "NL", "OD", "PB", "RJ", "SK", "TN", "TS", - "TR", "UP", "UK", "WB", "UT" - ] - in_old_states = ["UL", "OR", "UA"] - in_non_standard_state_or_ut = ["DD"] - # fmt: on - - def list_length(self): - """ - Unimplemented functon with primary job of running content length test case. - - :return: None - """ - pass diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py index 4e0538063..c8fdd0846 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py @@ -71,6 +71,210 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): CONTEXT = ["RTO", "vehicle", "plate", "registration"] + # fmt: off + in_vehicle_foreign_mission_codes = [ + 84, 85, 89, 93, 94, 95, 97, 98, 99, 102, 104, 105, 106, 109, 111, 112, + 113, 117, 119, 120, 121, 122, 123, 125, 126, 128, 133, 134, 135, 137, + 141, 145, 147, 149, 152, 153, 155, 156, 157, 159, 160 + ] + + in_vehicle_armed_forces_codes = [ + 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X' + ] + + in_vehicle_diplomatic_codes = ["CC", "CD", "UN"] + in_vehicle_dist_an = ["01"] + in_vehicle_dist_ap = ["39", "40"] + in_vehicle_dist_ar = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "19", "20", "22" + ] + in_vehicle_dist_as = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34" + ] + in_vehicle_dist_br = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "19", + "21", "22", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", + "34", "37", "38", "39", "43", "44", "45", "46", "50", "51", "52", "53", + "55", "56" + ] + in_vehicle_dist_cg = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", + "25", "26", "27", "28", "29", "30" + ] + in_vehicle_dist_ch = ["01", "02", "03", "04"] + in_vehicle_dist_dd = ["01", "02", "03"] + in_vehicle_dist_dn = ["09"] # old list + in_vehicle_dist_dl = [ + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] + in_vehicle_dist_ga = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] + in_vehicle_dist_gj = [ + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", + "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39" + ] + in_vehicle_dist_hp = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_hr = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_jh = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24" + ] + in_vehicle_dist_jk = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22" + ] + in_vehicle_dist_ka = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" + ] + in_vehicle_dist_kl = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_la = ["01", "02"] + in_vehicle_dist_ld = ["01", "02", "03", "04", "05", "06", "07", "08", "09"] + in_vehicle_dist_mh = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51" + ] + in_vehicle_dist_ml = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + in_vehicle_dist_mn = ["01", "02", "03", "04", "05", "06", "07"] + in_vehicle_dist_mp = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" + ] + in_vehicle_dist_mz = ["01", "02", "03", "04", "05", "06", "07", "08"] + in_vehicle_dist_nl = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] + in_vehicle_dist_od = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35" + ] + in_vehicle_dist_or = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31" + ] # old list + in_vehicle_dist_pb = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_py = ["01", "02", "03", "04", "05"] + in_vehicle_dist_rj = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58" + ] + in_vehicle_dist_sk = ["01", "02", "03", "04", "05", "06", "07", "08"] + in_vehicle_dist_tn = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99" + ] + in_vehicle_dist_tr = ["01", "02", "03", "04", "05", "06", "07", "08"] + in_vehicle_dist_ts = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38" + ] + in_vehicle_dist_uk = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20" + ] + in_vehicle_dist_up = [ + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", + "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", + "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", + "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", + "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", + "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", + "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", + "96" + ] + in_vehicle_dist_wb = [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", + "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", + "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", + "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", + "62", "63", "64", "65", "66", "67", "68", "69", "70", "71", "72", "73", + "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", + "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", + "98" + ] + + in_union_territories = ["AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"] + in_old_union_territories = ["CT", "DN"] + in_states = [ + "AP", "AR", "AS", "BR", "CG", "GA", "GJ", "HR", "HP", "JH", "KA", "KL", + "MP", "MH", "MN", "ML", "MZ", "NL", "OD", "PB", "RJ", "SK", "TN", "TS", + "TR", "UP", "UK", "WB", "UT" + ] + in_old_states = ["UL", "OR", "UA"] + in_non_standard_state_or_ut = ["DD"] + + # fmt: on + def __init__( self, patterns: Optional[List[Pattern]] = None, @@ -104,51 +308,51 @@ def __check_vehicle_registration(self, sanitized_value: str) -> bool: is_valid_registration = None # logic here state_rto_districtcode_map = { - "AN": Utils.in_vehicle_dist_an, - "AP": Utils.in_vehicle_dist_ap, - "AR": Utils.in_vehicle_dist_ar, - "AS": Utils.in_vehicle_dist_as, - "BR": Utils.in_vehicle_dist_br, - "CG": Utils.in_vehicle_dist_cg, - "CH": Utils.in_vehicle_dist_ch, - "DD": Utils.in_vehicle_dist_dd, - "DN": Utils.in_vehicle_dist_dn, - "DL": Utils.in_vehicle_dist_dl, - "GA": Utils.in_vehicle_dist_ga, - "GJ": Utils.in_vehicle_dist_gj, - "HP": Utils.in_vehicle_dist_hp, - "HR": Utils.in_vehicle_dist_hr, - "JH": Utils.in_vehicle_dist_jh, - "JK": Utils.in_vehicle_dist_jk, - "KA": Utils.in_vehicle_dist_ka, - "KL": Utils.in_vehicle_dist_kl, - "LA": Utils.in_vehicle_dist_la, - "LD": Utils.in_vehicle_dist_ld, - "MH": Utils.in_vehicle_dist_mh, - "ML": Utils.in_vehicle_dist_ml, - "MN": Utils.in_vehicle_dist_mn, - "MP": Utils.in_vehicle_dist_mp, - "MZ": Utils.in_vehicle_dist_mz, - "NL": Utils.in_vehicle_dist_nl, - "OD": Utils.in_vehicle_dist_od, - "OR": Utils.in_vehicle_dist_or, - "PB": Utils.in_vehicle_dist_pb, - "PY": Utils.in_vehicle_dist_py, - "RJ": Utils.in_vehicle_dist_rj, - "SK": Utils.in_vehicle_dist_sk, - "TN": Utils.in_vehicle_dist_tn, - "TR": Utils.in_vehicle_dist_tr, - "TS": Utils.in_vehicle_dist_ts, - "UK": Utils.in_vehicle_dist_uk, - "UP": Utils.in_vehicle_dist_up, - "WB": Utils.in_vehicle_dist_wb, + "AN": self.in_vehicle_dist_an, + "AP": self.in_vehicle_dist_ap, + "AR": self.in_vehicle_dist_ar, + "AS": self.in_vehicle_dist_as, + "BR": self.in_vehicle_dist_br, + "CG": self.in_vehicle_dist_cg, + "CH": self.in_vehicle_dist_ch, + "DD": self.in_vehicle_dist_dd, + "DN": self.in_vehicle_dist_dn, + "DL": self.in_vehicle_dist_dl, + "GA": self.in_vehicle_dist_ga, + "GJ": self.in_vehicle_dist_gj, + "HP": self.in_vehicle_dist_hp, + "HR": self.in_vehicle_dist_hr, + "JH": self.in_vehicle_dist_jh, + "JK": self.in_vehicle_dist_jk, + "KA": self.in_vehicle_dist_ka, + "KL": self.in_vehicle_dist_kl, + "LA": self.in_vehicle_dist_la, + "LD": self.in_vehicle_dist_ld, + "MH": self.in_vehicle_dist_mh, + "ML": self.in_vehicle_dist_ml, + "MN": self.in_vehicle_dist_mn, + "MP": self.in_vehicle_dist_mp, + "MZ": self.in_vehicle_dist_mz, + "NL": self.in_vehicle_dist_nl, + "OD": self.in_vehicle_dist_od, + "OR": self.in_vehicle_dist_or, + "PB": self.in_vehicle_dist_pb, + "PY": self.in_vehicle_dist_py, + "RJ": self.in_vehicle_dist_rj, + "SK": self.in_vehicle_dist_sk, + "TN": self.in_vehicle_dist_tn, + "TR": self.in_vehicle_dist_tr, + "TS": self.in_vehicle_dist_ts, + "UK": self.in_vehicle_dist_uk, + "UP": self.in_vehicle_dist_up, + "WB": self.in_vehicle_dist_wb, } two_factor_registration_prefix = [] - two_factor_registration_prefix.extend(Utils.in_union_territories) - two_factor_registration_prefix.extend(Utils.in_states) - two_factor_registration_prefix.extend(Utils.in_old_states) - two_factor_registration_prefix.extend(Utils.in_old_union_territories) - two_factor_registration_prefix.extend(Utils.in_non_standard_state_or_ut) + two_factor_registration_prefix.extend(self.in_union_territories) + two_factor_registration_prefix.extend(self.in_states) + two_factor_registration_prefix.extend(self.in_old_states) + two_factor_registration_prefix.extend(self.in_old_union_territories) + two_factor_registration_prefix.extend(self.in_non_standard_state_or_ut) first_two_char = sanitized_value[:2].upper() dist_code: str = "" @@ -167,15 +371,23 @@ def __check_vehicle_registration(self, sanitized_value: str) -> bool: ): is_valid_registration = True - for diplomatic_vehicle_code in Utils.in_vehicle_diplomatic_codes: + for diplomatic_vehicle_code in self.in_vehicle_diplomatic_codes: if diplomatic_vehicle_code in sanitized_value: vehicle_prefix = sanitized_value.partition(diplomatic_vehicle_code)[ 0 ] if vehicle_prefix.isnumeric() and ( 1 <= int(vehicle_prefix) <= 80 - or int(vehicle_prefix) in Utils.in_vehicle_foreign_mission_codes + or int(vehicle_prefix) in self.in_vehicle_foreign_mission_codes ): is_valid_registration = True return is_valid_registration + + def list_length(self): + """ + Unimplemented functon with primary job of running content length test case. + + :return: None + """ + pass diff --git a/presidio-analyzer/tests/test_analyzer_utils.py b/presidio-analyzer/tests/test_analyzer_utils.py index 5deec0e5c..a89d138fa 100644 --- a/presidio-analyzer/tests/test_analyzer_utils.py +++ b/presidio-analyzer/tests/test_analyzer_utils.py @@ -71,54 +71,3 @@ def test_is_verhoeff(input_number, is_verhoeff): :return: True/False """ assert PresidioAnalyzerUtils.is_verhoeff_number(input_number) == is_verhoeff - - -def test_list_length(): - """ - Tests for static counts of each metadata lists defined - :return: True/False - """ - assert len(PresidioAnalyzerUtils.in_old_states) == 3 - assert len(PresidioAnalyzerUtils.in_non_standard_state_or_ut) == 1 - assert len(PresidioAnalyzerUtils.in_states) == 29 - assert len(PresidioAnalyzerUtils.in_old_union_territories) == 2 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_wb) == 97 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_up) == 85 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_uk) == 20 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ts) == 37 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_tr) == 8 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_tn) == 98 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_sk) == 8 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_rj) == 57 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_py) == 5 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_pb) == 98 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_or) == 30 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_od) == 34 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_nl) == 10 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_mz) == 8 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_mp) == 70 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_mn) == 7 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ml) == 10 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_mh) == 50 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ld) == 9 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_la) == 2 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_kl) == 98 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ka) == 70 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_jh) == 23 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_hr) == 98 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_hp) == 98 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_gj) == 39 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ga) == 12 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_dl) == 13 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_dn) == 1 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_dd) == 3 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ch) == 4 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_cg) == 30 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_br) == 38 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_as) == 33 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ar) == 20 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_ap) == 2 - assert len(PresidioAnalyzerUtils.in_vehicle_dist_an) == 1 - assert len(PresidioAnalyzerUtils.in_vehicle_diplomatic_codes) == 3 - assert len(PresidioAnalyzerUtils.in_vehicle_armed_forces_codes) == 11 - assert len(PresidioAnalyzerUtils.in_vehicle_foreign_mission_codes) == 41 diff --git a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py index 9d007a995..669119772 100644 --- a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py @@ -50,3 +50,54 @@ def test_when_regn_in_text_then_all_regns_found( expected_position[1], expected_score, ) + + +def test_list_length(): + """ + Tests for static counts of each metadata lists defined + :return: True/False + """ + assert len(InVehicleRegistrationRecognizer.in_old_states) == 3 + assert len(InVehicleRegistrationRecognizer.in_non_standard_state_or_ut) == 1 + assert len(InVehicleRegistrationRecognizer.in_states) == 29 + assert len(InVehicleRegistrationRecognizer.in_old_union_territories) == 2 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_wb) == 97 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_up) == 85 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_uk) == 20 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ts) == 37 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_tr) == 8 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_tn) == 98 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_sk) == 8 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_rj) == 57 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_py) == 5 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_pb) == 98 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_or) == 30 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_od) == 34 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_nl) == 10 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mz) == 8 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mp) == 70 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mn) == 7 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ml) == 10 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_mh) == 50 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ld) == 9 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_la) == 2 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_kl) == 98 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ka) == 70 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_jh) == 23 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_hr) == 98 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_hp) == 98 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_gj) == 39 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ga) == 12 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_dl) == 13 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_dn) == 1 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_dd) == 3 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ch) == 4 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_cg) == 30 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_br) == 38 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_as) == 33 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ar) == 20 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_ap) == 2 + assert len(InVehicleRegistrationRecognizer.in_vehicle_dist_an) == 1 + assert len(InVehicleRegistrationRecognizer.in_vehicle_diplomatic_codes) == 3 + assert len(InVehicleRegistrationRecognizer.in_vehicle_armed_forces_codes) == 11 + assert len(InVehicleRegistrationRecognizer.in_vehicle_foreign_mission_codes) == 41 From 1ffbb8b367a6b1b84c1cbdcec0d931ff4e0f2e40 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Thu, 15 Feb 2024 23:16:58 +0530 Subject: [PATCH 15/23] added null/min vehicle number size added min size check to avoid failures per review comment --- .../in_vehicle_registration_recognizer.py | 144 +++++++++--------- ...test_in_vehicle_registration_recognizer.py | 2 - 2 files changed, 73 insertions(+), 73 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py index c8fdd0846..37a309e59 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py @@ -305,83 +305,85 @@ def validate_result(self, pattern_text: str) -> bool: def __check_vehicle_registration(self, sanitized_value: str) -> bool: # print('check function called') - is_valid_registration = None + is_valid_registration = None # deliberately not typecasted or set to bool False # logic here - state_rto_districtcode_map = { - "AN": self.in_vehicle_dist_an, - "AP": self.in_vehicle_dist_ap, - "AR": self.in_vehicle_dist_ar, - "AS": self.in_vehicle_dist_as, - "BR": self.in_vehicle_dist_br, - "CG": self.in_vehicle_dist_cg, - "CH": self.in_vehicle_dist_ch, - "DD": self.in_vehicle_dist_dd, - "DN": self.in_vehicle_dist_dn, - "DL": self.in_vehicle_dist_dl, - "GA": self.in_vehicle_dist_ga, - "GJ": self.in_vehicle_dist_gj, - "HP": self.in_vehicle_dist_hp, - "HR": self.in_vehicle_dist_hr, - "JH": self.in_vehicle_dist_jh, - "JK": self.in_vehicle_dist_jk, - "KA": self.in_vehicle_dist_ka, - "KL": self.in_vehicle_dist_kl, - "LA": self.in_vehicle_dist_la, - "LD": self.in_vehicle_dist_ld, - "MH": self.in_vehicle_dist_mh, - "ML": self.in_vehicle_dist_ml, - "MN": self.in_vehicle_dist_mn, - "MP": self.in_vehicle_dist_mp, - "MZ": self.in_vehicle_dist_mz, - "NL": self.in_vehicle_dist_nl, - "OD": self.in_vehicle_dist_od, - "OR": self.in_vehicle_dist_or, - "PB": self.in_vehicle_dist_pb, - "PY": self.in_vehicle_dist_py, - "RJ": self.in_vehicle_dist_rj, - "SK": self.in_vehicle_dist_sk, - "TN": self.in_vehicle_dist_tn, - "TR": self.in_vehicle_dist_tr, - "TS": self.in_vehicle_dist_ts, - "UK": self.in_vehicle_dist_uk, - "UP": self.in_vehicle_dist_up, - "WB": self.in_vehicle_dist_wb, - } - two_factor_registration_prefix = [] - two_factor_registration_prefix.extend(self.in_union_territories) - two_factor_registration_prefix.extend(self.in_states) - two_factor_registration_prefix.extend(self.in_old_states) - two_factor_registration_prefix.extend(self.in_old_union_territories) - two_factor_registration_prefix.extend(self.in_non_standard_state_or_ut) - first_two_char = sanitized_value[:2].upper() - dist_code: str = "" + if len(sanitized_value) >= 8: + state_rto_district_map = { + "AN": self.in_vehicle_dist_an, + "AP": self.in_vehicle_dist_ap, + "AR": self.in_vehicle_dist_ar, + "AS": self.in_vehicle_dist_as, + "BR": self.in_vehicle_dist_br, + "CG": self.in_vehicle_dist_cg, + "CH": self.in_vehicle_dist_ch, + "DD": self.in_vehicle_dist_dd, + "DN": self.in_vehicle_dist_dn, + "DL": self.in_vehicle_dist_dl, + "GA": self.in_vehicle_dist_ga, + "GJ": self.in_vehicle_dist_gj, + "HP": self.in_vehicle_dist_hp, + "HR": self.in_vehicle_dist_hr, + "JH": self.in_vehicle_dist_jh, + "JK": self.in_vehicle_dist_jk, + "KA": self.in_vehicle_dist_ka, + "KL": self.in_vehicle_dist_kl, + "LA": self.in_vehicle_dist_la, + "LD": self.in_vehicle_dist_ld, + "MH": self.in_vehicle_dist_mh, + "ML": self.in_vehicle_dist_ml, + "MN": self.in_vehicle_dist_mn, + "MP": self.in_vehicle_dist_mp, + "MZ": self.in_vehicle_dist_mz, + "NL": self.in_vehicle_dist_nl, + "OD": self.in_vehicle_dist_od, + "OR": self.in_vehicle_dist_or, + "PB": self.in_vehicle_dist_pb, + "PY": self.in_vehicle_dist_py, + "RJ": self.in_vehicle_dist_rj, + "SK": self.in_vehicle_dist_sk, + "TN": self.in_vehicle_dist_tn, + "TR": self.in_vehicle_dist_tr, + "TS": self.in_vehicle_dist_ts, + "UK": self.in_vehicle_dist_uk, + "UP": self.in_vehicle_dist_up, + "WB": self.in_vehicle_dist_wb, + } + two_factor_registration_prefix = [] + two_factor_registration_prefix.extend(self.in_union_territories) + two_factor_registration_prefix.extend(self.in_states) + two_factor_registration_prefix.extend(self.in_old_states) + two_factor_registration_prefix.extend(self.in_old_union_territories) + two_factor_registration_prefix.extend(self.in_non_standard_state_or_ut) + first_two_char = sanitized_value[:2].upper() + dist_code: str = "" - if first_two_char in two_factor_registration_prefix: - if sanitized_value[2].isdigit(): - if sanitized_value[3].isdigit(): - dist_code = sanitized_value[2:4] - else: - dist_code = sanitized_value[2:3] + if first_two_char in two_factor_registration_prefix: + if sanitized_value[2].isdigit(): + if sanitized_value[3].isdigit(): + dist_code = sanitized_value[2:4] + else: + dist_code = sanitized_value[2:3] - registration_digits = sanitized_value[-4:] - if registration_digits.isnumeric(): - if 0 < int(registration_digits) <= 9999: - if dist_code and dist_code in state_rto_districtcode_map.get( - first_two_char, "" + registration_digits = sanitized_value[-4:] + if registration_digits.isnumeric(): + if 0 < int(registration_digits) <= 9999: + if dist_code and dist_code in state_rto_district_map.get( + first_two_char, "" + ): + is_valid_registration = True + + for diplomatic_vehicle_code in self.in_vehicle_diplomatic_codes: + if diplomatic_vehicle_code in sanitized_value: + vehicle_prefix = sanitized_value.partition( + diplomatic_vehicle_code + )[0] + if vehicle_prefix.isnumeric() and ( + 1 <= int(vehicle_prefix) <= 80 + or int(vehicle_prefix) + in self.in_vehicle_foreign_mission_codes ): is_valid_registration = True - for diplomatic_vehicle_code in self.in_vehicle_diplomatic_codes: - if diplomatic_vehicle_code in sanitized_value: - vehicle_prefix = sanitized_value.partition(diplomatic_vehicle_code)[ - 0 - ] - if vehicle_prefix.isnumeric() and ( - 1 <= int(vehicle_prefix) <= 80 - or int(vehicle_prefix) in self.in_vehicle_foreign_mission_codes - ): - is_valid_registration = True - return is_valid_registration def list_length(self): diff --git a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py index 669119772..6e30e41c0 100644 --- a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py @@ -38,8 +38,6 @@ def test_when_regn_in_text_then_all_regns_found( entities, ): results = recognizer.analyze(text, entities) - print("Results") - print(results) assert len(results) == expected_len if results: From 2a4708b073f80d7c369a6fa9a0cd8a096c9fbbe9 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Sun, 18 Feb 2024 23:51:16 +0530 Subject: [PATCH 16/23] incorporated review comments --- .../in_vehicle_registration_recognizer.py | 264 +++++++++--------- .../tests/test_analyzer_utils.py | 9 - ...test_in_vehicle_registration_recognizer.py | 1 + 3 files changed, 137 insertions(+), 137 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py index 37a309e59..bfa335525 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_vehicle_registration_recognizer.py @@ -45,7 +45,7 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): Pattern( "India Vehicle Registration (Weak)", r"\b[A-Z]{3}(?!0000)\d{4}\b", - 0.2, + 0.20, ), Pattern( "India Vehicle Registration (Medium)", @@ -54,7 +54,12 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): ), Pattern( "India Vehicle Registration", - r"\b[A-Z]{2}\d{1,2}[A-Z]{1,2}(?!0000)\d{4}\b", + r"\b[A-Z]{2}\d{1}[A-Z]{1,3}(?!0000)\d{4}\b", + 0.50, + ), + Pattern( + "India Vehicle Registration", + r"\b[A-Z]{2}\d{2}[A-Z]{1,2}(?!0000)\d{4}\b", 0.50, ), Pattern( @@ -72,53 +77,51 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): CONTEXT = ["RTO", "vehicle", "plate", "registration"] # fmt: off - in_vehicle_foreign_mission_codes = [ + in_vehicle_foreign_mission_codes = { 84, 85, 89, 93, 94, 95, 97, 98, 99, 102, 104, 105, 106, 109, 111, 112, 113, 117, 119, 120, 121, 122, 123, 125, 126, 128, 133, 134, 135, 137, 141, 145, 147, 149, 152, 153, 155, 156, 157, 159, 160 - ] - - in_vehicle_armed_forces_codes = [ - 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X' - ] + } - in_vehicle_diplomatic_codes = ["CC", "CD", "UN"] - in_vehicle_dist_an = ["01"] - in_vehicle_dist_ap = ["39", "40"] - in_vehicle_dist_ar = [ + in_vehicle_armed_forces_codes = { + 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'K', 'P', 'R', 'X'} + in_vehicle_diplomatic_codes = {"CC", "CD", "UN"} + in_vehicle_dist_an = {"01"} + in_vehicle_dist_ap = {"39", "40"} + in_vehicle_dist_ar = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "19", "20", "22" - ] - in_vehicle_dist_as = [ + } + in_vehicle_dist_as = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34" - ] - in_vehicle_dist_br = [ + } + in_vehicle_dist_br = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "19", "21", "22", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "37", "38", "39", "43", "44", "45", "46", "50", "51", "52", "53", "55", "56" - ] - in_vehicle_dist_cg = [ + } + in_vehicle_dist_cg = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30" - ] - in_vehicle_dist_ch = ["01", "02", "03", "04"] - in_vehicle_dist_dd = ["01", "02", "03"] - in_vehicle_dist_dn = ["09"] # old list - in_vehicle_dist_dl = [ - "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"] - in_vehicle_dist_ga = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"] - in_vehicle_dist_gj = [ + } + in_vehicle_dist_ch = {"01", "02", "03", "04"} + in_vehicle_dist_dd = {"01", "02", "03"} + in_vehicle_dist_dn = {"09"} # old list + in_vehicle_dist_dl = { + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13"} + in_vehicle_dist_ga = { + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"} + in_vehicle_dist_gj = { "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39" - ] - in_vehicle_dist_hp = [ + } + in_vehicle_dist_hp = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", @@ -128,8 +131,8 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99" - ] - in_vehicle_dist_hr = [ + } + in_vehicle_dist_hr = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", @@ -139,24 +142,24 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99" - ] - in_vehicle_dist_jh = [ + } + in_vehicle_dist_jh = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24" - ] - in_vehicle_dist_jk = [ + } + in_vehicle_dist_jk = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22" - ] - in_vehicle_dist_ka = [ + } + in_vehicle_dist_ka = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" - ] - in_vehicle_dist_kl = [ + } + in_vehicle_dist_kl = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", @@ -166,40 +169,40 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99" - ] - in_vehicle_dist_la = ["01", "02"] - in_vehicle_dist_ld = ["01", "02", "03", "04", "05", "06", "07", "08", "09"] - in_vehicle_dist_mh = [ + } + in_vehicle_dist_la = {"01", "02"} + in_vehicle_dist_ld = {"01", "02", "03", "04", "05", "06", "07", "08", "09"} + in_vehicle_dist_mh = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51" - ] - in_vehicle_dist_ml = [ - "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] - in_vehicle_dist_mn = ["01", "02", "03", "04", "05", "06", "07"] - in_vehicle_dist_mp = [ + } + in_vehicle_dist_ml = { + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10"} + in_vehicle_dist_mn = {"01", "02", "03", "04", "05", "06", "07"} + in_vehicle_dist_mp = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", "69", "70", "71" - ] - in_vehicle_dist_mz = ["01", "02", "03", "04", "05", "06", "07", "08"] - in_vehicle_dist_nl = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10"] - in_vehicle_dist_od = [ + } + in_vehicle_dist_mz = {"01", "02", "03", "04", "05", "06", "07", "08"} + in_vehicle_dist_nl = {"01", "02", "03", "04", "05", "06", "07", "08", "09", "10"} + in_vehicle_dist_od = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35" - ] - in_vehicle_dist_or = [ + } + in_vehicle_dist_or = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31" - ] # old list - in_vehicle_dist_pb = [ + } # old list + in_vehicle_dist_pb = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", @@ -209,17 +212,17 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99" - ] - in_vehicle_dist_py = ["01", "02", "03", "04", "05"] - in_vehicle_dist_rj = [ + } + in_vehicle_dist_py = {"01", "02", "03", "04", "05"} + in_vehicle_dist_rj = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", "58" - ] - in_vehicle_dist_sk = ["01", "02", "03", "04", "05", "06", "07", "08"] - in_vehicle_dist_tn = [ + } + in_vehicle_dist_sk = {"01", "02", "03", "04", "05", "06", "07", "08"} + in_vehicle_dist_tn = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", @@ -229,19 +232,19 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98", "99" - ] - in_vehicle_dist_tr = ["01", "02", "03", "04", "05", "06", "07", "08"] - in_vehicle_dist_ts = [ + } + in_vehicle_dist_tr = {"01", "02", "03", "04", "05", "06", "07", "08"} + in_vehicle_dist_ts = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38" - ] - in_vehicle_dist_uk = [ + } + in_vehicle_dist_uk = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20" - ] - in_vehicle_dist_up = [ + } + in_vehicle_dist_up = { "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47", @@ -250,8 +253,8 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "72", "73", "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96" - ] - in_vehicle_dist_wb = [ + } + in_vehicle_dist_wb = { "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", @@ -261,18 +264,64 @@ class InVehicleRegistrationRecognizer(PatternRecognizer): "74", "75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96", "97", "98" - ] - - in_union_territories = ["AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"] - in_old_union_territories = ["CT", "DN"] - in_states = [ + } + in_union_territories = {"AN", "CH", "DH", "DL", "JK", "LA", "LD", "PY"} + in_old_union_territories = {"CT", "DN"} + in_states = { "AP", "AR", "AS", "BR", "CG", "GA", "GJ", "HR", "HP", "JH", "KA", "KL", "MP", "MH", "MN", "ML", "MZ", "NL", "OD", "PB", "RJ", "SK", "TN", "TS", "TR", "UP", "UK", "WB", "UT" - ] - in_old_states = ["UL", "OR", "UA"] - in_non_standard_state_or_ut = ["DD"] + } + in_old_states = {"UL", "OR", "UA"} + in_non_standard_state_or_ut = {"DD"} + + state_rto_district_map = { + "AN": in_vehicle_dist_an, + "AP": in_vehicle_dist_ap, + "AR": in_vehicle_dist_ar, + "AS": in_vehicle_dist_as, + "BR": in_vehicle_dist_br, + "CG": in_vehicle_dist_cg, + "CH": in_vehicle_dist_ch, + "DD": in_vehicle_dist_dd, + "DN": in_vehicle_dist_dn, + "DL": in_vehicle_dist_dl, + "GA": in_vehicle_dist_ga, + "GJ": in_vehicle_dist_gj, + "HP": in_vehicle_dist_hp, + "HR": in_vehicle_dist_hr, + "JH": in_vehicle_dist_jh, + "JK": in_vehicle_dist_jk, + "KA": in_vehicle_dist_ka, + "KL": in_vehicle_dist_kl, + "LA": in_vehicle_dist_la, + "LD": in_vehicle_dist_ld, + "MH": in_vehicle_dist_mh, + "ML": in_vehicle_dist_ml, + "MN": in_vehicle_dist_mn, + "MP": in_vehicle_dist_mp, + "MZ": in_vehicle_dist_mz, + "NL": in_vehicle_dist_nl, + "OD": in_vehicle_dist_od, + "OR": in_vehicle_dist_or, + "PB": in_vehicle_dist_pb, + "PY": in_vehicle_dist_py, + "RJ": in_vehicle_dist_rj, + "SK": in_vehicle_dist_sk, + "TN": in_vehicle_dist_tn, + "TR": in_vehicle_dist_tr, + "TS": in_vehicle_dist_ts, + "UK": in_vehicle_dist_uk, + "UP": in_vehicle_dist_up, + "WB": in_vehicle_dist_wb, + } + two_factor_registration_prefix = set() + two_factor_registration_prefix |= in_union_territories + two_factor_registration_prefix |= in_states + two_factor_registration_prefix |= in_old_states + two_factor_registration_prefix |= in_old_union_territories + two_factor_registration_prefix |= in_non_standard_state_or_ut # fmt: on def __init__( @@ -307,57 +356,14 @@ def __check_vehicle_registration(self, sanitized_value: str) -> bool: # print('check function called') is_valid_registration = None # deliberately not typecasted or set to bool False # logic here + # print(sanitized_value) if len(sanitized_value) >= 8: - state_rto_district_map = { - "AN": self.in_vehicle_dist_an, - "AP": self.in_vehicle_dist_ap, - "AR": self.in_vehicle_dist_ar, - "AS": self.in_vehicle_dist_as, - "BR": self.in_vehicle_dist_br, - "CG": self.in_vehicle_dist_cg, - "CH": self.in_vehicle_dist_ch, - "DD": self.in_vehicle_dist_dd, - "DN": self.in_vehicle_dist_dn, - "DL": self.in_vehicle_dist_dl, - "GA": self.in_vehicle_dist_ga, - "GJ": self.in_vehicle_dist_gj, - "HP": self.in_vehicle_dist_hp, - "HR": self.in_vehicle_dist_hr, - "JH": self.in_vehicle_dist_jh, - "JK": self.in_vehicle_dist_jk, - "KA": self.in_vehicle_dist_ka, - "KL": self.in_vehicle_dist_kl, - "LA": self.in_vehicle_dist_la, - "LD": self.in_vehicle_dist_ld, - "MH": self.in_vehicle_dist_mh, - "ML": self.in_vehicle_dist_ml, - "MN": self.in_vehicle_dist_mn, - "MP": self.in_vehicle_dist_mp, - "MZ": self.in_vehicle_dist_mz, - "NL": self.in_vehicle_dist_nl, - "OD": self.in_vehicle_dist_od, - "OR": self.in_vehicle_dist_or, - "PB": self.in_vehicle_dist_pb, - "PY": self.in_vehicle_dist_py, - "RJ": self.in_vehicle_dist_rj, - "SK": self.in_vehicle_dist_sk, - "TN": self.in_vehicle_dist_tn, - "TR": self.in_vehicle_dist_tr, - "TS": self.in_vehicle_dist_ts, - "UK": self.in_vehicle_dist_uk, - "UP": self.in_vehicle_dist_up, - "WB": self.in_vehicle_dist_wb, - } - two_factor_registration_prefix = [] - two_factor_registration_prefix.extend(self.in_union_territories) - two_factor_registration_prefix.extend(self.in_states) - two_factor_registration_prefix.extend(self.in_old_states) - two_factor_registration_prefix.extend(self.in_old_union_territories) - two_factor_registration_prefix.extend(self.in_non_standard_state_or_ut) first_two_char = sanitized_value[:2].upper() dist_code: str = "" + # print(first_two_char) - if first_two_char in two_factor_registration_prefix: + if first_two_char in self.two_factor_registration_prefix: + # print("Got into processing loop") if sanitized_value[2].isdigit(): if sanitized_value[3].isdigit(): dist_code = sanitized_value[2:4] @@ -367,8 +373,10 @@ def __check_vehicle_registration(self, sanitized_value: str) -> bool: registration_digits = sanitized_value[-4:] if registration_digits.isnumeric(): if 0 < int(registration_digits) <= 9999: - if dist_code and dist_code in state_rto_district_map.get( - first_two_char, "" + if ( + dist_code + and dist_code + in self.state_rto_district_map.get(first_two_char, "") ): is_valid_registration = True diff --git a/presidio-analyzer/tests/test_analyzer_utils.py b/presidio-analyzer/tests/test_analyzer_utils.py index a89d138fa..751736dc9 100644 --- a/presidio-analyzer/tests/test_analyzer_utils.py +++ b/presidio-analyzer/tests/test_analyzer_utils.py @@ -9,15 +9,6 @@ ["NotAPalindrome", True, False], ] -in_vehicle_metadata_test_set = [ - ["in_non_standard_state_or_ut", 1], - ["in_old_states", 3], - ["in_states", 29], - ["in_union_territories", 8], - ["in_old_union_territories", 2], - ["in_vehicle_dist_wb", 70], -] - sanitizer_test_set = [ [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], ["def", "", "def"], diff --git a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py index 6e30e41c0..aed04891d 100644 --- a/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py +++ b/presidio-analyzer/tests/test_in_vehicle_registration_recognizer.py @@ -23,6 +23,7 @@ def entities(): ("MN2412", 1, (0, 6), 0.01), ("MCX1243", 1, (0, 7), 0.2), ("I15432", 1, (0, 6), 0.01), + ("DL3CJI0001", 1, (0, 10), 1), ("ABNE123456", 0, (), (),), ("My Bike's registration number is OD02BA2341 with a lot of text beyond", 1, (33, 43), 1), From 424174d47f52528277cfff7c6577052e0d4b7f52 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Sun, 25 Feb 2024 01:57:19 +0530 Subject: [PATCH 17/23] added two predefined recognizers : ISIN, CFI Two english language predefine recognizers added viz. ISIN , CFI --- docs/supported_entities.md | 30 +-- .../predefined_recognizers/__init__.py | 4 + .../predefined_recognizers/cfi_recognizer.py | 204 ++++++++++++++++++ .../in_pan_recognizer.py | 6 +- .../predefined_recognizers/isin_recognizer.py | 60 ++++++ .../presidio_analyzer/recognizer_registry.py | 4 + .../tests/test_cfi_recognizer.py | 53 +++++ .../tests/test_isin_recognizer.py | 61 ++++++ .../tests/test_recognizer_registry.py | 4 +- 9 files changed, 407 insertions(+), 19 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py create mode 100644 presidio-analyzer/tests/test_cfi_recognizer.py create mode 100644 presidio-analyzer/tests/test_isin_recognizer.py diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 52bf94c45..94d113511 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -10,20 +10,22 @@ For more information, refer to the [adding new recognizers documentation](analyz ### Global -|Entity Type | Description | Detection Method | -| --- | --- | --- | -|CREDIT_CARD |A credit card number is between 12 to 19 digits. |Pattern match and checksum| -|CRYPTO|A Crypto wallet number. Currently only Bitcoin address is supported|Pattern match, context and checksum| -|DATE_TIME|Absolute or relative dates or periods or times smaller than a day.|Pattern match and context| -|EMAIL_ADDRESS|An email address identifies an email box to which email messages are delivered|Pattern match, context and RFC-822 validation| -|IBAN_CODE|The International Bank Account Number (IBAN) is an internationally agreed system of identifying bank accounts across national borders to facilitate the communication and processing of cross border transactions with a reduced risk of transcription errors.|Pattern match, context and checksum| -|IP_ADDRESS|An Internet Protocol (IP) address (either IPv4 or IPv6).|Pattern match, context and checksum| -|NRP|A person’s Nationality, religious or political group.|Custom logic and context| -|LOCATION|Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains|Custom logic and context| -|PERSON|A full person name, which can include first names, middle names or initials, and last names.|Custom logic and context| -|PHONE_NUMBER|A telephone number|Custom logic, pattern match and context| -|MEDICAL_LICENSE|Common medical license numbers.|Pattern match, context and checksum| -|URL|A URL (Uniform Resource Locator), unique identifier used to locate a resource on the Internet|Pattern match, context and top level url validation| +| Entity Type | Description | Detection Method | +|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------| +| CFI | CFI (Classification of Financial Instruments) is a six letter code to classify a financial instrument as per ISO 10962 | Pattern match, context | +| CREDIT_CARD | A credit card number is between 12 to 19 digits. | Pattern match and checksum | +| CRYPTO | A Crypto wallet number. Currently only Bitcoin address is supported | Pattern match, context and checksum | +| DATE_TIME | Absolute or relative dates or periods or times smaller than a day. | Pattern match and context | +| EMAIL_ADDRESS | An email address identifies an email box to which email messages are delivered | Pattern match, context and RFC-822 validation | +| IBAN_CODE | The International Bank Account Number (IBAN) is an internationally agreed system of identifying bank accounts across national borders to facilitate the communication and processing of cross border transactions with a reduced risk of transcription errors. | Pattern match, context and checksum | +| IP_ADDRESS | An Internet Protocol (IP) address (either IPv4 or IPv6). | Pattern match, context and checksum | +| ISIN | An ISIN ( International Securities Identification Number), 12 character unique identifier used to recognize a security as per ISO 6166 | Pattern match, context | +| NRP | A person’s Nationality, religious or political group. | Custom logic and context | +| LOCATION | Name of politically or geographically defined location (cities, provinces, countries, international regions, bodies of water, mountains | Custom logic and context | +| PERSON | A full person name, which can include first names, middle names or initials, and last names. | Custom logic and context | +| PHONE_NUMBER | A telephone number | Custom logic, pattern match and context | +| MEDICAL_LICENSE | Common medical license numbers. | Pattern match, context and checksum | +| URL | A URL (Uniform Resource Locator), unique identifier used to locate a resource on the Internet | Pattern match, context and top level url validation | ### USA diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 80d68b8b4..35c9b3752 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -37,6 +37,8 @@ from .azure_ai_language import AzureAILanguageRecognizer from .in_aadhaar_recognizer import InAadhaarRecognizer from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer +from .isin_recognizer import IsinRecognizer +from .cfi_recognizer import CfiRecognizer NLP_RECOGNIZERS = { "spacy": SpacyRecognizer, @@ -81,4 +83,6 @@ "AzureAILanguageRecognizer", "InAadhaarRecognizer", "InVehicleRegistrationRecognizer", + "IsinRecognizer", + "CfiRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py new file mode 100644 index 000000000..975b868e5 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py @@ -0,0 +1,204 @@ +import re +from typing import List, Optional, Tuple +from presidio_analyzer import Pattern, PatternRecognizer +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils + + +class CfiRecognizer(PatternRecognizer): + """ + Recognize Classification of Financial Codes (CFI codes) using regex. + + Ref: 1. https://en.wikipedia.org/wiki/ISO_10962 . + 2. https://bit.ly/ISO10962-2021 + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + PATTERNS = [ + Pattern( + "CFI (Weak)", + r"\b^[A-Z]{6}$\b", + 0.05, + ), + Pattern( + "CFI (Medium)", + r"\b^[ECDROFSHIJKLTM][A-Z]{5}$\b", + 0.10, + ), + Pattern( + "CFI (Strong)", + r"\b^(ES|EP|EC|EF|EL|ED|EY|EM|CI|CH|CB|CE|CS|CF|CP|CM|DB|DC|DW|DT|DS|DE|DG" + r"|DA|DN|DD|DM|DY|RA|RS|RP|RW|RF|RD|RM|OC|OP|OM|FF|FC|SR|ST|SE|SC|SF|SM|HR" + r"|HT|HE|HC|HF|HM|IF|IT|JE|JF|JC|JR|JT|KR|KT|KE|KC|KF|KY|KM|LL" + r"|LR|LS|TC|TT|TR|TI|TB|TD|TM|MC|MM)[A-Z]{4}$\b", + 0.50, + ), + ] + + equity_regex_patterns = [ + r"\b^(ES)[VNRE]{1}[TU]{1}[FOP]{1}[BRNM]{1}$\b", + r"\b^(E)[PF]{1}[VNRE]{1}[RETGACN]{1}[FCPQANUD]{1}[BRNM]{1}$\b", + r"\b^(ED)[SPCFLM]{1}[RNBDX]{1}[FCPQANUD]{1}[BRNM]{1}$\b", + r"\b^(EY)[ABCDEM]{1}[DYM]{1}[FVEM]{1}[BSDGTCINM]{1}$\b", + r"\b^(E)[RV]{1}[VNRE]{1}[RETGACN]{1}[FCPQANUD]{1}[BRNM]{1}$\b", + r"\b^(EU)[CO]{1}[IGM]{1}[RSMCD]{1}[BRNZAM]{1}$\b", + r"\b^(EMXXX)[BRNM]{1}$\b", + ] + + debt_regex_patterns = [ + r"\b(D)[BCWT]{1}[FZVCK]{1}[TGSUPNOQJC]{1}[FGCDABTLPQRE]{1}[BRNM]{1}\b", + r"\b(DS)[ABCDM]{1}[FDYM]{1}[FVM]{1}[BSDTCIM]{1}\b", + r"\b(DE)[ABCDEM]{1}[FDYM]{1}[RSCTM]{1}[BSDTCIM]{1}\b", + r"\b(DG)[FZV]{1}[TGSUPNOQJC]{1}[FGCDABTLPQRE]{1}[BRNM]{1}\b", + r"\b(D)[AN]{1}[FZV]{1}[TGSUPNOQJC]{1}[FGCDABTLPQRE]{1}[BRNM]{1}\b", + r"\b(DD)[BCWTYGQNM]{1}[FZVC]{1}[TGSUPNOQJC]{1}[FGCDABTLPQRE]{1}\b", + r"\b(DY)[FZVK]{1}[TGSUPNOQJC]{1}(X)[BRNM]{1}\b", + r"\b(DM)[BPM]{1}(XX)[BRNM]{1}\b", + ] + + collective_investment_regex_patterns = [ + r"\b(C)[PI]{1}[OCM]{1}[IGJ]{1}[RBEVLCDFKM]{1}[SQUY]{1}\b", + r"\b(CMXXX)[SQUY]{1}\b", + r"\b(CH)[DRSEANLM]{1}(XXX)\b", + r"\b(CB)[OCM]{1}[IGJ]{1}(X)[SQUY]\b", + r"\b(CE)[OCM]{1}[IGJ]{1}[RBEVLCDFKM]{1}[SU]{1}\b", + r"\b(CS)[OCM]{1}[BGLM]{1}[RBM]{1}[SU]{1}\b", + r"\b(CF)[OCM]{1}[IGJ]{1}[IHBEPM]{1}[SQUY]{1}\b", + ] + + rights_regex_patterns = [ + r"\b(RAXXX)[BRNM]{1}\b", + r"\b(R)[PS]{1}[SPCFBIM]{1}(XX)[BRNM]{1}\b", + r"\b(RW)[BSDTCIM]{1}[TNC]{1}[CPB]{1}[AEBM]{1}\b", + r"\b(RF)[BSDTCIM]{1}[TNM]{1}[CPM]{1}[AEBM]{1}\b", + r"\b(RD)[ASPWM]{1}(XX)[BRNM]{1}\b", + r"\b(RMXXXX)\b", + ] + + listed_options_regex_patterns = [ + r"\b(O)[CP]{1}[AEB]{1}[BSDTCIOFWNM]{1}[PCNE]{1}[SN]{1}\b", + r"\b(OMXXXX)\b", + ] + + futures_regex_patterns = [ + r"\b(FF)[BSDCIOFWNVM]{1}[PCN]{1}[SN]{1}(X)\b", + r"\b(FC)[EAISNPHM]{1}[PCN]{1}[SN]{1}(X)\b", + ] + + swaps_regex_patterns = [ + r"\b(SR)[ACDGHZM]{1}[CDIY]{1}[SC]{1}[CP]{1}\b", + r"\b(ST)[JKANGPSTIQM]{1}[PDVLTCM]{1}(X)[CPE]{1}\b", + r"\b(SE)[SIBM]{1}[PDVLTCM]{1}(X)[CPE]{1}\b", + r"\b(SC)[UVIBM]{1}[CTM]{1}[CSL]{1}[CPA]{1}\b", + r"\b(SF)[ACM]{1}(XX)[PN]{1}\b", + r"\b(SM)[PM]{1}(XX)[CP]{1}\b", + ] + + non_listed_regex_patterns = [ + r"\b(HR)[ACDGHORFM]{1}[ABCDEFGHI]{1}[VADBGLPM]{1}[CPE]{1}\b", + r"\b(HT)[JKANGPSTIQORFWM][ABCDEFGHI]{1}[VADBGLPM]{1}[CPE]{1}\b", + r"\b(HE)[SIBORFM]{1}[ABCDEFGHI]{1}[VADBGLPM]{1}[CPE]{1}\b", + r"\b(HC)[UVIWM]{1}[ABCDEFGHI]{1}[VADBGLPM]{1}[CPE]{1}\b", + r"\b(HF)[RFTVM]{1}[ABCDEFGHI]{1}[VADBGLPM]{1}[CPEN{1}\b", + r"\b(HM)[PM]{1}[ABCDEFGHI]{1}[VADBGLPM]{1}[CPENA]{1}\b", + ] + + spot_regex_patterns = [ + r"\b(IFXXXP)\b", + r"\b(IT)[AJKNPSTM]{1}(XXX)\b", + ] + + forwards_regex_patterns = [ + r"\b(JE)[SIBOF]{1}(X)[CSF]{1}[CP]{1}\b", + r"\b(JF)[TROF]{1}(X)[CSF]{1}[PCN]{1}\b", + r"\b(JC)[AIBCDGO]{1}(X)[SF]{1}[PCN]{1}\b", + r"\b(JR)[IOM]{1}(X)[SF]{1}[PCN]{1}\b", + r"\b(JT)[ABGIJKNPSTM]{1}(X)[CF]{1}[PCN]{1}\b", + ] + + strategies_regex_patterns = [ + r"\b(K)[RTECFYM]{1}(XXXX)\b", + ] + + financing_regex_patterns = [ + r"\b(LL)[ABJKNPSTM]{1}(XX)[PCN]{1}\b", + r"\b(LR)[GSC]{1}[FNOT]{1}(X)[DHT]{1}\b", + r"\b(LS)[CGPTELDWKM]{1}[NOT]{1}(X)[DHFT]{1}\b", + ] + reference_instruments_regex_patterns = [ + r"\b(TC)[NCLM]{1}(XXX)\b", + r"\b(TT)[EAISNPHM]{1}(XXX)\b", + r"\b(TR)[NVFRM]{1}[DWNQSAM]{1}(XX)\b", + r"\b(TI)[EDFRTCM]{1}[PCEFM]{1}[PNGM]{1}(X)\b", + r"\b(TB)[EDFITCM]{1}(XXX)\b", + r"\b(TD)[SPCFLKM](XXX)\b", + r"\b(TMXXXX)\b", + ] + miscellaneous_regex_patterns = [ + r"\b(MC)[SBHAWUM]{1}[TU]{1}(X)[BRNM]\b", + r"\b(MM)[RIETNPSM]{1}(XXX)\b", + ] + + regex_options = { + "E": equity_regex_patterns, + "C": collective_investment_regex_patterns, + "D": debt_regex_patterns, + "R": rights_regex_patterns, + "O": listed_options_regex_patterns, + "F": futures_regex_patterns, + "S": swaps_regex_patterns, + "H": non_listed_regex_patterns, + "I": spot_regex_patterns, + "J": forwards_regex_patterns, + "K": strategies_regex_patterns, + "L": financing_regex_patterns, + "T": reference_instruments_regex_patterns, + "M": miscellaneous_regex_patterns, + } + + CONTEXT = ["CFI", "CFI_CODE"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + supported_language: str = "en", + supported_entity: str = "CFI_CODE", + ): + self.replacement_pairs = ( + replacement_pairs + if replacement_pairs + else [("-", ""), (" ", ""), (":", "")] + ) + + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """Determine absolute value based on calculation.""" + sanitized_value = Utils.sanitize_value(pattern_text, self.replacement_pairs) + return self.__check_cfi(sanitized_value) + + def __check_cfi(self, sanitized_value: str) -> bool: + is_valid_cfi = None + if sanitized_value and len(sanitized_value) == 6 and sanitized_value.isalpha(): + applicable_regex_patterns = self.regex_options.get(sanitized_value[0], None) + if applicable_regex_patterns and len(applicable_regex_patterns) > 0: + pattern = re.compile( + "|".join(applicable_regex_patterns), + flags=re.DOTALL | re.IGNORECASE | re.MULTILINE, + ) + groups = re.match(pattern, sanitized_value) + if groups: + is_valid_cfi = True + return is_valid_cfi diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py index 83c82c1fb..210ceb01e 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py @@ -26,17 +26,17 @@ class InPanRecognizer(PatternRecognizer): PATTERNS = [ Pattern( "PAN (High)", - r"\b([A-Za-z]{3}[AaBbCcFfGgHhJjLlPpTt]{1}[A-Za-z]{1}[0-9]{4}[A-Za-z]{1})\b", + r"\b([A-Z]{3}[ABCFGHJLPT]{1}[A-Za-z]{1}(?!0000)\d{4}[A-Z]{1})\b", 0.85, ), Pattern( "PAN (Medium)", - r"\b([A-Za-z]{5}[0-9]{4}[A-Za-z]{1})\b", + r"\b([A-Z]{5}(?!0000)\d{4}[A-Z]{1})\b", 0.6, ), Pattern( "PAN (Low)", - r"\b((?=.*?[a-zA-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", + r"\b((?=.*?[A-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", 0.05, ), ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py new file mode 100644 index 000000000..a858fbf22 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py @@ -0,0 +1,60 @@ +from typing import List, Optional, Tuple +from presidio_analyzer import Pattern, PatternRecognizer +import pycountry + + +class IsinRecognizer(PatternRecognizer): + """ + Recognize ISIN codes using regex. + + Ref: https://en.wikipedia.org/wiki/International_Securities_Identification_Number + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + """ + + iso2a_countryname = "" + countries = pycountry.countries + for country in countries: + iso2a_countryname += country.alpha_2 + "|" + pattern = "^" + "(" + iso2a_countryname.rstrip("|") + ")" + "[A-Z0-9]{9}[0-9]{1}$" + + PATTERNS = [ + Pattern( + "ISIN (Medium)", + r"\b[A-Z]{2}[A-Z0-9]{9}\d{1}\b", + 0.5, + ), + Pattern( + "ISIN (Strong)", + pattern, + 0.85, + ), + ] + + CONTEXT = ["ISIN", "ISIN_CODE"] + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "ISIN_CODE", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs + if replacement_pairs + else [("-", ""), (" ", ""), (":", "")] + ) + + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry.py index a47f34dbf..dac337bf8 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry.py @@ -47,6 +47,8 @@ PlPeselRecognizer, InAadhaarRecognizer, InVehicleRegistrationRecognizer, + IsinRecognizer, + CfiRecognizer, ) logger = logging.getLogger("presidio-analyzer") @@ -105,6 +107,8 @@ def load_predefined_recognizers( InPanRecognizer, InAadhaarRecognizer, InVehicleRegistrationRecognizer, + IsinRecognizer, + CfiRecognizer, ], "es": [EsNifRecognizer], "it": [ diff --git a/presidio-analyzer/tests/test_cfi_recognizer.py b/presidio-analyzer/tests/test_cfi_recognizer.py new file mode 100644 index 000000000..ea0559eef --- /dev/null +++ b/presidio-analyzer/tests/test_cfi_recognizer.py @@ -0,0 +1,53 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import CfiRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return CfiRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["CFI_CODE"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + ("ESVUFA", 1, (0, 6), 0.5), + ("JFTXFP", 1, (0, 6), 1.0), + ("JFTXFN", 1, (0, 6), 1.0), + ("ESVUFR", 1, (0, 6), 1.0), + ("ABNE123456", 0, (), (),), + ("OCXFXS", 1, (0, 6), 0.5), + ("DZXBAC", 1, (0, 6), 0.10), + ("ABCDEF", 1, (0, 6), 0.05), + ("MMRXXX", 1, (0, 6), 1.0), + ("TBEXXX", 1, (0, 6), 1.0), + ("NABHRT", 1, (0, 6), 0.05), + # fmt: on + ], +) +def test_when_cfi_in_text_then_all_cfis_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) diff --git a/presidio-analyzer/tests/test_isin_recognizer.py b/presidio-analyzer/tests/test_isin_recognizer.py new file mode 100644 index 000000000..235fe7242 --- /dev/null +++ b/presidio-analyzer/tests/test_isin_recognizer.py @@ -0,0 +1,61 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import IsinRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return IsinRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["ISIN_CODE"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + ("IL0011762056", 1, (0, 12), 0.85), + ("ZZ12345ABCD1", 1, (0, 12), 0.50), + ("US0378331005", 1, (0, 12), 0.85), + ("KR7000830000", 1, (0, 12), 0.85), + ("IL0006290147", 1, (0, 12), 0.85), + ("JP3967200001", 1, (0, 12), 0.85), + ("ARDEUT110061", 1, (0, 12), 0.85), + ("BRBOEIBDR003", 1, (0, 12), 0.85), + ("KYG017171003", 1, (0, 12), 0.85), + ("SG1T75931496", 1, (0, 12), 0.85), + ("GB00B16PRC61", 1, (0, 12), 0.85), + ("DE0007236101", 1, (0, 12), 0.85), + ("XS1636274265", 1, (0, 12), 0.50), # exception to XS as a country code + ("INF740KA1BM0", 1, (0, 12), 0.85), + ("INE732I01013", 1, (0, 12), 0.85), + ("ABNE123456", 0, (), (),), + ("My Listed Company's stock trades with ISIN number SA14TG012N13 with a lot of " + "text beyond the actual value", + 1, (50, 62), 0.50), + # fmt: on + ], +) +def test_when_regn_in_text_then_all_regns_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 24f03d98a..46ac00ac5 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -57,8 +57,8 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 24 predefined - assert len(recognizers) == 1 + 24 + # 1 custom recognizer in english + 26 predefined + assert len(recognizers) == 1 + 26 def test_when_get_recognizers_then_return_all_fields(mock_recognizer_registry): From b0767aad1ece4dca2515a69c39309c20821b09f7 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Mon, 4 Mar 2024 23:04:12 +0530 Subject: [PATCH 18/23] added three predefined recognizers, improvements 1. Improved IN_PAN regex 2. Utility function for LUHN ModN validation 3. New recognizers : IN_GSTIN, CFI_CODE, ISIN_CODE --- docs/supported_entities.md | 9 +- .../presidio_analyzer/analyzer_utils.py | 22 ++- .../predefined_recognizers/__init__.py | 2 + .../in_gstin_recognizer.py | 148 ++++++++++++++++++ .../in_pan_recognizer.py | 4 +- presidio-analyzer/setup.py | 1 + .../tests/test_analyzer_utils.py | 16 ++ .../tests/test_in_gstin_recognizer.py | 48 ++++++ .../tests/test_recognizer_registry.py | 2 +- 9 files changed, 244 insertions(+), 8 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py create mode 100644 presidio-analyzer/tests/test_in_gstin_recognizer.py diff --git a/docs/supported_entities.md b/docs/supported_entities.md index 94d113511..6246188cd 100644 --- a/docs/supported_entities.md +++ b/docs/supported_entities.md @@ -81,11 +81,12 @@ For more information, refer to the [adding new recognizers documentation](analyz |AU_MEDICARE| Medicare number is a unique identifier issued by Australian Government that enables the cardholder to receive a rebates of medical expenses under Australia's Medicare system| Pattern match, context, and checksum | ### India -| FieldType | Description |Detection Method| -|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--- | -| IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context | -| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum | +| FieldType | Description |Detection Method| +|-------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|--- | +| IN_PAN | The Indian Permanent Account Number (PAN) is a unique 12 character alphanumeric identifier issued to all business and individual entities registered as Tax Payers. | Pattern match, context | +| IN_AADHAAR | Indian government issued unique 12 digit individual identity number | Pattern match, context, and checksum | | IN_VEHICLE_REGISTRATION | Indian government issued transport (govt, personal, diplomatic, defence) vehicle registration number | Pattern match, context, and checksum | +| IN_GSTIN | Indian government issued unique goods and services tax identification number | Pattern match, context, and checksum | ## Adding a custom PII entity diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py index 74710bd8b..744b34926 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_utils.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -36,13 +36,33 @@ def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: text = text.replace(search_string, replacement_string) return text + @staticmethod + def get_luhn_mod_n(input_str: str, alphabet="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"): + """ + Check if the given input number has a valid last checksum as per LUHN algorithm. + + https://en.wikipedia.org/wiki/Luhn_mod_N_algorithm + :param alphabet: input alpha-numeric list of characters to determine mod 'N' + :param input_str: the alpha numeric string to be checked for LUHN algorithm + :return: True/False + """ + if len(alphabet) == 0: + return False + + charset = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + n = len(charset) + luhn_input = tuple(alphabet.index(i) for i in reversed(str(input_str))) + return ( + sum(luhn_input[::2]) + sum(sum(divmod(i * 2, n)) for i in luhn_input[1::2]) + ) % n == 0 + @staticmethod def is_verhoeff_number(input_number: int): """ Check if the input number is a true verhoeff number. :param input_number: - :return: + :return: Bool """ __d__ = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index 35c9b3752..b860d50a4 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -39,6 +39,7 @@ from .in_vehicle_registration_recognizer import InVehicleRegistrationRecognizer from .isin_recognizer import IsinRecognizer from .cfi_recognizer import CfiRecognizer +from .in_gstin_recognizer import InGstinRecognizer NLP_RECOGNIZERS = { "spacy": SpacyRecognizer, @@ -85,4 +86,5 @@ "InVehicleRegistrationRecognizer", "IsinRecognizer", "CfiRecognizer", + "InGstinRecognizer", ] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py new file mode 100644 index 000000000..1a1bb9057 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py @@ -0,0 +1,148 @@ +from typing import Optional, List, Tuple +from presidio_analyzer import Pattern, PatternRecognizer +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils +import pycountry + + +class InGstinRecognizer(PatternRecognizer): + """ + Recognizes Indian Goods and Services Tax Identification Number ("GSTIN"). + + The GSTIN is a fifteen character alpha-numeric code + with the last digit being a check digit calculated using a + modified modulus 36 LUHN calculation. + This recognizer identifies GSTIN using regex, context words and calculated value. + Reference: https://en.wikipedia.org/wiki/Goods_and_Services_Tax_(India), + http://idtc-icai.s3.amazonaws.com/download/knowledgeShare18-19/Structure-of-GSTIN.pdf + + :param patterns: List of patterns to be used by this recognizer + :param context: List of context words to increase confidence in detection + :param supported_language: Language this recognizer supports + :param supported_entity: The entity this recognizer can detect + :param replacement_pairs: List of tuples with potential replacement values + for different strings to be used during pattern matching. + This can allow a greater variety in input, for example by removing dashes or spaces. + """ + + gstin_country_codes_iso3a = "" + countries = pycountry.countries + for country in countries: + gstin_country_codes_iso3a += country.alpha_3 + "|" + pattern1 = ( + "[0-9]{4}" + + "(" + + gstin_country_codes_iso3a.rstrip("|") + + ")" + + "(?!00000)[0-9]{5}[A-Z]{2}[A-Z0-9]{1}" + ) + + pattern2 = ( + "[0-9]{2}[A-Z]{3}[ABCFGHJLPT]{1}[A-Z]{1}(?!0000)[0-9]{4}" + + "[A-Z]{1}[1-9A-Z]{1}(Z)[0-9A-Z]{1}" + ) + + PATTERNS = [ + Pattern( + "GSTIN (High)", + pattern2, + 0.85, + ), # Regular registration pattern + Pattern( + "GSTIN (Low)", + r"\b([0-9]{2}[A-Z]{5}(?!0000)[0-9]{4}[A-Z]{1}[0-9A-Z]{2})\b", + 0.2, + ), + Pattern("GSTIN (Medium)", pattern1, 0.6), # NRTP pattern + Pattern( + "GSTIN (Very Low)", + r"\b((?=.*?[A-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", + 0.05, + ), + ] + + CONTEXT = [ + "GSTIN", + "GST", + ] + + _in_state_tin_codes = { + "01": "Jammu and Kashmir", + "02": "Himachal Pradesh", + "03": "Punjab", + "04": "Chandigarh", + "05": "Uttarakhand", + "06": "Haryana", + "07": "Delhi", + "08": "Rajasthan", + "09": "Uttar Pradesh", + "10": "Bihar", + "11": "Sikkim", + "12": "Arunachal Pradesh", + "13": "Nagaland", + "14": "Manipur", + "15": "Mizoram", + "16": "Tripura", + "17": "Meghalaya", + "18": "Assam", + "19": "West Bengal", + "20": "Jharkhand", + "21": "Orissa", + "22": "Chattisgarh", + "23": "Madhya Pradesh", + "24": "Gujarat", + "25": "Daman and Diu", + "26": "Dadar and Nagar Haveli", + "27": "Maharashtra", + "28": "Andhra Pradesh", + "29": "Karnataka", + "30": "Goa", + "31": "Lakshadweep", + "32": "Kerala", + "33": "Tamil Nadu", + "34": "Puducherry", + "35": "Anadaman and Nicobar Islands", + "36": "Telangana", + "37": "Andhra Pradesh (New)", + } + + def __init__( + self, + patterns: Optional[List[Pattern]] = None, + context: Optional[List[str]] = None, + supported_language: str = "en", + supported_entity: str = "IN_GSTIN", + replacement_pairs: Optional[List[Tuple[str, str]]] = None, + ): + self.replacement_pairs = ( + replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] + ) + patterns = patterns if patterns else self.PATTERNS + context = context if context else self.CONTEXT + super().__init__( + supported_entity=supported_entity, + patterns=patterns, + context=context, + supported_language=supported_language, + ) + + def validate_result(self, pattern_text: str) -> bool: + """Determine absolute value based on calculation.""" + sanitized_value = Utils.sanitize_value(pattern_text, self.replacement_pairs) + return self.__check_gstin(sanitized_value) + + def __check_gstin(self, sanitized_value: str) -> bool: + is_valid_gstin = None # deliberately set to None and not typecast either + if sanitized_value and len(sanitized_value) == 15 and sanitized_value.isalnum(): + if sanitized_value[0:2] not in self._in_state_tin_codes: + pass # NRTP pattern detection only. As rules are not published yet + else: + if sanitized_value[13] != "Z" or sanitized_value[12] == "0": + is_valid_gstin = False + elif Utils.get_luhn_mod_n(sanitized_value): + is_valid_gstin = True + else: + is_valid_gstin = False + else: + is_valid_gstin = False + + return is_valid_gstin diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py index 210ceb01e..8961bbca8 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_pan_recognizer.py @@ -26,12 +26,12 @@ class InPanRecognizer(PatternRecognizer): PATTERNS = [ Pattern( "PAN (High)", - r"\b([A-Z]{3}[ABCFGHJLPT]{1}[A-Za-z]{1}(?!0000)\d{4}[A-Z]{1})\b", + r"\b([A-Z]{3}[ABCFGHJLPT]{1}[A-Z]{1}(?!0000)[0-9]{4}[A-Z]{1})\b", 0.85, ), Pattern( "PAN (Medium)", - r"\b([A-Z]{5}(?!0000)\d{4}[A-Z]{1})\b", + r"\b([A-Z]{5}(?!0000)[0-9]{4}[A-Z]{1})\b", 0.6, ), Pattern( diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index a1326e40a..d06ca7abb 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -39,6 +39,7 @@ "tldextract", "pyyaml", "phonenumbers>=8.12,<9.0.0", + "pycountry>=23.12.11", ], extras_require={ "transformers": ["spacy_huggingface_pipelines"], diff --git a/presidio-analyzer/tests/test_analyzer_utils.py b/presidio-analyzer/tests/test_analyzer_utils.py index 751736dc9..4aabedd81 100644 --- a/presidio-analyzer/tests/test_analyzer_utils.py +++ b/presidio-analyzer/tests/test_analyzer_utils.py @@ -9,6 +9,12 @@ ["NotAPalindrome", True, False], ] +luhn_mod_n_test_set = [ + ["27AAACM6094R1ZP", True], + ["36AAICA3369H1ZJ", True], + ["36AAHAA2262Q1ZF", True], +] + sanitizer_test_set = [ [" a|b:c ::-", [("-", ""), (" ", ""), (":", ""), ("|", "")], "abc"], ["def", "", "def"], @@ -21,6 +27,16 @@ ] +@pytest.mark.parametrize("input_string , expected_output", luhn_mod_n_test_set) +def test_get_luhn_mod_n(input_string, expected_output): + """ + Test if the checksum is matching for a module-36 LUHN input + :param input_string: string value + :return: match if calculated checksum is same as input + """ + assert PresidioAnalyzerUtils.get_luhn_mod_n(input_string) == expected_output + + @pytest.mark.parametrize( "input_text,case_sensitive, expected_output", palindrome_test_set ) diff --git a/presidio-analyzer/tests/test_in_gstin_recognizer.py b/presidio-analyzer/tests/test_in_gstin_recognizer.py new file mode 100644 index 000000000..61882acdd --- /dev/null +++ b/presidio-analyzer/tests/test_in_gstin_recognizer.py @@ -0,0 +1,48 @@ +import pytest + +from tests import assert_result +from presidio_analyzer.predefined_recognizers import InGstinRecognizer + + +@pytest.fixture(scope="module") +def recognizer(): + return InGstinRecognizer() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_GSTIN"] + + +@pytest.mark.parametrize( + "text, expected_len, expected_position, expected_score", + [ + # fmt: off + ("09ACQPI2284L1Z0", 1, (0, 15), 1), + ("99ABCPG1111T1NX", 0, (0, 15), 1), + ("03AAGFL0883Q2ZW", 1, (0, 15), 1), + ("9917USA00015OS3", 1, (0, 15), 0.6), + ("My GSTIN is 29AAHCR6226R1ZJ with a lot of text beyond it", 1, (12, 27), 1), + # fmt: on + ], +) +def test_when_gstin_in_text_then_all_gstins_found( + text, + expected_len, + expected_position, + expected_score, + recognizer, + entities, +): + results = recognizer.analyze(text, entities) + print(results) + + assert len(results) == expected_len + if results: + assert_result( + results[0], + entities[0], + expected_position[0], + expected_position[1], + expected_score, + ) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 46ac00ac5..83eaac1f7 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -57,7 +57,7 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 26 predefined + # 1 custom recognizer in english + 27 predefined assert len(recognizers) == 1 + 26 From d1f2fc649536544debc943ab6ab8077c47c12ddb Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Sun, 10 Mar 2024 03:15:06 +0530 Subject: [PATCH 19/23] removed pycountry Removed pycountry per feedback on it's license. Built the utility in analyzer_utils.py & removed all references. --- .../presidio_analyzer/analyzer_utils.py | 145 ++++++++++ .../presidio_analyzer/country_master.csv | 255 ++++++++++++++++++ .../in_gstin_recognizer.py | 12 +- .../predefined_recognizers/isin_recognizer.py | 7 +- presidio-analyzer/setup.py | 1 - .../tests/test_analyzer_utils.py | 75 ++++++ 6 files changed, 487 insertions(+), 8 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/country_master.csv diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py index 744b34926..bd6a1aa95 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_utils.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -1,4 +1,6 @@ from typing import List, Tuple +import csv +import os class PresidioAnalyzerUtils: @@ -9,6 +11,12 @@ class PresidioAnalyzerUtils: logic for re-usability and maintainability """ + __country_master_file_path__ = "presidio_analyzer/country_master.csv" + __country_master__ = [] + + def __init__(self): + self.__load_country_master__() + @staticmethod def is_palindrome(text: str, case_insensitive: bool = False): """ @@ -93,3 +101,140 @@ def is_verhoeff_number(input_number: int): for i in range(len(inverted_number)): c = __d__[c][__p__[i % 8][inverted_number[i]]] return __inv__[c] == 0 + + def __load_country_master__(self): + """ + Load various standards as defined in Country specific metadata. + + :return: None + """ + if os.path.isfile(self.__country_master_file_path__) is not True: + raise FileNotFoundError() + else: + with open( + file=self.__country_master_file_path__, + mode="r", + newline="", + encoding="utf-8", + ) as csvfile: + if csv.Sniffer().has_header(csvfile.readline()) is not True: + raise Exception( + "Header missing in file: {}".format( + self.__country_master_file_path__ + ) + ) + csvfile.seek(0) # read the header as well, hence start from beginning + country_info = csv.DictReader(csvfile, fieldnames=None) + self.__country_master__ = list(country_info) + + if len(self.__country_master__) <= 1: + raise Exception( + "Blank file: {} detected.".format(self.__country_master_file_path__) + ) + + def __get_country_master_full_data__(self, iso_code: str = ""): + """ + Fetch all country information for a specific column (index). + + :param iso_code: + :return: + """ + supported_codes = [ + "ISO3166-1-Alpha-2", + "ISO3166-1-Alpha-3", + "ISO3166-1-Numeric", + "ISO4217-Alpha-3", + "ISO4217-Numeric", + ] + if iso_code.strip() not in supported_codes: + return None + else: + # return full country list for given code + country_information = [ + country[iso_code] for country in self.__country_master__ + ] + country_information = list(filter(None, country_information)) + return country_information + + def get_country_codes(self, iso_code: str): + """ + Fetch all defined country codes per required ISO format. + + :param iso_code: currently supporting : ISO3166-1-Alpha-2, + ISO3166-1-Alpha-3, ISO3166-1-Numeric + :return: List of country codes in provided ISO format. + """ + supported_codes = [ + "ISO3166-1-Alpha-2", + "ISO3166-1-Alpha-3", + "ISO3166-1-Numeric", + ] + if iso_code.strip() not in supported_codes: + print("Code Invalid: ") + return None + else: + # return full country list for given code + return self.__get_country_master_full_data__(iso_code=iso_code) + + def get_currency_codes(self, iso_code: str = ""): + """ + Retrieve all defined currency codes across countries. + + :param iso_code: currently supporting : ISO4217-Alpha-3, ISO4217-Numeric + :return: List of currency codes in provided ISO format. + """ + supported_codes = ["ISO4217-Alpha-3", "ISO4217-Numeric"] + if iso_code.strip() not in supported_codes: + return None + else: + # return full country list for given code + return self.__get_country_master_full_data__(iso_code=iso_code) + + def get_full_country_information(self, lookup_key: str, lookup_index: str): + """ + Fetch additional information through lookup_index in index of lookup_key. + + :param lookup_key: Item to be searched + :param lookup_index: A valid index_name out of available values + English_short_name_using_title_case, English_full_name, + FIFA_country_code, International_olympic_committee_country_code, + ISO3166-1-Alpha-2,ISO3166-1-Alpha-3, ISO3166-1-Numeric, + International_licence_plate_country_code, Country_code_top_level_domain, + Currency_Name, ISO4217-Alpha-3, ISO4217-Numeric, Capital_City, Dialing_Code + :return: Dictionary object with additional information enriched from + master lookup + + """ + allowed_indices = [ + "English_short_name_using_title_case", + "English_full_name", + "FIFA_country_code", + "International_olympic_committee_country_code", + "ISO3166-1-Alpha-2", + "ISO3166-1-Alpha-3", + "ISO3166-1-Numeric", + "International_licence_plate_country_code", + "Country_code_top_level_domain", + "Currency_Name", + "ISO4217-Alpha-3", + "ISO4217-Numeric", + "Capital_City", + "Dialing_Code", + ] + if ( + lookup_index is None + or len(lookup_index.strip()) == 0 + or lookup_index not in allowed_indices + ): + print("Lookup Index problem") + return None + elif lookup_key is None or len(lookup_key.strip()) == 0: + print("Lookup Key issue") + return None + else: + return list( + filter( + lambda country: country[lookup_index] == lookup_key, + self.__country_master__, + ) + ) diff --git a/presidio-analyzer/presidio_analyzer/country_master.csv b/presidio-analyzer/presidio_analyzer/country_master.csv new file mode 100644 index 000000000..06ae62c67 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/country_master.csv @@ -0,0 +1,255 @@ +English_full_name,English_short_name_using_title_case,FIFA_country_code,International_olympic_committee_country_code,ISO3166-1-Alpha-2,ISO3166-1-Alpha-3,ISO3166-1-Numeric,international licence plate country code,Country_code_top_level_domain,Currency Name,ISO4217-Alpha-3,ISO4217-Numeric,Currency Minor Unit,Capital_City,Recognized_Country_Flag,Dialing_Code,Mobile_Country_Code,Mobile_Network_Code +Islamic Republic of Afghanistan,Afghanistan,AFG,AFG,AF,AFG,4,AFG,.af,Afghani,AFN,971,2,Kabul,Y,93,412,"01,03,20th,30th,40,50,80,88" +,Åland Islands,,,AX,ALA,248,,.ax,Euro,EUR,978,2,Mariehamn,Y,358 18,, +Republic of Albania,Albania,ALB,ALB,AL,ALB,8,AL,.al,Lek,ALL,8,2,Tirana,Y,355,276,"01,02,03,04" +People's Democratic Republic of Algeria,Algeria,ALG,ALG,DZ,DZA,12,DZ,.dz,Algerian Dinar,DZD,12,2,Algiers,Y,213,603,"01,02,03" +,American Samoa,ASA,ASA,AS,ASM,16,AND,.as,US Dollar,USD,840,2,Pago Pago,Y,1 684,544,11 +Principality of Andorra,Andorra,AND,AND,AD,AND,20,,.ad,Euro,EUR,978,2,Andorra la Vella,Y,376,213,3 +Republic of Angola,Angola,ANG,ANG,AO,AGO,24,,.ao,Kwanza,AOA,973,2,Luanda,Y,244,631,"02,04" +,Anguilla,AIA,,AI,AIA,660,,.ai,East Caribbean Dollar,XCD,951,2,The Valley,Y,1 264,365,"10,840" +,Antarctica,,,AQ,ATA,10,,.aq,,,,,,Y,672,, +,Antigua and Barbuda,ATG,ANT,AG,ATG,28,,.ag,East Caribbean Dollar,XCD,951,2,St. John's,Y,1 268,344,"3,09,20,930" +Argentine Republic,Argentina,ARG,ARG,AR,ARG,32,RA,.ar,Argentine Peso,ARS,32,2,Buenos Aires,Y,54,722,"10,02,00,70,31,03,20,30,00,00,000" +Republic of Armenia,Armenia,ARM,ARM,AM,ARM,51,AM,.am,Armenian Dram,AMD,51,2,Yeravan,Y,374,283,"01,04,05,10" +,Aruba,ARU,ARU,AW,ABW,533,,.aw,Aruban Florin,AWG,533,2,Oranjestad,Y,297,363,"01,02,20th" +Commonwealth of Australia,Australia,AUS,AUS,AU,AUS,36,AUS,.au,Australian Dollar,AUD,36,2,Canberra,Y,61,505,"01,02,03,04,05,06,07,08,09,11,12,13th,14th,16,19th,24,26th,71,72,88,90,99" +Republic of Austria,Austria,AUT,AUT,AT,AUT,40,A,.at,Euro,EUR,978,2,Vienna,Y,43,43,"01,02,03,04,05,06,07,08,09,10,11,12,13th,14th,15th,17th,19th" +Republic of Azerbaijan,Azerbaijan,AZE,AZE,AZ,AZE,31,AZ,.az,Azerbaijan Manat,AZN,944,2,Baku,Y,994,400,"01,02,03,04" +The – Commonwealth of The Bahamas,Bahamas,BAH,BAH,BS,BHS,44,BS,.bs,Bahamian Dollar,BSD,44,2,Nassau,Y,1 242,364,"03,30th,39,390" +Kingdom of Bahrain,Bahrain,BHR,BRN,BH,BHR,48,BRN,.bh,Bahraini Dinar,BHD,48,3,Manama,Y,973,426,"01,02,04" +People's Republic of Bangladesh,Bangladesh,BAN,BAN,BD,BGD,50,BD,.bd,Taka,BDT,50,2,Dhaka,Y,880,470,"01,02,03,04,05,06,07" +,Barbados,BRB,BAR,BB,BRB,52,BDS,.bb,Barbados Dollar,BBD,52,2,Bridgetown,Y,1 246,342,"5,06,00,75,08,10,820" +Republic of Belarus,Belarus,BLR,BLR,BY,BLR,112,BY,.by,Belarusian Ruble,BYN,933,2,Minsk,Y,375,257,"01,02,03,04" +Kingdom of Belgium,Belgium,BEL,BEL,BE,BEL,56,B,.be,Euro,EUR,978,2,Brussels,Y,32,206,"01,02,05,06,10,20th" +,Belize,BLZ,BIZ,BZ,BLZ,84,BH,.bz,Belize Dollar,BZD,84,2,Belmopan,Y,501,702,"67,68" +Republic of Benin,Benin,BEN,BEN,BJ,BEN,204,DY,.bj,CFA Franc BCEAO,XOF,952,0,Porto-Novo,Y,229,616,"01,02,03,04,05" +,Bermuda,BER,BER,BM,BMU,60,,.bm,Bermudian Dollar,BMD,60,2,Hamilton,Y,1 441,350,"000,01,02,10,99" +Kingdom of Bhutan,Bhutan,BHU,BHU,BT,BTN,64,,.bt,Ngultrum,BTN,64,2,Thimpu,Y,975,402,"11,17th,77" +Plurinational State of Bolivia,Bolivia,BOL,BOL,BO,BOL,68,BOL,.bo,Boliviano,BOB,68,2,La Paz,Y,591,736,"01,02,03" +,"Bonaire, Sint Eustatius and Saba",,,BQ,BES,535,,.bq,US Dollar,USD,840,2,,Y,599 7,362,"5,19,16,30,951" +,Bosnia and Herzegovina,BIH,BIH,BA,BIH,70,BIH,.ba,Convertible Mark,BAM,977,2,Sarajevo,Y,387,218,"03,05,90" +Republic of Botswana,Botswana,BOT,BOT,BW,BWA,72,BW,.bw,Pula,BWP,72,2,Gaborone,Y,267,652,"01,02,04" +,Bouvet Island,,,BV,BVT,74,,,Norwegian Krone,NOK,578,2,,Y,55,, +Federative Republic of Brazil,Brazil,BRA,BRA,BR,BRA,76,BR,.br,Brazilian Real,BRL,986,2,Brasília,Y,55,724,"00,01,02,03,04,05,06,07,08,10,11,12,15th,16,19th,23,24,30th,31,32,33,34,37,38,39,54" +,British Indian Ocean Territory,,,IO,IOT,86,,.io,US Dollar,USD,840,2,,Y,246,995,1 +"Nation of Brunei, the Abode of Peace",Brunei Darussalam,BRU,BRU,BN,BRN,96,BRU,.bn,Brunei Dollar,BND,96,2,Bandar Seri Begawan,Y,673,528,"01,02,11" +Republic of Bulgaria,Bulgaria,BUL,BUL,BG,BGR,100,BG,.bg,Bulgarian Lev,BGN,975,2,Sofia,Y,359,284,"01,03,05,06" +,Burkina Faso,BFA,BUR,BF,BFA,854,BF,.bf,CFA Franc BCEAO,XOF,952,0,Ouagadougou,Y,226,613,"01,02,03" +Republic of Burundi,Burundi,BDI,BDI,BI,BDI,108,RU,.bi,Burundi Franc,BIF,108,0,Gitega,Y,257,642,"01,02,03,07,08,82" +Republic of Cabo Verde,Cabo Verde,CPV,CPV,CV,CPV,132,,.cv,Cabo Verde Escudo,CVE,132,2,Praia,Y,238,625,"01,02" +Kingdom of Cambodia,Cambodia,CAM,CAM,KH,KHM,116,K,.kh,Riel,KHR,116,2,Phnom Penh,Y,855,456,"01,02,03,04,05,06,08,09,18th" +Republic of Cameroon,Cameroon,CMR,CMR,CM,CMR,120,CAM,.cm,CFA Franc BEAC,XAF,950,0,Yaoundé,Y,237,624,"01,02,04" +,Canada,CAN,CAN,CA,CAN,124,CDN,.ca,Canadian Dollar,CAD,124,2,Ottawa,Y,1,302,"2,20,32,03,60,36,13,70,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,00,000" +The Cayman Islands,Cayman Islands,CAY,CAY,KY,CYM,136,,.ky,Cayman Islands Dollar,KYD,136,2,George Town,Y,1 345,346,"60,50,140" +,Central African Republic,CTA,CAF,CF,CAF,140,RCA,.cf,CFA Franc BEAC,XAF,950,0,Bangui,Y,236,623,"01,02,03,04" +Republic of Chad,Chad,CHA,CHA,TD,TCD,148,"TCH,TD",.td,CFA Franc BEAC,XAF,950,0,N'Djamena,Y,235,622,"01,02,03,04" +Republic of Chile,Chile,CHI,CHI,CL,CHL,152,RCH,.cl,Chilean Peso,CLP,152,0,Santiago,Y,56,730,"00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,19" +People's Republic of China,China,CHN,CHN,CN,CHN,156,,.cn,Yuan Renminbi,CNY,156,2,Beijing,Y,86,"4,60,461","460-00,460-01,460-02,460-03,460-04,460-05,460-06,460-07" +,Christmas Island,,,CX,CXR,162,,.cx,Australian Dollar,AUD,36,2,Flying Fish Cove,Y,61 89164,, +,Cocos (Keeling) Islands,,,CC,CCK,166,,.cc,Australian Dollar,AUD,36,2,West Island,Y,61 89162,, +Republic of Colombia,Colombia,COL,COL,CO,COL,170,,.co,Colombian Peso,COP,170,2,Bogotá,Y,57,732,"1,00,20,20,10,11,02,10,00,00,00,00,00,00,00,00,000" +Union of the Comoros,Comoros,COM,COM,KM,COM,174,,.km,Comorian Franc ,KMF,174,0,Moroni,Y,269,654,1 +Republic of the Congo,Congo,CGO,CGO,CG,COG,178,RCB,,CFA Franc BEAC,XAF,950,0,Brazzaville,Y,242,629,"01,02,07,10" +,Democratic Republic of the Congo,COD,COD,CD,COD,180,CGO,.cd,Congolese Franc,CDF,976,2,Kinshasa,Y,243,630,"01,02,05,86,88,89,90" +,Cook Islands,COK,COK,CK,COK,184,,.ck,New Zealand Dollar,NZD,554,2,Avarua,Y,682,548,1 +Republic of Costa Rica,Costa Rica,CRC,CRC,CR,CRI,188,CR,.cr,Costa Rican Colon,CRC,188,2,San José,Y,506,712,"01,02,03,04,20" +Ivory Coast,Côte d'Ivoire,CIV,CIV,CI,CIV,384,CI,.ci,CFA Franc BCEAO,XOF,952,0,,Y,225,612,"01,02,03,04,05,06,07" +Republic of Croatia,Croatia,CRO,CRO,HR,HRV,191,HR,.hr,Kuna,HRK,191,2,Zagreb,Y,385,219,"01,02,10" +Republic of Cuba,Cuba,CUB,CUB,CU,CUB,192,C,.cu,Cuban Peso,CUP,192,2,Havana,Y,53,368,1 +,Curaçao,CUW,,CW,CUW,531,,.cw,Netherlands Antillean Guilder,ANG,532,2,Willemstad,Y,599 9,362,"69,95" +Republic of Cyprus,Cyprus,CYP,CYP,CY,CYP,196,CY,.cy,Euro,EUR,978,2,Nicosia,Y,357,280,"01,10,20" +Czech Republic,Czechia,CZE,CZE,CZ,CZE,203,CZ,.cz,Czech Koruna,CZK,203,2,Prague,Y,420,230,"01,02,03,04,05,08,99" +Kingdom of Denmark,Denmark,DEN,DEN,DK,DNK,208,DK,.dk,Danish Krone,DKK,208,2,Copenhagen,Y,45,238,"01,02,03,04,05,06,07,10,12,20,23,28,30,77" +Republic of Djibouti,Djibouti,DJI,DJI,DJ,DJI,262,,.dj,Djibouti Franc,DJF,262,0,Djibouti,Y,253,638,1 +Commonwealth of Dominica,Dominica,DMA,DMA,DM,DMA,212,WD,.dm,East Caribbean Dollar,XCD,951,2,Roseau,Y,1 767,366,"2,00,50,110" +,Dominican Republic,DOM,DOM,DO,DOM,214,DOM,.do,Dominican Peso,DOP,214,2,Santo Domingo,Y,"1 809, 1 829, 1 849",370,"01,02,03,04" +Republic of Ecuador,Ecuador,ECU,ECU,EC,ECU,218,EC,.ec,US Dollar,USD,840,2,Quito,Y,593,740,"00,01,02" +Arab Republic of Egypt,Egypt,EGY,EGY,EG,EGY,818,ET,.eg,Egyptian Pound,EGP,818,2,Cairo,Y,20,602,"01,02,03" +Republic of El Salvador,El Salvador,SLV,ESA,SV,SLV,222,ES,.sv,El Salvador Colon,SVC,222,2,Sand Salvador,Y,503,706,"01,02,03,04,05" +Republic of Equatorial Guinea,Equatorial Guinea,EQG,GEQ,GQ,GNQ,226,,.gq,CFA Franc BEAC,XAF,950,0,Malabo,Y,240,627,"01,03" +State of Eritrea,Eritrea,ERI,ERI,ER,ERI,232,ER,.er,Nakfa,ERN,232,2,Asmara,Y,291,657,1 +Republic of Estonia,Estonia,EST,EST,EE,EST,233,EST,.ee,Euro,EUR,978,2,Tallinn,Y,372,248,"01,02,03,04" +Kingdom of Eswatini,Eswatini,SWZ,SWZ,SZ,SWZ,748,SD,.sz,Lilangeni,SZL,748,2,Lobamba,Y,268,653,"01,10" +Federal Democratic Republic of Ethiopia,Ethiopia,ETH,ETH,ET,ETH,231,ETH,.et,Ethiopian Birr,ETB,230,2,Addis Ababa,Y,251,636,1 +,Falkland Islands (Malvinas),,,FK,FLK,238,,.fk,Falkland Islands Pound,FKP,238,2,Stanley,Y,500,750,1 +,Faroe Islands,FRO,,FO,FRO,234,FO,.fo,Danish Krone,DKK,208,2,Tórshavn,Y,298,288,"01,02,03" +Republic of Fiji,Fiji,FIJ,FIJ,FJ,FJI,242,FJI,.fj,Fiji Dollar,FJD,242,2,Suva,Y,679,542,"01,02" +Republic of Finland,Finland,FIN,FIN,FI,FIN,246,FIN,.fi,Euro,EUR,978,2,Helsinki,Y,358,244,"03,04,05,09,10,11,12,13,14,21,26,82,91" +French Republic,France,FRA,FRA,FR,FRA,250,F,.fr,Euro,EUR,978,2,Paris,Y,33,208,"00,01,02,03,04,05,06,07,09,10,11,13,14,15,16,17,20,21,22,23,24,25,26,27,28,29,31,88,89,91,92" +,French Guiana,,,GF,GUF,254,,.gf,Euro,EUR,978,2,,Y,594,340,"01,02,03,11,20" +,French Polynesia,,,PF,PYF,258,,.pf,CFP Franc,XPF,953,0,Papeete,Y,689,547,"15,20" +,French Southern Territories,,,TF,ATF,260,,.tf,Euro,EUR,978,2,,Y,262,, +Gabonese Republic,Gabon,GAB,GAB,GA,GAB,266,G,.ga,CFA Franc BEAC,XAF,950,0,Libreville,Y,241,628,"01,02,03,04" +Republic of The Gambia,Gambia,GAM,GAM,GM,GMB,270,WAG,.gm,Dalasi,GMD,270,2,,Y,220,607,"01,02,03,04" +,Georgia,GEO,GEO,GE,GEO,268,GE,.ge,Lari,GEL,981,2,Tbilisi,Y,995,282,"01,02,03,04,05" +Federal Republic of Germany,Germany,GER,GER,DE,DEU,276,D,.de,Euro,EUR,978,2,Berlin,Y,49,262,"01,02,03,04,05,06,07,08,09,10,11,12,13,14,16,17,20,42,43,77" +Republic of Ghana,Ghana,GHA,GHA,GH,GHA,288,GH,.gh,Ghana Cedi,GHS,936,2,Accra,Y,233,620,"01,02,03,04,06,07" +,Gibraltar,GIB,,GI,GIB,292,GBZ,.gi,Gibraltar Pound,GIP,292,2,Gibraltar,Y,350,266,"01,06,09" +Hellenic Republic,Greece,GRE,GRE,GR,GRC,300,GR,.gr,Euro,EUR,978,2,Athens,Y,30,202,"01,02,03,04,05,07,09,10,14" +,Greenland,,,GL,GRL,304,,.gl,Danish Krone,DKK,208,2,Nuuk,Y,299,290,1 +,Grenada,GRN,GRN,GD,GRD,308,WG,.gd,East Caribbean Dollar,XCD,951,2,St. George's,Y,1 473,352,"3,00,50,110" +,Guadeloupe,,,GP,GLP,312,,.gp,Euro,EUR,978,2,,Y,590,340,"08,10" +,Guam,GUM,GUM,GU,GUM,316,,.gu,US Dollar,USD,840,2,Hagåtña,Y,1 671,"3,10,311","310-032,310-033,310-140,310-370,310-470,311-250" +Republic of Guatemala,Guatemala,GUA,GUA,GT,GTM,320,GCA,.gt,Quetzal,GTQ,320,2,Gautemala City,Y,502,704,"01,02,03" +,Guernsey,,,GG,GGY,831,GBG,.gg,Pound Sterling,GBP,826,2,St. Peter Port,Y,"44 1481, 44 7781, 44 7839, 44 7911",234,"03,50,55" +Republic of Guinea,Guinea,GUI,GUI,GN,GIN,324,RG,.gn,Guinean Franc,GNF,324,0,,Y,224,611,"01,02,03,04,05" +Republic of Guinea-Bissau,Guinea-Bissau,GNB,GBS,GW,GNB,624,,.gw,CFA Franc BCEAO,XOF,952,0,Conakry,Y,245,632,"01,02,03" +Co-operative Republic of Guyana,Guyana,GUY,GUY,GY,GUY,328,GUY,.gy,Guyana Dollar,GYD,328,2,Bissau,Y,592,738,"01,02" +Republic of Haiti,Haiti,HAI,HAI,HT,HTI,332,RH,.ht,Gourde,HTG,332,2,Port-au-Prince,Y,509,372,"01,02,03" +,Heard Island and McDonald Islands,,,HM,HMD,334,,.hm,Australian Dollar,AUD,36,2,,Y,672,, +Vatican City State,Holy See,,,VA,VAT,336,V,.va,Euro,EUR,978,2,,Y,"39 06 698, 379",, +Republic of Honduras,Honduras,HON,HON,HN,HND,340,HN,.hn,Lempira,HNL,340,2,Tegucigalpa,Y,504,708,"1,00,20,30,040" +Hong Kong Special Administrative Region of the People's Republic of China,Hong Kong,HKG,HKG,HK,HKG,344,,.hk,Hong Kong Dollar,HKD,344,2,,Y,852,454,"00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,28,29,40,47" +,Hungary,HUN,HUN,HU,HUN,348,H,.hu,Forint,HUF,348,2,Budapest,Y,36,216,"01,30,70,71" +,Iceland,ISL,ISL,IS,ISL,352,IS,.is,Iceland Krona,ISK,352,0,Reykjavík,Y,354,274,"01,02,03,04,05,07,08,09,11" +Republic of India,India,IND,IND,IN,IND,356,IND,.in,Indian Rupee,INR,356,2,New Delhi,Y,91,"40,44,05,406","404-01,404-04,404-045,404-05,404-07,404-09,404-10,404-11,404-12,404-13,404-14,404-15,404-16,404-17,404-18,404-19,404-22,404-24,404-25,404-28,404-29,404-30,404-33,404-34,404-36,404-38,404-41,404-42,404-44,404-50,404-51,404-52,404-53,404-54,404-55,404-56,404-57,404-58,404-59,404-60,404-62,404-64,404-66,404-67,404-68,404-69,404-70,404-71,404-72,404-73,404-74,404-75,404-76,404-77,404-78,404-79,404-80,404-81,404-82,404-83,404-85,404-86,404-87,404-88,404-89,405-05,405-034,405-53" +Republic of Indonesia,Indonesia,IDN,INA,ID,IDN,360,RI,.id,Rupiah,IDR,360,2,Jakarta,Y,62,510,"00,01,07,08,09,10,11,21,27,28,89,99" +Islamic Republic of Iran,Iran,IRN,IRI,IR,IRN,364,IR,.ir,Iranian Rial,IRR,364,2,Tehran,Y,98,432,"11,14,19,20,32,35,70" +Republic of Iraq,Iraq,IRQ,IRQ,IQ,IRQ,368,IRQ,.iq,Iraqi Dinar,IQD,368,3,Baghdad,Y,964,418,"05,08,20,30,40,45,82,92" +,Ireland,IRL,IRL,IE,IRL,372,IRL,.ie,Euro,EUR,978,2,Dublin,Y,353,272,"01,02,03,04,05,07,09,11,13" +,Isle of Man,,,IM,IMN,833,GBM,.im,Pound Sterling,GBP,826,2,Douglas,Y,"44 1624, 44 7524, 44 7624, 44 7924",234,"18,36,58,73" +State of Israel,Israel,ISR,ISR,IL,ISR,376,IL,.il,New Israeli Sheqel,ILS,376,2,Jerusalem,Y,972,425,"01,02,03,07,08,12,14,15,16,19,77" +Italian Republic,Italy,ITA,ITA,IT,ITA,380,I,.it,Euro,EUR,978,2,Rome,Y,39,222,"00,01,02,06,07,08,10,30,33,34,35,43,44,48,77,88,99" +,Jamaica,JAM,JAM,JM,JAM,388,JA,.jm,Jamaican Dollar,JMD,388,2,Kingston,Y,1 876,338,"20,05,01,10,180" +,Japan,JPN,JPN,JP,JPN,392,J,.jp,Yen,JPY,392,0,Tokyo,Y,81,"4,40,441","440-00,440-01,440-02,440-03,440-04,440-06,440-07,440-08,440-09,440-10,440-11,440-12,440-13,440-14,440-15,440-16,440-17,440-18,440-19,440-20,440-21,440-22,440-23,440-24,440-25,440-26,440-27,440-28,440-29,440-30,440-31,440-32,440-33,440-34,440-35,440-36,440-37,440-38,440-39,440-40,440-41,440-42,440-43,440-44,440-45,440-46,440-47,440-48,440-49,440-50,440-51,440-52,440-53,440-54,440-55,440-56,440-58,440-60,440-61,440-62,440-63,440-64,440-65,440-66,440-67,440-68,440-69,440-70,440-71,440-72,440-73,440-74,440-75,440-76,440-77,440-78,440-79,440-80,440-81,440-82,440-83,440-84,440-85,440-86,440-87,440-88,440-89,440-90,440-92,440-93,440-94,440-95,440-96,440-97,440-98,440-99,441-40,441-41,441-42,441-43,441-44,441-45,441-61,441-62,441-63,441-64,441-65,441-70,441-90,441-91,441-92,441-93,441-94,441-98,441-99" +,Jersey,,,JE,JEY,832,GBJ,.je,Pound Sterling,GBP,826,2,St. Helier,Y,44 1534,234,"03,28,50,55" +Hashemite Kingdom of Jordan,Jordan,JOR,JOR,JO,JOR,400,HKJ,.jo,Jordanian Dinar,JOD,400,3,Amman,Y,962,416,"01,02,03,77" +Republic of Kazakhstan,Kazakhstan,KAZ,KAZ,KZ,KAZ,398,KZ,.kz,Tenge,KZT,398,2,Nur-Sultan,Y,"7 6, 7 7",401,"01,02,07,77" +Republic of Kenya,Kenya,KEN,KEN,KE,KEN,404,EAK,.ke,Kenyan Shilling,KES,404,2,Nairobi,Y,254,639,"02,03,05,07" +Republic of Kiribati,Kiribati,,KIR,KI,KIR,296,,.ki,Australian Dollar,AUD,36,2,South Tarawa,Y,686,545,9 +Democratic People's Republic of Korea,North Korea,PRK,PRK,KP,PRK,408,,.kp,North Korean Won,KPW,408,2,Pyongyang,Y,850,467,193 +Republic of Korea,South Korea,KOR,KOR,KR,KOR,410,ROK,.kr,Won,KRW,410,0,Seoul,Y,82,450,"02,03,04,05,06,08" +State of Kuwait,Kuwait,KUW,KUW,KW,KWT,414,KWT,.kw,Kuwaiti Dinar,KWD,414,3,Kuwait City,Y,965,419,"02,03,04" +Kyrgyz Republic,Kyrgyzstan,KGZ,KGZ,KG,KGZ,417,KS,.kg,Som,KGS,417,2,Bishkek,Y,996,437,"01,03,05,09" + Lao People's Democratic Republic,Laos,LAO,LAO,LA,LAO,418,LAO,.la,Lao Kip,LAK,418,2,Vientiane,Y,856,457,"01,02,03,08" +Republic of Latvia,Latvia,LVA,LAT,LV,LVA,428,LV,.lv,Euro,EUR,978,2,Riga,Y,371,247,"01,02,03,05,06,07,08,09" +Lebanese Republic,Lebanon,LBN,LBN,LB,LBN,422,RL,.lb,Lebanese Pound,LBP,422,2,Beirut,Y,961,415,"01,03,32,33,34,35,36,37,38,39" +Kingdom of Lesotho,Lesotho,LES,LES,LS,LSO,426,LS,.ls,Loti,LSL,426,2,Maseru,Y,266,651,"01,02" +Republic of Liberia,Liberia,LBR,LBR,LR,LBR,430,LB,.lr,Liberian Dollar,LRD,430,2,Monrovia,Y,231,618,"01,02,04,07,20" +State of Libya,Libya,LBY,LBA,LY,LBY,434,LAR,.ly,Libyan Dinar,LYD,434,3,Tripoli,Y,218,606,"00,01,02,03,06" +Principality of Liechtenstein,Liechtenstein,LIE,LIE,LI,LIE,438,FL,.li,Swiss Franc,CHF,756,2,Vaduz,Y,423,295,"01,02,05,06,07,77" +Republic of Lithuania,Lithuania,LTU,LTU,LT,LTU,440,LT,.lt,Euro,EUR,978,2,Vilnius,Y,370,246,"01,02,03" +Grand Duchy of Luxembourg,Luxembourg,LUX,LUX,LU,LUX,442,L,.lu,Euro,EUR,978,2,Luxembourg,Y,352,270,"01,77,99" +Macao Special Administrative Region of the People's Republic of China,Macao,MAC,,MO,MAC,446,,.mo,Pataca,MOP,446,2,,Y,853,455,"00,01,02,03,04,05,06" +Republic of Madagascar,Madagascar,MAD,MAD,MG,MDG,450,RM,.mg,Malagasy Ariary,MGA,969,2,Antananarivo,Y,261,646,"01,02,03,04" +Republic of Malawi,Malawi,MWI,MAW,MW,MWI,454,MW,.mw,Malawi Kwacha,MWK,454,2,Lilongwe,Y,265,650,"01,10" +,Malaysia,MAS,MAS,MY,MYS,458,MAL,.my,Malaysian Ringgit,MYR,458,2,Kuala Lumpur,Y,60,502,"01,10,11,12,13,151,152,153,154,155,16,17,18,19,195,198,20" +,Maldives,MDV,MDV,MV,MDV,462,,.mv,Rufiyaa,MVR,462,2,Malé,Y,960,472,"01,02" +Republic of Mali,Mali,MLI,MLI,ML,MLI,466,RMM,.ml,CFA Franc BCEAO,XOF,952,0,Bamako,Y,223,610,"01,02" +Republic of Malta,Malta,MLT,MLT,MT,MLT,470,M,.mt,Euro,EUR,978,2,Valletta,Y,356,278,"01,21,77" +Republic of the Marshall Islands,Marshall Islands,,MHL,MH,MHL,584,,.mh,US Dollar,USD,840,2,Majuro,Y,692,551,1 +,Martinique,,,MQ,MTQ,474,,.mq,Euro,EUR,978,2,,Y,596,340,12 +Islamic Republic of Mauritania,Mauritania,MTN,MTN,MR,MRT,478,RIM,.mr,Ouguiya,MRU,929,2,Nouakchott,Y,222,609,"01,02,10" +Republic of Mauritius,Mauritius,MRI,MRI,MU,MUS,480,MS,.mu,Mauritius Rupee,MUR,480,2,Port Louis,Y,230,617,"01,02,03,10" +,Mayotte,,,YT,MYT,175,,.yt,Euro,EUR,978,2,,Y,"262 269, 262 639",647,"00,01,02,10" +United Mexican States,Mexico,MEX,MEX,MX,MEX,484,MEX,.mx,Mexican Peso,MXN,484,2,Mexico City,Y,52,334,"01,010,02,020,03,030,04,040,050,060,070,080,09,090,50" +Federated States of Micronesia,Micronesia,,FSM,FM,FSM,583,,.fm,US Dollar,USD,840,2,Palikir,Y,691,550,1 +Republic of Moldova,Moldova,MDA,MDA,MD,MDA,498,MD,.md,Moldovan Leu,MDL,498,2,Chișinău,Y,373,259,"01,02,03,04,05,99" +Principality of Monaco,Monaco,,MON,MC,MCO,492,MC,.mc,Euro,EUR,978,2,Monaco,Y,377,212,"01,10" +,Mongolia,MNG,MGL,MN,MNG,496,MGL,.mn,Tugrik,MNT,496,2,Ulaanbaatar,Y,976,428,"00,88,91,98,99" +,Montenegro,MNE,MNE,ME,MNE,499,MNE,.me,Euro,EUR,978,2,Podgorica,Y,382,297,"01,02,03" +,Montserrat,MSR,,MS,MSR,500,,.ms,East Caribbean Dollar,XCD,951,2,Brades Estate,Y,1 664,354,860 +Kingdom of Morocco,Morocco,MAR,MAR,MA,MAR,504,MA,.ma,Moroccan Dirham,MAD,504,2,Rabat,Y,212,604,"00,01,02" +Republic of Mozambique,Mozambique,MOZ,MOZ,MZ,MOZ,508,MOC,.mz,Mozambique Metical,MZN,943,2,Maputo,Y,258,643,"01,03,04" +Republic of the Union of Myanmar,Myanmar,MYA,MYA,MM,MMR,104,BUR,.mm,Kyat,MMK,104,2,Naypyidaw,Y,95,414,"01,05,06" +Republic of Namibia,Namibia,NAM,NAM,NA,NAM,516,NAM,.na,Namibia Dollar,NAD,516,2,Windhoek,Y,264,649,"01,02,03" +Republic of Nauru,Nauru,,NRU,NR,NRU,520,NAU,.nr,Australian Dollar,AUD,36,2,Yaren,Y,674,536,2 +Federal Democratic Republic of Nepal,Nepal,NEP,NEP,NP,NPL,524,NEP,.np,Nepalese Rupee,NPR,524,2,Kathmandu,Y,977,429,"01,02,04" +Kingdom of the Netherlands,Netherlands,NED,NED,NL,NLD,528,NL,.nl,Euro,EUR,978,2,Amsterdam,Y,31,204,"02,03,04,05,06,07,08,09,10,12,14,15,16,17,18,20,21,23,24,28,68,69,98" +,New Caledonia,NCL,,NC,NCL,540,,.nc,CFP Franc,XPF,953,0,Nouméa,Y,687,546,1 +,New Zealand,NZL,NZL,NZ,NZL,554,NZ,.nz,New Zealand Dollar,NZD,554,2,Wellington,Y,64,530,"01,02,03,04,05,24,28" +Republic of Nicaragua,Nicaragua,NCA,NCA,NI,NIC,558,NIC,.ni,Cordoba Oro,NIO,558,2,Managua,Y,505,710,"21,30,73" +Republic of Niger,Niger,NIG,NIG,NE,NER,562,RN,.ne,CFA Franc BCEAO,XOF,952,0,Niamey,Y,227,614,"01,02,03,04" +Federal Republic of Nigeria,Nigeria,NGA,NGR,NG,NGA,566,WAN,.ng,Naira,NGN,566,2,Abuja,Y,234,621,"01,20,25,30,40,50,60,99" +,Niue,,,NU,NIU,570,,.nu,New Zealand Dollar,NZD,554,2,Alofi,Y,683,555,1 +,Norfolk Island,,,NF,NFK,574,,.nf,Australian Dollar,AUD,36,2,Kingston,Y,672,505,10 +Republic of North Macedonia,North Macedonia,MKD,MKD,MK,MKD,807,NMK,.mk,Denar,MKD,807,2,Skopje,Y,672 3,294,"01,02,03,75" +,Northern Mariana Islands,,,MP,MNP,580,,.mp,US Dollar,USD,840,2,Saipan,Y,1 670,310,"1,10,370" +Kingdom of Norway,Norway,NOR,NOR,NO,NOR,578,N,.no,Norwegian Krone,NOK,578,2,Oslo,Y,47,242,"01,017,02,03,04,05,06,07,08,09,12,14,20,21,22,23" +Sultanate of Oman,Oman,OMA,OMA,OM,OMN,512,,.om,Rial Omani,OMR,512,3,Muscat,Y,968,422,"02,03" +Islamic Republic of Pakistan,Pakistan,PAK,PAK,PK,PAK,586,PK,.pk,Pakistan Rupee,PKR,586,2,Islamabad,Y,92,410,"01,03,04,06,07,08" +Republic of Palau,Palau,,PLW,PW,PLW,585,,.pw,US Dollar,USD,840,2,Ngerulmud,Y,680,552,"01,80" +State of Palestine,Palestine,PLE,PLE,PS,PSE,275,,.ps,,,,,Ramallah,Y,970,425,"05,06" +Republic of Panama,Panama,PAN,PAN,PA,PAN,591,PA,.pa,Balboa,PAB,590,2,Panama City,Y,507,714,"1,02,03,04,020" +Independent State of Papua New Guinea,Papua New Guinea,PNG,PNG,PG,PNG,598,PNG,.pg,Kina,PGK,598,2,Port Moresby,Y,675,537,"01,02,03" +Republic of Paraguay,Paraguay,PAR,PAR,PY,PRY,600,PY,.py,Guarani,PYG,600,0,Asunción,Y,595,744,"01,02,03,04,05" +Republic of Peru,Peru,PER,PER,PE,PER,604,PE,.pe,Sol,PEN,604,2,Lima,Y,51,716,"01,02,06,07,10,15,17,20" +Republic of the Philippines,Philippines,PHI,PHI,PH,PHL,608,RP,.ph,Philippine Peso,PHP,608,2,Manila,Y,63,515,"00,01,02,03,05,18,88" +,Pitcairn,,,PN,PCN,612,,.pn,New Zealand Dollar,NZD,554,2,Adamstown,Y,64,, +Republic of Poland,Poland,POL,POL,PL,POL,616,PL,.pl,Zloty,PLN,985,2,Warsaw,Y,48,260,"01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,34,35,36,38,98" +Portuguese Republic,Portugal,POR,POR,PT,PRT,620,P,.pt,Euro,EUR,978,2,Lisbon,Y,351,268,"01,03,04,06,07" +,Puerto Rico,PUR,PUR,PR,PRI,630,,.pr,US Dollar,USD,840,2,San Juan,Y,"1 787, 1 939",330,"11,110" +State of Qatar,Qatar,QAT,QAT,QA,QAT,634,Q,.qa,Qatari Rial,QAR,634,2,Doha,Y,974,427,"01,02" +,Réunion,,,RE,REU,638,,.re,Euro,EUR,978,2,,Y,262,647,"00,02,10" +,Romania,ROU,ROU,RO,ROU,642,RO,.ro,Romanian Leu,RON,946,2,Bucharest,Y,40,226,"01,02,03,04,05,06,10,11,16" +,Russian Federation,RUS,RUS,RU,RUS,643,RUS,.ru,Russian Ruble,RUB,643,2,Moscow,Y,7,250,"01,02,03,04,05,07,10,11,12,13,15,16,17,19,20,28,35,39,44,92,93,99" +Republic of Rwanda,Rwanda,RWA,RWA,RW,RWA,646,RWA,.rw,Rwanda Franc,RWF,646,0,Kigali,Y,250,635,"10,13,14" +,Saint Barthélemy,,,BL,BLM,652,,,Euro,EUR,978,2,Gustavia,Y,590,340,"01,03,08,20" +,"Saint Helena, Ascension and Tristan da Cunha",,,SH,SHN,654,,.sh,Saint Helena Pound,SHP,654,2,Jamestown,Y,290,658,1 +Federation of Saint Christopher and Nevis,Saint Kitts and Nevis,SKN,SKN,KN,KNA,659,,.kn,East Caribbean Dollar,XCD,951,2,Basseterre,Y,1 869,356,"5,00,70,110" +,Saint Lucia,LCA,LCA,LC,LCA,662,WL,.lc,East Caribbean Dollar,XCD,951,2,Castries,Y,1 758,358,"30,50,110" +,Saint Martin (French part),,,MF,MAF,663,,,Euro,EUR,978,2,Marigot,Y,590,340,"01,08,20" +,Saint Pierre and Miquelon,,,PM,SPM,666,,.pm,Euro,EUR,978,2,St. Pierre,Y,508,308,"01,02,03" +,Saint Vincent and the Grenadines,VIN,VIN,VC,VCT,670,WV,.vc,East Caribbean Dollar,XCD,951,2,Kingstown,Y,1 784,360,"5,01,00,110" +Independent State of Samoa,Samoa,SAM,SAM,WS,WSM,882,WS,.ws,Tala,WST,882,2,Apia,Y,685,549,"01,27" +Republic of San Marino,San Marino,SMR,SMR,SM,SMR,674,RSM,.sm,Euro,EUR,978,2,San Marino,Y,378,292,1 +Democratic Republic of São Tomé and Príncipe,Sao Tome and Principe,STP,STP,ST,STP,678,,.st,Dobra,STN,930,2,São Tomé,Y,239,626,1 +Kingdom of Saudi Arabia,Saudi Arabia,KSA,KSA,SA,SAU,682,KSA,.sa,Saudi Riyal,SAR,682,2,Riyadh,Y,966,420,"01,03,04,05,06,07" +Republic of Senegal,Senegal,SEN,SEN,SN,SEN,686,SN,.sn,CFA Franc BCEAO,XOF,952,0,Dakar,Y,221,608,"01,02,03" +Republic of Serbia,Serbia,SRB,SRB,RS,SRB,688,,.rs,Serbian Dinar,RSD,941,2,Belgrade,Y,381,220,"01,02,03,05" +Republic of Seychelles,Seychelles,SEY,SEY,SC,SYC,690,SY,.sc,Seychelles Rupee,SCR,690,2,Victoria,Y,248,633,"01,02,10" +Republic of Sierra Leone,Sierra Leone,SLE,SLE,SL,SLE,694,WAL,.sl,Leone,SLL,694,2,Freetown,Y,232,619,"01,02,03,04,05,25" +Republic of Singapore,Singapore,SIN,SGP,SG,SGP,702,SGP,.sg,Singapore Dollar,SGD,702,2,Singapore,Y,65,525,"01,02,03,05,06,07,12" +,Sint Maarten (Dutch part),,,SX,SXM,534,,.sx,Netherlands Antillean Guilder,ANG,532,2,Philipsburg,Y,1 721,362,"51,59,60" +Slovak Republic,Slovakia,SVK,SVK,SK,SVK,703,SK,.sk,Euro,EUR,978,2,Bratislava,Y,421,231,"01,02,03,04,05,06,15,99" +Republic of Slovenia,Slovenia,SVN,SLO,SK,SVN,705,SLO,.si,Euro,EUR,978,2,Ljubljana,Y,386,293,"10,40,41,64,70" +,Solomon Islands,SOL,SOL,SB,SLB,90,,.sb,Solomon Islands Dollar,SBD,90,2,Honiara,Y,677,540,"01,02,10" +Federal Republic of Somalia,Somalia,SOM,SOM,SO,SOM,706,SO,.so,Somali Shilling,SOS,706,2,Mogadishu,Y,252,637,"01,04,10,19,30,60,71,82" +Republic of South Africa,South Africa,RSA,RSA,ZA,ZAF,710,ZA,.za,Rand,ZAR,710,2,Cape Town,Y,27,655,"01,02,06,07,10,12,19,21" +,South Georgia and the South Sandwich Islands,,,GS,SGS,239,,.gs,,,,,King Edward Point,Y,500,, +Republic of South Sudan,South Sudan,SSD,SSD,SS,SSD,728,,.ss,South Sudanese Pound,SSP,728,2,Juba,Y,211,659,"02,03,04,06" +Kingdom of Spain,Spain,ESP,ESP,ES,ESP,724,E,.es,Euro,EUR,978,2,Madrid,Y,34,214,"01,03,04,05,06,07,08,09,11,15,16,17,18,19,20,21,22,23,25,26,27,32" +Democratic Socialist Republic of Sri Lanka,Sri Lanka,SRI,SRI,LK,LKA,144,CL,.lk,Sri Lanka Rupee,LKR,144,2,Sri Jayawardenepura Kotte,Y,94,413,"01,02,03,05,08" +Republic of the Sudan,Sudan,SDN,SUD,SD,SDN,729,SUD,.sd,Sudanese Pound,SDG,938,2,Khartoum,Y,249,634,"00,01,02,05,06,07,08,15,22" +Republic of Suriname,Suriname,SUR,SUR,SR,SUR,740,SME,.sr,Surinam Dollar,SRD,968,2,Paramaribo,Y,597,746,"01,02,03,04" +,Svalbard and Jan Mayen,,,SJ,SJM,744,,,Norwegian Krone,NOK,578,2,,Y,47 79,, +Kingdom of Sweden,Sweden,SWE,SWE,SE,SWE,752,S,.se,Swedish Krona,SEK,752,2,Stockholm,Y,46,240,"01,02,04,05,06,07,08,10,11,12,13th,14th,15th,16,17,18,19th,20th,22,23,24,25,26,27,28,29,30,35,36" +Swiss Confederation,Switzerland,SUI,SUI,CH,CHE,756,CH,.ch,Swiss Franc,CHF,756,2,Bern,Y,41,228,"01,02,03,05,07,08,09,12,51,52,53,54" +Syrian Arab Republic,Syria,SYR,SYR,SY,SYR,760,SYR,.sy,Syrian Pound,SYP,760,2,Damascus,Y,963,417,"01,02,09" +Republic of China,Taiwan,TPE,TPE,TW,TWN,158,RC,.tw,New Taiwan Dollar,TWD,901,2,Taipei,Y,886,466,"01,02,03,05,06,07,09,10,11,56,68,88,89,92,93,97,99" +Republic of Tajikistan,Tajikistan,TJK,TJK,TJ,TJK,762,TJ,.tj,Somoni,TJS,972,2,Dushanbe,Y,992,436,"01,02,03,04,05,12" +United Republic of Tanzania,Tanzania,TAN,TAN,TZ,TZA,834,EAT,.tz,Tanzanian Shilling,TZS,834,2,Dodoma,Y,255,640,"01,02,03,04,05,06,07,08,09,11" +Kingdom of Thailand,Thailand,THA,THA,TH,THA,764,T,.th,Baht,THB,764,2,Bangkok,Y,66,520,"00,01,03,04,05,15,18,20,23,99" +Democratic Republic of Timor-Leste,Timor-Leste,TLS,,TL,TLS,626,,.tl,US Dollar,USD,840,2,Dili,Y,670,514,"01,02" +Togolese Republic,Togo,TOG,TOG,TG,TGO,768,TG,.tg,CFA Franc BCEAO,XOF,952,0,Lomé,Y,228,615,"01,02,03" +,Tokelau,,,TK,TKL,772,,.tk,New Zealand Dollar,NZD,554,2,,Y,690,554,1 +Kingdom of Tonga,Tonga,TGA,TGA,TO,TON,776,,.to,Pa’anga,TOP,776,2,Nukuʻalofa,Y,676,539,"01,43" +Republic of Trinidad and Tobago,Trinidad and Tobago,TRI,TTO,TT,TTO,780,TT,.tt,Trinidad and Tobago Dollar,TTD,780,2,Port of Spain,Y,1 868,374,"12,12,01,30,140" +Republic of Tunisia,Tunisia,TUN,TUN,TN,TUN,788,TN,.tn,Tunisian Dinar,TND,788,3,Tunis,Y,216,605,"01,02,03,06" +Republic of Turkey,Turkey,TUR,TUR,TR,TUR,792,TR,.tr,Turkish Lira,TRY,949,2,Ankara,Y,90,286,"01,02,03,04" +,Turkmenistan,TKM,TKM,TM,TKM,795,TM,.tm,Turkmenistan New Manat,TMT,934,2,Ashgabat,Y,993,438,"01,02" +,Turks and Caicos Islands,TCA,,TC,TCA,796,,.tc,US Dollar,USD,840,2,Cockburn Town,Y,1 649,376,"5,03,50,352" +,Tuvalu,,TUV,TV,TUV,798,,.tv,Australian Dollar,AUD,36,2,Funafuti,Y,688,553,1 +Republic of Uganda,Uganda,UGA,UGA,UG,UGA,800,EAU,.ug,Uganda Shilling,UGX,800,0,Kampala,Y,256,641,"01,10,11,14,18,22,30,33,66" +,Ukraine,UKR,UKR,UA,UKR,804,UA,.ua,Hryvnia,UAH,980,2,Kyiv,Y,380,255,"01,02,03,04,05,06,07,21,39,50,67,68" +,United Arab Emirates,UAE,UAE,AE,ARE,784,UAE,.ae,UAE Dirham,AED,784,2,Abu Dhabi,Y,971,"42,44,30,431","424-02,424-03,430-02,431-02" +United Kingdom of Great Britain and Northern Ireland,United Kingdom,ENG,GBR,GB,GBR,826,GB,.uk,Pound Sterling,GBP,826,2,London,Y,44,"2,34,235","234-01,234-02,234-03,234-07,234-08,234-09,234-10,234-11,234-12,234-14,234-15th,234-16,234-17,234-18,234-19th,234-20,234-22nd,234-23,234-24,234-25th,234-26,234-27,234-28,234-30th,234-31,234-32,234-33,234-34,234-35,234-36,234-37,234-50,234-51,234-55,234-57,234-58,234-75,234-76,234-77,234-78,234-91,234-92,234-94,235-02" +United States of America,United States,USA,USA,US,USA,840,US,.us,US Dollar,USD,840,2,"Washington, D.C.",Y,1,"31,03,11,31,23,13,31,40,00,000","310-003,310-004,310-010,310-011,310-012,310-013,310-016,310-020,310-030,310-040,310-050,310-06,310-070,310-080,310-090,310-100,310-120,310-130,310-14th,310-150,310-15th,310-160,310-170,310-180,310-190,310-200,310-210,310-220,310-23,310-230,310-24,310-240,310-250,310-25th,310-260,310-26th,310-270,310-280,310-290,310-300,310-31,310-310,310-320,310-330,310-34,310-340,310-350,310-360,310-38,310-380,310-390,310-400,310-410,310-420,310-430,310-440,310-450,310-46,310-460,310-480,310-490,310-500,310-510,310-520,310-530,310-540,310-560,310-570,310-580,310-590,310-60,310-600,310-610,310-620,310-630,310-640,310-650,310-660,310-670,310-680,310-690,310-700,310-710,310-730,310-740,310-750,310-760,310-770,310-780,310-790,310-800,310-830,310-850,310-860,310-870,310-880,310-890,310-900,310-910,310-920,310-930,310-940,310-950,310-960,310-970,310-980,310-990,311-000,311-010,311-020,311-030,311-040,311-050,311-070,311-080,311-090,311-100,311-110,311-120,311-130,311-140,311-150,311-170,311-190,311-210,311-220,311-240,311-260,311-270,311-271,311-272,311-273,311-274,311-275,311-276,311-277,311-278,311-279,311-280,311-281,311-282,311-283,311-284,311-285,311-286,311-287,311-288,311-289,311-300,311-310,311-311,311-330,311-340,311-350,311-370,311-380,311-390,311-410,311-420,311-430,311-440,311-460,311-480,311-481,311-482,311-483,311-484,311-485,311-486,311-487,311-488,311-489,311-490,311-500,311-520,311-540,311-590,311-610,311-650,311-660,311-670,311-710,311-730,311-740,311-800,311-810,311-830,311-860,311-870,311-880,311-910,311-920,312-010,312-030,312-040,312-090,312-120,312-130,312-160,312-170,312-180,312-190,312-220,312-230,312-270,312-280,312-290,312-380,312-530,316-010,316-011" +,United States Minor Outlying Islands,,,UM,UMI,581,,,US Dollar,USD,840,2,,Y,246,, +Oriental Republic of Uruguay,Uruguay,URU,URU,UY,URY,858,UY,.uy,Peso Uruguayo,UYU,858,2,Montevideo,Y,598,748,"01,03,07,10" +Republic of Uzbekistan,Uzbekistan,UZB,UZB,UZ,UZB,860,UZ,.uz,Uzbekistan Sum,UZS,860,2,Tashkent,Y,998,434,"01,02,04,05,07" +Republic of Vanuatu,Vanuatu,VAN,VAN,VU,VUT,548,,.vu,Vatu,VUV,548,0,Port Vila,Y,678,541,"01,05" +Bolivarian Republic of Venezuela,Venezuela,VEN,VEN,VE,VEN,862,YV,.ve,Bolívar Soberano,VES,928,2,Caracas,Y,58,734,"01,02,03,04,06" +Socialist Republic of Vietnam,Viet Nam,VIE,VIE,VN,VNM,704,VN,.vn,Dong,VND,704,0,Hanoi,Y,84,452,"01,02,03,04,05,06,07,08" +,Virgin Islands (British),VGB,IVB,VG,VGB,92,BVI,.vg,US Dollar,USD,840,2,Road Town,Y,1 284,348,"17,05,70,770" +,Virgin Islands (U.S.),VIR,ISV,VI,VIR,850,,.vi,US Dollar,USD,840,2,Charlotte Amalie,Y,1 340,332,50 +,Wallis and Futuna,,,WF,WLF,876,,.wf,CFP Franc,XPF,953,0,Mata Utu,Y,681,543,1 +,Western Sahara,,,EH,ESH,732,,.eh,Moroccan Dirham,MAD,504,2,Tifariti,Y,212,, +Republic of Yemen,Yemen,YEM,YEM,YE,YEM,887,YAR,.ye,Yemeni Rial,YER,886,2,Aden,Y,967,421,"01,02,03,04" +Republic of Zambia,Zambia,ZAM,ZAM,ZM,ZMB,894,Z,.zm,Zambian Kwacha,ZMW,967,2,Lusaka,Y,260,645,"01,02,03" +Republic of Zimbabwe,Zimbabwe,ZIM,ZIM,ZW,ZWE,716,ZW,.zw,Zimbabwe Dollar,ZWL,932,2,Harare,Y,263,648,"01,03,04" +Republic of Abkhazia,Abkhazia,,,,,,,,,,,,Sukhumi,N,7,289,"67,68,88" +Republic of Kosovo,Kosovo,KVX,KOS,,,,RKS,,,,,,Pristina,N,383,221,"01,02,06,07" +,Zanzibar,,,,,,EAZ,,,,,,,N,255,, +,Ascension Island,,,,,,,.ac,,,,,Georgetown,N,247,, +Pridnestrovian Moldavian Republic,Transnistria,,,,,,,,,PRB,,,Tiraspol,,"373 2, 373 5",, diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py index 1a1bb9057..54179e2b4 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py @@ -1,7 +1,6 @@ from typing import Optional, List, Tuple from presidio_analyzer import Pattern, PatternRecognizer from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils -import pycountry class InGstinRecognizer(PatternRecognizer): @@ -25,9 +24,10 @@ class InGstinRecognizer(PatternRecognizer): """ gstin_country_codes_iso3a = "" - countries = pycountry.countries + utils = Utils() + countries = utils.get_country_codes(iso_code="ISO3166-1-Alpha-3") for country in countries: - gstin_country_codes_iso3a += country.alpha_3 + "|" + gstin_country_codes_iso3a += country + "|" pattern1 = ( "[0-9]{4}" + "(" @@ -52,7 +52,11 @@ class InGstinRecognizer(PatternRecognizer): r"\b([0-9]{2}[A-Z]{5}(?!0000)[0-9]{4}[A-Z]{1}[0-9A-Z]{2})\b", 0.2, ), - Pattern("GSTIN (Medium)", pattern1, 0.6), # NRTP pattern + Pattern( + "GSTIN (Medium)", + pattern1, + 0.6, # NRTP pattern + ), Pattern( "GSTIN (Very Low)", r"\b((?=.*?[A-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py index a858fbf22..0d78e4df7 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py @@ -1,6 +1,6 @@ from typing import List, Optional, Tuple from presidio_analyzer import Pattern, PatternRecognizer -import pycountry +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils class IsinRecognizer(PatternRecognizer): @@ -16,9 +16,10 @@ class IsinRecognizer(PatternRecognizer): """ iso2a_countryname = "" - countries = pycountry.countries + utils = Utils() + countries = utils.get_country_codes(iso_code="ISO3166-1-Alpha-2") for country in countries: - iso2a_countryname += country.alpha_2 + "|" + iso2a_countryname += country + "|" pattern = "^" + "(" + iso2a_countryname.rstrip("|") + ")" + "[A-Z0-9]{9}[0-9]{1}$" PATTERNS = [ diff --git a/presidio-analyzer/setup.py b/presidio-analyzer/setup.py index d06ca7abb..a1326e40a 100644 --- a/presidio-analyzer/setup.py +++ b/presidio-analyzer/setup.py @@ -39,7 +39,6 @@ "tldextract", "pyyaml", "phonenumbers>=8.12,<9.0.0", - "pycountry>=23.12.11", ], extras_require={ "transformers": ["spacy_huggingface_pipelines"], diff --git a/presidio-analyzer/tests/test_analyzer_utils.py b/presidio-analyzer/tests/test_analyzer_utils.py index 4aabedd81..ec3efd919 100644 --- a/presidio-analyzer/tests/test_analyzer_utils.py +++ b/presidio-analyzer/tests/test_analyzer_utils.py @@ -1,6 +1,17 @@ from presidio_analyzer import PresidioAnalyzerUtils import pytest + +@pytest.fixture(scope="module") +def recognizer(): + return PresidioAnalyzerUtils() + + +@pytest.fixture(scope="module") +def entities(): + return ["IN_PAN"] + + palindrome_test_set = [ ["abMA", False, False], ["abCba", False, True], @@ -26,6 +37,26 @@ [123456789012, False], ] +iso_country_format_test_set = [ + ["ISO3166-1-Alpha-2", 249], + ["ISO3166-1-Alpha-3", 249], + ["ISO3166-1-Numeric", 249], +] + +iso_currency_format_test_set = [ + ["ISO4217-Alpha-3", 247], + ["ISO4217-Numeric", 246], +] + +full_country_information_test_set = [ + ["Åland Islands", "English_short_name_using_title_case", 1], + # [], + # [], + # [], + # [], + # [], +] + @pytest.mark.parametrize("input_string , expected_output", luhn_mod_n_test_set) def test_get_luhn_mod_n(input_string, expected_output): @@ -78,3 +109,47 @@ def test_is_verhoeff(input_number, is_verhoeff): :return: True/False """ assert PresidioAnalyzerUtils.is_verhoeff_number(input_number) == is_verhoeff + + +@pytest.mark.parametrize("iso_code, count_of_records", iso_country_format_test_set) +def test_get_country_codes(iso_code, count_of_records): + """ + Test to get all country_codes for the given ISO format + :param iso_code: Valid/supported ISO code + :param count_of_records: count of total countries + :return: list of ISO codes for all countries + """ + pau = PresidioAnalyzerUtils() + assert len(pau.get_country_codes(iso_code=iso_code)) == count_of_records + + +@pytest.mark.parametrize("iso_code, count_of_records", iso_currency_format_test_set) +def test_get_currency_codes(iso_code, count_of_records): + """ + Test to get all country_currency_codes for the given ISO format + :param iso_code: Valid/supported ISO code + :param count_of_records: count of total countries + :return: List of ISO currency codes for all countries + """ + pau = PresidioAnalyzerUtils() + assert len(pau.get_currency_codes(iso_code=iso_code)) == count_of_records + + +@pytest.mark.parametrize( + "lookup_key, lookup_index, count_of_records", full_country_information_test_set +) +def test_get_full_country_information(lookup_key, lookup_index, count_of_records): + pau = PresidioAnalyzerUtils() + print( + pau.get_full_country_information( + lookup_key=lookup_key, lookup_index=lookup_index + ) + ) + assert ( + len( + pau.get_full_country_information( + lookup_key=lookup_key, lookup_index=lookup_index + ) + ) + == count_of_records + ) From 65a2e7004b0bdfaafe36f467e8e90fca8947da3b Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Thu, 14 Mar 2024 22:48:25 +0530 Subject: [PATCH 20/23] review feedback incorporation --- .../presidio_analyzer/analyzer_utils.py | 12 ++++++++++-- .../presidio_analyzer/{ => data}/country_master.csv | 0 .../predefined_recognizers/isin_recognizer.py | 2 +- presidio-analyzer/tests/test_isin_recognizer.py | 6 +++--- presidio-analyzer/tests/test_recognizer_registry.py | 2 +- 5 files changed, 15 insertions(+), 7 deletions(-) rename presidio-analyzer/presidio_analyzer/{ => data}/country_master.csv (100%) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py index bd6a1aa95..f2d294d9c 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_utils.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -11,10 +11,18 @@ class PresidioAnalyzerUtils: logic for re-usability and maintainability """ - __country_master_file_path__ = "presidio_analyzer/country_master.csv" + __country_master_file_path__ = "presidio_analyzer/data/country_master.csv" __country_master__ = [] def __init__(self): + # provision to override the default path for future need + __country_master_file_path__ = "presidio_analyzer/data/country_master.csv" + __country_master_file_path__ = ( + __country_master_file_path__ + if __country_master_file_path__ + else self.__country_master_file_path__ + ) + self.__load_country_master__() @staticmethod @@ -178,7 +186,7 @@ def get_country_codes(self, iso_code: str): def get_currency_codes(self, iso_code: str = ""): """ - Retrieve all defined currency codes across countries. + ...x .c ,xcRetrieve all defined currency codes across countries. :param iso_code: currently supporting : ISO4217-Alpha-3, ISO4217-Numeric :return: List of currency codes in provided ISO format. diff --git a/presidio-analyzer/presidio_analyzer/country_master.csv b/presidio-analyzer/presidio_analyzer/data/country_master.csv similarity index 100% rename from presidio-analyzer/presidio_analyzer/country_master.csv rename to presidio-analyzer/presidio_analyzer/data/country_master.csv diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py index 0d78e4df7..9f1944747 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py @@ -26,7 +26,7 @@ class IsinRecognizer(PatternRecognizer): Pattern( "ISIN (Medium)", r"\b[A-Z]{2}[A-Z0-9]{9}\d{1}\b", - 0.5, + 0.01, ), Pattern( "ISIN (Strong)", diff --git a/presidio-analyzer/tests/test_isin_recognizer.py b/presidio-analyzer/tests/test_isin_recognizer.py index 235fe7242..9b70bb11e 100644 --- a/presidio-analyzer/tests/test_isin_recognizer.py +++ b/presidio-analyzer/tests/test_isin_recognizer.py @@ -19,7 +19,7 @@ def entities(): [ # fmt: off ("IL0011762056", 1, (0, 12), 0.85), - ("ZZ12345ABCD1", 1, (0, 12), 0.50), + ("ZZ12345ABCD1", 1, (0, 12), 0.01), ("US0378331005", 1, (0, 12), 0.85), ("KR7000830000", 1, (0, 12), 0.85), ("IL0006290147", 1, (0, 12), 0.85), @@ -30,13 +30,13 @@ def entities(): ("SG1T75931496", 1, (0, 12), 0.85), ("GB00B16PRC61", 1, (0, 12), 0.85), ("DE0007236101", 1, (0, 12), 0.85), - ("XS1636274265", 1, (0, 12), 0.50), # exception to XS as a country code + ("XS1636274265", 1, (0, 12), 0.01), # exception to XS as a country code ("INF740KA1BM0", 1, (0, 12), 0.85), ("INE732I01013", 1, (0, 12), 0.85), ("ABNE123456", 0, (), (),), ("My Listed Company's stock trades with ISIN number SA14TG012N13 with a lot of " "text beyond the actual value", - 1, (50, 62), 0.50), + 1, (50, 62), 0.01), # fmt: on ], ) diff --git a/presidio-analyzer/tests/test_recognizer_registry.py b/presidio-analyzer/tests/test_recognizer_registry.py index 83eaac1f7..46ac00ac5 100644 --- a/presidio-analyzer/tests/test_recognizer_registry.py +++ b/presidio-analyzer/tests/test_recognizer_registry.py @@ -57,7 +57,7 @@ def test_when_get_recognizers_then_all_recognizers_returned(mock_recognizer_regi registry = mock_recognizer_registry registry.load_predefined_recognizers() recognizers = registry.get_recognizers(language="en", all_fields=True) - # 1 custom recognizer in english + 27 predefined + # 1 custom recognizer in english + 26 predefined assert len(recognizers) == 1 + 26 From 4391cd40c69220cd5c5fdd092e0208a54559143a Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Mon, 25 Mar 2024 19:08:09 +0530 Subject: [PATCH 21/23] interim commit - not ready for merging interim code with EntityRecognizer enhancement WIP --- .../presidio_analyzer/entity_recognizer.py | 6 +++ .../predefined_recognizers/isin_recognizer.py | 49 ++++++++++++------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index 6e99a3dca..92f3ab4fa 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -4,6 +4,7 @@ from presidio_analyzer import RecognizerResult from presidio_analyzer.nlp_engine import NlpArtifacts +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils logger = logging.getLogger("presidio-analyzer") @@ -39,8 +40,13 @@ def __init__( supported_language: str = "en", version: str = "0.0.1", context: Optional[List[str]] = None, + analyzer_utils: Optional[PresidioAnalyzerUtils] = None, ): + if analyzer_utils: + self.analyzer_utils = analyzer_utils + else: + self.analyzer_utils = PresidioAnalyzerUtils() self.supported_entities = supported_entities if name is None: diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py index 9f1944747..d6f610b64 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py @@ -1,6 +1,7 @@ from typing import List, Optional, Tuple from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils + +# from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils class IsinRecognizer(PatternRecognizer): @@ -16,24 +17,8 @@ class IsinRecognizer(PatternRecognizer): """ iso2a_countryname = "" - utils = Utils() - countries = utils.get_country_codes(iso_code="ISO3166-1-Alpha-2") - for country in countries: - iso2a_countryname += country + "|" - pattern = "^" + "(" + iso2a_countryname.rstrip("|") + ")" + "[A-Z0-9]{9}[0-9]{1}$" - - PATTERNS = [ - Pattern( - "ISIN (Medium)", - r"\b[A-Z]{2}[A-Z0-9]{9}\d{1}\b", - 0.01, - ), - Pattern( - "ISIN (Strong)", - pattern, - 0.85, - ), - ] + # utils = Utils() + pattern: str = "" CONTEXT = ["ISIN", "ISIN_CODE"] @@ -51,6 +36,32 @@ def __init__( else [("-", ""), (" ", ""), (":", "")] ) + self.countries = self.analyzer_utils.get_country_codes( + iso_code="ISO3166-1-Alpha-2" + ) + for country in self.countries: + self.iso2a_countryname += country + "|" + self.pattern = ( + "^" + + "(" + + self.iso2a_countryname.rstrip("|") + + ")" + + "[A-Z0-9]{9}[0-9]{1}$" + ) + + self.PATTERNS = [ + Pattern( + "ISIN (Medium)", + r"\b[A-Z]{2}[A-Z0-9]{9}\d{1}\b", + 0.01, + ), + Pattern( + "ISIN (Strong)", + self.pattern, + 0.85, + ), + ] + patterns = patterns if patterns else self.PATTERNS context = context if context else self.CONTEXT super().__init__( From e040ecc93f909c3b864765368f9db3855983ec75 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Wed, 17 Apr 2024 14:39:35 +0530 Subject: [PATCH 22/23] incorporated review suggestions class instantiation changed for analyzer utils --- .../presidio_analyzer/entity_recognizer.py | 10 +- .../presidio_analyzer/pattern_recognizer.py | 5 +- .../in_gstin_recognizer.py | 97 ++++++++++--------- .../predefined_recognizers/isin_recognizer.py | 12 +-- 4 files changed, 67 insertions(+), 57 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index 92f3ab4fa..238006c8d 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -43,12 +43,6 @@ def __init__( analyzer_utils: Optional[PresidioAnalyzerUtils] = None, ): - if analyzer_utils: - self.analyzer_utils = analyzer_utils - else: - self.analyzer_utils = PresidioAnalyzerUtils() - self.supported_entities = supported_entities - if name is None: self.name = self.__class__.__name__ # assign class name as name else: @@ -60,11 +54,15 @@ def __init__( self.version = version self.is_loaded = False self.context = context if context else [] + self.supported_entities = supported_entities self.load() logger.info("Loaded recognizer: %s", self.name) self.is_loaded = True + if analyzer_utils is not None: + self.analyzer_utils = analyzer_utils + @property def id(self): """Return a unique identifier of this recognizer.""" diff --git a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py index 3aec751af..70927d2e5 100644 --- a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py @@ -12,6 +12,7 @@ AnalysisExplanation, ) from presidio_analyzer.nlp_engine import NlpArtifacts +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils logger = logging.getLogger("presidio-analyzer") @@ -41,6 +42,7 @@ def __init__( deny_list_score: float = 1.0, global_regex_flags: Optional[int] = re.DOTALL | re.MULTILINE | re.IGNORECASE, version: str = "0.0.1", + analyzer_utils: Optional[PresidioAnalyzerUtils] = None, ): if not supported_entity: raise ValueError("Pattern recognizer should be initialized with entity") @@ -50,12 +52,13 @@ def __init__( "Pattern recognizer should be initialized with patterns" " or with deny list" ) - + print(supported_entity) super().__init__( supported_entities=[supported_entity], supported_language=supported_language, name=name, version=version, + analyzer_utils=analyzer_utils, ) if patterns is None: self.patterns = [] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py index 54179e2b4..7dfd362d0 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/in_gstin_recognizer.py @@ -1,6 +1,9 @@ from typing import Optional, List, Tuple from presidio_analyzer import Pattern, PatternRecognizer -from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils + + +# from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils class InGstinRecognizer(PatternRecognizer): @@ -23,47 +26,6 @@ class InGstinRecognizer(PatternRecognizer): This can allow a greater variety in input, for example by removing dashes or spaces. """ - gstin_country_codes_iso3a = "" - utils = Utils() - countries = utils.get_country_codes(iso_code="ISO3166-1-Alpha-3") - for country in countries: - gstin_country_codes_iso3a += country + "|" - pattern1 = ( - "[0-9]{4}" - + "(" - + gstin_country_codes_iso3a.rstrip("|") - + ")" - + "(?!00000)[0-9]{5}[A-Z]{2}[A-Z0-9]{1}" - ) - - pattern2 = ( - "[0-9]{2}[A-Z]{3}[ABCFGHJLPT]{1}[A-Z]{1}(?!0000)[0-9]{4}" - + "[A-Z]{1}[1-9A-Z]{1}(Z)[0-9A-Z]{1}" - ) - - PATTERNS = [ - Pattern( - "GSTIN (High)", - pattern2, - 0.85, - ), # Regular registration pattern - Pattern( - "GSTIN (Low)", - r"\b([0-9]{2}[A-Z]{5}(?!0000)[0-9]{4}[A-Z]{1}[0-9A-Z]{2})\b", - 0.2, - ), - Pattern( - "GSTIN (Medium)", - pattern1, - 0.6, # NRTP pattern - ), - Pattern( - "GSTIN (Very Low)", - r"\b((?=.*?[A-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", - 0.05, - ), - ] - CONTEXT = [ "GSTIN", "GST", @@ -116,10 +78,54 @@ def __init__( supported_language: str = "en", supported_entity: str = "IN_GSTIN", replacement_pairs: Optional[List[Tuple[str, str]]] = None, + analyzer_utils=PresidioAnalyzerUtils(), ): self.replacement_pairs = ( replacement_pairs if replacement_pairs else [("-", ""), (" ", "")] ) + + self.analyzer_utils = analyzer_utils + self.patterns = [] + gstin_country_codes_iso3a = "" + countries = self.analyzer_utils.get_country_codes("ISO3166-1-Alpha-3") + for country in countries: + gstin_country_codes_iso3a += country + "|" + pattern1 = ( + "[0-9]{4}" + + "(" + + gstin_country_codes_iso3a.rstrip("|") + + ")" + + "(?!00000)[0-9]{5}[A-Z]{2}[A-Z0-9]{1}" + ) + + pattern2 = ( + "[0-9]{2}[A-Z]{3}[ABCFGHJLPT]{1}[A-Z]{1}(?!0000)[0-9]{4}" + + "[A-Z]{1}[1-9A-Z]{1}(Z)[0-9A-Z]{1}" + ) + + self.PATTERNS = [ + Pattern( + "GSTIN (High)", + pattern2, + 0.85, + ), # Regular registration pattern + Pattern( + "GSTIN (Low)", + r"\b([0-9]{2}[A-Z]{5}(?!0000)[0-9]{4}[A-Z]{1}[0-9A-Z]{2})\b", + 0.2, + ), + Pattern( + "GSTIN (Medium)", + pattern1, + 0.6, # NRTP pattern + ), + Pattern( + "GSTIN (Very Low)", + r"\b((?=.*?[A-Z])(?=.*?[0-9]{4})[\w@#$%^?~-]{10})\b", + 0.05, + ), + ] + patterns = patterns if patterns else self.PATTERNS context = context if context else self.CONTEXT super().__init__( @@ -127,11 +133,14 @@ def __init__( patterns=patterns, context=context, supported_language=supported_language, + analyzer_utils=analyzer_utils, ) def validate_result(self, pattern_text: str) -> bool: """Determine absolute value based on calculation.""" - sanitized_value = Utils.sanitize_value(pattern_text, self.replacement_pairs) + sanitized_value = self.analyzer_utils.sanitize_value( + pattern_text, self.replacement_pairs + ) return self.__check_gstin(sanitized_value) def __check_gstin(self, sanitized_value: str) -> bool: @@ -142,7 +151,7 @@ def __check_gstin(self, sanitized_value: str) -> bool: else: if sanitized_value[13] != "Z" or sanitized_value[12] == "0": is_valid_gstin = False - elif Utils.get_luhn_mod_n(sanitized_value): + elif self.analyzer_utils.get_luhn_mod_n(sanitized_value): is_valid_gstin = True else: is_valid_gstin = False diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py index d6f610b64..4d927512e 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/isin_recognizer.py @@ -1,5 +1,7 @@ from typing import List, Optional, Tuple from presidio_analyzer import Pattern, PatternRecognizer +from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils + # from presidio_analyzer.analyzer_utils import PresidioAnalyzerUtils as Utils @@ -29,16 +31,15 @@ def __init__( supported_language: str = "en", supported_entity: str = "ISIN_CODE", replacement_pairs: Optional[List[Tuple[str, str]]] = None, + analyzer_utils=PresidioAnalyzerUtils(), ): self.replacement_pairs = ( replacement_pairs if replacement_pairs else [("-", ""), (" ", ""), (":", "")] ) - - self.countries = self.analyzer_utils.get_country_codes( - iso_code="ISO3166-1-Alpha-2" - ) + self.analyzer_utils = analyzer_utils + self.countries = self.analyzer_utils.get_country_codes("ISO3166-1-Alpha-2") for country in self.countries: self.iso2a_countryname += country + "|" self.pattern = ( @@ -48,7 +49,6 @@ def __init__( + ")" + "[A-Z0-9]{9}[0-9]{1}$" ) - self.PATTERNS = [ Pattern( "ISIN (Medium)", @@ -61,7 +61,6 @@ def __init__( 0.85, ), ] - patterns = patterns if patterns else self.PATTERNS context = context if context else self.CONTEXT super().__init__( @@ -69,4 +68,5 @@ def __init__( patterns=patterns, context=context, supported_language=supported_language, + analyzer_utils=analyzer_utils, ) From 64407fb8264bd15bebfe90c78e0bbc3978ebbe57 Mon Sep 17 00:00:00 2001 From: Devopam Mittra Date: Tue, 21 May 2024 21:38:13 +0530 Subject: [PATCH 23/23] interim commit --- .../presidio_analyzer/analyzer_utils.py | 18 +++++++++++------- .../presidio_analyzer/entity_recognizer.py | 4 ++-- .../presidio_analyzer/pattern_recognizer.py | 1 - .../predefined_recognizers/cfi_recognizer.py | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_utils.py b/presidio-analyzer/presidio_analyzer/analyzer_utils.py index f2d294d9c..3f1fa40df 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_utils.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_utils.py @@ -26,7 +26,7 @@ def __init__(self): self.__load_country_master__() @staticmethod - def is_palindrome(text: str, case_insensitive: bool = False): + def is_palindrome(text: str, case_insensitive: bool = False) -> bool: """ Validate if input text is a true palindrome. @@ -53,7 +53,9 @@ def sanitize_value(text: str, replacement_pairs: List[Tuple[str, str]]) -> str: return text @staticmethod - def get_luhn_mod_n(input_str: str, alphabet="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"): + def get_luhn_mod_n( + input_str: str, alphabet="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + ) -> bool: """ Check if the given input number has a valid last checksum as per LUHN algorithm. @@ -73,7 +75,7 @@ def get_luhn_mod_n(input_str: str, alphabet="0123456789ABCDEFGHIJKLMNOPQRSTUVWXY ) % n == 0 @staticmethod - def is_verhoeff_number(input_number: int): + def is_verhoeff_number(input_number: int) -> bool: """ Check if the input number is a true verhoeff number. @@ -164,7 +166,7 @@ def __get_country_master_full_data__(self, iso_code: str = ""): country_information = list(filter(None, country_information)) return country_information - def get_country_codes(self, iso_code: str): + def get_country_codes(self, iso_code: str) -> List[str]: """ Fetch all defined country codes per required ISO format. @@ -184,7 +186,7 @@ def get_country_codes(self, iso_code: str): # return full country list for given code return self.__get_country_master_full_data__(iso_code=iso_code) - def get_currency_codes(self, iso_code: str = ""): + def get_currency_codes(self, iso_code: str = "") -> List[str]: """ ...x .c ,xcRetrieve all defined currency codes across countries. @@ -198,7 +200,9 @@ def get_currency_codes(self, iso_code: str = ""): # return full country list for given code return self.__get_country_master_full_data__(iso_code=iso_code) - def get_full_country_information(self, lookup_key: str, lookup_index: str): + def get_full_country_information( + self, lookup_key: str, lookup_index: str + ) -> List[str]: """ Fetch additional information through lookup_index in index of lookup_key. @@ -209,7 +213,7 @@ def get_full_country_information(self, lookup_key: str, lookup_index: str): ISO3166-1-Alpha-2,ISO3166-1-Alpha-3, ISO3166-1-Numeric, International_licence_plate_country_code, Country_code_top_level_domain, Currency_Name, ISO4217-Alpha-3, ISO4217-Numeric, Capital_City, Dialing_Code - :return: Dictionary object with additional information enriched from + :return: List of strings values with additional information enriched from master lookup """ diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index 238006c8d..73ec22d60 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -28,6 +28,7 @@ class EntityRecognizer: :param version: the recognizer current version :param context: a list of words which can help boost confidence score when they appear in context of the matched entity + :param analyzer_utils: Presidio Analyzer Utility class object (optional) """ MIN_SCORE = 0 @@ -60,8 +61,7 @@ def __init__( logger.info("Loaded recognizer: %s", self.name) self.is_loaded = True - if analyzer_utils is not None: - self.analyzer_utils = analyzer_utils + self.analyzer_utils = analyzer_utils @property def id(self): diff --git a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py index 70927d2e5..c197f9ade 100644 --- a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py @@ -52,7 +52,6 @@ def __init__( "Pattern recognizer should be initialized with patterns" " or with deny list" ) - print(supported_entity) super().__init__( supported_entities=[supported_entity], supported_language=supported_language, diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py index 975b868e5..e448f483e 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/cfi_recognizer.py @@ -34,7 +34,7 @@ class CfiRecognizer(PatternRecognizer): r"|DA|DN|DD|DM|DY|RA|RS|RP|RW|RF|RD|RM|OC|OP|OM|FF|FC|SR|ST|SE|SC|SF|SM|HR" r"|HT|HE|HC|HF|HM|IF|IT|JE|JF|JC|JR|JT|KR|KT|KE|KC|KF|KY|KM|LL" r"|LR|LS|TC|TT|TR|TI|TB|TD|TM|MC|MM)[A-Z]{4}$\b", - 0.50, + 0.20, ), ]