Update Image PII Detection Code.

awslabs · Oct 17, 2023 · 8ae8871 · 8ae8871
1 parent 7f1ab4f
commit 8ae8871
Show file tree

Hide file tree

Showing 25 changed files with 1,900 additions and 0 deletions.
diff --git a/src/containers/image-pii-detection/Dockerfile b/src/containers/image-pii-detection/Dockerfile
@@ -0,0 +1,27 @@
+FROM public.ecr.aws/lambda/python:3.9
+
+ARG FUNCTION_DIR="/opt/ml/code/"
+COPY requirements.txt ${FUNCTION_DIR}/requirements.txt
+RUN python3.9 -m pip install -r ${FUNCTION_DIR}/requirements.txt
+
+COPY main.py parser_factory.py ${FUNCTION_DIR}/
+COPY parsers/ ${FUNCTION_DIR}/parsers/
+
+ARG OCR_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/infer-ocr-model/standard"
+ARG OCR_MODEL_VERSION="v1.0.0"
+ARG FD_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/face-detection"
+ARG FD_MODEL_VERSION="1.2.0"
+
+RUN yum install -y wget
+RUN mkdir -p ${FUNCTION_DIR}/ocr_model
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/classifier.onnx -O ${FUNCTION_DIR}/ocr_model/classifier.onnx
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/det_standard.onnx -O ${FUNCTION_DIR}/ocr_model/det_standard.onnx
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/keys_v1.txt -O ${FUNCTION_DIR}/ocr_model/keys_v1.txt
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/rec_standard.onnx -O ${FUNCTION_DIR}/ocr_model/rec_standard.onnx
+RUN mkdir -p ${FUNCTION_DIR}/fd_model
+RUN wget -c ${FD_MODEL_URL}/${FD_MODEL_VERSION}/det.onnx -O ${FUNCTION_DIR}/fd_model/det.onnx
+
+WORKDIR ${FUNCTION_DIR}
+
+# Command can be overwritten by providing a different command in the template directly.
+ENTRYPOINT ["python"]
diff --git a/src/containers/image-pii-detection/__init__.py b/src/containers/image-pii-detection/__init__.py
diff --git a/src/containers/image-pii-detection/main.py b/src/containers/image-pii-detection/main.py
@@ -0,0 +1,208 @@
+import json
+import boto3
+import os
+import sys
+import pandas as pd
+import base64
+import argparse
+import copy
+import logging
+import tempfile
+
+from parser_factory import ParserFactory
+
+def check_include_file_type(file_info, include_file_types):
+    """
+    Check if the file type is included in the include_file_types list.
+
+    :param file_info: file info
+    :param include_file_types: list of file types to include
+
+    """
+    file_type = file_info['file_type']
+
+    if file_type in include_file_types:
+        return True
+    else:
+        return False
+
+def organize_table_info(table_name, result_bucket_name, original_bucket_name, file_info, columns, file_category):
+
+    description = json.dumps(file_info, ensure_ascii=False)
+    s3_location = f"s3://{result_bucket_name}/parser_results/{table_name}/"
+    input_format = 'org.apache.hadoop.mapred.TextInputFormat'
+    output_format = 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+    table_type = 'EXTERNAL_TABLE'
+    serde_info = {'SerializationLibrary': 'org.apache.hadoop.hive.serde2.OpenCSVSerde',
+                'Parameters': {'field.delim': ','}}
+    parameters = {'originalFileBucketName': original_bucket_name,
+                  'originalFileType': file_info['file_type'],
+                  'originalFilePath': file_info['file_path'],
+                  'originalFileSample': ', '.join(file_info['sample_files'][:10]),
+                  'originalFileCategory': file_category,
+                  'Unstructured': 'true',
+                  'classification': 'csv'}
+    glue_table_columns = [{'Name': 'index', 'Type': 'string'}]
+    for column in columns:
+        glue_table_columns.append({'Name': column, 'Type': 'string'})
+
+    glue_table_info = {
+        'Name': table_name,
+        'Description': description,
+        'StorageDescriptor': {
+            'Columns': glue_table_columns,
+            'Location': s3_location,
+            'InputFormat': input_format,
+            'OutputFormat': output_format,
+            'SerdeInfo': serde_info
+        },
+        'PartitionKeys': [],
+        'TableType': table_type,
+        'Parameters': parameters
+    }
+    return glue_table_info
+
+def batch_process_files(s3_client, bucket_name, file_info, file_category):
+    """
+    Batch process files in a folder with the same schema.
+
+    :param bucket_name: S3 bucket name
+    :param file_info: file info
+
+    Sample file_info:
+    {
+        "file_type": ".jpeg",
+        "file_path": "test_images/human_faces",
+        "sample_files": [
+            "1"
+        ]
+    }
+
+    """
+    file_contents = {}
+
+    file_type = file_info['file_type']
+    file_path = file_info['file_path']
+    sample_files = file_info['sample_files']
+
+    if file_category == 'detection_files':
+
+        parser = ParserFactory.create_parser(file_type=file_type, s3_client=s3_client)
+
+        for sample_file in sample_files:
+            object_key = f"{file_path}/{sample_file}{file_type}"
+            file_content = parser.load_content(bucket_name, object_key)
+            file_contents[f"{sample_file}"] = file_content
+
+    elif file_category == 'include_files':
+        for sample_file in sample_files:
+            file_contents[f"{sample_file}"] = ['This file is marked as Contains-PII.']
+
+    elif file_category == 'exclude_files':
+        for sample_file in sample_files:
+            file_contents[f"{sample_file}"] = ['This file is marked as Non-PII.']
+
+    return file_contents
+
+def process_file(parser, bucket_name, object_key):
+    """
+    Process a single file.
+    """
+    file_content = parser.load_content(bucket_name, object_key)
+
+    json_format_content = {}
+    json_format_content[f"{object_key}"] = file_content
+
+    return json_format_content
+
+def create_glue_table(glue_client, database_name, table_name, glue_table_info):
+
+    # Check if table exists
+    try:
+        response = glue_client.get_table(
+            DatabaseName=database_name,
+            Name=table_name
+        )
+        print(f"Table '{table_name}' exists in database '{database_name}'. Updating table...")
+        response = glue_client.update_table(
+            DatabaseName=database_name,
+            TableInput=glue_table_info
+        )
+    except glue_client.exceptions.EntityNotFoundException:
+        print(f"Table '{table_name}' does not exist in database '{database_name}'. Creating table...")
+        response = glue_client.create_table(
+            DatabaseName=database_name,
+            TableInput=glue_table_info
+        )
+
+    print(response)
+
+def main(param_dict):
+    original_bucket_name = param_dict['SourceBucketName']
+    crawler_result_bucket_name = param_dict['ResultBucketName']
+    region_name = param_dict['RegionName']
+
+    crawler_result_object_key = f"crawler_results/{original_bucket_name}_info.json"
+    destination_database = f"SDPS-unstructured-{original_bucket_name}"
+
+    s3_client = boto3.client('s3', region_name = region_name)
+    glue_client = boto3.client('glue', region_name = region_name)
+
+    # 1. Create a Glue Database
+    try:
+        response = glue_client.create_database(
+            DatabaseInput={
+                'Name': destination_database
+            }
+        )
+    except glue_client.exceptions.AlreadyExistsException:
+        print(f"Database '{destination_database}' already exists. Skipping database creation...")
+
+    # 2. Download the crawler result from S3 and 
+    with tempfile.NamedTemporaryFile(mode='w') as temp:
+        temp_file_path = temp.name
+        s3_client.download_file(Bucket=crawler_result_bucket_name, Key=crawler_result_object_key, Filename=temp_file_path)
+        bucket_info = json.load(open(temp_file_path, 'r'))
+
+
+    # 4. Batch process files in same folder with same type
+    original_file_bucket_name = bucket_info['bucket_name']
+    for file_category in ['detection_files', 'include_files', 'exclude_files']:
+        files = bucket_info[file_category]
+        for file_path, file_info in files.items():
+            print(f"Processing {file_path}...")
+            file_contents = batch_process_files(s3_client, original_file_bucket_name, file_info, file_category)
+
+            # convert file_contents to dataframe
+            df = pd.DataFrame.from_dict(file_contents, orient='index')
+            df = df.transpose()
+            columns = df.columns.tolist()
+
+            # dump file_info into string and encode in base64 as filename
+            table_name = file_path.replace('/', '_')
+            table_name = table_name.replace('.', '_')
+            table_name = original_file_bucket_name + '_' + table_name
+
+            # save to csv and upload to s3
+            with tempfile.NamedTemporaryFile(mode='w') as temp:
+                csv_file_path = temp.name
+                df.to_csv(csv_file_path, header=False)
+                s3_client.upload_file(csv_file_path, crawler_result_bucket_name, f"parser_results/{table_name}/result.csv")
+
+            glue_table_info = organize_table_info(table_name, crawler_result_bucket_name, original_file_bucket_name, file_info, columns, file_category)
+            create_glue_table(glue_client, destination_database, table_name, glue_table_info)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(...)
+    parser.add_argument('--SourceBucketName', type=str, default='icyxu-glue-assets-member-a',
+                        help='crawler_result_bucket_name')
+    parser.add_argument('--ResultBucketName', type=str, default='icyxu-glue-assets-member-a',
+                        help='crawler_result_bucket_name')
+    parser.add_argument('--RegionName', type=str, default='us-west-2',
+                        help='crawler_result_object_key')
+
+    args, _ = parser.parse_known_args()
+    param_dict = copy.copy(vars(args))
+
+    main(param_dict)
diff --git a/src/containers/image-pii-detection/parser_factory.py b/src/containers/image-pii-detection/parser_factory.py
@@ -0,0 +1,20 @@
+from parsers import PdfParser, TxtParser, DocParser, HtmlParser, EmailParser, ImageParser
+
+class ParserFactory:
+    @staticmethod
+    def create_parser(file_type, s3_client):
+        if file_type in ['.pdf', '.PDF']:
+            return PdfParser(s3_client=s3_client)
+        elif file_type in ['.txt', '.TXT']:
+            return TxtParser(s3_client=s3_client)
+        elif file_type in ['.doc', '.docx', '.DOC', '.DOCX']:
+            return DocParser(s3_client=s3_client)
+        elif file_type in ['.html', '.htm', '.HTML', '.HTM']:
+            return HtmlParser(s3_client=s3_client)
+        elif file_type in ['.eml', '.EML']:
+            return EmailParser(s3_client=s3_client)
+        elif file_type in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
+            return ImageParser(s3_client=s3_client, fd_model_path='./fd_model/', 
+                               ocr_model_path='./ocr_model/')
+        else:
+            raise ValueError('Unsupported file type')
diff --git a/src/containers/image-pii-detection/parsers/__init__.py b/src/containers/image-pii-detection/parsers/__init__.py
@@ -0,0 +1,7 @@
+from .pdf_parser import PdfParser
+from .txt_parser import TxtParser
+from .doc_parser import DocParser
+from .html_parser import HtmlParser
+from .email_parser import EmailParser
+
+from .image_parser import ImageParser
diff --git a/src/containers/image-pii-detection/parsers/doc_parser.py b/src/containers/image-pii-detection/parsers/doc_parser.py
@@ -0,0 +1,19 @@
+
+import docx
+from .parser import BaseParser
+
+class DocParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+
+    def parse_file(self, doc_path):
+        """
+        Extracts text from a doc file and returns a string of content.
+        """
+
+        doc = docx.Document(doc_path)
+        file_content = ""
+        for para in doc.paragraphs:
+            file_content += para.text + "\n"
+
+        return [file_content]
diff --git a/src/containers/image-pii-detection/parsers/email_parser.py b/src/containers/image-pii-detection/parsers/email_parser.py
@@ -0,0 +1,25 @@
+
+import os
+from .parser import BaseParser
+from email.parser import Parser as PyEmailParser
+
+class EmailParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+
+
+    def parse_file(self, eml_path):
+        """
+        Extracts text from a eml file and returns a string of content.
+        """
+
+        with open(eml_path) as stream:
+            parser = PyEmailParser()
+            message = parser.parse(stream)
+
+        file_content = []
+        for part in message.walk():
+            if part.get_content_type().startswith('text/plain'):
+                file_content.append(part.get_payload())
+
+        return ['\n'.join(file_content)]