Skip to content

Commit

Permalink
Update Image PII Detection Code.
Browse files Browse the repository at this point in the history
  • Loading branch information
Xu Han committed Oct 17, 2023
1 parent 7f1ab4f commit 8ae8871
Show file tree
Hide file tree
Showing 25 changed files with 1,900 additions and 0 deletions.
27 changes: 27 additions & 0 deletions src/containers/image-pii-detection/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM public.ecr.aws/lambda/python:3.9

ARG FUNCTION_DIR="/opt/ml/code/"
COPY requirements.txt ${FUNCTION_DIR}/requirements.txt
RUN python3.9 -m pip install -r ${FUNCTION_DIR}/requirements.txt

COPY main.py parser_factory.py ${FUNCTION_DIR}/
COPY parsers/ ${FUNCTION_DIR}/parsers/

ARG OCR_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/infer-ocr-model/standard"
ARG OCR_MODEL_VERSION="v1.0.0"
ARG FD_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/face-detection"
ARG FD_MODEL_VERSION="1.2.0"

RUN yum install -y wget
RUN mkdir -p ${FUNCTION_DIR}/ocr_model
RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/classifier.onnx -O ${FUNCTION_DIR}/ocr_model/classifier.onnx
RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/det_standard.onnx -O ${FUNCTION_DIR}/ocr_model/det_standard.onnx
RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/keys_v1.txt -O ${FUNCTION_DIR}/ocr_model/keys_v1.txt
RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/rec_standard.onnx -O ${FUNCTION_DIR}/ocr_model/rec_standard.onnx
RUN mkdir -p ${FUNCTION_DIR}/fd_model
RUN wget -c ${FD_MODEL_URL}/${FD_MODEL_VERSION}/det.onnx -O ${FUNCTION_DIR}/fd_model/det.onnx

WORKDIR ${FUNCTION_DIR}

# Command can be overwritten by providing a different command in the template directly.
ENTRYPOINT ["python"]
Empty file.
208 changes: 208 additions & 0 deletions src/containers/image-pii-detection/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import json
import boto3
import os
import sys
import pandas as pd
import base64
import argparse
import copy
import logging
import tempfile

from parser_factory import ParserFactory

def check_include_file_type(file_info, include_file_types):
"""
Check if the file type is included in the include_file_types list.
:param file_info: file info
:param include_file_types: list of file types to include
"""
file_type = file_info['file_type']

if file_type in include_file_types:
return True
else:
return False

def organize_table_info(table_name, result_bucket_name, original_bucket_name, file_info, columns, file_category):

description = json.dumps(file_info, ensure_ascii=False)
s3_location = f"s3://{result_bucket_name}/parser_results/{table_name}/"
input_format = 'org.apache.hadoop.mapred.TextInputFormat'
output_format = 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
table_type = 'EXTERNAL_TABLE'
serde_info = {'SerializationLibrary': 'org.apache.hadoop.hive.serde2.OpenCSVSerde',
'Parameters': {'field.delim': ','}}
parameters = {'originalFileBucketName': original_bucket_name,
'originalFileType': file_info['file_type'],
'originalFilePath': file_info['file_path'],
'originalFileSample': ', '.join(file_info['sample_files'][:10]),
'originalFileCategory': file_category,
'Unstructured': 'true',
'classification': 'csv'}
glue_table_columns = [{'Name': 'index', 'Type': 'string'}]
for column in columns:
glue_table_columns.append({'Name': column, 'Type': 'string'})

glue_table_info = {
'Name': table_name,
'Description': description,
'StorageDescriptor': {
'Columns': glue_table_columns,
'Location': s3_location,
'InputFormat': input_format,
'OutputFormat': output_format,
'SerdeInfo': serde_info
},
'PartitionKeys': [],
'TableType': table_type,
'Parameters': parameters
}
return glue_table_info

def batch_process_files(s3_client, bucket_name, file_info, file_category):
"""
Batch process files in a folder with the same schema.
:param bucket_name: S3 bucket name
:param file_info: file info
Sample file_info:
{
"file_type": ".jpeg",
"file_path": "test_images/human_faces",
"sample_files": [
"1"
]
}
"""
file_contents = {}

file_type = file_info['file_type']
file_path = file_info['file_path']
sample_files = file_info['sample_files']

if file_category == 'detection_files':

parser = ParserFactory.create_parser(file_type=file_type, s3_client=s3_client)

for sample_file in sample_files:
object_key = f"{file_path}/{sample_file}{file_type}"
file_content = parser.load_content(bucket_name, object_key)
file_contents[f"{sample_file}"] = file_content

elif file_category == 'include_files':
for sample_file in sample_files:
file_contents[f"{sample_file}"] = ['This file is marked as Contains-PII.']

elif file_category == 'exclude_files':
for sample_file in sample_files:
file_contents[f"{sample_file}"] = ['This file is marked as Non-PII.']

return file_contents

def process_file(parser, bucket_name, object_key):
"""
Process a single file.
"""
file_content = parser.load_content(bucket_name, object_key)

json_format_content = {}
json_format_content[f"{object_key}"] = file_content

return json_format_content

def create_glue_table(glue_client, database_name, table_name, glue_table_info):

# Check if table exists
try:
response = glue_client.get_table(
DatabaseName=database_name,
Name=table_name
)
print(f"Table '{table_name}' exists in database '{database_name}'. Updating table...")
response = glue_client.update_table(
DatabaseName=database_name,
TableInput=glue_table_info
)
except glue_client.exceptions.EntityNotFoundException:
print(f"Table '{table_name}' does not exist in database '{database_name}'. Creating table...")
response = glue_client.create_table(
DatabaseName=database_name,
TableInput=glue_table_info
)

print(response)

def main(param_dict):
original_bucket_name = param_dict['SourceBucketName']
crawler_result_bucket_name = param_dict['ResultBucketName']
region_name = param_dict['RegionName']

crawler_result_object_key = f"crawler_results/{original_bucket_name}_info.json"
destination_database = f"SDPS-unstructured-{original_bucket_name}"

s3_client = boto3.client('s3', region_name = region_name)
glue_client = boto3.client('glue', region_name = region_name)

# 1. Create a Glue Database
try:
response = glue_client.create_database(
DatabaseInput={
'Name': destination_database
}
)
except glue_client.exceptions.AlreadyExistsException:
print(f"Database '{destination_database}' already exists. Skipping database creation...")

# 2. Download the crawler result from S3 and
with tempfile.NamedTemporaryFile(mode='w') as temp:
temp_file_path = temp.name
s3_client.download_file(Bucket=crawler_result_bucket_name, Key=crawler_result_object_key, Filename=temp_file_path)
bucket_info = json.load(open(temp_file_path, 'r'))


# 4. Batch process files in same folder with same type
original_file_bucket_name = bucket_info['bucket_name']
for file_category in ['detection_files', 'include_files', 'exclude_files']:
files = bucket_info[file_category]
for file_path, file_info in files.items():
print(f"Processing {file_path}...")
file_contents = batch_process_files(s3_client, original_file_bucket_name, file_info, file_category)

# convert file_contents to dataframe
df = pd.DataFrame.from_dict(file_contents, orient='index')
df = df.transpose()
columns = df.columns.tolist()

# dump file_info into string and encode in base64 as filename
table_name = file_path.replace('/', '_')
table_name = table_name.replace('.', '_')
table_name = original_file_bucket_name + '_' + table_name

# save to csv and upload to s3
with tempfile.NamedTemporaryFile(mode='w') as temp:
csv_file_path = temp.name
df.to_csv(csv_file_path, header=False)
s3_client.upload_file(csv_file_path, crawler_result_bucket_name, f"parser_results/{table_name}/result.csv")

glue_table_info = organize_table_info(table_name, crawler_result_bucket_name, original_file_bucket_name, file_info, columns, file_category)
create_glue_table(glue_client, destination_database, table_name, glue_table_info)


if __name__ == '__main__':
parser = argparse.ArgumentParser(...)
parser.add_argument('--SourceBucketName', type=str, default='icyxu-glue-assets-member-a',
help='crawler_result_bucket_name')
parser.add_argument('--ResultBucketName', type=str, default='icyxu-glue-assets-member-a',
help='crawler_result_bucket_name')
parser.add_argument('--RegionName', type=str, default='us-west-2',
help='crawler_result_object_key')

args, _ = parser.parse_known_args()
param_dict = copy.copy(vars(args))

main(param_dict)
20 changes: 20 additions & 0 deletions src/containers/image-pii-detection/parser_factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from parsers import PdfParser, TxtParser, DocParser, HtmlParser, EmailParser, ImageParser

class ParserFactory:
@staticmethod
def create_parser(file_type, s3_client):
if file_type in ['.pdf', '.PDF']:
return PdfParser(s3_client=s3_client)
elif file_type in ['.txt', '.TXT']:
return TxtParser(s3_client=s3_client)
elif file_type in ['.doc', '.docx', '.DOC', '.DOCX']:
return DocParser(s3_client=s3_client)
elif file_type in ['.html', '.htm', '.HTML', '.HTM']:
return HtmlParser(s3_client=s3_client)
elif file_type in ['.eml', '.EML']:
return EmailParser(s3_client=s3_client)
elif file_type in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
return ImageParser(s3_client=s3_client, fd_model_path='./fd_model/',
ocr_model_path='./ocr_model/')
else:
raise ValueError('Unsupported file type')
7 changes: 7 additions & 0 deletions src/containers/image-pii-detection/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .pdf_parser import PdfParser
from .txt_parser import TxtParser
from .doc_parser import DocParser
from .html_parser import HtmlParser
from .email_parser import EmailParser

from .image_parser import ImageParser
19 changes: 19 additions & 0 deletions src/containers/image-pii-detection/parsers/doc_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

import docx
from .parser import BaseParser

class DocParser(BaseParser):
def __init__(self, s3_client):
super().__init__(s3_client=s3_client)

def parse_file(self, doc_path):
"""
Extracts text from a doc file and returns a string of content.
"""

doc = docx.Document(doc_path)
file_content = ""
for para in doc.paragraphs:
file_content += para.text + "\n"

return [file_content]
25 changes: 25 additions & 0 deletions src/containers/image-pii-detection/parsers/email_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

import os
from .parser import BaseParser
from email.parser import Parser as PyEmailParser

class EmailParser(BaseParser):
def __init__(self, s3_client):
super().__init__(s3_client=s3_client)


def parse_file(self, eml_path):
"""
Extracts text from a eml file and returns a string of content.
"""

with open(eml_path) as stream:
parser = PyEmailParser()
message = parser.parse(stream)

file_content = []
for part in message.walk():
if part.get_content_type().startswith('text/plain'):
file_content.append(part.get_payload())

return ['\n'.join(file_content)]
Loading

0 comments on commit 8ae8871

Please sign in to comment.