diff --git a/src/containers/image-pii-detection/Dockerfile b/src/containers/image-pii-detection/Dockerfile
new file mode 100644
index 00000000..3e08eb7b
--- /dev/null
+++ b/src/containers/image-pii-detection/Dockerfile
@@ -0,0 +1,27 @@
+FROM public.ecr.aws/lambda/python:3.9
+
+ARG FUNCTION_DIR="/opt/ml/code/"
+COPY requirements.txt ${FUNCTION_DIR}/requirements.txt
+RUN python3.9 -m pip install -r ${FUNCTION_DIR}/requirements.txt
+
+COPY main.py parser_factory.py ${FUNCTION_DIR}/
+COPY parsers/ ${FUNCTION_DIR}/parsers/
+
+ARG OCR_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/infer-ocr-model/standard"
+ARG OCR_MODEL_VERSION="v1.0.0"
+ARG FD_MODEL_URL="https://aws-gcr-solutions-assets.s3.cn-northwest-1.amazonaws.com.cn/ai-solution-kit/face-detection"
+ARG FD_MODEL_VERSION="1.2.0"
+
+RUN yum install -y wget
+RUN mkdir -p ${FUNCTION_DIR}/ocr_model
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/classifier.onnx -O ${FUNCTION_DIR}/ocr_model/classifier.onnx
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/det_standard.onnx -O ${FUNCTION_DIR}/ocr_model/det_standard.onnx
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/keys_v1.txt -O ${FUNCTION_DIR}/ocr_model/keys_v1.txt
+RUN wget -c $OCR_MODEL_URL/$OCR_MODEL_VERSION/rec_standard.onnx -O ${FUNCTION_DIR}/ocr_model/rec_standard.onnx
+RUN mkdir -p ${FUNCTION_DIR}/fd_model
+RUN wget -c ${FD_MODEL_URL}/${FD_MODEL_VERSION}/det.onnx -O ${FUNCTION_DIR}/fd_model/det.onnx
+
+WORKDIR ${FUNCTION_DIR}
+
+# Command can be overwritten by providing a different command in the template directly.
+ENTRYPOINT ["python"]
diff --git a/src/containers/image-pii-detection/__init__.py b/src/containers/image-pii-detection/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/containers/image-pii-detection/main.py b/src/containers/image-pii-detection/main.py
new file mode 100644
index 00000000..ebc79204
--- /dev/null
+++ b/src/containers/image-pii-detection/main.py
@@ -0,0 +1,208 @@
+import json
+import boto3
+import os
+import sys
+import pandas as pd
+import base64
+import argparse
+import copy
+import logging
+import tempfile
+
+from parser_factory import ParserFactory
+
+def check_include_file_type(file_info, include_file_types):
+    """
+    Check if the file type is included in the include_file_types list.
+
+    :param file_info: file info
+    :param include_file_types: list of file types to include
+
+    """
+    file_type = file_info['file_type']
+
+    if file_type in include_file_types:
+        return True
+    else:
+        return False
+
+def organize_table_info(table_name, result_bucket_name, original_bucket_name, file_info, columns, file_category):
+
+    description = json.dumps(file_info, ensure_ascii=False)
+    s3_location = f"s3://{result_bucket_name}/parser_results/{table_name}/"
+    input_format = 'org.apache.hadoop.mapred.TextInputFormat'
+    output_format = 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+    table_type = 'EXTERNAL_TABLE'
+    serde_info = {'SerializationLibrary': 'org.apache.hadoop.hive.serde2.OpenCSVSerde',
+                'Parameters': {'field.delim': ','}}
+    parameters = {'originalFileBucketName': original_bucket_name,
+                  'originalFileType': file_info['file_type'],
+                  'originalFilePath': file_info['file_path'],
+                  'originalFileSample': ', '.join(file_info['sample_files'][:10]),
+                  'originalFileCategory': file_category,
+                  'Unstructured': 'true',
+                  'classification': 'csv'}
+    glue_table_columns = [{'Name': 'index', 'Type': 'string'}]
+    for column in columns:
+        glue_table_columns.append({'Name': column, 'Type': 'string'})
+    
+    glue_table_info = {
+        'Name': table_name,
+        'Description': description,
+        'StorageDescriptor': {
+            'Columns': glue_table_columns,
+            'Location': s3_location,
+            'InputFormat': input_format,
+            'OutputFormat': output_format,
+            'SerdeInfo': serde_info
+        },
+        'PartitionKeys': [],
+        'TableType': table_type,
+        'Parameters': parameters
+    }
+    return glue_table_info
+
+def batch_process_files(s3_client, bucket_name, file_info, file_category):
+    """
+    Batch process files in a folder with the same schema.
+
+    :param bucket_name: S3 bucket name
+    :param file_info: file info
+
+    Sample file_info:
+    {
+        "file_type": ".jpeg",
+        "file_path": "test_images/human_faces",
+        "sample_files": [
+            "1"
+        ]
+    }
+
+    """
+    file_contents = {}
+
+    file_type = file_info['file_type']
+    file_path = file_info['file_path']
+    sample_files = file_info['sample_files']
+
+    if file_category == 'detection_files':
+        
+        parser = ParserFactory.create_parser(file_type=file_type, s3_client=s3_client)
+
+        for sample_file in sample_files:
+            object_key = f"{file_path}/{sample_file}{file_type}"
+            file_content = parser.load_content(bucket_name, object_key)
+            file_contents[f"{sample_file}"] = file_content
+
+    elif file_category == 'include_files':
+        for sample_file in sample_files:
+            file_contents[f"{sample_file}"] = ['This file is marked as Contains-PII.']
+    
+    elif file_category == 'exclude_files':
+        for sample_file in sample_files:
+            file_contents[f"{sample_file}"] = ['This file is marked as Non-PII.']
+            
+    return file_contents
+
+def process_file(parser, bucket_name, object_key):
+    """
+    Process a single file.
+    """
+    file_content = parser.load_content(bucket_name, object_key)
+
+    json_format_content = {}
+    json_format_content[f"{object_key}"] = file_content
+    
+    return json_format_content
+
+def create_glue_table(glue_client, database_name, table_name, glue_table_info):
+
+    # Check if table exists
+    try:
+        response = glue_client.get_table(
+            DatabaseName=database_name,
+            Name=table_name
+        )
+        print(f"Table '{table_name}' exists in database '{database_name}'. Updating table...")
+        response = glue_client.update_table(
+            DatabaseName=database_name,
+            TableInput=glue_table_info
+        )
+    except glue_client.exceptions.EntityNotFoundException:
+        print(f"Table '{table_name}' does not exist in database '{database_name}'. Creating table...")
+        response = glue_client.create_table(
+            DatabaseName=database_name,
+            TableInput=glue_table_info
+        )
+
+    print(response)
+
+def main(param_dict):
+    original_bucket_name = param_dict['SourceBucketName']
+    crawler_result_bucket_name = param_dict['ResultBucketName']
+    region_name = param_dict['RegionName']
+
+    crawler_result_object_key = f"crawler_results/{original_bucket_name}_info.json"
+    destination_database = f"SDPS-unstructured-{original_bucket_name}"
+
+    s3_client = boto3.client('s3', region_name = region_name)
+    glue_client = boto3.client('glue', region_name = region_name)
+
+    # 1. Create a Glue Database
+    try:
+        response = glue_client.create_database(
+            DatabaseInput={
+                'Name': destination_database
+            }
+        )
+    except glue_client.exceptions.AlreadyExistsException:
+        print(f"Database '{destination_database}' already exists. Skipping database creation...")
+
+    # 2. Download the crawler result from S3 and 
+    with tempfile.NamedTemporaryFile(mode='w') as temp:
+        temp_file_path = temp.name
+        s3_client.download_file(Bucket=crawler_result_bucket_name, Key=crawler_result_object_key, Filename=temp_file_path)
+        bucket_info = json.load(open(temp_file_path, 'r'))
+
+    
+    # 4. Batch process files in same folder with same type
+    original_file_bucket_name = bucket_info['bucket_name']
+    for file_category in ['detection_files', 'include_files', 'exclude_files']:
+        files = bucket_info[file_category]
+        for file_path, file_info in files.items():
+            print(f"Processing {file_path}...")
+            file_contents = batch_process_files(s3_client, original_file_bucket_name, file_info, file_category)
+
+            # convert file_contents to dataframe
+            df = pd.DataFrame.from_dict(file_contents, orient='index')
+            df = df.transpose()
+            columns = df.columns.tolist()
+
+            # dump file_info into string and encode in base64 as filename
+            table_name = file_path.replace('/', '_')
+            table_name = table_name.replace('.', '_')
+            table_name = original_file_bucket_name + '_' + table_name
+
+            # save to csv and upload to s3
+            with tempfile.NamedTemporaryFile(mode='w') as temp:
+                csv_file_path = temp.name
+                df.to_csv(csv_file_path, header=False)
+                s3_client.upload_file(csv_file_path, crawler_result_bucket_name, f"parser_results/{table_name}/result.csv")
+
+            glue_table_info = organize_table_info(table_name, crawler_result_bucket_name, original_file_bucket_name, file_info, columns, file_category)
+            create_glue_table(glue_client, destination_database, table_name, glue_table_info)
+
+    
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(...)
+    parser.add_argument('--SourceBucketName', type=str, default='icyxu-glue-assets-member-a',
+                        help='crawler_result_bucket_name')
+    parser.add_argument('--ResultBucketName', type=str, default='icyxu-glue-assets-member-a',
+                        help='crawler_result_bucket_name')
+    parser.add_argument('--RegionName', type=str, default='us-west-2',
+                        help='crawler_result_object_key')
+
+    args, _ = parser.parse_known_args()
+    param_dict = copy.copy(vars(args))
+    
+    main(param_dict)
diff --git a/src/containers/image-pii-detection/parser_factory.py b/src/containers/image-pii-detection/parser_factory.py
new file mode 100644
index 00000000..750d9559
--- /dev/null
+++ b/src/containers/image-pii-detection/parser_factory.py
@@ -0,0 +1,20 @@
+from parsers import PdfParser, TxtParser, DocParser, HtmlParser, EmailParser, ImageParser
+
+class ParserFactory:
+    @staticmethod
+    def create_parser(file_type, s3_client):
+        if file_type in ['.pdf', '.PDF']:
+            return PdfParser(s3_client=s3_client)
+        elif file_type in ['.txt', '.TXT']:
+            return TxtParser(s3_client=s3_client)
+        elif file_type in ['.doc', '.docx', '.DOC', '.DOCX']:
+            return DocParser(s3_client=s3_client)
+        elif file_type in ['.html', '.htm', '.HTML', '.HTM']:
+            return HtmlParser(s3_client=s3_client)
+        elif file_type in ['.eml', '.EML']:
+            return EmailParser(s3_client=s3_client)
+        elif file_type in ['.jpg', '.jpeg', '.png', '.JPG', '.JPEG', '.PNG']:
+            return ImageParser(s3_client=s3_client, fd_model_path='./fd_model/', 
+                               ocr_model_path='./ocr_model/')
+        else:
+            raise ValueError('Unsupported file type')
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/__init__.py b/src/containers/image-pii-detection/parsers/__init__.py
new file mode 100644
index 00000000..948b5962
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/__init__.py
@@ -0,0 +1,7 @@
+from .pdf_parser import PdfParser
+from .txt_parser import TxtParser
+from .doc_parser import DocParser
+from .html_parser import HtmlParser
+from .email_parser import EmailParser
+
+from .image_parser import ImageParser
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/doc_parser.py b/src/containers/image-pii-detection/parsers/doc_parser.py
new file mode 100644
index 00000000..a0005470
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/doc_parser.py
@@ -0,0 +1,19 @@
+
+import docx
+from .parser import BaseParser
+
+class DocParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+
+    def parse_file(self, doc_path):
+        """
+        Extracts text from a doc file and returns a string of content.
+        """
+
+        doc = docx.Document(doc_path)
+        file_content = ""
+        for para in doc.paragraphs:
+            file_content += para.text + "\n"
+
+        return [file_content]
diff --git a/src/containers/image-pii-detection/parsers/email_parser.py b/src/containers/image-pii-detection/parsers/email_parser.py
new file mode 100644
index 00000000..9f192aa7
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/email_parser.py
@@ -0,0 +1,25 @@
+
+import os
+from .parser import BaseParser
+from email.parser import Parser as PyEmailParser
+
+class EmailParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+
+
+    def parse_file(self, eml_path):
+        """
+        Extracts text from a eml file and returns a string of content.
+        """
+
+        with open(eml_path) as stream:
+            parser = PyEmailParser()
+            message = parser.parse(stream)
+
+        file_content = []
+        for part in message.walk():
+            if part.get_content_type().startswith('text/plain'):
+                file_content.append(part.get_payload())
+
+        return ['\n'.join(file_content)]
diff --git a/src/containers/image-pii-detection/parsers/html_parser.py b/src/containers/image-pii-detection/parsers/html_parser.py
new file mode 100644
index 00000000..0c2b2989
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/html_parser.py
@@ -0,0 +1,152 @@
+
+import re
+import six
+
+from bs4 import BeautifulSoup
+
+from .parser import BaseParser
+
+class HtmlParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+        # additional PdfParser constructor code here
+
+    def parse_file(self, html_path):
+        """
+        Extracts text from a html file and returns a string of content.
+        """
+
+        with open(html_path, "rb") as stream:
+            soup = BeautifulSoup(stream, 'lxml')
+
+        # Convert tables to ASCII ones
+        soup = self._replace_tables(soup)
+
+        # Join inline elements
+        soup = self._join_inlines(soup)
+
+        # Make HTML
+        html = ''
+        elements = soup.find_all(True)
+        elements = [el for el in filter(self._visible, elements)]
+        for elem in elements:
+            string = elem.string
+            if string is None:
+                string = self._find_any_text(elem)
+            string = string.strip()
+            if len(string) > 0:
+                html += "\n" + string + "\n"
+        return [html]
+
+    _disallowed_names = [
+        'style', 'script', '[document]', 'head', 'title', 'html', 'meta',
+        'link', 'body',
+    ]
+
+    _inline_tags = [
+        'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code',
+        'dfn', 'em', 'kbd', 'strong', 'samp', 'var', 'a', 'bdo', 'br', 'img',
+        'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button',
+        'input', 'label', 'select', 'textarea',
+    ]
+
+    def _visible(self, element):
+        """Used to filter text elements that have invisible text on the page.
+        """
+        if element.name in self._disallowed_names:
+            return False
+        elif re.match(u'<!--.*-->', six.text_type(element.extract())):
+            return False
+        return True
+
+    def _inline(self, element):
+        """Used to check whether given element can be treated as inline
+        element (without new line after).
+        """
+        if element.name in self._inline_tags:
+            return True
+        return False
+
+    def _find_any_text(self, tag):
+        """Looks for any possible text within given tag.
+        """
+        text = ''
+        if tag is not None:
+            text = six.text_type(tag)
+            text = re.sub(r'(<[^>]+>)', '', text)
+            text = re.sub(r'\s', ' ', text)
+            text = text.strip()
+        return text
+
+    def _parse_tables(self, soup):
+        """Returns array containing basic informations about tables for ASCII
+        replacement (look: _replace_tables()).
+        """
+        tables = []
+        for t in soup.find_all('table'):
+            t_dict = {'width': 0, 'table': t, 'trs': [], 'col_width': {}}
+            trs = t.find_all('tr')
+            if len(trs) > 0:
+                for tr in trs:
+                    tr_dict = []
+                    tds = tr.find_all('th') + tr.find_all('td')
+                    if len(tds) > 0:
+                        for i, td in enumerate(tds):
+                            td_text = self._find_any_text(td)
+                            length = len(td_text)
+                            if i in t_dict['col_width']:
+                                t_dict['col_width'][i] = max(
+                                    length,
+                                    t_dict['col_width'][i]
+                                )
+                            else:
+                                t_dict['col_width'][i] = length
+                            tr_dict.append({
+                                'text': td_text,
+                                'colspan': int(td.get('colspan', 1)),
+                            })
+                        t_dict['trs'].append(tr_dict)
+                for col in t_dict['col_width']:
+                    t_dict['width'] += t_dict['col_width'][col]
+                tables.append(t_dict)
+        return tables
+
+    def _replace_tables(self, soup, v_separator=' | ', h_separator='-'):
+        """Replaces <table> elements with its ASCII equivalent.
+        """
+        tables = self._parse_tables(soup)
+        v_sep_len = len(v_separator)
+        v_left_sep = v_separator.lstrip()
+        for t in tables:
+            html = ''
+            trs = t['trs']
+            h_length = 1 + (v_sep_len * len(t['col_width'])) + t['width']
+            head_foot = (h_separator * h_length) + "\n"
+            html += head_foot
+            for tr in trs:
+                html += v_left_sep
+                for i, td in enumerate(tr):
+                    text = td['text']
+                    col_width = t['col_width'][i] + v_sep_len
+                    if td['colspan'] > 1:
+                        for j in range(td['colspan']-1):
+                            j = j + 1
+                            if (i+j) < len(t['col_width']):
+                                col_width += t['col_width'][i+j] + v_sep_len
+                    html += ('%' + str(col_width) + 's') % (text + v_separator)
+                html += "\n"
+            html += head_foot
+            new_table = soup.new_tag('div')
+            new_table.string = html
+            t['table'].replace_with(new_table)
+        return soup
+
+    def _join_inlines(self, soup):
+        """Unwraps inline elements defined in self._inline_tags.
+        """
+        elements = soup.find_all(True)
+        for elem in elements:
+            if self._inline(elem):
+                elem.unwrap()
+        return soup
+
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/__init__.py b/src/containers/image-pii-detection/parsers/image_analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/face_detection/__init__.py b/src/containers/image-pii-detection/parsers/image_analysis/face_detection/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/face_detection/face_detection_main.py b/src/containers/image-pii-detection/parsers/image_analysis/face_detection/face_detection_main.py
new file mode 100644
index 00000000..19960c8b
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/face_detection/face_detection_main.py
@@ -0,0 +1,281 @@
+
+import numpy as np
+import onnxruntime
+import os.path as osp
+import cv2
+
+cuda_available = False
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+
+class SCRFD:
+    def __init__(self, model_file=None, session=None):
+        self.model_file = model_file
+        self.session = session
+        self.taskname = 'detection'
+        self.batched = False
+        if self.session is None:
+            assert self.model_file is not None
+            assert osp.exists(self.model_file)
+            self.session = onnxruntime.InferenceSession(self.model_file, providers=['CUDAExecutionProvider'] if cuda_available else ['CPUExecutionProvider'])
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+
+    def _init_vars(self):
+        input_cfg = self.session.get_inputs()[0]
+        input_shape = input_cfg.shape
+        #print(input_shape)
+        if isinstance(input_shape[2], str):
+            self.input_size = None
+        else:
+            self.input_size = tuple(input_shape[2:4][::-1])
+        self.input_size=(736,736)
+        #print('image_size:', self.image_size)
+        input_name = input_cfg.name
+        self.input_shape = input_shape
+        outputs = self.session.get_outputs()
+        if len(outputs[0].shape) == 3:
+            self.batched = True
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        #print(self.output_names)
+        #assert len(outputs)==10 or len(outputs)==15
+        self.use_kps = False
+        self._anchor_ratio = 1.0
+        self._num_anchors = 1
+        if len(outputs)==6:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+        elif len(outputs)==9:
+            self.fmc = 3
+            self._feat_stride_fpn = [8, 16, 32]
+            self._num_anchors = 2
+            self.use_kps = True
+        elif len(outputs)==10:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+        elif len(outputs)==15:
+            self.fmc = 5
+            self._feat_stride_fpn = [8, 16, 32, 64, 128]
+            self._num_anchors = 1
+            self.use_kps = True
+    def prepare(self, ctx_id, **kwargs):
+        if ctx_id<0:
+            self.session.set_providers(['CPUExecutionProvider'])
+        nms_thresh = kwargs.get('nms_thresh', None)
+        if nms_thresh is not None:
+            self.nms_thresh = nms_thresh
+        det_thresh = kwargs.get('det_thresh', None)
+        if det_thresh is not None:
+            self.det_thresh = det_thresh
+        input_size = kwargs.get('input_size', None)
+        if input_size is not None:
+            if self.input_size is not None:
+                print('warning: det_size is already set in scrfd model, ignore')
+            else:
+                self.input_size = input_size
+
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            # If model support batch dim, take first output
+            if self.batched:
+                scores = net_outs[idx][0]
+                bbox_preds = net_outs[idx + fmc][0]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2][0] * stride
+            # If model doesn't support batching take output as is
+            else:
+                scores = net_outs[idx]
+                bbox_preds = net_outs[idx + fmc]
+                bbox_preds = bbox_preds * stride
+                if self.use_kps:
+                    kps_preds = net_outs[idx + fmc * 2] * stride
+            height = input_height // stride
+            width = input_width // stride
+            K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-1, c style:
+                #anchor_centers = np.zeros( (height, width, 2), dtype=np.float32 )
+                #for i in range(height):
+                #    anchor_centers[i, :, 1] = i
+                #for i in range(width):
+                #    anchor_centers[:, i, 0] = i
+
+                #solution-2:
+                #ax = np.arange(width, dtype=np.float32)
+                #ay = np.arange(height, dtype=np.float32)
+                #xv, yv = np.meshgrid(np.arange(width), np.arange(height))
+                #anchor_centers = np.stack([xv, yv], axis=-1).astype(np.float32)
+
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                #print(anchor_centers.shape)
+
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            #print(anchor_centers.shape, kps_preds.shape)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                #kpss = kps_preds
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+
+    def detect(self, img, input_size = None, max_num=0, metric='default'):
+        assert input_size is not None or self.input_size is not None
+        input_size = self.input_size if input_size is None else input_size
+            
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, self.det_thresh)
+
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] -
+                                                    det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(
+                values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/__init__.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/imaug/__init__.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/imaug/__init__.py
new file mode 100644
index 00000000..2751a0d4
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/imaug/__init__.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from .operators import *
+
+def transform(data, ops=None):
+    """ transform """
+    if ops is None:
+        ops = []
+    for op in ops:
+        data = op(data)
+        if data is None:
+            return None
+    return data
+
+
+def create_operators(op_param_list, global_config=None):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(op_param_list, list), ('operator config should be a list')
+    ops = []
+    for operator in op_param_list:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        if global_config is not None:
+            param.update(global_config)
+        op = eval(op_name)(**param)
+        ops.append(op)
+    return ops
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/imaug/operators.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/imaug/operators.py
new file mode 100644
index 00000000..93a8eabe
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/imaug/operators.py
@@ -0,0 +1,209 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import sys
+import six
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self, img_mode='RGB', channel_first=False, **kwargs):
+        self.img_mode = img_mode
+        self.channel_first = channel_first
+
+    def __call__(self, data):
+        img = data['image']
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        img = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(img, 1)
+        if img is None:
+            return None
+        if self.img_mode == 'GRAY':
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+        elif self.img_mode == 'RGB':
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        data['image'] = img
+        return data
+
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+        data['image'] = (
+            img.astype('float32') * self.scale - self.mean) / self.std
+        return data
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self, **kwargs):
+        pass
+
+    def __call__(self, data):
+        img = data['image']
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+        data['image'] = img.transpose((2, 0, 1))
+        return data
+
+
+class KeepKeys(object):
+    def __init__(self, keep_keys, **kwargs):
+        self.keep_keys = keep_keys
+
+    def __call__(self, data):
+        data_list = []
+        for key in self.keep_keys:
+            data_list.append(data[key])
+        return data_list
+
+
+class DetResizeForTest(object):
+    def __init__(self, **kwargs):
+        super(DetResizeForTest, self).__init__()
+        self.resize_type = 0
+        if 'image_shape' in kwargs:
+            self.image_shape = kwargs['image_shape']
+            self.resize_type = 1
+        elif 'limit_side_len' in kwargs:
+            self.limit_side_len = kwargs['limit_side_len']
+            self.limit_type = kwargs.get('limit_type', 'min')
+        elif 'resize_long' in kwargs:
+            self.resize_type = 2
+            self.resize_long = kwargs.get('resize_long', 960)
+        else:
+            self.limit_side_len = 736
+            self.limit_type = 'min'
+
+    def __call__(self, data):
+        img = data['image']
+        src_h, src_w, _ = img.shape
+
+        if self.resize_type == 0:
+            # img, shape = self.resize_image_type0(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type0(img)
+        elif self.resize_type == 2:
+            img, [ratio_h, ratio_w] = self.resize_image_type2(img)
+        else:
+            # img, shape = self.resize_image_type1(img)
+            img, [ratio_h, ratio_w] = self.resize_image_type1(img)
+        data['image'] = img
+        data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w])
+        return data
+
+    def resize_image_type1(self, img):
+        resize_h, resize_w = self.image_shape
+        ori_h, ori_w = img.shape[:2]  # (h, w, c)
+        ratio_h = float(resize_h) / ori_h
+        ratio_w = float(resize_w) / ori_w
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        # return img, np.array([ori_h, ori_w])
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type0(self, img):
+        """
+        resize image to a size multiple of 32 which is required by the network
+        args:
+            img(array): array with shape [h, w, c]
+        return(tuple):
+            img, (ratio_h, ratio_w)
+        """
+        limit_side_len = self.limit_side_len
+        h, w, _ = img.shape
+
+        # limit the max side
+        if self.limit_type == 'max':
+            if max(h, w) > limit_side_len:
+                if h > w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        else:
+            if min(h, w) < limit_side_len:
+                if h < w:
+                    ratio = float(limit_side_len) / h
+                else:
+                    ratio = float(limit_side_len) / w
+            else:
+                ratio = 1.
+        resize_h = int(h * ratio)
+        resize_w = int(w * ratio)
+
+        resize_h = int(round(resize_h / 32) * 32)
+        resize_w = int(round(resize_w / 32) * 32)
+
+        try:
+            if int(resize_w) <= 0 or int(resize_h) <= 0:
+                return None, (None, None)
+            img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        except:
+            print(img.shape, resize_w, resize_h)
+            sys.exit(0)
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+        # return img, np.array([h, w])
+        return img, [ratio_h, ratio_w]
+
+    def resize_image_type2(self, img):
+        h, w, _ = img.shape
+
+        resize_w = w
+        resize_h = h
+
+        # Fix the longer side
+        if resize_h > resize_w:
+            ratio = float(self.resize_long) / resize_h
+        else:
+            ratio = float(self.resize_long) / resize_w
+
+        resize_h = int(resize_h * ratio)
+        resize_w = int(resize_w * ratio)
+
+        max_stride = 128
+        resize_h = (resize_h + max_stride - 1) // max_stride * max_stride
+        resize_w = (resize_w + max_stride - 1) // max_stride * max_stride
+        img = cv2.resize(img, (int(resize_w), int(resize_h)))
+        ratio_h = resize_h / float(h)
+        ratio_w = resize_w / float(w)
+
+        return img, [ratio_h, ratio_w]
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/ocr_main.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/ocr_main.py
new file mode 100644
index 00000000..1005d476
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/ocr_main.py
@@ -0,0 +1,386 @@
+import copy
+import math
+import time
+import os
+
+import numpy as np
+import onnxruntime
+from PIL import Image
+import cv2
+
+from .imaug import create_operators, transform
+from .postprocess import build_post_process
+
+cuda_available = False
+
+def sorted_boxes(dt_boxes):
+    """
+    Sort text boxes in order from top to bottom, left to right
+    args:
+        dt_boxes(array):detected text boxes with shape [4, 2]
+    return:
+        sorted boxes(array) with shape [4, 2]
+    """
+    num_boxes = dt_boxes.shape[0]
+    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
+    _boxes = list(sorted_boxes)
+
+    for i in range(num_boxes - 1):
+        if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and (
+            _boxes[i + 1][0][0] < _boxes[i][0][0]
+        ):
+            tmp = _boxes[i]
+            _boxes[i] = _boxes[i + 1]
+            _boxes[i + 1] = tmp
+    return _boxes
+
+class TextClassifier():
+    def __init__(self, model_path):
+        self.weights_path = model_path + 'classifier.onnx'
+
+        self.cls_image_shape = [3, 48, 192]
+        self.cls_batch_num = 30
+        self.cls_thresh = 0.9
+        self.use_zero_copy_run = False
+        postprocess_params = {
+            'name': 'ClsPostProcess',
+            "label_list": ['0', '180'],
+        }
+        self.postprocess_op = build_post_process(postprocess_params)
+
+        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=['CUDAExecutionProvider'] if cuda_available else ['CPUExecutionProvider'])
+
+    def resize_norm_img(self, img):
+        imgC, imgH, imgW = self.cls_image_shape
+        h = img.shape[0]
+        w = img.shape[1]
+        ratio = w / float(h)
+        if math.ceil(imgH * ratio) > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = np.array(Image.fromarray(img).resize((resized_w, imgH)))
+        #resized_image = cv2.resize(img, (resized_w, imgH))
+        resized_image = resized_image.astype('float32')
+        if self.cls_image_shape[0] == 1:
+            resized_image = resized_image / 255
+            resized_image = resized_image[np.newaxis, :]
+        else:
+            resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+
+    def __call__(self, img_list):
+        img_list = copy.deepcopy(img_list)
+        img_num = len(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[1] / float(img.shape[0]))
+        # Sorting can speed up the cls process
+        indices = np.argsort(np.array(width_list))
+
+        cls_res = [['', 0.0]] * img_num
+        batch_num = self.cls_batch_num
+        for beg_img_no in range(0, img_num, batch_num):
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            max_wh_ratio = 0
+            for ino in range(beg_img_no, end_img_no):
+                h, w = img_list[indices[ino]].shape[0:2]
+                wh_ratio = w * 1.0 / h
+                max_wh_ratio = max(max_wh_ratio, wh_ratio)
+            for ino in range(beg_img_no, end_img_no):
+                norm_img = self.resize_norm_img(img_list[indices[ino]])
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+            norm_img_batch = np.concatenate(norm_img_batch)
+            norm_img_batch = norm_img_batch.copy()
+            starttime = time.time()
+            ort_inputs = {self.ort_session.get_inputs()[0].name: norm_img_batch}
+            prob_out = self.ort_session.run(None, ort_inputs)[0]
+            cls_result = self.postprocess_op(prob_out)
+            for rno in range(len(cls_result)):
+                label, score = cls_result[rno]
+                cls_res[indices[beg_img_no + rno]] = [label, score]
+                if '180' in label and score > self.cls_thresh:
+                    img_list[indices[beg_img_no + rno]] = np.array(Image.fromarray(img_list[indices[beg_img_no + rno]]).transpose(Image.ROTATE_180))
+        return img_list, cls_res
+
+class TextDetector():
+    def __init__(self, model_path):
+        self.weights_path = model_path + 'det_standard.onnx'
+
+        self.det_algorithm = 'DB'
+        self.use_zero_copy_run = False
+
+        pre_process_list = [{
+            'DetResizeForTest': {
+                'limit_side_len': 960,
+                'limit_type': 'max'
+            }
+        }, {
+            'NormalizeImage': {
+                'std': [0.229, 0.224, 0.225],
+                'mean': [0.485, 0.456, 0.406],
+                'scale': '1./255.',
+                'order': 'hwc'
+            }
+        }, {
+            'ToCHWImage': None
+        }, {
+            'KeepKeys': {
+                'keep_keys': ['image', 'shape']
+            }
+        }]
+
+        postprocess_params = {}
+        postprocess_params['name'] = 'DBPostProcess'
+        postprocess_params["thresh"] = 0.3
+        postprocess_params["box_thresh"] = 0.3
+        postprocess_params["max_candidates"] = 1000
+        postprocess_params["unclip_ratio"] = 1.6
+        postprocess_params["use_dilation"] = True
+        self.preprocess_op = create_operators(pre_process_list)
+        self.postprocess_op = build_post_process(postprocess_params)
+        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=['CUDAExecutionProvider'] if cuda_available else ['CPUExecutionProvider'])
+        _ = self.ort_session.run(None, {"backbone": np.zeros([1, 3, 64, 64], dtype='float32')})
+
+    # load_pytorch_weights
+
+    def order_points_clockwise(self, pts):
+        """
+        reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
+        # sort the points based on their x-coordinates
+        """
+        xSorted = pts[np.argsort(pts[:, 0]), :]
+
+        # grab the left-most and right-most points from the sorted
+        # x-roodinate points
+        leftMost = xSorted[:2, :]
+        rightMost = xSorted[2:, :]
+
+        # now, sort the left-most coordinates according to their
+        # y-coordinates so we can grab the top-left and bottom-left
+        # points, respectively
+        leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
+        (tl, bl) = leftMost
+
+        rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
+        (tr, br) = rightMost
+
+        rect = np.array([tl, tr, br, bl], dtype="float32")
+        return rect
+
+    def clip_det_res(self, points, img_height, img_width):
+        for pno in range(points.shape[0]):
+            points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
+            points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
+        return points
+
+    def filter_tag_det_res(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.order_points_clockwise(box)
+            box = self.clip_det_res(box, img_height, img_width)
+            rect_width = int(np.linalg.norm(box[0] - box[1]))
+            rect_height = int(np.linalg.norm(box[0] - box[3]))
+            if rect_width <= 3 or rect_height <= 3:
+                continue
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+
+    def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
+        img_height, img_width = image_shape[0:2]
+        dt_boxes_new = []
+        for box in dt_boxes:
+            box = self.clip_det_res(box, img_height, img_width)
+            dt_boxes_new.append(box)
+        dt_boxes = np.array(dt_boxes_new)
+        return dt_boxes
+
+    def __call__(self, img):
+        ori_im = img.copy()
+        data = {'image': img}
+        data = transform(data, self.preprocess_op)
+        img, shape_list = data
+        if img is None:
+            return None, 0
+        img = np.expand_dims(img, axis=0)
+        shape_list = np.expand_dims(shape_list, axis=0)
+        img = img.copy()
+        ort_inputs = {self.ort_session.get_inputs()[0].name: img}
+        preds = {}
+        preds['maps'] = self.ort_session.run(None, ort_inputs)[0]
+
+        post_result = self.postprocess_op(preds, shape_list)
+        dt_boxes = post_result[0]['points']
+        if self.det_algorithm == "SAST" and self.det_sast_polygon:
+            dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
+        else:
+            dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
+        return dt_boxes
+
+class TextRecognizer():
+    def __init__(self, model_path):
+        self.weights_path = model_path + 'rec_standard.onnx'
+
+        self.limited_max_width = 1280
+        self.limited_min_width = 16
+
+        self.rec_image_shape = [3, 32, 320]
+        self.character_type = 'ch'
+        self.rec_batch_num = 6
+        self.rec_algorithm = 'CRNN'
+        self.use_zero_copy_run = False
+        postprocess_params = {
+            'name': 'CTCLabelDecode',
+            "character_type": 'ch',
+            "character_dict_path": model_path + 'keys_v1.txt',
+            "use_space_char": True
+        }
+        self.postprocess_op = build_post_process(postprocess_params)
+
+        self.ort_session = onnxruntime.InferenceSession(self.weights_path, providers=['CUDAExecutionProvider'] if cuda_available else ['CPUExecutionProvider'])
+        _ = self.ort_session.run(None, {"backbone": np.zeros([1, 3, 32, 64], dtype='float32')})
+
+    def resize_norm_img(self, img, max_wh_ratio):
+        imgC, imgH, imgW = self.rec_image_shape
+        assert imgC == img.shape[2]
+        if self.character_type == "ch":
+            imgW = int((32 * max_wh_ratio))
+        imgW = max(min(imgW, self.limited_max_width), self.limited_min_width)
+        h, w = img.shape[:2]
+        ratio = w / float(h)
+        ratio_imgH = math.ceil(imgH * ratio)
+        ratio_imgH = max(ratio_imgH, self.limited_min_width)
+        if ratio_imgH > imgW:
+            resized_w = imgW
+        else:
+            resized_w = int(math.ceil(imgH * ratio))
+        resized_image = np.array(Image.fromarray(img).resize((resized_w, imgH)))
+        #resized_image = cv2.resize(img, (resized_w, imgH))
+        resized_image = resized_image.astype('float32')
+        resized_image = resized_image.transpose((2, 0, 1)) / 255
+        resized_image -= 0.5
+        resized_image /= 0.5
+        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
+        padding_im[:, :, 0:resized_w] = resized_image
+        return padding_im
+
+    def __call__(self, img_list):
+        img_num = len(img_list)
+        # Calculate the aspect ratio of all text bars
+        width_list = []
+        for img in img_list:
+            width_list.append(img.shape[1] / float(img.shape[0]))
+        # Sorting can speed up the recognition process
+        indices = np.argsort(np.array(width_list))
+
+        # rec_res = []
+        rec_res = [['', 0.0]] * img_num
+        batch_num = self.rec_batch_num
+        for beg_img_no in range(0, img_num, batch_num):
+            end_img_no = min(img_num, beg_img_no + batch_num)
+            norm_img_batch = []
+            max_wh_ratio = 0
+            for ino in range(beg_img_no, end_img_no):
+                # h, w = img_list[ino].shape[0:2]
+                h, w = img_list[indices[ino]].shape[0:2]
+                wh_ratio = w * 1.0 / h
+                max_wh_ratio = max(max_wh_ratio, wh_ratio)
+            for ino in range(beg_img_no, end_img_no):
+                # norm_img = self.resize_norm_img(img_list[ino], max_wh_ratio)
+                norm_img = self.resize_norm_img(img_list[indices[ino]],
+                                                max_wh_ratio)
+                norm_img = norm_img[np.newaxis, :]
+                norm_img_batch.append(norm_img)
+            norm_img_batch = np.concatenate(norm_img_batch)
+            norm_img_batch = norm_img_batch.copy()
+            ort_inputs = {self.ort_session.get_inputs()[0].name: norm_img_batch}
+            preds = self.ort_session.run(None, ort_inputs)[0]
+
+            rec_result = self.postprocess_op(preds)
+            for rno in range(len(rec_result)):
+                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
+        return rec_res
+
+class TextSystem:
+    def __init__(self, model_path):
+        self.text_detector = TextDetector(model_path)
+        self.text_recognizer = TextRecognizer(model_path)
+        self.drop_score = 0.3
+        self.text_classifier = TextClassifier(model_path)
+
+    def get_rotate_crop_image(self, img, points):
+        """
+        img_height, img_width = img.shape[0:2]
+        left = int(np.min(points[:, 0]))
+        right = int(np.max(points[:, 0]))
+        top = int(np.min(points[:, 1]))
+        bottom = int(np.max(points[:, 1]))
+        img_crop = img[top:bottom, left:right, :].copy()
+        points[:, 0] = points[:, 0] - left
+        points[:, 1] = points[:, 1] - top
+        """
+        img_crop_width = int(
+            max(
+                np.linalg.norm(points[0] - points[1]),
+                np.linalg.norm(points[2] - points[3]),
+            )
+        )
+        img_crop_height = int(
+            max(
+                np.linalg.norm(points[0] - points[3]),
+                np.linalg.norm(points[1] - points[2]),
+            )
+        )
+        pts_std = np.float32(
+            [
+                [0, 0],
+                [img_crop_width, 0],
+                [img_crop_width, img_crop_height],
+                [0, img_crop_height],
+            ]
+        )
+        M = cv2.getPerspectiveTransform(points, pts_std)
+        dst_img = cv2.warpPerspective(
+            img,
+            M,
+            (img_crop_width, img_crop_height),
+            borderMode=cv2.BORDER_REPLICATE,
+            flags=cv2.INTER_CUBIC,
+        )
+        dst_img_height, dst_img_width = dst_img.shape[0:2]
+        if dst_img_height * 1.0 / dst_img_width >= 1.5:
+            dst_img = np.rot90(dst_img)
+        return dst_img
+
+    def __call__(self, img):
+        ori_im = img.copy()
+        dt_boxes = self.text_detector(img)
+        if dt_boxes is None:
+            return None, None
+        img_crop_list = []
+
+        dt_boxes = sorted_boxes(dt_boxes)
+
+        for bno in range(len(dt_boxes)):
+            tmp_box = copy.deepcopy(dt_boxes[bno])
+            img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
+            img_crop_list.append(img_crop)
+        img_crop_list, angle_list = self.text_classifier(img_crop_list)
+
+        rec_res = self.text_recognizer(img_crop_list)
+        filter_boxes, filter_rec_res = [], []
+        for box, rec_reuslt in zip(dt_boxes, rec_res):
+            text, score = rec_reuslt
+            if score >= self.drop_score:
+                filter_boxes.append(box)
+                filter_rec_res.append(rec_reuslt)
+        return filter_boxes, filter_rec_res
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/__init__.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/__init__.py
new file mode 100644
index 00000000..85bc130c
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/__init__.py
@@ -0,0 +1,27 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import copy
+
+__all__ = ['build_post_process']
+
+
+def build_post_process(config, global_config=None):
+    from .db_postprocess import DBPostProcess
+    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode
+    from .cls_postprocess import ClsPostProcess
+
+    support_dict = [
+        'DBPostProcess', 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess'
+    ]
+
+    config = copy.deepcopy(config)
+    module_name = config.pop('name')
+    if global_config is not None:
+        config.update(global_config)
+    assert module_name in support_dict, Exception(
+        'post process only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/cls_postprocess.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/cls_postprocess.py
new file mode 100644
index 00000000..f16536c3
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/cls_postprocess.py
@@ -0,0 +1,15 @@
+class ClsPostProcess(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self, label_list, **kwargs):
+        super(ClsPostProcess, self).__init__()
+        self.label_list = label_list
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        pred_idxs = preds.argmax(axis=1)
+        decode_out = [(self.label_list[idx], preds[i, idx])
+                      for i, idx in enumerate(pred_idxs)]
+        if label is None:
+            return decode_out
+        label = [(self.label_list[idx], 1.0) for idx in label]
+        return decode_out, label
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/db_postprocess.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/db_postprocess.py
new file mode 100644
index 00000000..741a2c88
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/db_postprocess.py
@@ -0,0 +1,139 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
+
+
+class DBPostProcess(object):
+    """
+    The post process for Differentiable Binarization (DB).
+    """
+
+    def __init__(self,
+                 thresh=0.3,
+                 box_thresh=0.7,
+                 max_candidates=1000,
+                 unclip_ratio=2.0,
+                 use_dilation=False,
+                 **kwargs):
+        self.thresh = thresh
+        self.box_thresh = box_thresh
+        self.max_candidates = max_candidates
+        self.unclip_ratio = unclip_ratio
+        self.min_size = 3
+        self.dilation_kernel = None if not use_dilation else np.array(
+            [[1, 1], [1, 1]])
+
+    def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
+        '''
+        _bitmap: single map with shape (1, H, W),
+                whose values are binarized as {0, 1}
+        '''
+
+        bitmap = _bitmap
+        height, width = bitmap.shape
+
+        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
+                                cv2.CHAIN_APPROX_SIMPLE)
+        if len(outs) == 3:
+            img, contours, _ = outs[0], outs[1], outs[2]
+        elif len(outs) == 2:
+            contours, _ = outs[0], outs[1]
+
+        num_contours = min(len(contours), self.max_candidates)
+
+        boxes = []
+        scores = []
+        for index in range(num_contours):
+            contour = contours[index]
+            points, sside = self.get_mini_boxes(contour)
+            if sside < self.min_size:
+                continue
+            points = np.array(points)
+            score = self.box_score_fast(pred, points.reshape(-1, 2))
+            if self.box_thresh > score:
+                continue
+
+            box = self.unclip(points).reshape(-1, 1, 2)
+            box, sside = self.get_mini_boxes(box)
+            if sside < self.min_size + 2:
+                continue
+            box = np.array(box)
+
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes.append(box.astype(np.int16))
+            scores.append(score)
+        return np.array(boxes, dtype=np.int16), scores
+
+    def unclip(self, box):
+        unclip_ratio = self.unclip_ratio
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+        return expanded
+
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+
+        box = [
+            points[index_1], points[index_2], points[index_3], points[index_4]
+        ]
+        return box, min(bounding_box[1])
+
+    def box_score_fast(self, bitmap, _box):
+        h, w = bitmap.shape[:2]
+        box = _box.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
+
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+        return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+    def __call__(self, outs_dict, shape_list):
+        pred = outs_dict['maps']
+        pred = pred[:, 0, :, :]
+        segmentation = pred > self.thresh
+
+        boxes_batch = []
+        for batch_index in range(pred.shape[0]):
+            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
+            if self.dilation_kernel is not None:
+                mask = cv2.dilate(
+                    np.array(segmentation[batch_index]).astype(np.uint8),
+                    self.dilation_kernel)
+            else:
+                mask = segmentation[batch_index]
+            boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
+                                                   src_w, src_h)
+
+            boxes_batch.append({'points': boxes})
+        return boxes_batch
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/rec_postprocess.py b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/rec_postprocess.py
new file mode 100644
index 00000000..5b3245c5
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_analysis/general_ocr/postprocess/rec_postprocess.py
@@ -0,0 +1,138 @@
+import numpy as np
+
+
+class BaseRecLabelDecode(object):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 character_type='ch',
+                 use_space_char=False):
+        support_character_type = [
+            'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean'
+        ]
+        assert character_type in support_character_type, "Only {} are supported now but get {}".format(
+            support_character_type, character_type)
+
+        if character_type == "en":
+            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
+            dict_character = list(self.character_str)
+        elif character_type in ["ch", "french", "german", "japan", "korean"]:
+            self.character_str = ""
+            assert character_dict_path is not None, "character_dict_path should not be None when character_type is ch"
+            with open(character_dict_path, "rb") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    line = line.decode('utf-8').strip("\n").strip("\r\n")
+                    self.character_str += line
+            if use_space_char:
+                self.character_str += " "
+            dict_character = list(self.character_str)
+        elif character_type == "en_sensitive":
+            # same with ASTER setting (use 94 char).
+            import string
+            self.character_str = string.printable[:-6]
+            dict_character = list(self.character_str)
+        else:
+            raise NotImplementedError
+        self.character_type = character_type
+        dict_character = self.add_special_char(dict_character)
+        self.dict = {}
+        for i, char in enumerate(dict_character):
+            self.dict[char] = i
+        self.character = dict_character
+
+    def add_special_char(self, dict_character):
+        return dict_character
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=True):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list)))
+        return result_list
+
+    def get_ignored_tokens(self):
+        return [0]  # for ctc blank
+
+
+class CTCLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 character_type='ch',
+                 use_space_char=False,
+                 **kwargs):
+        super(CTCLabelDecode, self).__init__(character_dict_path,
+                                             character_type, use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+
+        preds_idx = preds.argmax(axis=2)
+        preds_prob = preds.max(axis=2)
+        text = self.decode(preds_idx, preds_prob)
+        if label is None:
+            return text
+        label = self.decode(label, is_remove_duplicate=False)
+        return text, label
+
+    def add_special_char(self, dict_character):
+        dict_character = ['blank'] + dict_character
+        return dict_character
+
+
+class AttnLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 character_type='ch',
+                 use_space_char=False,
+                 **kwargs):
+        super(AttnLabelDecode, self).__init__(character_dict_path,
+                                              character_type, use_space_char)
+        self.beg_str = "sos"
+        self.end_str = "eos"
+
+    def add_special_char(self, dict_character):
+        dict_character = [self.beg_str, self.end_str] + dict_character
+        return dict_character
+
+    def __call__(self, text):
+        text = self.decode(text)
+        return text
+
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/image_parser.py b/src/containers/image-pii-detection/parsers/image_parser.py
new file mode 100644
index 00000000..d33a0e23
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/image_parser.py
@@ -0,0 +1,79 @@
+
+import os
+from .parser import BaseParser
+
+from PIL import Image
+import numpy as np
+
+from .image_analysis.face_detection import face_detection_main
+from .image_analysis.general_ocr import ocr_main
+
+def check_keywords_exist(det_results, keywords):
+    for keyword in keywords:
+        found = False
+        for dt_result in det_results:
+            text, score = dt_result[1]
+            if keyword in text and score >= 0.5:
+                found = True
+                break
+        if not found:
+            return False
+    return True
+    
+class ImageParser(BaseParser):
+    def __init__(self, s3_client, fd_model_path, ocr_model_path):
+        super().__init__(s3_client=s3_client)
+        self.face_detection_model = face_detection_main.SCRFD(model_file = fd_model_path + 'det.onnx')
+        self.ocr_model = ocr_main.TextSystem(model_path = ocr_model_path)
+        # additional PdfParser constructor code here
+    
+    def read_img(self, file_path):
+        img = np.array(Image.open(file_path).convert('RGB'))[:, :, :3]
+        
+        return img
+
+    def face_detection_pipeline(self, img):
+        bboxes, kpss = self.face_detection_model.detect(img)
+        return bboxes, kpss
+
+    def ocr_pipeline(self, img):
+        img = img[:,:,::-1]
+        dt_boxes, rec_res = self.ocr_model(img)
+        dt_results = list(zip(dt_boxes, rec_res))
+        return dt_results
+
+    def parse_file(self, file_path):
+        file_content = []
+        img = self.read_img(file_path)
+
+        face_detection_result, _ = self.face_detection_pipeline(img)
+        ocr_pipeline_result = self.ocr_pipeline(img)
+
+        contain_face = True if len(face_detection_result) > 0 else False
+        business_license_keywords = ['营', '业', '执', '照', '信用代码']
+        cnid_keywords = ['公', '民', '身', '份', '号', '码']
+        car_license_keywords = ['机动车', '驾驶证']
+        
+        contain_business_license = check_keywords_exist(ocr_pipeline_result, business_license_keywords)
+        contain_cnid = check_keywords_exist(ocr_pipeline_result, cnid_keywords)
+        contain_car_license = check_keywords_exist(ocr_pipeline_result, car_license_keywords)
+        
+        if contain_face:
+            if contain_cnid:
+                file_content.append('ChineseID')
+            elif contain_car_license:
+                file_content.append('CarLicense')
+            else:
+                file_content.append('Face')
+        else:
+            if contain_business_license:
+                file_content.append('BusinessLicense')
+            elif contain_car_license:
+                file_content.append('CarLicense')
+            else:
+                pass
+
+        return file_content
+
+
+
diff --git a/src/containers/image-pii-detection/parsers/parser.py b/src/containers/image-pii-detection/parsers/parser.py
new file mode 100644
index 00000000..5bc41ab7
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/parser.py
@@ -0,0 +1,65 @@
+import os
+import magic
+import re
+from tempfile import NamedTemporaryFile
+
+
+class BaseParser:
+    def __init__(self, s3_client):
+        # constructor code here
+        # self.region = region
+        self.s3_client=s3_client
+        pass
+
+    def parse_file(self, file_path, **kwargs):
+        """This method must be overwritten by child classes to extract raw
+        text from a file path. 
+        """
+        raise NotImplementedError('must be overwritten by child classes')
+
+    def load_content(self, bucket, object_key):
+        """
+        Downloads the file from S3.
+        """
+        # Create a temporary file
+        with NamedTemporaryFile() as temp_file:
+            self.s3_client.download_file(Bucket=bucket, Key=object_key, Filename=temp_file.name)
+            file_path = temp_file.name
+
+            file_content = self.parse_file(file_path)
+            processed_content = self.postprocess_content(file_content)
+
+        return processed_content
+    
+    def postprocess_content(self, file_content):
+        """
+        For each item in content, if size is bigger than 128, split it into multiple items.
+        """
+        # split all_page_content into a list of lines and remove empty lines
+        processed_content=[]
+        for page in file_content:
+            # page_content = []
+            lines = [line for line in page.splitlines() if line.strip() != '']
+
+            for item in lines:
+                if len(item) > 128:
+                    # Split item by . and extend to processed_content
+                    split_items = re.split(r'(?<=[.。;])', item)
+                    # 
+                    for split_item in split_items:
+                        if len(split_item) != 0:
+                            # Avoid too long item
+                            processed_content.append(split_item[:256])
+                else:
+                    processed_content.append(item)
+
+        return processed_content
+
+    def get_encoding(self, file_path):
+        """
+        Returns the encoding of the file.
+        """
+        blob = open(file_path, 'rb').read()
+        m = magic.Magic(mime_encoding=True)
+        encoding = m.from_buffer(blob)
+        return encoding
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/parsers/pdf_parser.py b/src/containers/image-pii-detection/parsers/pdf_parser.py
new file mode 100644
index 00000000..e1a07bb1
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/pdf_parser.py
@@ -0,0 +1,30 @@
+
+import os
+import boto3
+from pypdf import PdfReader
+
+from .parser import BaseParser
+
+class PdfParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+    
+
+    def parse_file(self, pdf_path):
+        """
+        Extracts text from a PDF file and returns a list of lines.
+        """
+
+        # Create a PDF reader object
+        pdf_reader = PdfReader(pdf_path)
+        file_content = []
+
+        # Loop through each page in the PDF file
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+
+            # Extract the text from the page and append it to the string
+            page_content = page.extract_text()
+            file_content.append(page_content)
+
+        return file_content
diff --git a/src/containers/image-pii-detection/parsers/txt_parser.py b/src/containers/image-pii-detection/parsers/txt_parser.py
new file mode 100644
index 00000000..7bf503bf
--- /dev/null
+++ b/src/containers/image-pii-detection/parsers/txt_parser.py
@@ -0,0 +1,18 @@
+
+import os
+from .parser import BaseParser
+
+class TxtParser(BaseParser):
+    def __init__(self, s3_client):
+        super().__init__(s3_client=s3_client)
+
+    def parse_file(self, txt_path):
+        """
+        Extracts text from a TXT file and returns a list of lines.
+        """
+
+        # Read the file
+        with open(txt_path, 'r') as file:
+            file_content = file.read()
+
+        return [file_content]
diff --git a/src/containers/image-pii-detection/requirements.txt b/src/containers/image-pii-detection/requirements.txt
new file mode 100644
index 00000000..a50b4853
--- /dev/null
+++ b/src/containers/image-pii-detection/requirements.txt
@@ -0,0 +1,20 @@
+requests
+boto3
+six==1.16.0
+opencv-python-headless==4.5.3.56
+numpy<=1.23.5
+onnxruntime
+Pillow==8.4.0
+pyclipper==1.3.0
+Shapely==1.7.1
+base64image==0.5.1
+urllib3==1.26.6
+python-dateutil==2.8.2
+certifi==2022.12.7
+idna==2.10
+chardet==4.0.0
+pypdf==3.12.1
+python-magic==0.4.27
+python-docx==0.8.11
+bs4==0.0.1
+pandas==1.5.3
\ No newline at end of file
diff --git a/src/containers/image-pii-detection/utils.py b/src/containers/image-pii-detection/utils.py
new file mode 100644
index 00000000..e69de29b