Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

language support #36

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 70 additions & 18 deletions email_reply_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,106 @@
"""
email_reply_parser is a python library port of GitHub's Email Reply Parser.

For more information, visit https://github.com/zapier/email-reply-parser
For more information, visit https://github.com/zapier/email_reply_parser
"""

import os
import re
import json


class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
def __init__(self, language='en'):
self.language = language

@staticmethod
def read(text):
def read(self, text):
""" Factory method that splits email into list of fragments

text - A string email body

Returns an EmailMessage instance
"""
return EmailMessage(text).read()
return EmailMessage(text, self.language).read()

@staticmethod
def parse_reply(text):
def parse_reply(self, text):
""" Provides the reply portion of email.

text - A string email body

Returns reply body message
"""
return EmailReplyParser.read(text).reply
return self.read(text).reply


class EmailMessage(object):
""" An email message represents a parsed email body.
"""

SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
QUOTED_REGEX = re.compile(r'(>+)')
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)

def __init__(self, text):
def __init__(self, text, language):
self.fragments = []
self.fragment = None
self.text = text.replace('\r\n', '\n')
self.found_visible = False
self.SIG_REGEX = None
self.QUOTE_HDR_REGEX = None
self.QUOTED_REGEX = None
self.HEADER_REGEX = None
self._MULTI_QUOTE_HDR_REGEX = None
self.MULTI_QUOTE_HDR_REGEX = None
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None
dir_path = os.path.dirname(__file__)
with open(dir_path + "/languages_support.json", "r") as read_file:
self.words_diff_languages = json.load(read_file)
self.language = language
self.set_regex()

def default_quoted_header(self):
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(
r'^\*?(' + self.words_diff_languages[self.language]['From'] +
'|' + self.words_diff_languages[self.language]['Sent'] +
'|' + self.words_diff_languages[self.language]['To'] +
'|' + self.words_diff_languages[self.language]['Subject'] +
'):\*? .+'
)

def nl_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$')
self.default_quoted_header()
self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)'

def de_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$')
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(
r'^\*?(' + self.words_diff_languages[self.language]['From'] +
'|' + self.words_diff_languages[self.language]['Sent'] +
'|' + self.words_diff_languages[self.language]['To'] +
'|' + self.words_diff_languages[self.language]['Subject'] +
'):\*? .+'
)
self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)'

def en_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'

def set_regex(self):
if hasattr(self, self.language+"_support"):
getattr(self, self.language+"_support")()
else:
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_diff_languages[self.language]['wrote'] + ':$')
self.default_quoted_header()
self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_diff_languages[self.language]['wrote'] + \
':)(On\s(.+?)' + self.words_diff_languages[self.language]['wrote'] + ':)'
self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL)

def read(self):
""" Creates new fragment for each line
Expand Down
162 changes: 162 additions & 0 deletions email_reply_parser/languages_support.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
{
"vi": {
"Sent from": "\u0110\u01b0\u1ee3c g\u1eedi t\u1eeb",
"From": "T\u1eeb",
"To": "\u0110\u1ebfn",
"wrote": "\u0111\u00e3 vi\u1ebft",
"Sent": "G\u1edfi",
"Subject": "M\u00f4n h\u1ecdc"
},
"ru": {
"Sent from": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e \u0438\u0437",
"From": "\u041e\u0442",
"To": "\u043a",
"wrote": "\u043f\u0438\u0441\u0430\u043b",
"Sent": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e",
"Subject": "\u041f\u0440\u0435\u0434\u043c\u0435\u0442"
},
"fr": {
"Sent from": "Envoy\u00e9 depuis",
"From": "De",
"To": "\u00c0",
"wrote": "a \u00e9crit",
"Sent": "Envoy\u00e9",
"Subject": "Objet"
},
"en": {
"Sent from": "Sent from",
"From": "From",
"To": "To",
"wrote": "wrote",
"Sent": "Sent",
"Subject": "Subject"
},
"nl": {
"Sent from": "Verzonden met",
"From": "Van",
"To": "Aan",
"wrote": "schreef",
"Sent": "Verzonden",
"Subject": "Onderwerp"
},
"pt": {
"Sent from": "Enviado de",
"From": "De",
"To": "Para",
"wrote": "escrevi",
"Sent": "Enviei",
"Subject": "Sujeito"
},
"ko": {
"Sent from": "\ubd80\ud130 \ubcf4\ub0b4\uc9c4",
"From": "\uc5d0\uc11c",
"To": "\uc5d0",
"wrote": "\uc4f4",
"Sent": "\uc804\uc1a1 \ub428",
"Subject": "\uc81c\ubaa9"
},
"de": {
"Sent from": "Gesendet von",
"From": "Von",
"To": "An",
"wrote": "schrieb",
"Sent": "geschickt",
"Subject": "Betreff"
},
"tr": {
"Sent from": "Den g\u00f6nderildi",
"From": "itibaren",
"To": "i\u00e7in",
"wrote": "yazd\u0131",
"Sent": "G\u00f6nderilen",
"Subject": "konu"
},
"it": {
"Sent from": "Inviato da",
"From": "Da",
"To": "A",
"wrote": "ha scritto",
"Sent": "Inviato",
"Subject": "Oggetto"
},
"id": {
"Sent from": "Dikirim dari",
"From": "Dari",
"To": "Untuk",
"wrote": "menulis",
"Sent": "Terkirim",
"Subject": "Subyek"
},
"sk": {
"Sent from": "Odoslan\u00e9 od",
"From": "z",
"To": "na",
"wrote": "nap\u00edsal",
"Sent": "odoslan\u00e9",
"Subject": "predmet"
},
"ar": {
"Sent from": "\u0627\u0631\u0633\u0644\u062a \u0645\u0646",
"From": "\u0645\u0646 \u0639\u0646\u062f",
"To": "\u0625\u0644\u0649",
"wrote": "\u0643\u062a\u0628",
"Sent": "\u0623\u0631\u0633\u0644\u062a",
"Subject": "\u0645\u0648\u0636\u0648\u0639"
},
"es": {
"Sent from": "Enviado desde",
"From": "De",
"To": "Para",
"wrote": "escribi\u00f3",
"Sent": "Expedido",
"Subject": "Asunto"
},
"th": {
"Sent from": "\u0e2a\u0e48\u0e07\u0e08\u0e32\u0e01",
"From": "\u0e08\u0e32\u0e01",
"To": "\u0e44\u0e1b\u0e22\u0e31\u0e07",
"wrote": "\u0e40\u0e02\u0e35\u0e22\u0e19",
"Sent": "\u0e2a\u0e48\u0e07",
"Subject": "\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07"
},
"fi": {
"Sent from": "L\u00e4hetetty",
"From": "alkaen",
"To": "jotta",
"wrote": "kirjoitti",
"Sent": "L\u00e4hetetyt",
"Subject": "aihe"
},
"zh": {
"Sent from": "\u6765\u81ea",
"From": "\u4ece",
"To": "\u81f3",
"wrote": "\u5199",
"Sent": "\u53d1\u9001",
"Subject": "\u5b66\u79d1"
},
"ja": {
"Sent from": "\u9001\u4fe1\u5143",
"From": "\u304b\u3089",
"To": "\u306b",
"wrote": "\u66f8\u304d\u307e\u3057\u305f",
"Sent": "\u9001\u4fe1\u6e08\u307f",
"Subject": "\u4ef6\u540d"
},
"pl": {
"Sent from": "Wys\u0142ane z",
"From": "Z",
"To": "Do",
"wrote": "napisa\u0142",
"Sent": "Wys\u0142ane",
"Subject": "Przedmiot"
},
"he": {
"Sent from": "\u05e0\u05e9\u05dc\u05d7 \u05de",
"From": "\u05de",
"To": "\u05dc",
"wrote": "\u05db\u05ea\u05d1\u05ea\u05d9",
"Sent": "\u05e0\u05e9\u05dc\u05d7",
"Subject": "\u05e0\u05d5\u05e9\u05d0"
}
}
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
version=version.VERSION,
description='Email reply parser',
packages=['email_reply_parser'],
package_data={'email_reply_parser': ['../VERSION']},
package_data={
'email_reply_parser': ['../VERSION'],
'': ['./languages_support.json']
},
author='Royce Haynes',
author_email='[email protected]',
url='https://github.com/zapier/email-reply-parser',
Expand All @@ -32,4 +35,4 @@
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
]
)
)