-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmoss_scanner.py
170 lines (144 loc) · 5.57 KB
/
moss_scanner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import re
import os
import shutil
import logging
import html5lib
import urllib.request
from mosspy import Moss
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.error import HTTPError
class MossScanner:
"""
A class that stores connection information for Moss and sends Files to Moss and returns the result.
Attributes:
user_id: The user id for Moss connection
language: The language of the submissions
tmp_path: The path where the submissions are temporary stored
Methods:
compare(files): Returns a json array in the form:
{
"error": ErrorCode,
"data": [
{
"name": FileName,
"match": MatchPercent,
"match_history": [
{
"start": StartLine,
"end": EndLine,
"temperature": MatchPercent
}
]
}
]
}
"""
__DICT_IDX = 'file_index' # json/dict index for the file index
__DICT_MATCH = 'match' # json/dict index for the match percentage
__DICT_HISTORY = 'match_history' # json/dict index for the match_history list
__TARGET_FILE = 'match0-top.html' # target file of the moss response
__FILE_NAME = 'file' # prefix of the file names that are sent to moss
__TMP_FILE_NAME = 'tmp' # prefix of the file names that are stored in the tmp directory
__NO_MATCH_FOUND_ERROR = 'No match found'
__TEMPERATURE_PATTERN = r'http://moss\.stanford.edu/bitmaps/tm_\d+_(\d+).gif'
def __init__(self, user_id=31398739, language='matlab'):
self.user_id = user_id
self.language = language
self.tmp_path = os.path.join(os.path.dirname(__file__), f'{self.__TMP_FILE_NAME}_{self.__get_unique_id()}')
def compare(self, files):
"""
Sends the files to moss and returns the result parsed as json.
:param files: a list of files as strings to compare
:return: parsed json data
"""
error: str = ''
data = []
paths = self.__setup(files)
moss = self.__get_moss(paths)
try:
url = moss.send()
logging.info(f"Moss URL: {url}")
soup = self.__get_soup_from_url(url + '/' + self.__TARGET_FILE)
data = self.__parse_soup(soup)
except HTTPError:
error = self.__NO_MATCH_FOUND_ERROR
except Exception as e:
error = str(e)
return json.dumps({
'error': error,
'data': data
})
def __get_moss(self, paths):
m = Moss(self.user_id, self.language)
for i, path in enumerate(paths):
m.addFile(path, f'{self.__FILE_NAME}_{i}')
return m
def __setup(self, files):
os.makedirs(self.tmp_path)
return [self.__write_file(file) for file in files]
def __write_file(self, file):
name = f"{self.__TMP_FILE_NAME}_{self.__get_unique_id()}"
path = os.path.join(self.tmp_path, name)
with open(path, 'w') as f:
f.write(file)
return path
@classmethod
def __get_soup_from_url(cls, url):
with urllib.request.urlopen(url) as response:
document = html5lib.parse(response, treebuilder='dom')
formatted_html = document.toprettyxml()
return BeautifulSoup(formatted_html, 'html.parser')
@classmethod
def __parse_header(cls, th: str):
idx, perc = th.split()
idx = idx.split('_')[1]
perc = perc.translate(str.maketrans('', '', '()%'))
return {
cls.__DICT_IDX: idx,
cls.__DICT_MATCH: perc,
cls.__DICT_HISTORY: []
}
@classmethod
def __parse_first_tr(cls, first_tr: BeautifulSoup):
return [cls.__parse_header(th.text) for th in first_tr.children if th.text.strip()]
@classmethod
def __parse_match_lines(cls, td: BeautifulSoup):
return td.text.strip('\n').split('-')
@classmethod
def __parse_temperature(cls, td: BeautifulSoup):
img = td.find_next('img')
match = re.match(cls.__TEMPERATURE_PATTERN, img.get('src'))
return match.group(1)
@classmethod
def __add_to_match_history(cls, data_dict, start, end, match):
data_dict[cls.__DICT_HISTORY].append({
'start': start,
'end': end,
'match': match
})
@classmethod
def __parse_data_tr(cls, data_tr: BeautifulSoup, data_list: list):
assert len(data_list) * 2 == len(data_tr.find_all('td'))
td_list = data_tr.find_all('td')
for i in range(0, len(td_list), 2):
data_list_idx = i // 2
start, end = cls.__parse_match_lines(td_list[i])
match = cls.__parse_temperature(td_list[i + 1])
cls.__add_to_match_history(data_list[data_list_idx], start, end, match)
@classmethod
def __parse_data(cls, trs, data_list: list):
for tr in trs:
cls.__parse_data_tr(tr, data_list)
@classmethod
def __parse_soup(cls, soup: BeautifulSoup):
trs = soup.find_all('tr')
data_list = cls.__parse_first_tr(trs[0]) # table row contains info like match %
cls.__parse_data(trs[1:], data_list) # residual table rows contain data where the matches were
return data_list
@classmethod
def __get_unique_id(cls):
return datetime.now().strftime('%M%S%f')
def __del__(self):
shutil.rmtree(self.tmp_path)