-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.py
executable file
·100 lines (71 loc) · 3.31 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from service.database import Database
from service.cleaner import Cleaner
from service.file_manager import FileManager
import service.wikidata as wikidata
from database.remiss import Remiss
from database.answer import Answer
from database.document import Document
from database.consultee import Consultee
from database.file import File
from database.consultee_list import ConsulteeList
import pandas as pd
saved_remisser = Remiss.query.all()
saved_answers = Answer.query.all()
RESET_DB = True
RESET_WIKIDATA = False
# GOV_LIST = 'tmp/government_organisations.csv'
# if RESET_WIKIDATA or FileManager.filepath_exists(GOV_LIST):
# gov_orgs = wikidata.get_government_organisations()
# pd.DataFrame(gov_orgs, columns=['organisation']).to_csv(GOV_LIST)
# gov_list = pd.read_csv(GOV_LIST)['organisation'].to_list()
# print(gov_list)
print('II-1 Light cleaning file names...')
for remiss_index, remiss in enumerate(saved_remisser, start=1):
answers_for_remiss = Answer.query.filter_by(remiss_id=remiss.id).all()
nb_of_remisser = len(saved_remisser)
for answer in answers_for_remiss:
org_name = answer.files[0].name
if len(answers_for_remiss) > 3:
filenames = [a.files[0].name for a in answers_for_remiss]
common = Cleaner.long_substr(filenames)
if len(common) > 3:
org_name = org_name.replace(common, '')
org_name = Cleaner.light_clean(org_name)
answer.organisation = org_name
Database.commit()
print(f'{remiss_index}/{nb_of_remisser} - Cleaned')
print('II-1 Deep cleaning file names...')
for remiss_index, remiss in enumerate(saved_remisser, start=1):
answers_for_remiss = Answer.query.filter_by(remiss_id=remiss.id).all()
nb_of_remisser = len(saved_remisser)
for answer in answers_for_remiss:
org_name = answer.organisation
org_name = Cleaner.deep_clean(org_name)
answer.organisation = org_name
Database.commit()
print(f'{remiss_index}/{nb_of_remisser} - Cleaned')
saved_lists = Document.query.filter(Document.type == 'consultee_list').all()
print('II-2 Light cleaning organisation names from consultee lists...')
for document_index, consultee_list in enumerate(saved_lists, start=1):
consultees_for_list = Consultee.query.filter_by(
consultee_list_id=consultee_list.id
).all()
nb_of_consultee_lists = len(saved_lists)
for consultee in consultees_for_list:
org_name = consultee.name
org_name = Cleaner.light_clean(org_name)
consultee.cleaned_name = org_name
Database.commit()
print(f'{document_index}/{nb_of_consultee_lists} - Cleaned')
print('II-2 Deep cleaning organisation names from consultee lists...')
for document_index, consultee_list in enumerate(saved_lists, start=1):
consultees_for_list = Consultee.query.filter_by(
consultee_list_id=consultee_list.id
).all()
nb_of_consultee_lists = len(saved_lists)
for consultee in consultees_for_list:
org_name = consultee.cleaned_name
org_name = Cleaner.deep_clean(org_name)
consultee.cleaned_name = org_name
Database.commit()
print(f'{document_index}/{nb_of_consultee_lists} - Cleaned')