Skip to content

Commit

Permalink
Added a sligthly thinner cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
PierreMesure committed Oct 24, 2018
1 parent 47cb577 commit c8db5f0
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 8 deletions.
14 changes: 10 additions & 4 deletions remiss_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,18 @@
f'{index}/{nb_of_remisser} remiss(er) - {len(files)} file(s) saved'
)

saved_files = db.get_all_remiss_files()
saved_files = db.get_all_remiss_answers()

print('Cleaning filenames to get organisation name...')
for index, file in enumerate(saved_files, start=1):
file.organisation = Cleaner.get_organisation_name(file.filename)
db.update_remiss_file(file, index)
db.commit()
if index % (len(saved_files) // 100) == 0:
print(
f'{(index + 1) * 100 // len(saved_files)} % cleaned'
)
organisation_name = Cleaner.get_organisation_name(file.filename)
if organisation_name != file.organisation:
file.organisation = organisation_name
db.update_remiss_file(file, file.id)
db.commit()

db.close()
15 changes: 15 additions & 0 deletions service/cleaner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
from service.database import Database


class Cleaner(object):

def remove_leading_numbers(text):
return text.lstrip('0123456789abc.-_ abc ')

def replace_by_popular_contained(text):
db = Database('remisser.db')
popular_org_names = db.get_popular_remiss_file_organisations()

for popular_org_name in popular_org_names:
if popular_org_name in text and popular_org_name != text:
# print(f'{text} -> {popular_org_name}')
text = popular_org_name
break
return text

def get_organisation_name(text):
text = Cleaner.remove_leading_numbers(text)
text = Cleaner.replace_by_popular_contained(text)
return text

def is_instance(filename):
Expand Down
30 changes: 26 additions & 4 deletions service/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ def get_all_remisser(self):

return remisser

def get_all_remiss_files(self):
self.cursor.execute("SELECT * FROM files ORDER BY ID ASC")
def get_all_remiss_answers(self):
self.cursor.execute('''
SELECT *
FROM files
WHERE files.type = 'answer'
ORDER BY ID ASC
''')

files = []

for row in self.cursor.fetchall():
Expand All @@ -32,6 +38,22 @@ def get_all_remiss_files(self):

return files

def get_popular_remiss_file_organisations(self):
self.cursor.execute('''
SELECT organisation,
COUNT(organisation) AS num
FROM files
GROUP BY lower(organisation)
HAVING num >= 100
ORDER BY num DESC
''')
organisation_names = []

for row in self.cursor.fetchall():
organisation_names.append(row[0])

return organisation_names

def save_remiss(self, remiss):
self.cursor.execute('''
INSERT INTO remisser (title, url, date, sender)
Expand Down Expand Up @@ -60,7 +82,7 @@ def save_remiss_file(self, file):
)
)

def update_remiss_file(self, file, index):
def update_remiss_file(self, file, id):
self.cursor.execute('''
UPDATE files SET remiss_id=?,
filename=?,
Expand All @@ -74,7 +96,7 @@ def update_remiss_file(self, file, index):
file.organisation,
file.url,
file.type,
index
id
)
)

Expand Down

0 comments on commit c8db5f0

Please sign in to comment.