-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
143 lines (111 loc) · 4.01 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
import json
from rapidfuzz import fuzz
def remove_parenthesis(name):
match = re.search(r'(.+) ?\(([a-zA-Z]{2,10})\)', name)
if match and len(match.groups()) == 2:
return [name.strip() for name in match.groups()]
else:
return name, None
def get(row, label):
if label not in row.keys():
return ''
value = row.loc[label]
value = value if not value != value else ''
if isinstance(value, str):
value = value.strip()
elif isinstance(value, float) and value.is_integer():
value = int(value)
return value
def read_json(filepath):
with open(filepath, 'r', encoding='utf-8') as file:
return json.load(file)
def write_json(data, name):
with open(f'{name}.json', 'w', encoding='utf-8') as file:
json.dump(data, file, indent=2, ensure_ascii=False)
def smart_ratio(names, choices, threshold = 90):
smart_ratio = 0
best_match = None
for name in names:
for choice in choices:
name_cleaned = name.lower()
choice_cleaned = choice.lower()
to_remove = [
'förvaltningsrätten',
'lokala',
'arrendenämnd',
'tingsrätt',
'hovrätt',
'domstol',
'sverige',
'svensk',
'ambassad',
'konsulat',
'arkivet',
'regionala',
'staten',
'institut',
'styrelse',
'universitet',
'högskol',
'länsstyrelse',
'universitet',
'avvecklingsmyndigheten för',
'myndighet',
'verk'
]
for word in to_remove:
if word in name_cleaned and word in choice_cleaned:
name_cleaned = name_cleaned.replace(word, '')
choice_cleaned = choice_cleaned.replace(word, '')
to_remove_unilateral = [
' och arrende',
' o mark o miljö och patent',
' och mark och miljödomstol',
' och migrationsdomstol',
' och migrationsöverdomst',
' och migrationsdom',
' kärntekniska anläggningen',
' och patent och marknadsdom',
'älhavaren'
]
for word in to_remove_unilateral:
name_cleaned = name_cleaned.replace(word, '')
choice_cleaned = choice_cleaned.replace(word, '')
ratio = fuzz.ratio(name_cleaned, choice_cleaned)
if ratio > smart_ratio:
smart_ratio = ratio
best_match = choice
similarity = smart_ratio if len(name) >= 4 and len(best_match) >= 4 else 0
if similarity >= threshold:
return choices[best_match], similarity
else:
return None, similarity
def get_sfs(text):
if not text:
return None
match = re.search(r'\d{4}:\d{1,4}', text)
if match:
return match[0]
def get_names(data, include_alt = True, include_old = True):
agency_names = []
if include_alt and 'name_en' in data:
agency_names.append(data['name_en'])
if include_alt and 'short_name' in data:
agency_names.append(data['short_name'])
if include_old and 'old_names' in data:
agency_names.extend(data['old_names'])
if 'other_names' in data:
agency_names.extend(data['other_names'])
return agency_names
def get_all_names(merged_data, include_alt = True, include_old = False):
all_names = {}
for main_name in merged_data:
for source in merged_data[main_name]:
data = merged_data[main_name][source]
agency_names = [main_name]
agency_names.extend(get_names(data, include_alt))
for name in agency_names:
if len(name) >= 4:
all_names[name] = main_name
return all_names