-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preparation_funcs.py
258 lines (204 loc) · 9.76 KB
/
data_preparation_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
import pandas as pd
import numpy as np
import re as re
"""General functions for chapter data preparation"""
def transform_dictionary_to_list (df, column, dict_element) :
df[column+'_'+dict_element] = [dictionary_pair.get(dict_element) for dictionary_pair in df[column]]
# Fill missing values
df[column+'_'+dict_element].fillna('', inplace=True)
return df
def extract_number_digits_from_string (df, attrib) :
df[attrib] = df[attrib].apply(lambda x : re.findall(r'\d+', x))
df = transform_list_to_string(df, attrib, ' ')
return df
def extract_scaling_from_scale (df) :
# Only this attriubte is affected
attrib = 'scale'
# The '1: ' shall be removed
df[attrib] = df[attrib].apply(lambda x : x.replace('1 ', ''))
return df
def transform_list_to_string (df, attrib, separator=', ') :
df[attrib] = [separator.join(map(str, list_element)) for list_element in df[attrib]]
df[attrib] = df[attrib].str.lower()
return df
def split_dictionary_column (df, attrib, split_columns) :
for ending in split_columns:
df = transform_dictionary_to_list(df, attrib, ending)
df = transform_list_to_string(df, attrib+'_'+ending)
df = df.drop(columns=[attrib])
return df
def isolate_doi (x_list) :
# Dedupe list
x_list = list(dict.fromkeys(x_list))
for x_element in x_list :
# A doi starts with '10.'
if x_element.startswith('10.') :
# Return first list element starting with '10.'
return x_element.lower()
# Return empty string if no doi element has been found
return ''
def isolate_ismn (x_list) :
# Dedupe list
x_list = list(dict.fromkeys(x_list))
for x_element in x_list :
# A ismn starts with '979' or 'm'
if (x_element.startswith('979') | x_element.lower().startswith('m')) :
# Return first list element starting with '10.'
return x_element.lower()
# Return empty string if no doi element has been found
return ''
def reduce_to_attrib_element (df, attrib) :
if attrib == 'doi' :
apply_function = lambda x : isolate_doi(x)
elif attrib == 'ismn' :
apply_function = lambda x : isolate_ismn(x)
df[attrib] = df[attrib].apply(apply_function)
return df
def isolate_number_from_string (df, attrib) :
df[attrib] = df[attrib].str.extract(r'(\d+)').fillna('')
return df
def build_delta_isbn (x_list, y_list, algorithm) :
# Dedupe list
x_list = list(dict.fromkeys(x_list))
y_list = list(dict.fromkeys(y_list))
if len(x_list) == 0 and len(y_list) == 0 :
# Return fix value in case of both lists empty
return 1.0
else :
length_min = min(len(x_list), len(y_list))
if length_min == 0 :
# Return fix value in case of one list empty
return 0.0
else : # Both lists hold elements
# Initialize similarity sum
similarity_sum = 0.0
# Calculate sum of similarities for each element
for x_element in x_list :
for y_element in y_list :
similarity_sum += algorithm.normalized_similarity(x_element, y_element)
# Normalize
return similarity_sum / length_min
def build_delta_feature (df, attrib, algorithm, metadata_dict) :
if attrib in ['isbn'] :
df[attrib+'_delta'] = df[[attrib+'_x', attrib+'_y']].apply(lambda x : build_delta_isbn(x[attrib+'_x'], x[attrib+'_y'], algorithm), axis=1)
else :
df[attrib+'_delta'] = df[[attrib+'_x', attrib+'_y']].apply(lambda x : algorithm.normalized_similarity(x[attrib+'_x'], x[attrib+'_y']), axis=1)
metadata_dict['features'].append(attrib+'_delta')
return df, metadata_dict
def concatenate_corporate_keys (df):
# Initial filling
df['corporate_full'] = df['corporate_110']
df['corporate_full'] = df[['corporate_full', 'corporate_710']].apply(lambda x : x['corporate_full']+' '+x['corporate_710']
if ( (len(x['corporate_full'])>0) & (len(x['corporate_710'])>0) & (x['corporate_full']!=x['corporate_710']) ) else ( x['corporate_710'] if ((len(x['corporate_full'])==0) & (len(x['corporate_710'])>0))
else x['corporate_full']), axis=1)
return df
def clean_exactDate_string (df) :
df['exactDate'] = df.exactDate.str.replace('\D', 'u')
return df
def norm_first_coordinate (df, suffix) :
df['coordinate'+suffix] = df['coordinate'+suffix].map(lambda x : x[0] if len(x)>0 else '').str.replace(' ', '').str.replace('.', '').str[:8].str.lower()
return df
def reduce_list_to_north (df) :
for i in range(len(df)):
if len(df.coordinate_N[i]) > 0:
to_be_used = df.coordinate_N[i].copy()
while (df.coordinate_N[i][0][0] == 'E') | (df.coordinate_N[i][0][0] == 'W'):
df.coordinate_N[i].remove(df.coordinate_N[i][0])
return df
def split_coordinate (df) :
df['coordinate_E'] = df['coordinate']
df = norm_first_coordinate(df, '_E')
# Recuce list to N and S and then same procedure
df['coordinate_N'] = df['coordinate']
df = reduce_list_to_north(df)
df = norm_first_coordinate(df, '_N')
return df
def split_format (df) :
df['format_prefix'] = df['format'].str[:2]
df['format_postfix'] = df['format'].str[2:8]
# Fill empty pre/postfix attributes with spaces for comparison
df['format_prefix'][df['format_prefix']==''] = ' '
df['format_postfix'][df['format_postfix']==''] = ' '
df = df.drop(columns=['format'])
return df
# Full preprocessing of all attributes of Swissbib's goldstandard
def attribute_preprocessing(df, columns, strip_digits) :
for attrib in columns:
if attrib in ['isbn' # Take as is
, 'coordinate_N' # See 'coordinate_E'
, 'format_postfix' # See 'format_prefix'
, 'person_100', 'person_700' # See 'person_245c'
, 'ttlfull_246' # See 'ttlfull_245'
]:
continue # Explicitly : do nothing!
elif attrib in ['coordinate_E']:
df = split_coordinate(df)
elif attrib in ['corporate_full']:
df = split_dictionary_column(df, 'corporate', ['110', '710'#, '810
])
df = concatenate_corporate_keys(df)
elif attrib in ['doi', 'ismn']:
df = reduce_to_attrib_element(df, attrib)
elif attrib in ['edition', 'musicid']:
df = isolate_number_from_string(df, attrib)
elif attrib in ['exactDate']:
df = clean_exactDate_string(df)
elif attrib in ['format_prefix']:
df = transform_list_to_string(df, 'format')
df = split_format(df)
elif attrib in ['person_245c']:
df = split_dictionary_column(
df, 'person', ['100', '700', #'800',
'245c']
)
elif attrib in ['scale'] and strip_digits :
# Remove non-number digits
df = extract_number_digits_from_string(df, attrib)
# Remove '1: ' parts
df = extract_scaling_from_scale(df)
elif attrib in ['ttlfull_245']:
df = split_dictionary_column(
df, 'ttlfull', ['245', '246']
)
elif attrib in ['part', 'pubinit', 'volumes']:
df = transform_list_to_string(df, attrib)
if attrib in ['part', 'volumes'] and strip_digits:
df = extract_number_digits_from_string(df, attrib)
else: # Not explicitly handled, yet
print('Attribute', attrib, 'is missing in this processing step!')
return df
def show_samples_interval (df, attrib, lower, upper, no_sample=5) :
# If not enough number of samples then take least possible number.
no_sample = min(len(df[['duplicates', attrib+'_delta', attrib+'_x', attrib+'_y']][(df[attrib+'_delta'] >= lower) & (df[attrib+'_delta'] <= upper)]), no_sample)
display(df[['duplicates', attrib+'_delta', attrib+'_x', attrib+'_y']][(df[attrib+'_delta'] >= lower) & (df[attrib+'_delta'] <= upper)].sample(n=no_sample))
print(f'{lower} <= {attrib}_delta <= {upper}')
return None
def show_samples_distinct (df, attrib, pos_value, no_sample=5) :
# If not enough number of samples then take least possible number.
no_sample = min(len(df[['duplicates', attrib+'_delta', attrib+'_x', attrib+'_y']][df[attrib+'_delta'] == pos_value]), no_sample)
display(df[['duplicates', attrib+'_delta', attrib+'_x', attrib+'_y']][df[attrib+'_delta'] == pos_value].sample(n=no_sample))
return None
def mark_both_missing (df, attrib, target_value) :
pd.set_option('mode.chained_assignment', None) # Suppress SettingWithCopyWarning.
df[attrib+'_delta'][df[attrib+'_x'].apply(lambda x : len(x) == 0) & df[attrib+'_y'].apply(lambda x : len(x) == 0)] = target_value
return df
def mark_one_missing (df, attrib, target_value) :
pd.set_option('mode.chained_assignment', None) # Suppress SettingWithCopyWarning.
# XOR on one single attribute is empty
df[attrib+'_delta'][df[attrib+'_x'].apply(lambda x : len(x) == 0) ^ df[attrib+'_y'].apply(lambda x : len(x) == 0)] = target_value
return df
def mark_missing (df, attrib, fctr, one_or_two='all') :
value_ob_missing = [-0.5*fctr, -1.0*fctr]
if one_or_two == 'all' :
df = mark_both_missing(df, attrib, value_ob_missing[1])
df = mark_one_missing(df, attrib, value_ob_missing[0])
elif one_or_two == 'both' :
df = mark_both_missing(df, attrib, value_ob_missing[1])
elif one_or_two == 'one' :
df = mark_one_missing(df, attrib, value_ob_missing[0])
return df
def determine_similarity_values (df, attrib):
attribute_uniques = np.sort(df[attrib+'_delta'].unique())
attribute_uniques_len = len(attribute_uniques)
print(attrib, 'values range', attribute_uniques)
return attribute_uniques, attribute_uniques_len