-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuhm.py
97 lines (76 loc) · 3.06 KB
/
uhm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from requests import Session
from services.writer import Writer
import pandas as pd
BASE_URL = 'https://www.upphandlingsmyndigheten.se'
URL = '/api/sv/statisticsservice/bridgeapi/statistics/export/{}'
EXT = {
'csv': 'csv',
'excel': 'xlsx'
}
def download(params, filepath, format='excel', cleaned=True):
print('Downloading the data...')
response = Session().get(BASE_URL + URL.format(format), params=params)
if response.status_code == 200:
print('Data downloaded succesfully!')
print(f'Url: {response.url}')
if format == 'csv':
content = __fix_csv(response.content)
else:
content = response.content
Writer.write_file(content, f'{filepath}.{EXT[format]}')
if cleaned:
__clean_file(f'{filepath}.{EXT[format]}')
else:
print(f'There was an error {response.status_code}.')
print(response.url)
def __fix_csv(bytes):
return bytes.decode('utf-8-sig').replace('; ', ',').encode('utf-8')
def __clean_file(filepath):
print('Opening the file...')
if filepath.endswith('.csv'):
df = pd.read_csv(filepath, sep=';')
elif filepath.endswith('.xlsx'):
df = pd.read_excel(filepath)
else:
print('The file does not seem to be a CSV or Excel.')
return
print('Cleaning the file to reduce it size...')
print("- Cleaning all the 'Uppgift saknas'")
df = df.replace('Uppgift saknas', '')
columns_to_clean = [
'Direktivstyrd',
'Dynamiskt inköpssystem',
'Elektronisk auktion',
'Anbudsområden',
'Samordnad upphandling',
'Miljömässigt hållbar upphandling',
'Socialt hållbar upphandling',
'Innovationsupphandling',
'Reserverad upphandling',
'Reserverat genomförande',
'Överprövad'
]
for column in columns_to_clean:
if column in df.columns:
print(f'- Cleaning column {column}')
df[column] = df[column].apply(lambda x: 'False' if 'Inte' in x or 'Inga' in x else x)
df[column] = df[column].apply(lambda x: 'True' if x != 'False' and x != '' else x)
if 'Upphandlings-ID' in df and 'Publiceringsdatum' in df:
print('- Sorting the file by date then procurement ID')
df = df.sort_values('Upphandlings-ID').sort_values('Publiceringsdatum')
print('Saving the file...')
if filepath.endswith('.csv'):
df.to_csv(filepath, index=False)
elif filepath.endswith('.xlsx'):
sheet_name = f'{df.columns[-1]}'[:31].replace('*', '')
save_excel_with_adjusted_columns(df, filepath, sheet_name)
def save_excel_with_adjusted_columns(df, filepath, sheet_name):
writer = pd.ExcelWriter(filepath)
df.to_excel(writer, sheet_name=sheet_name, index=False, na_rep='')
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
writer.sheets[sheet_name].set_column(col_idx, col_idx, column_width)
writer.close()
def big_merge():
df = pd.read_csv('number_of_procurements.csv')