-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path0.download_ecotaxa_project.py
executable file
·118 lines (97 loc) · 3.96 KB
/
0.download_ecotaxa_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
#
# Download data from one or several EcoTaxa project(s) to serve as training and test sets
#
# (c) 2022 Jean-Olivier Irisson, GNU General Public License v3
## Changeable settings ----
# EcoTaxa login info
# !! replace with your own !!
ecotaxa_user = '[email protected]'
ecotaxa_pass = 'test!'
# numeric id(s) of project(s) to download
# you should have appropriate access rights to it:
# - either the project should be visible to all
# - or you should be explicitely registered to it, as viewer at least
proj_ids = [185]
# path or URL of a taxonomic grouping file
# should have level0, level1, level2 columns where level0 is the current EcoTaxa
# classif and level1 and 2 are increasingly coarse groupings
grouping_url = 'https://docs.google.com/spreadsheets/d/1Rn73hMWGI3N-WzqlRX2X1H1aa94OLZGZIppzhQnOj-8/export?format=csv'
# NB: replace the end by `export?format=csv`
grouping_level = 'level1'
# where to store the raw data (including the images)
data_dir = '~/datasets/raw/'
## Download data ----
import os
import pandas as pd
import urllib.request
from tqdm import tqdm
import shutil
import ecotaxa_py_client
from ecotaxa_py_client.api import authentification_api
from ecotaxa_py_client.model.login_req import LoginReq
from ecotaxa_py_client.api import objects_api
from ecotaxa_py_client.model.project_filters import ProjectFilters
# authenticate
print("Log in EcoTaxa")
with ecotaxa_py_client.ApiClient() as client:
api = authentification_api.AuthentificationApi(client)
token = api.login(LoginReq(username=ecotaxa_user, password=ecotaxa_pass))
config = ecotaxa_py_client.Configuration(access_token=token, discard_unknown_keys=True)
print("Get data from project(s)")
# get validated objects (and their names + images paths) from a project
def get_objects_df(ecotaxa_py_client, proj_id):
with ecotaxa_py_client.ApiClient(config) as client:
objects_instance = objects_api.ObjectsApi(client)
# only validated
filters = ProjectFilters(statusfilter="V")
# get taxonomic name and image file name
fields = 'txo.display_name,img.file_name'
# get objects
objs = objects_instance.get_object_set(proj_id, project_filters=filters, fields=fields)
# format result as DataFrame
df = pd.DataFrame(objs['details'], columns=fields.split(','))
df['id'] = objs['object_ids']
return(df)
objs = [get_objects_df(ecotaxa_py_client, proj_id) for proj_id in proj_ids]
# format as a single DataFrame
df = pd.concat(objs).reset_index()
print("Download images")
# compute final path for images
img_dir = os.path.join(os.path.expanduser(data_dir), 'imgs')
os.makedirs(img_dir, exist_ok=True)
# name image according to internal object_id to ensure uniqueness
ext = os.path.splitext(df['img.file_name'][0])[-1].lower()
df['img_path'] = [os.path.join(img_dir, str(this_id)+ext) for this_id in df['id']]
# download images (with a nice progress bar)
vault_path = '/remote/ecotaxa/vault'
for i in tqdm(range(df.shape[0])):
# if the file has not been copied already
if not os.path.isfile(df['img_path'][i]):
# copy from vault
if os.path.isdir(vault_path):
res = shutil.copyfile(
src=os.path.join(vault_path, df['img.file_name'][i]),
dst=df['img_path'][i]
)
# or copy through the internet
else:
res = urllib.request.urlretrieve(
url='https://ecotaxa.obs-vlfr.fr/vault/'+df['img.file_name'][i],
filename=df['img_path'][i]
)
print("Perform taxonomic regrouping")
# read taxonomic grouping
groups = pd.read_csv(grouping_url, index_col='level0')
groups = groups[['level1', 'level2']]
# add grouping levels
dfg = df.join(groups, on='txo.display_name')
# save to disk
dfg.to_csv(os.path.join(data_dir, 'taxa.csv.gz'), index=False)
# choose level
dfg = dfg.rename(columns={grouping_level: 'label'})
dfg = dfg[['id', 'img_path', 'label']]
# drop images that end up with no taxon name
dfg = dfg.dropna(subset=['label'])
# save to disk
dfg.to_csv('io/training_labels.csv', index=False)