From 2c0bc34d4faecaaedc07dc9bd2a43b7acc496df6 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Tue, 6 Apr 2021 10:07:01 -0400 Subject: [PATCH 01/10] First draft of OpenDataPlatform input --- sdg/inputs/InputOpenDataPlatform.py | 77 +++++++++++++++++++++++++++++ sdg/inputs/__init__.py | 1 + sdg/open_sdg.py | 3 ++ 3 files changed, 81 insertions(+) create mode 100644 sdg/inputs/InputOpenDataPlatform.py diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py new file mode 100644 index 00000000..1c9b8ba1 --- /dev/null +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -0,0 +1,77 @@ +from urllib.request import urlopen +import json +from sdg.inputs import InputBase + +class InputOpenDataPlatform(InputBase): + + + def __init__(self, source=None, logging=None): + InputBase.__init__(self, logging=logging) + self.source = source + + + def execute(self, indicator_options): + payload = self.fetch_file(self.source) + parsed = json.loads(payload) + indicators = {} + names = {} + for item in parsed['data']: + indicator_id = self.normalize_indicator_id(self.get_indicator_id(item)) + indicator_name = self.normalize_indicator_name(self.get_indicator_name(item), indicator_id) + indicator_name = indicator_name.strip(':').strip('.').strip() + if indicator_id not in indicators: + indicators[indicator_id] = [] + dimensions = self.get_dimensions(item) + + idx = 0 + for year in self.get_years(item): + value = item['values'][idx] + if value is not None: + disaggregations = dimensions.copy() + disaggregations['UNIT_MEASURE'] = self.get_unit(item) + disaggregations['UNIT_MULT'] = self.get_unit_multiplier(item) + row = self.get_row(year, value, disaggregations) + indicators[indicator_id].append(row) + idx += 1 + + for indicator_id in indicators: + df = self.create_dataframe(indicators[indicator_id]) + self.add_indicator(indicator_id, name=indicator_name, data=df, options=indicator_options) + + + def get_dimensions(self, row): + dimensions = {} + non_dimension_props = ['goal', 'target', 'indicator'] + for prop in row: + if prop in non_dimension_props: + continue + try: + prop_id = row[prop]['id'] + dimensions[prop_id] = row[prop]['id'] + except: + pass + return dimensions + + + def get_years(self, row): + start = int(row['startDate'][0:4]) + end = int(row['endDate'][0:4]) + if start == end: + return [start] + return list(range(start, end)) + + + def get_unit(self, row): + return row['unit'] + + + def get_unit_multiplier(self, row): + return row['scale'] + + + def get_indicator_id(self, row): + return row['indicator']['id'] + + + def get_indicator_name(self, row): + return row['indicator']['name'] diff --git a/sdg/inputs/__init__.py b/sdg/inputs/__init__.py index a8585fd9..f2e8189a 100644 --- a/sdg/inputs/__init__.py +++ b/sdg/inputs/__init__.py @@ -15,3 +15,4 @@ from .InputApi import InputApi from .InputCkan import InputCkan from .InputSdmxMl_UnitedNationsApi import InputSdmxMl_UnitedNationsApi +from .InputOpenDataPlatform import InputOpenDataPlatform diff --git a/sdg/open_sdg.py b/sdg/open_sdg.py index a35d65ac..51c89617 100644 --- a/sdg/open_sdg.py +++ b/sdg/open_sdg.py @@ -448,6 +448,7 @@ def open_sdg_input_from_dict(params, options): 'InputExcelMeta', 'InputYamlMeta', 'InputSdmxMeta', + 'InputOpenDataPlatform', ] if input_class not in allowed: raise KeyError("Input class '%s' is not one of: %s." % (input_class, ', '.join(allowed))) @@ -486,6 +487,8 @@ def open_sdg_input_from_dict(params, options): input_instance = sdg.inputs.InputYamlMeta(**params) elif input_class == 'InputSdmxMeta': input_instance = sdg.inputs.InputSdmxMeta(**params) + elif input_class == 'InputOpenDataPlatform': + input_instance = sdg.inputs.InputOpenDataPlatform(**params) return input_instance From 0edcc688cf6d6397b6088dd2e0769356717a8869 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Tue, 6 Apr 2021 10:20:58 -0400 Subject: [PATCH 02/10] Bugfix --- sdg/inputs/InputOpenDataPlatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index 1c9b8ba1..f37423ed 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -47,7 +47,7 @@ def get_dimensions(self, row): continue try: prop_id = row[prop]['id'] - dimensions[prop_id] = row[prop]['id'] + dimensions[prop] = row[prop]['id'] except: pass return dimensions From 91fc5da8a3c7e14253979a46c0888ad3fa358705 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Tue, 6 Apr 2021 11:11:06 -0400 Subject: [PATCH 03/10] Fixes --- sdg/inputs/InputOpenDataPlatform.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index f37423ed..76d58bc3 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -28,8 +28,8 @@ def execute(self, indicator_options): value = item['values'][idx] if value is not None: disaggregations = dimensions.copy() - disaggregations['UNIT_MEASURE'] = self.get_unit(item) - disaggregations['UNIT_MULT'] = self.get_unit_multiplier(item) + disaggregations[self.get_unit_key()] = self.get_unit(item) + disaggregations[self.get_unit_multiplier_key()] = self.get_unit_multiplier(item) row = self.get_row(year, value, disaggregations) indicators[indicator_id].append(row) idx += 1 @@ -46,7 +46,6 @@ def get_dimensions(self, row): if prop in non_dimension_props: continue try: - prop_id = row[prop]['id'] dimensions[prop] = row[prop]['id'] except: pass @@ -62,11 +61,19 @@ def get_years(self, row): def get_unit(self, row): - return row['unit'] + return row[self.get_unit_key()] + + + def get_unit_key(self): + return 'unit' def get_unit_multiplier(self, row): - return row['scale'] + return row[self.get_unit_multiplier_key()] + + + def get_unit_multiplier_key(self): + return 'scale' def get_indicator_id(self, row): From f0798d1942c10d7c800c233e21022818749337d1 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Tue, 6 Apr 2021 14:44:14 -0400 Subject: [PATCH 04/10] Allow configurable keys --- sdg/inputs/InputOpenDataPlatform.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index 76d58bc3..cd37749c 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -5,8 +5,12 @@ class InputOpenDataPlatform(InputBase): - def __init__(self, source=None, logging=None): + def __init__(self, source=None, logging=None, unit_key='unit', + unit_multiplier_key='scale', indicator_key='indicator'): InputBase.__init__(self, logging=logging) + self.unit_key = unit_key + self.unit_multiplier_key = unit_multiplier_key + self.indicator_key = indicator_key self.source = source @@ -28,8 +32,8 @@ def execute(self, indicator_options): value = item['values'][idx] if value is not None: disaggregations = dimensions.copy() - disaggregations[self.get_unit_key()] = self.get_unit(item) - disaggregations[self.get_unit_multiplier_key()] = self.get_unit_multiplier(item) + disaggregations[self.unit_key] = self.get_unit(item) + disaggregations[self.unit_multiplier_key] = self.get_unit_multiplier(item) row = self.get_row(year, value, disaggregations) indicators[indicator_id].append(row) idx += 1 @@ -61,24 +65,16 @@ def get_years(self, row): def get_unit(self, row): - return row[self.get_unit_key()] - - - def get_unit_key(self): - return 'unit' + return row[self.unit_key] def get_unit_multiplier(self, row): - return row[self.get_unit_multiplier_key()] - - - def get_unit_multiplier_key(self): - return 'scale' + return row[self.unit_multiplier_key] def get_indicator_id(self, row): - return row['indicator']['id'] + return row[self.indicator_key]['id'] def get_indicator_name(self, row): - return row['indicator']['name'] + return row[self.indicator_key]['name'] From cc0590cf7a5d5d187c027afdb79f48dfce07515f Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Tue, 6 Apr 2021 17:50:10 -0400 Subject: [PATCH 05/10] Remove arbitrary name transformation --- sdg/inputs/InputOpenDataPlatform.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index cd37749c..3f9c7a89 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -22,7 +22,6 @@ def execute(self, indicator_options): for item in parsed['data']: indicator_id = self.normalize_indicator_id(self.get_indicator_id(item)) indicator_name = self.normalize_indicator_name(self.get_indicator_name(item), indicator_id) - indicator_name = indicator_name.strip(':').strip('.').strip() if indicator_id not in indicators: indicators[indicator_id] = [] dimensions = self.get_dimensions(item) From 91ad84e1ff8e53e89a810f7fa4d8f8fd36c1eb4b Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Mon, 19 Apr 2021 14:37:42 -0400 Subject: [PATCH 06/10] Support column/code map --- sdg/inputs/InputOpenDataPlatform.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index 3f9c7a89..c294558d 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -6,8 +6,10 @@ class InputOpenDataPlatform(InputBase): def __init__(self, source=None, logging=None, unit_key='unit', - unit_multiplier_key='scale', indicator_key='indicator'): - InputBase.__init__(self, logging=logging) + unit_multiplier_key='scale', indicator_key='indicator', + column_map=None, code_map=None): + InputBase.__init__(self, logging=logging, column_map=column_map, + code_map=code_map) self.unit_key = unit_key self.unit_multiplier_key = unit_multiplier_key self.indicator_key = indicator_key From 947eeb73ff1af2cecf4d36d7d5b66d5602594430 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Fri, 7 May 2021 07:50:07 -0400 Subject: [PATCH 07/10] Fix for year range --- sdg/inputs/InputOpenDataPlatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index c294558d..d7854090 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -62,7 +62,7 @@ def get_years(self, row): end = int(row['endDate'][0:4]) if start == end: return [start] - return list(range(start, end)) + return list(range(start, end + 1)) def get_unit(self, row): From a4d34fba52ece4f58f0ba280048ac8d1d40a9a86 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Mon, 31 May 2021 16:21:49 -0400 Subject: [PATCH 08/10] Separate input for metadata from ODP --- sdg/inputs/InputOpenDataPlatform.py | 1 - sdg/inputs/InputOpenDataPlatformMeta.py | 63 +++++++++++++++++++++++++ sdg/inputs/__init__.py | 1 + sdg/open_sdg.py | 3 ++ 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 sdg/inputs/InputOpenDataPlatformMeta.py diff --git a/sdg/inputs/InputOpenDataPlatform.py b/sdg/inputs/InputOpenDataPlatform.py index d7854090..66820371 100644 --- a/sdg/inputs/InputOpenDataPlatform.py +++ b/sdg/inputs/InputOpenDataPlatform.py @@ -20,7 +20,6 @@ def execute(self, indicator_options): payload = self.fetch_file(self.source) parsed = json.loads(payload) indicators = {} - names = {} for item in parsed['data']: indicator_id = self.normalize_indicator_id(self.get_indicator_id(item)) indicator_name = self.normalize_indicator_name(self.get_indicator_name(item), indicator_id) diff --git a/sdg/inputs/InputOpenDataPlatformMeta.py b/sdg/inputs/InputOpenDataPlatformMeta.py new file mode 100644 index 00000000..b7b5c768 --- /dev/null +++ b/sdg/inputs/InputOpenDataPlatformMeta.py @@ -0,0 +1,63 @@ +import json +import os +import pandas as pd +from sdg.inputs import InputBase + +class InputOpenDataPlatformMeta(InputBase): + + + def __init__(self, source=None, logging=None, indicator_key='SDG_INDICATOR', + metadata_mapping=None): + InputBase.__init__(self, logging=logging) + self.indicator_key = indicator_key + self.metadata_mapping = metadata_mapping + self.source = source + + + def load_metadata_mapping(self): + mapping = None + if self.metadata_mapping is None: + mapping = {} + elif isinstance(self.metadata_mapping, dict): + mapping = self.metadata_mapping + # Otherwise assume it is a path to a file. + else: + extension = os.path.splitext(self.metadata_mapping)[1] + if extension.lower() == '.csv': + mapping = pd.read_csv(self.metadata_mapping, header=None, index_col=0, squeeze=True).to_dict() + + if mapping is None: + raise Exception('Format of metadata_mapping should be a dict or a path to a CSV file.') + + self.metadata_mapping = mapping + + + def apply_metadata_mapping(self, metadata): + for human_key in self.metadata_mapping: + machine_key = self.metadata_mapping[human_key] + if human_key in metadata and human_key != machine_key: + metadata[machine_key] = metadata[human_key] + del metadata[human_key] + + + def execute(self, indicator_options): + self.load_metadata_mapping() + payload = self.fetch_file(self.source) + parsed = json.loads(payload) + for item in parsed['data']: + meta = item.copy() + self.apply_metadata_mapping(meta) + indicator_id = self.normalize_indicator_id(self.get_indicator_id(item)) + indicator_name = self.normalize_indicator_name(self.get_indicator_name(item), indicator_id) + self.add_indicator(indicator_id, name=indicator_name, meta=meta, options=indicator_options) + + + def get_indicator_id(self, row): + if self.indicator_key not in row: + print(row) + raise Exception('The indicator_key was not found in the metadata shown above.') + return row[self.indicator_key] + + + def get_indicator_name(self, row): + return self.get_indicator_id(row) diff --git a/sdg/inputs/__init__.py b/sdg/inputs/__init__.py index 20bc5140..38db2a1d 100644 --- a/sdg/inputs/__init__.py +++ b/sdg/inputs/__init__.py @@ -17,3 +17,4 @@ from .InputCkan import InputCkan from .InputSdmxMl_UnitedNationsApi import InputSdmxMl_UnitedNationsApi from .InputOpenDataPlatform import InputOpenDataPlatform +from .InputOpenDataPlatformMeta import InputOpenDataPlatformMeta diff --git a/sdg/open_sdg.py b/sdg/open_sdg.py index cd85a3e8..fb500b39 100644 --- a/sdg/open_sdg.py +++ b/sdg/open_sdg.py @@ -449,6 +449,7 @@ def open_sdg_input_from_dict(params, options): 'InputYamlMeta', 'InputSdmxMeta', 'InputOpenDataPlatform', + 'InputOpenDataPlatformMeta', 'InputWordMeta', ] if input_class not in allowed: @@ -490,6 +491,8 @@ def open_sdg_input_from_dict(params, options): input_instance = sdg.inputs.InputSdmxMeta(**params) elif input_class == 'InputOpenDataPlatform': input_instance = sdg.inputs.InputOpenDataPlatform(**params) + elif input_class == 'InputOpenDataPlatformMeta': + input_instance = sdg.inputs.InputOpenDataPlatformMeta(**params) elif input_class == 'InputWordMeta': input_instance = sdg.inputs.InputWordMeta(**params) From 67cef636c0c512e49931560c40326e43ace53987 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Mon, 31 May 2021 16:34:29 -0400 Subject: [PATCH 09/10] Use mapped metadata --- sdg/inputs/InputOpenDataPlatformMeta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdg/inputs/InputOpenDataPlatformMeta.py b/sdg/inputs/InputOpenDataPlatformMeta.py index b7b5c768..58c819fc 100644 --- a/sdg/inputs/InputOpenDataPlatformMeta.py +++ b/sdg/inputs/InputOpenDataPlatformMeta.py @@ -47,8 +47,8 @@ def execute(self, indicator_options): for item in parsed['data']: meta = item.copy() self.apply_metadata_mapping(meta) - indicator_id = self.normalize_indicator_id(self.get_indicator_id(item)) - indicator_name = self.normalize_indicator_name(self.get_indicator_name(item), indicator_id) + indicator_id = self.normalize_indicator_id(self.get_indicator_id(meta)) + indicator_name = self.normalize_indicator_name(self.get_indicator_name(meta), indicator_id) self.add_indicator(indicator_id, name=indicator_name, meta=meta, options=indicator_options) From 888c96bc83545ab562df2a13203ad7b87b9a4b81 Mon Sep 17 00:00:00 2001 From: Brock Fanning Date: Mon, 31 May 2021 17:55:27 -0400 Subject: [PATCH 10/10] Fixes for meta input --- sdg/inputs/InputOpenDataPlatformMeta.py | 26 ++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/sdg/inputs/InputOpenDataPlatformMeta.py b/sdg/inputs/InputOpenDataPlatformMeta.py index 58c819fc..8eef5c9f 100644 --- a/sdg/inputs/InputOpenDataPlatformMeta.py +++ b/sdg/inputs/InputOpenDataPlatformMeta.py @@ -36,6 +36,9 @@ def apply_metadata_mapping(self, metadata): for human_key in self.metadata_mapping: machine_key = self.metadata_mapping[human_key] if human_key in metadata and human_key != machine_key: + # If it has already been mapped, skip it. + if machine_key in metadata and metadata[machine_key] is not None: + continue metadata[machine_key] = metadata[human_key] del metadata[human_key] @@ -47,9 +50,26 @@ def execute(self, indicator_options): for item in parsed['data']: meta = item.copy() self.apply_metadata_mapping(meta) - indicator_id = self.normalize_indicator_id(self.get_indicator_id(meta)) - indicator_name = self.normalize_indicator_name(self.get_indicator_name(meta), indicator_id) - self.add_indicator(indicator_id, name=indicator_name, meta=meta, options=indicator_options) + try: + indicator_id = self.normalize_indicator_id(self.get_indicator_id(meta)) + # Safety check that we got a real indicator id. + assert len(indicator_id.split('-')) >= 3 + indicator_name = self.normalize_indicator_name(self.get_indicator_name(meta), indicator_id) + self.add_indicator(indicator_id, name=indicator_name, meta=meta, options=indicator_options) + except Exception as e: + print('Unable to parse an indicator in InputOpenDataPlatformMeta. Error below:') + print(e) + + + def normalize_indicator_id(self, indicator_id): + normalized = InputBase.normalize_indicator_id(self, indicator_id) + # A common issue is when an extra fourth part gets added. + parts = normalized.split('-') + if len(parts) == 4 and len(parts[3]) > 2: + # Assume the part is uneeded. + parts.pop() + normalized = '-'.join(parts) + return normalized def get_indicator_id(self, row):