From 1a7a4c81f015eb586cae711850a8dc731a74bd86 Mon Sep 17 00:00:00 2001 From: Lewis Date: Tue, 17 Dec 2024 16:04:12 +0000 Subject: [PATCH 1/3] changed parameter from str to intended df --- src/staging/staging_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/staging/staging_main.py b/src/staging/staging_main.py index 40f246484..938efacda 100644 --- a/src/staging/staging_main.py +++ b/src/staging/staging_main.py @@ -216,7 +216,7 @@ def run_staging( # noqa: C901 backdata_path = staging_dict["backdata_path"] rd_file_exists(backdata_path, raise_error=True) backdata = rd_read_csv(backdata_path) - val.validate_data_with_schema(backdata_path, "./config/backdata_schema.toml") + val.validate_data_with_schema(backdata, "./config/backdata_schema.toml") StagingMainLogger.info("Backdata File Loaded Successfully...") From 01f1446f3fd09d021a2d7e3899201a01737285ea Mon Sep 17 00:00:00 2001 From: Lewis Date: Wed, 18 Dec 2024 16:56:37 +0000 Subject: [PATCH 2/3] modified config to point to latest PNP output --- src/dev_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dev_config.yaml b/src/dev_config.yaml index 0b12542fa..70e92a499 100644 --- a/src/dev_config.yaml +++ b/src/dev_config.yaml @@ -81,7 +81,7 @@ network_paths: # Imputation and outliers input paths # backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_oct_24.csv" backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_published_v347.csv" - pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_cleaned_backdata.csv" + pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_prep/PNP_2021_backdata_for_checking.csv" manual_imp_trim_path: "06_imputation/manual_trimming/2023_manual_trimming_v1.csv" manual_outliers_path: "07_outliers/manual_outliers/2023_manual_outliers_v1.csv" # Construction paths From fa2418f47ef70356555e880b02cfdfabc0dc8327 Mon Sep 17 00:00:00 2001 From: Lewis Date: Fri, 20 Dec 2024 13:29:37 +0000 Subject: [PATCH 3/3] changed backdata schema 604 to str, backdata name in config, and modified validation to account for columns that only contain nan values --- config/backdata_schema.toml | 2 +- src/dev_config.yaml | 2 +- src/staging/validation.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/config/backdata_schema.toml b/config/backdata_schema.toml index c2cfe4a8e..94e687095 100644 --- a/config/backdata_schema.toml +++ b/config/backdata_schema.toml @@ -272,7 +272,7 @@ Deduced_Data_Type = "float64" [604] old_name = "604" -Deduced_Data_Type = "float64" +Deduced_Data_Type = "object" [708] old_name = "708" diff --git a/src/dev_config.yaml b/src/dev_config.yaml index 70e92a499..ff390477e 100644 --- a/src/dev_config.yaml +++ b/src/dev_config.yaml @@ -81,7 +81,7 @@ network_paths: # Imputation and outliers input paths # backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_oct_24.csv" backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_published_v347.csv" - pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_prep/PNP_2021_backdata_for_checking.csv" + pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_prep/PNP_2021_backdata_clean.csv" manual_imp_trim_path: "06_imputation/manual_trimming/2023_manual_trimming_v1.csv" manual_outliers_path: "07_outliers/manual_outliers/2023_manual_outliers_v1.csv" # Construction paths diff --git a/src/staging/validation.py b/src/staging/validation.py index 816804745..261ecc016 100644 --- a/src/staging/validation.py +++ b/src/staging/validation.py @@ -156,7 +156,8 @@ def validate_data_with_schema(survey_df: pd.DataFrame, schema_path: str): # noq " the data." ) else: - survey_df[column] = survey_df[column].astype(dtypes_dict[column]) + if survey_df[column].isna().all() is False: + survey_df[column] = survey_df[column].astype(dtypes_dict[column]) except Exception as e: ValidationLogger.error(e) ValidationLogger.info("Validation successful")