Skip to content

Commit

Permalink
merge in develop
Browse files Browse the repository at this point in the history
  • Loading branch information
AnneONS committed Dec 18, 2024
2 parents 24fd6dd + d7628af commit 1e20a04
Show file tree
Hide file tree
Showing 27 changed files with 1,333 additions and 179 deletions.
7 changes: 7 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
#
# These owners will be the default owners for everything in
# the repo (*). Unless a later match takes precedence,
# the following will be requested for
# review when someone opens a pull request.
* @adriano-lopresti @AnneONS @Ryan2Y79 @woodact
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/kynan/nbstripout
rev: 0.4.0
rev: 0.8.1
hooks:
- id: nbstripout
name: nbstripout - Strip outputs from notebooks (auto-fixes)
Expand All @@ -21,28 +21,28 @@ repos:
- id: trailing-whitespace
name: Check for trailing whitespaces (auto-fixes)
- repo: https://github.com/pycqa/isort
rev: 5.8.0
rev: 5.13.2
hooks:
- id: isort
name: isort - Sort Python imports (auto-fixes)
types: [cython, pyi, python]
args: ["--profile", "black", "--filter-files"]
- repo: https://github.com/psf/black
rev: 22.8.0 # Replace by any tag/version: https://github.com/psf/black/tags
rev: 24.10.0 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
name: black - consistent Python code formatting (auto-fixes)
language_version: python # Should be a command that runs python3.6+
exclude: .*/tests/.*|^\.cruft\.json$|.*\tests\.*|helpers/.*|^tests
args: ["--preview", "--line-length=88"]
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
rev: 7.1.1
hooks:
- id: flake8
name: flake8 - Python linting
exclude: .*/tests*|^\.cruft\.json|helpers/.*|.*\tests*$|^tests
- repo: https://github.com/nbQA-dev/nbQA
rev: 0.12.0
rev: 1.9.1
hooks:
- id: nbqa-isort
name: nbqa-isort - Sort Python imports (notebooks; auto-fixes)
Expand All @@ -59,7 +59,7 @@ repos:
# name: nbqa-flake8 - Python linting (notebooks)
# additional_dependencies: [ flake8==3.9.2 ]
- repo: https://github.com/Yelp/detect-secrets
rev: v1.0.3
rev: v1.5.0
hooks:
- id: detect-secrets
name: detect-secrets - Detect secrets in staged code
Expand Down
116 changes: 116 additions & 0 deletions helpers/convert_berd_2021_backdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
"""NOTE: This is a temporary script to convert the 2021 backdata to the format required
for MoR imputation. When the mapping module is complete, we can produce a one-off update
of the 2021 data and remove this script."""

import os
import re
import logging
import pandas as pd

from src.utils.local_file_mods import rd_read_csv, rd_write_csv
from src.staging import staging_helpers as stage_hlp
from src.staging import postcode_validation as pcval
from src.mapping.pg_conversion import pg_to_pg_mapper
from src.imputation.tmi_imputation import create_imp_class_col
from src.imputation.apportionment import run_apportionment

MappingMainLogger = logging.getLogger(__name__)


def do_pg_conv(backdata, config) -> pd.DataFrame:

# Load and validate the PG mappers
pg_num_alpha = stage_hlp.load_validate_mapper(
"pg_num_alpha_mapper_path",
config,
MappingMainLogger,
)

backdata = pg_to_pg_mapper(
backdata,
pg_num_alpha,
)
return backdata


def prep_2021_backdata(backdata) -> pd.DataFrame:
"""Prepare the backdata for MoR imputation.
Args:
backdata (pd.DataFrame): Backdata for the current year.
Returns:
pd.DataFrame: Prepped backdata.
"""
# Convert backdata column names from qXXX to XXX
# Note that this is only applicable when using the backdata on the network
p = re.compile(r"q\d{3}")
cols = [col for col in list(backdata.columns) if p.match(col)]
to_rename = {col: col[1:] for col in cols}
backdata = backdata.rename(columns=to_rename)

# Apply the postcode formatting to clean the postcodes in col 601 of the back data
backdata["601"] = backdata["601"].apply(pcval.format_postcodes)

return backdata


def get_backdate_wanted_cols(backdata: pd.DataFrame, config: dict) -> pd.DataFrame:
"""Get the columns required for the backdata.
Args:
backdata (pd.DataFrame): The backdata.
config (dict): The configuration settings.
Returns:
pd.DataFrame: The backdata with only the required columns.
"""
# Load the columns to keep
backdata_cols = stage_hlp.load_required_columns(
"backdata_required_cols_path",
config,
MappingMainLogger,
)

# Get the columns that are in the backdata
cols = list(backdata.columns)
wanted_cols = [col for col in backdata_cols if col in cols]

return backdata[wanted_cols]


def create_imp_marker_col(df: pd.DataFrame) -> pd.DataFrame:
"""Create the imp_marker column for the backdata.
Args:
df (pd.DataFrame): The backdata.
Returns:
pd.DataFrame: The backdata with the imp_marker column.
"""
clear_responders_mask = df.status.isin(["Clear", "Clear - overridden"])
df.loc[clear_responders_mask, "imp_marker"] = "R"
df.loc[~clear_responders_mask, "imp_marker"] = "no_imputation"

return df


def create_backdata(backdata: pd.DataFrame, config: dict) -> pd.DataFrame:
staging_dict = config["staging_paths"]
backdata_path = staging_dict["backdata_path"]

backdata = rd_read_csv(backdata_path)

backdata = prep_2021_backdata(backdata)

backdata = do_pg_conv(backdata, config)

backdata = run_apportionment(backdata)

backdata = create_imp_class_col(backdata, "200", "201")

backdata = create_imp_marker_col(backdata)

backdata_out_path = config["imputation_paths"]["backdata_out_path"]

rd_write_csv(os.path.join(backdata_out_path, "2021_backdata.csv"), backdata)
Loading

0 comments on commit 1e20a04

Please sign in to comment.