Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RDRP-1087 outliers #395

Merged
merged 4 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/dev_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ network_paths:
# Imputation and outliers input paths
# backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_oct_24.csv"
backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_published_v347.csv"
pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_backdata_thousands.csv"
pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_backdata.csv"
manual_imp_trim_path: "06_imputation/manual_trimming/2023_manual_trimming_v1.csv"
manual_outliers_path: "07_outliers/manual_outliers/2023_manual_outliers_v1.csv"
# Construction paths
Expand Down
62 changes: 43 additions & 19 deletions src/outlier_detection/auto_outliers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Apply outlier detection to the dataset."""

import logging
import pandas as pd
from typing import List
Expand Down Expand Up @@ -72,6 +73,29 @@ def filter_valid(df: pd.DataFrame, value_col: str) -> pd.DataFrame:
return filtered_df


def get_clip_bands(count_df: pd.DataFrame, clip: float, upper=True) -> pd.DataFrame:
"""Calculate the number of rows to clip based on the count and clip percentage.

Args:
count_df (pd.DataFrame): The dataframe with the count of rows
clip (float): The percentage to clip as a float
upper (bool): Whether to clip the highest values or lower values

Returns:
pd.DataFrame: The dataframe with the number of rows to clip
"""
count_df["band"] = count_df["gr_count"] * clip
count_df["rows_to_clip"] = count_df["band"].apply(lambda x: normal_round(x))

if upper:
count_df["clip_higher_than"] = count_df["gr_count"] - count_df["rows_to_clip"]

else:
count_df["clip_lower_than"] = count_df["rows_to_clip"]

return count_df


def flag_outliers(
df: pd.DataFrame, upper_clip: float, lower_clip: float, value_col: str
) -> pd.DataFrame:
Expand All @@ -95,32 +119,32 @@ def flag_outliers(
# Filter for valid sampled data and positive values in the value column
filtered_df = filter_valid(df, value_col)

# Add group count - how many RU refs there are in a cell, perod
filtered_df["group_count"] = filtered_df.groupby(groupby_cols)[value_col].transform(
"count"
) # noqa
# created a dataframe with the group count and how many rows should be clipped
count_df = filtered_df.groupby(groupby_cols).size().reset_index(name="gr_count")

# Rank margins
filtered_df["high"] = filtered_df["group_count"] * upper_clip
filtered_df["high_rounded"] = filtered_df.apply(
lambda row: normal_round(row["high"]), axis=1
)
filtered_df["upper_band"] = filtered_df["group_count"] - filtered_df["high_rounded"]

filtered_df["low"] = filtered_df["group_count"] * lower_clip
filtered_df["lower_band"] = filtered_df.apply(
lambda row: normal_round(row["low"]), axis=1
)
# Calculate the number of rows to clip for each group
count_df = get_clip_bands(count_df, upper_clip, upper=True)

# Ranks of RU refs in each group, depending on their value
filtered_df["group_rank"] = filtered_df.groupby(groupby_cols)[value_col].rank(
method="first", ascending=True
)

# Outlier conditions
outlier_cond = (filtered_df["group_rank"] > filtered_df["upper_band"]) | (
filtered_df["group_rank"] <= filtered_df["lower_band"]
)
# Calculate the number of rows to clip by merging in the group count
filtered_df = filtered_df.merge(count_df, how="left", on=groupby_cols)

# Create outlier condition
outlier_cond = filtered_df["group_rank"] > filtered_df["clip_higher_than"]

# If lower clipping is specified, add the condition
if lower_clip > 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the case where function "get_clip_bands" has a non-zero lower_clip has not been tested. I'm wondering if this is worth doing.
Also, has "flag_outliers" been unit-tested for the case where lower_clip is not zero?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the flag_outliers has been unit tested where the lower_clip is non zero. I think you're right, I'll add in a unit test for get_clip_bands for non-zero lower clip as well, and for when both are non-zero.

count_df = get_clip_bands(count_df, lower_clip, upper=False)
filtered_df = filtered_df.merge(
count_df[groupby_cols + ["clip_lower_than"]], how="left", on=groupby_cols
)
lower_cond = filtered_df["group_rank"] <= filtered_df["clip_lower_than"]

outlier_cond = outlier_cond | lower_cond

# Create outlier flag
filtered_df[f"{value_col}_outlier_flag"] = outlier_cond
Expand Down
44 changes: 43 additions & 1 deletion tests/test_outlier_detection/test_auto_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

from pandas._testing import assert_frame_equal
from pandas import DataFrame as pandasDF
import numpy as np
import pytest

import src.outlier_detection.auto_outliers as auto
Expand Down Expand Up @@ -159,6 +158,49 @@ def test_filter_valid(self, caplog):
assert "has no valid returns for outliers" in caplog.text


class TestGetClipBands:
"""Unit tests for get_clip_bands function."""
def input_df(self):
cols = ["cell_no", "gr_count"]

data = [
[111, 9],
[222, 10],
[333, 19],
[444, 20],
[555, 29],
[666, 30]
]

return pandasDF(data=data, columns=cols)

def expected_df(self):
cols = ["cell_no", "gr_count", "band", "rows_to_clip", "clip_higher_than"]

data = [
[111, 9, 0.45, 0, 9], # no rows to clip
[222, 10, 0.5, 1, 9], # 1 row to clip, clip rows higher than rank 9
[333, 19, 0.95, 1, 18],
[444, 20, 1.0, 1, 19],
[555, 29, 1.45, 1, 28],
[666, 30, 1.5, 2, 28]
]

return pandasDF(data=data, columns=cols)

def test_get_clip_bands(self):
"""Test for get_clip_bands function."""
upper_clip = 0.05
lower_clip = 0.0
df = self.input_df()

result_df = auto.get_clip_bands(df, upper_clip)
expected_df = self.expected_df()

assert_frame_equal(result_df, expected_df)



# Seven tests for `flag_outliers()`:
# Test the correct rows are flagged as outliers
# Test that no outliers are flagged if both clips = 0
Expand Down
Loading