ONSdigital · AnneONS · Jan 7, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 7, 2025
@@ -80,7 +80,7 @@ network_paths:
   # Imputation and outliers input paths
   # backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_oct_24.csv"
   backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_published_v347.csv"
-  pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_backdata_thousands.csv"
+  pnp_backdata_path: "R:/BERD Results System Development 2023/DAP_emulation/2021_surveys/PNP/06_imputation/backdata_output/PNP_2021_backdata.csv"
   manual_imp_trim_path: "06_imputation/manual_trimming/2023_manual_trimming_v1.csv"
   manual_outliers_path: "07_outliers/manual_outliers/2023_manual_outliers_v1.csv"
   # Construction paths

@@ -1,4 +1,5 @@
 """Apply outlier detection to the dataset."""
+
 import logging
 import pandas as pd
 from typing import List
@@ -72,6 +73,29 @@ def filter_valid(df: pd.DataFrame, value_col: str) -> pd.DataFrame:
     return filtered_df
 
 
+def get_clip_bands(count_df: pd.DataFrame, clip: float, upper=True) -> pd.DataFrame:
+    """Calculate the number of rows to clip based on the count and clip percentage.
+
+    Args:
+        count_df (pd.DataFrame): The dataframe with the count of rows
+        clip (float): The percentage to clip as a float
+        upper (bool): Whether to clip the highest values or lower values
+
+    Returns:
+        pd.DataFrame: The dataframe with the number of rows to clip
+    """
+    count_df["band"] = count_df["gr_count"] * clip
+    count_df["rows_to_clip"] = count_df["band"].apply(lambda x: normal_round(x))
+
+    if upper:
+        count_df["clip_higher_than"] = count_df["gr_count"] - count_df["rows_to_clip"]
+
+    else:
+        count_df["clip_lower_than"] = count_df["rows_to_clip"]
+
+    return count_df
+
+
 def flag_outliers(
     df: pd.DataFrame, upper_clip: float, lower_clip: float, value_col: str
 ) -> pd.DataFrame:
@@ -95,32 +119,32 @@ def flag_outliers(
     # Filter for valid sampled data and positive values in the value column
     filtered_df = filter_valid(df, value_col)
 
-    # Add group count - how many RU refs there are in a cell, perod
-    filtered_df["group_count"] = filtered_df.groupby(groupby_cols)[value_col].transform(
-        "count"
-    )  # noqa
+    # created a dataframe with the group count and how many rows should be clipped
+    count_df = filtered_df.groupby(groupby_cols).size().reset_index(name="gr_count")
 
-    # Rank margins
-    filtered_df["high"] = filtered_df["group_count"] * upper_clip
-    filtered_df["high_rounded"] = filtered_df.apply(
-        lambda row: normal_round(row["high"]), axis=1
-    )
-    filtered_df["upper_band"] = filtered_df["group_count"] - filtered_df["high_rounded"]
-
-    filtered_df["low"] = filtered_df["group_count"] * lower_clip
-    filtered_df["lower_band"] = filtered_df.apply(
-        lambda row: normal_round(row["low"]), axis=1
-    )
+    # Calculate the number of rows to clip for each group
+    count_df = get_clip_bands(count_df, upper_clip, upper=True)
 
     # Ranks of RU refs in each group, depending on their value
     filtered_df["group_rank"] = filtered_df.groupby(groupby_cols)[value_col].rank(
         method="first", ascending=True
     )
 
-    # Outlier conditions
-    outlier_cond = (filtered_df["group_rank"] > filtered_df["upper_band"]) | (
-        filtered_df["group_rank"] <= filtered_df["lower_band"]
-    )
+    # Calculate the number of rows to clip by merging in the group count
+    filtered_df = filtered_df.merge(count_df, how="left", on=groupby_cols)
+
+    # Create outlier condition
+    outlier_cond = filtered_df["group_rank"] > filtered_df["clip_higher_than"]
+
+    # If lower clipping is specified, add the condition
+    if lower_clip > 0:
+        count_df = get_clip_bands(count_df, lower_clip, upper=False)
+        filtered_df = filtered_df.merge(
+            count_df[groupby_cols + ["clip_lower_than"]], how="left", on=groupby_cols
+        )
+        lower_cond = filtered_df["group_rank"] <= filtered_df["clip_lower_than"]
+
+        outlier_cond = outlier_cond | lower_cond
 
     # Create outlier flag
     filtered_df[f"{value_col}_outlier_flag"] = outlier_cond

@@ -2,7 +2,6 @@
 
 from pandas._testing import assert_frame_equal
 from pandas import DataFrame as pandasDF
-import numpy as np
 import pytest
 
 import src.outlier_detection.auto_outliers as auto
@@ -159,6 +158,49 @@ def test_filter_valid(self, caplog):
         assert "has no valid returns for outliers" in caplog.text
 
 
+class TestGetClipBands:
+    """Unit tests for get_clip_bands function."""
+    def input_df(self):
+        cols = ["cell_no", "gr_count"]
+
+        data = [
+            [111, 9],
+            [222, 10],
+            [333, 19],
+            [444, 20],
+            [555, 29],
+            [666, 30]
+        ]
+
+        return pandasDF(data=data, columns=cols)
+
+    def expected_df(self):
+        cols = ["cell_no", "gr_count", "band", "rows_to_clip", "clip_higher_than"]
+
+        data = [
+            [111, 9, 0.45, 0, 9],  # no rows to clip
+            [222, 10,  0.5, 1, 9],  # 1 row to clip, clip rows higher than rank 9
+            [333, 19, 0.95, 1, 18],
+            [444, 20, 1.0, 1, 19],
+            [555, 29, 1.45, 1, 28],
+            [666, 30, 1.5, 2, 28]
+        ]
+
+        return pandasDF(data=data, columns=cols)
+
+    def test_get_clip_bands(self):
+        """Test for get_clip_bands function."""
+        upper_clip = 0.05
+        lower_clip = 0.0
+        df = self.input_df()
+
+        result_df = auto.get_clip_bands(df, upper_clip)
+        expected_df = self.expected_df()
+
+        assert_frame_equal(result_df, expected_df)
+
+
+
 # Seven tests for `flag_outliers()`:
 # Test the correct rows are flagged as outliers
 # Test that no outliers are flagged if both clips = 0