Skip to content

Commit

Permalink
fix(eap): fix bug with data_present (#6726)
Browse files Browse the repository at this point in the history
Fixes getsentry/eap-planning#144

## Additional Context
When we perform aggregations over attributes, the function is converted
into function_nameIf (e.g. countIf) which returns a default value if no
rows match the condition. This means that there is no way to distinguish
between an aggregate result that is 0 because it's for example a sum of
values that add up to 0, and a result that is 0 because no values
matched the given condition. To deal with this, we compute the number of
events being aggregated even when we aren't extrapolating so we can
determine if data was present or not.
  • Loading branch information
davidtsuk authored Jan 8, 2025
1 parent 6c5213c commit 33453a0
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 37 deletions.
22 changes: 5 additions & 17 deletions snuba/web/rpc/common/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class ExtrapolationContext(ABC):
sample_count: int

@property
def extrapolated_data_present(self) -> bool:
def is_data_present(self) -> bool:
return self.sample_count > 0

@property
Expand All @@ -67,8 +67,8 @@ def from_row(
value = row_data[column_label]

confidence_interval = None
average_sample_rate = None
sample_count = None
average_sample_rate = 0
sample_count = 0

percentile = 0.0
granularity = 0.0
Expand Down Expand Up @@ -108,18 +108,6 @@ def from_row(
elif custom_column_information.custom_column_id == "count":
sample_count = col_value

if (
confidence_interval is None
or average_sample_rate is None
or sample_count is None
):
return GenericExtrapolationContext(
value=value,
confidence_interval=None,
average_sample_rate=0,
sample_count=0,
)

if is_percentile:
return PercentileExtrapolationContext(
value=value,
Expand Down Expand Up @@ -150,7 +138,7 @@ def is_extrapolated(self) -> bool:

@cached_property
def reliability(self) -> Reliability.ValueType:
if not self.is_extrapolated or not self.extrapolated_data_present:
if not self.is_extrapolated or not self.is_data_present:
return Reliability.RELIABILITY_UNSPECIFIED

relative_confidence = (
Expand Down Expand Up @@ -183,7 +171,7 @@ def is_extrapolated(self) -> bool:

@cached_property
def reliability(self) -> Reliability.ValueType:
if not self.is_extrapolated or not self.extrapolated_data_present:
if not self.is_extrapolated or not self.is_data_present:
return Reliability.RELIABILITY_UNSPECIFIED

lower_bound, upper_bound = _calculate_approximate_ci_percentile_levels(
Expand Down
26 changes: 10 additions & 16 deletions snuba/web/rpc/v1/endpoint_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,7 @@ def _convert_result_timeseries(
extrapolation_context = ExtrapolationContext.from_row(
timeseries.label, row_data
)
if (
# This isn't quite right but all non extrapolated aggregates
# are assumed to be present.
not extrapolation_context.is_extrapolated
# This checks if the extrapolated aggregate is present by
# inspecting the sample count
or extrapolation_context.extrapolated_data_present
):
if extrapolation_context.is_data_present:
timeseries.data_points.append(
DataPoint(
data=row_data[timeseries.label],
Expand Down Expand Up @@ -209,32 +202,33 @@ def _build_query(request: TimeSeriesRequest) -> Query:
for aggregation in request.aggregations
]

extrapolation_columns = []
additional_context_columns = []
for aggregation in request.aggregations:
if (
aggregation.extrapolation_mode
== ExtrapolationMode.EXTRAPOLATION_MODE_SAMPLE_WEIGHTED
):
confidence_interval_column = get_confidence_interval_column(aggregation)
if confidence_interval_column is not None:
extrapolation_columns.append(
additional_context_columns.append(
SelectedExpression(
name=confidence_interval_column.alias,
expression=confidence_interval_column,
)
)

average_sample_rate_column = get_average_sample_rate_column(aggregation)
count_column = get_count_column(aggregation)
extrapolation_columns.append(
additional_context_columns.append(
SelectedExpression(
name=average_sample_rate_column.alias,
expression=average_sample_rate_column,
)
)
extrapolation_columns.append(
SelectedExpression(name=count_column.alias, expression=count_column)
)

count_column = get_count_column(aggregation)
additional_context_columns.append(
SelectedExpression(name=count_column.alias, expression=count_column)
)

groupby_columns = [
SelectedExpression(
Expand Down Expand Up @@ -275,7 +269,7 @@ def _build_query(request: TimeSeriesRequest) -> Query:
),
*aggregation_columns,
*groupby_columns,
*extrapolation_columns,
*additional_context_columns,
],
granularity=request.granularity_secs,
condition=base_conditions_and(
Expand Down
8 changes: 4 additions & 4 deletions tests/web/rpc/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_get_confidence_interval_column_for_non_extrapolatable_column() -> None:
"column_name",
"average_sample_rate",
"reliability",
"extrapolated_data_present",
"is_data_present",
"is_extrapolated",
),
[
Expand Down Expand Up @@ -180,18 +180,18 @@ def test_get_confidence_interval_column_for_non_extrapolatable_column() -> None:
),
],
)
def test_get_extrapolation_meta(
def test_get_extrapolation_context(
row_data: dict[str, Any],
column_name: str,
average_sample_rate: float,
reliability: Reliability.ValueType,
extrapolated_data_present: bool,
is_data_present: bool,
is_extrapolated: bool,
) -> None:
extrapolation_context = ExtrapolationContext.from_row(column_name, row_data)
assert extrapolation_context.average_sample_rate == average_sample_rate
assert extrapolation_context.reliability == reliability
assert extrapolation_context.extrapolated_data_present == extrapolated_data_present
assert extrapolation_context.is_data_present == is_data_present
assert extrapolation_context.is_extrapolated == is_extrapolated


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,54 @@ def test_start_time_not_divisible_by_time_buckets_returns_valid_data(self) -> No
assert len(ts.data_points) == 1
assert ts.data_points[0].data == 300

def test_with_non_existent_attribute(self) -> None:
store_timeseries(
BASE_TIME,
1,
3600,
metrics=[
DummyMetric("test_metric", get_value=lambda x: 1),
],
)

message = TimeSeriesRequest(
meta=RequestMeta(
project_ids=[1, 2, 3],
organization_id=1,
cogs_category="something",
referrer="something",
start_timestamp=Timestamp(seconds=int(BASE_TIME.timestamp())),
end_timestamp=Timestamp(seconds=int(BASE_TIME.timestamp() + 60 * 30)),
),
aggregations=[
AttributeAggregation(
aggregate=Function.FUNCTION_SUM,
key=AttributeKey(
type=AttributeKey.TYPE_FLOAT, name="non_existent_metric"
),
label="sum",
extrapolation_mode=ExtrapolationMode.EXTRAPOLATION_MODE_NONE,
),
],
granularity_secs=300,
)

response = EndpointTimeSeries().execute(message)
expected_buckets = [
Timestamp(seconds=int(BASE_TIME.timestamp()) + secs)
for secs in range(0, 60 * 30, 300)
]

assert response.result_timeseries == [
TimeSeries(
label="sum",
buckets=expected_buckets,
data_points=[
DataPoint(data_present=False) for _ in range(len(expected_buckets))
],
)
]


class TestUtils:
def test_no_duplicate_labels(self) -> None:
Expand Down

0 comments on commit 33453a0

Please sign in to comment.