Skip to content

Commit

Permalink
feat: reorganize TPC-H test scripts to make them more similar (#171)
Browse files Browse the repository at this point in the history
This PR splits the `test_*_tpch.py` scripts into several
`test_X_on_Y.py` scripts that have minimal differences among them. More
precisely, `test_duckdb_on_acero.py` and `test_duckdb_on_duckdb.py` now
only differ on which consumer is being built and
`test_duckdb_on_acero.py` and `test_isthmus_on_acero.py` differ only on
which plan is used (the former produces it on the fly, the latter loads
it from a file). The splitting essentially consisted of moving/copying
entire functions from one file to another, changing the order of lines,
renaming variables, factoring out common expressions into variables, and
changing comments. This is a preparatory step for reusing test
functionality in `common.py` in all three scripts.

Signed-off-by: Ingo Müller <[email protected]>
  • Loading branch information
ingomueller-net authored Dec 17, 2024
1 parent 525f26e commit 6659d6f
Show file tree
Hide file tree
Showing 70 changed files with 191 additions and 177 deletions.
176 changes: 0 additions & 176 deletions substrait_consumer/tests/integration/test_acero_tpch.py

This file was deleted.

93 changes: 93 additions & 0 deletions substrait_consumer/tests/integration/test_duckdb_on_acero.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from pathlib import Path

import duckdb
import pytest
from pytest_snapshot.plugin import Snapshot

from substrait_consumer.consumers.acero_consumer import AceroConsumer
from substrait_consumer.functional.utils import load_json
from substrait_consumer.producers.duckdb_producer import DuckDBProducer

CONFIG_DIR = Path(__file__).parent.parent / "integration"
TPCH_CONFIG_DIR = CONFIG_DIR / "tpch"
TEST_CASE_PATHS = list(
(path.relative_to(CONFIG_DIR),) for path in TPCH_CONFIG_DIR.rglob("*.json")
)
IDS = list((str(path[0]).removesuffix(".json") for path in TEST_CASE_PATHS))


@pytest.mark.parametrize(["path"], TEST_CASE_PATHS, ids=IDS)
@pytest.mark.usefixtures("prepare_tpch_parquet_data")
def test_substrait_query(
path: Path,
snapshot: Snapshot,
db_con: duckdb.DuckDBPyConnection,
) -> None:
"""
1. Load all the parquet files into DuckDB as separate named_tables.
2. Format the SQL query to work with DuckDB by inserting all the table names.
3. Execute the SQL on DuckDB.
4. Run the substrait query plan.
5. Compare substrait query plan results against the results of
running the SQL on DuckDB.
Parameters:
test_name:
Name of test.
local_files:
A `dict` mapping format argument names to local files paths.
named_tables:
A `dict` mapping table names to local file paths.
sql_query:
SQL query.
"""
test_case = load_json(CONFIG_DIR / path)
test_name = test_case["test_name"]
local_files = test_case["local_files"]
named_tables = test_case["named_tables"]
sql_query, supported_producers = test_case["sql_query"]

assert "duckdb" in supported_producers

tpch_num = int(test_name.split("_")[-1])

snapshot.snapshot_dir = snapshot.snapshot_dir.parent / f"test_tpch_sql_{tpch_num}"

consumer = AceroConsumer()
producer = DuckDBProducer()

consumer.setup(db_con, local_files, named_tables)
producer.setup(db_con, local_files, named_tables)

outcome_path = f"query_{tpch_num:02d}_outcome.txt"

# Produce DuckDB plan from SQL query.
try:
proto_bytes = producer.produce_substrait(sql_query)
except BaseException as e:
snapshot.assert_match(str(type(e)), outcome_path)
return

try:
subtrait_query_result_tb = consumer.run_substrait_query(proto_bytes)
except BaseException as e:
snapshot.assert_match(str(type(e)), outcome_path)
return

# Calculate results to verify against by running the SQL query on DuckDB
try:
duckdb_sql_result_tb = producer.run_sql_query(sql_query)
except BaseException as e:
snapshot.assert_match(str(type(e)), outcome_path)
return

col_names = [x.lower() for x in subtrait_query_result_tb.column_names]
exp_col_names = [x.lower() for x in duckdb_sql_result_tb.column_names]

# Verify results between substrait plan query and sql running against
# duckdb are equal.
outcome = {
"column_names": col_names == exp_col_names,
"table": subtrait_query_result_tb == duckdb_sql_result_tb,
}
snapshot.assert_match(str(outcome), outcome_path)
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def test_substrait_query(

outcome_path = f"query_{tpch_num:02d}_outcome.txt"

# Convert the SQL into a substrait query plan and run the plan.
# Produce DuckDB plan from SQL query.
try:
proto_bytes = producer.produce_substrait(sql_query)
except BaseException as e:
Expand Down
97 changes: 97 additions & 0 deletions substrait_consumer/tests/integration/test_isthmus_on_acero.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from pathlib import Path

import duckdb
import pytest
from pytest_snapshot.plugin import Snapshot

from substrait_consumer.consumers.acero_consumer import AceroConsumer
from substrait_consumer.functional.utils import load_json
from substrait_consumer.producers.duckdb_producer import DuckDBProducer

PLAN_DIR = Path(__file__).parent / "queries" / "tpch_substrait_plans"

CONFIG_DIR = Path(__file__).parent.parent / "integration"
TPCH_CONFIG_DIR = CONFIG_DIR / "tpch"
TEST_CASE_PATHS = list(
(path.relative_to(CONFIG_DIR),) for path in TPCH_CONFIG_DIR.rglob("*.json")
)
IDS = list((str(path[0]).removesuffix(".json") for path in TEST_CASE_PATHS))


@pytest.mark.parametrize(["path"], TEST_CASE_PATHS, ids=IDS)
@pytest.mark.usefixtures("prepare_tpch_parquet_data")
def test_isthmus_substrait_plan(
path: Path,
snapshot: Snapshot,
db_con: duckdb.DuckDBPyConnection,
) -> None:
"""
1. Format the substrait_query by replacing the 'local_files' 'uri_file'
path with the full path to the parquet data.
2. Format the SQL query to work with DuckDB by setting the 'Table'
Parameters to be the relative files paths for parquet data.
3. Run the substrait query plan.
4. Execute the SQL on DuckDB.
5. Compare substrait query plan results against the results of
running the SQL on DuckDB.
Parameters:
test_name:
Name of test.
local_files:
A `dict` mapping format argument names to local files paths.
named_tables:
A `dict` mapping table names to local file paths.
sql_query:
SQL query.
substrait_query:
Substrait query.
"""
test_case = load_json(CONFIG_DIR / path)
test_name = test_case["test_name"]
local_files = test_case["local_files"]
named_tables = test_case["named_tables"]
sql_query, supported_producers = test_case["sql_query"]

assert "duckdb" in supported_producers

tpch_num = int(test_name.split("_")[-1])

snapshot.snapshot_dir = snapshot.snapshot_dir.parent / f"test_tpch_sql_{tpch_num}"

consumer = AceroConsumer()
producer = DuckDBProducer()

consumer.setup(db_con, local_files, named_tables)
producer.setup(db_con, local_files, named_tables)

outcome_path = f"query_{tpch_num:02d}_outcome.txt"

# Load Isthmus plan from file.
substrait_plan_path = PLAN_DIR / f"query_{tpch_num:02d}_plan.json"
with open(substrait_plan_path, "r") as f:
proto_bytes = f.read()

try:
subtrait_query_result_tb = consumer.run_substrait_query(proto_bytes)
except BaseException as e:
snapshot.assert_match(str(type(e)), outcome_path)
return

# Calculate results to verify against by running the SQL query on DuckDB
try:
duckdb_sql_result_tb = producer.run_sql_query(sql_query)
except BaseException as e:
snapshot.assert_match(str(type(e)), outcome_path)
return

col_names = [x.lower() for x in subtrait_query_result_tb.column_names]
exp_col_names = [x.lower() for x in duckdb_sql_result_tb.column_names]

# Verify results between substrait plan query and sql running against
# duckdb are equal.
outcome = {
"column_names": col_names == exp_col_names,
"table": subtrait_query_result_tb == duckdb_sql_result_tb,
}
snapshot.assert_match(str(outcome), outcome_path)

0 comments on commit 6659d6f

Please sign in to comment.