-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: reorganize TPC-H test scripts to make them more similar (#171)
This PR splits the `test_*_tpch.py` scripts into several `test_X_on_Y.py` scripts that have minimal differences among them. More precisely, `test_duckdb_on_acero.py` and `test_duckdb_on_duckdb.py` now only differ on which consumer is being built and `test_duckdb_on_acero.py` and `test_isthmus_on_acero.py` differ only on which plan is used (the former produces it on the fly, the latter loads it from a file). The splitting essentially consisted of moving/copying entire functions from one file to another, changing the order of lines, renaming variables, factoring out common expressions into variables, and changing comments. This is a preparatory step for reusing test functionality in `common.py` in all three scripts. Signed-off-by: Ingo Müller <[email protected]>
- Loading branch information
1 parent
525f26e
commit 6659d6f
Showing
70 changed files
with
191 additions
and
177 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
176 changes: 0 additions & 176 deletions
176
substrait_consumer/tests/integration/test_acero_tpch.py
This file was deleted.
Oops, something went wrong.
93 changes: 93 additions & 0 deletions
93
substrait_consumer/tests/integration/test_duckdb_on_acero.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from pathlib import Path | ||
|
||
import duckdb | ||
import pytest | ||
from pytest_snapshot.plugin import Snapshot | ||
|
||
from substrait_consumer.consumers.acero_consumer import AceroConsumer | ||
from substrait_consumer.functional.utils import load_json | ||
from substrait_consumer.producers.duckdb_producer import DuckDBProducer | ||
|
||
CONFIG_DIR = Path(__file__).parent.parent / "integration" | ||
TPCH_CONFIG_DIR = CONFIG_DIR / "tpch" | ||
TEST_CASE_PATHS = list( | ||
(path.relative_to(CONFIG_DIR),) for path in TPCH_CONFIG_DIR.rglob("*.json") | ||
) | ||
IDS = list((str(path[0]).removesuffix(".json") for path in TEST_CASE_PATHS)) | ||
|
||
|
||
@pytest.mark.parametrize(["path"], TEST_CASE_PATHS, ids=IDS) | ||
@pytest.mark.usefixtures("prepare_tpch_parquet_data") | ||
def test_substrait_query( | ||
path: Path, | ||
snapshot: Snapshot, | ||
db_con: duckdb.DuckDBPyConnection, | ||
) -> None: | ||
""" | ||
1. Load all the parquet files into DuckDB as separate named_tables. | ||
2. Format the SQL query to work with DuckDB by inserting all the table names. | ||
3. Execute the SQL on DuckDB. | ||
4. Run the substrait query plan. | ||
5. Compare substrait query plan results against the results of | ||
running the SQL on DuckDB. | ||
Parameters: | ||
test_name: | ||
Name of test. | ||
local_files: | ||
A `dict` mapping format argument names to local files paths. | ||
named_tables: | ||
A `dict` mapping table names to local file paths. | ||
sql_query: | ||
SQL query. | ||
""" | ||
test_case = load_json(CONFIG_DIR / path) | ||
test_name = test_case["test_name"] | ||
local_files = test_case["local_files"] | ||
named_tables = test_case["named_tables"] | ||
sql_query, supported_producers = test_case["sql_query"] | ||
|
||
assert "duckdb" in supported_producers | ||
|
||
tpch_num = int(test_name.split("_")[-1]) | ||
|
||
snapshot.snapshot_dir = snapshot.snapshot_dir.parent / f"test_tpch_sql_{tpch_num}" | ||
|
||
consumer = AceroConsumer() | ||
producer = DuckDBProducer() | ||
|
||
consumer.setup(db_con, local_files, named_tables) | ||
producer.setup(db_con, local_files, named_tables) | ||
|
||
outcome_path = f"query_{tpch_num:02d}_outcome.txt" | ||
|
||
# Produce DuckDB plan from SQL query. | ||
try: | ||
proto_bytes = producer.produce_substrait(sql_query) | ||
except BaseException as e: | ||
snapshot.assert_match(str(type(e)), outcome_path) | ||
return | ||
|
||
try: | ||
subtrait_query_result_tb = consumer.run_substrait_query(proto_bytes) | ||
except BaseException as e: | ||
snapshot.assert_match(str(type(e)), outcome_path) | ||
return | ||
|
||
# Calculate results to verify against by running the SQL query on DuckDB | ||
try: | ||
duckdb_sql_result_tb = producer.run_sql_query(sql_query) | ||
except BaseException as e: | ||
snapshot.assert_match(str(type(e)), outcome_path) | ||
return | ||
|
||
col_names = [x.lower() for x in subtrait_query_result_tb.column_names] | ||
exp_col_names = [x.lower() for x in duckdb_sql_result_tb.column_names] | ||
|
||
# Verify results between substrait plan query and sql running against | ||
# duckdb are equal. | ||
outcome = { | ||
"column_names": col_names == exp_col_names, | ||
"table": subtrait_query_result_tb == duckdb_sql_result_tb, | ||
} | ||
snapshot.assert_match(str(outcome), outcome_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
97 changes: 97 additions & 0 deletions
97
substrait_consumer/tests/integration/test_isthmus_on_acero.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
from pathlib import Path | ||
|
||
import duckdb | ||
import pytest | ||
from pytest_snapshot.plugin import Snapshot | ||
|
||
from substrait_consumer.consumers.acero_consumer import AceroConsumer | ||
from substrait_consumer.functional.utils import load_json | ||
from substrait_consumer.producers.duckdb_producer import DuckDBProducer | ||
|
||
PLAN_DIR = Path(__file__).parent / "queries" / "tpch_substrait_plans" | ||
|
||
CONFIG_DIR = Path(__file__).parent.parent / "integration" | ||
TPCH_CONFIG_DIR = CONFIG_DIR / "tpch" | ||
TEST_CASE_PATHS = list( | ||
(path.relative_to(CONFIG_DIR),) for path in TPCH_CONFIG_DIR.rglob("*.json") | ||
) | ||
IDS = list((str(path[0]).removesuffix(".json") for path in TEST_CASE_PATHS)) | ||
|
||
|
||
@pytest.mark.parametrize(["path"], TEST_CASE_PATHS, ids=IDS) | ||
@pytest.mark.usefixtures("prepare_tpch_parquet_data") | ||
def test_isthmus_substrait_plan( | ||
path: Path, | ||
snapshot: Snapshot, | ||
db_con: duckdb.DuckDBPyConnection, | ||
) -> None: | ||
""" | ||
1. Format the substrait_query by replacing the 'local_files' 'uri_file' | ||
path with the full path to the parquet data. | ||
2. Format the SQL query to work with DuckDB by setting the 'Table' | ||
Parameters to be the relative files paths for parquet data. | ||
3. Run the substrait query plan. | ||
4. Execute the SQL on DuckDB. | ||
5. Compare substrait query plan results against the results of | ||
running the SQL on DuckDB. | ||
Parameters: | ||
test_name: | ||
Name of test. | ||
local_files: | ||
A `dict` mapping format argument names to local files paths. | ||
named_tables: | ||
A `dict` mapping table names to local file paths. | ||
sql_query: | ||
SQL query. | ||
substrait_query: | ||
Substrait query. | ||
""" | ||
test_case = load_json(CONFIG_DIR / path) | ||
test_name = test_case["test_name"] | ||
local_files = test_case["local_files"] | ||
named_tables = test_case["named_tables"] | ||
sql_query, supported_producers = test_case["sql_query"] | ||
|
||
assert "duckdb" in supported_producers | ||
|
||
tpch_num = int(test_name.split("_")[-1]) | ||
|
||
snapshot.snapshot_dir = snapshot.snapshot_dir.parent / f"test_tpch_sql_{tpch_num}" | ||
|
||
consumer = AceroConsumer() | ||
producer = DuckDBProducer() | ||
|
||
consumer.setup(db_con, local_files, named_tables) | ||
producer.setup(db_con, local_files, named_tables) | ||
|
||
outcome_path = f"query_{tpch_num:02d}_outcome.txt" | ||
|
||
# Load Isthmus plan from file. | ||
substrait_plan_path = PLAN_DIR / f"query_{tpch_num:02d}_plan.json" | ||
with open(substrait_plan_path, "r") as f: | ||
proto_bytes = f.read() | ||
|
||
try: | ||
subtrait_query_result_tb = consumer.run_substrait_query(proto_bytes) | ||
except BaseException as e: | ||
snapshot.assert_match(str(type(e)), outcome_path) | ||
return | ||
|
||
# Calculate results to verify against by running the SQL query on DuckDB | ||
try: | ||
duckdb_sql_result_tb = producer.run_sql_query(sql_query) | ||
except BaseException as e: | ||
snapshot.assert_match(str(type(e)), outcome_path) | ||
return | ||
|
||
col_names = [x.lower() for x in subtrait_query_result_tb.column_names] | ||
exp_col_names = [x.lower() for x in duckdb_sql_result_tb.column_names] | ||
|
||
# Verify results between substrait plan query and sql running against | ||
# duckdb are equal. | ||
outcome = { | ||
"column_names": col_names == exp_col_names, | ||
"table": subtrait_query_result_tb == duckdb_sql_result_tb, | ||
} | ||
snapshot.assert_match(str(outcome), outcome_path) |