Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support oldstyle jsons in examples/genomicsdb_query #68

Merged
merged 8 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,7 @@ jobs:
run: |
PYTHONPATH=. pytest
PYTHONPATH=. python test/test.py
tar xzf examples/examples_ws.tgz
PYTHONPATH=. WORKSPACE=ws ./examples/test.sh
PYTHONPATH=. examples/test.sh

- name: Cloud Test - Azurite
run: |
Expand Down
12 changes: 10 additions & 2 deletions examples/genomicsdb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,17 @@ def parse_vidmap_json(vidmap_file, intervals=None):
if not intervals:
intervals = []
for contig in contigs:
contigs_map[contig["name"]] = contig
if isinstance(contig, str): # Old style vidmap json
contig_name = contig
contigs_map[contig] = {
"length": contigs[contig]["length"],
"tiledb_column_offset": contigs[contig]["tiledb_column_offset"],
}
else: # Generated with vcf2genomicsdb_init
contig_name = contig["name"]
contigs_map[contig["name"]] = contig
if all_intervals:
intervals.append(contig["name"])
intervals.append(contig_name)
return contigs_map, intervals


Expand Down
40 changes: 33 additions & 7 deletions examples/genomicsdb_query
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb
def parse_callset_json(callset_file):
callset = json.loads(genomicsdb.read_entire_file(callset_file))
callsets = callset["callsets"]
samples = [callset["sample_name"] for callset in callsets]
samples = [callset if isinstance(callset, str) else callset["sample_name"] for callset in callsets]
return samples


Expand All @@ -57,7 +57,16 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None):
if isinstance(samples, str):
with open(samples) as file:
samples = [line.rstrip() for line in file]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make samples a set and then we don't need to do the nested loop below?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not fixed the nested loop issue, but fixed a bug for now. I will get to it soon.

rows = [callset["row_idx"] for sample in samples for callset in callsets if sample == callset["sample_name"]]
if not samples:
return None
# Old style json has type(callset) == str whereas the one generated with vcf2genomicsdb_init is a list
rows = [
callsets[callset]["row_idx"] if isinstance(callset, str) else callset["row_idx"]
for sample in samples
for callset in callsets
if (not isinstance(callset, str) and sample == callset["sample_name"])
or (isinstance(callset, str) and sample == callset)
]
if len(rows) == 0:
print(f"None of the samples{samples} specified were found in the workspace")
return []
Expand All @@ -81,7 +90,9 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None):
def parse_loader_json(loader_file):
loader = json.loads(genomicsdb.read_entire_file(loader_file))
partitions = loader["column_partitions"]
array_names = [partition["array_name"] for partition in partitions]
array_names = [
partition["array_name"] if "array_name" in partition else partition["array"] for partition in partitions
]
return [name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names]


Expand Down Expand Up @@ -140,7 +151,7 @@ def setup_gdb():
parser.add_argument(
"--list-partitions",
action="store_true",
help="List interval partitions for the ingested samples in the workspace and exit",
help="List interval partitions(genomicsdb arrays in the workspace) for the ingested samples in the workspace and exit", # noqa
)
parser.add_argument(
"-i",
Expand Down Expand Up @@ -233,6 +244,7 @@ def setup_gdb():
sys.exit(0)

if args.list_partitions:
contigs_map, _ = genomicsdb_common.parse_vidmap_json(vidmap_file)
# parse loader.json for partitions
partitions = parse_loader_json(loader_file)
print(*partitions, sep="\n")
Expand Down Expand Up @@ -326,10 +338,24 @@ def main():
continue

arrays = []
for partition in partitions:
if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]:
for idx, partition in enumerate(partitions):
if isinstance(partition["begin"], int): # Old style vidmap json
column_begin = partition["begin"]
if "end" in partition.keys():
column_end = partition["end"]
elif idx + 1 < len(partitions):
column_end = partitions[idx + 1]["begin"] - 1
else:
column_end = sys.maxsize
else: # Generated with vcf2genomicsdb_init
column_begin = partition["begin"]["tiledb_column"]
column_end = partition["end"]["tiledb_column"]
if contig_end < column_begin or contig_offset > column_end:
continue
arrays.append(partition["array_name"])
if "array_name" in partition.keys():
arrays.append(partition["array_name"])
elif "array" in partition.keys():
arrays.append(partition["array"])

arrays_length = len(arrays)
if arrays_length == 0:
Expand Down
12 changes: 10 additions & 2 deletions examples/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
#
#

WORKSPACE=${WORKSPACE:-my_workspace}
set -e

export WORKSPACE=${WORKSPACE:-my_workspace}
export CALLSET_FILE=${CALLSET_FILE:-$WORKSPACE/callset.json}
export VIDMAP_FILE=${VIDMAP_FILE:-$WORKSPACE/vidmap.json}
export LOADER_FILE=${LOADER_FILE:-$WORKSPACE/loader.json}
Expand Down Expand Up @@ -97,11 +99,17 @@ if [[ -f vidmap.json ]]; then
export VIDMAP_FILE="vidmap.json"
fi

if [[ $(uname) == "Darwin" ]]; then
export MEASURE_PERFORMANCE="/usr/bin/time -l"
else
export MEASURE_PERFORMANCE="/usr/bin/time -v"
fi

run_query() {
INTERVAL=$1
OUTPUT_FILE=$2
echo genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE
/usr/bin/time -l genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE
$MEASURE_PERFORMANCE genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE
}

export -f run_query
Expand Down
59 changes: 45 additions & 14 deletions examples/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
# ~/GenomicsDB/install/bin/vcf2genomicsdb -r 2 ws/loader.json
# ~/GenomicsDB/install/bin/vcf2genomicsdb -r 80 ws/loader.json

WORKSPACE=${WORKSPACE:-ws}
INTERVAL_ARGS="-i 1:1-40000000 -i 2:3000-40000 -i 2:40001 -i 3"
SAMPLE_ARGS="-s HG00096 -s HG00097 -s HG00099"
VIDMAP_FILE=vidmap_file.json
Expand Down Expand Up @@ -93,7 +92,13 @@ run_command() {
fi
}

if [[ -z $WORKSPACE ]]; then
tar xzf $(dirname $0)/examples_ws.tgz -C $TEMP_DIR
WORKSPACE=$TEMP_DIR/ws
fi

PATH=$(dirname $0):$PATH

run_command "genomicsdb_query" 2
run_command "genomicsdb_query -h"
run_command "genomicsdb_query --version"
Expand All @@ -102,11 +107,6 @@ run_command "genomicsdb_query --list-contigs" 2
run_command "genomicsdb_query -w $WORKSPACE" 1
run_command "genomicsdb_query -w $WORKSPACE -i xx -S XX" 1

if [[ -z $WORKSPACE ]]; then
echo "Specify an existing workspace in env WORKSPACE"
exit 1
fi

genomicsdb_query -w $WORKSPACE --list-contigs > $TEMP_DIR/contigs.list
genomicsdb_query -w $WORKSPACE --list-samples > $TEMP_DIR/samples.list

Expand Down Expand Up @@ -137,23 +137,54 @@ do
fi
done

run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list"
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list"
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER -o $OUTPUT"

rm -f loader.json callset.json vidmap.json
run_command "genomicsdb_cache -w $WORKSPACE $INTERVAL_ARGS"
export TILEDB_CACHE=1
if [[ $WORKSPACE == *://* ]]; then
if [[ ! -f loader.json ]] || [[ ! -f callset.json ]] || [[ ! -f vidmap.json ]]; then
die "Could not cache workspace metadata for cloud URL=$WORKSPACE"
fi
echo "Running from cached metadata for workspace=$WORKSPACE..."
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json"
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json -o $OUTPUT"
echo "Running from cached metadata for workspace=$WORKSPACE DONE"
fi
rm -f loader.json callset.json vidmap.json

if [[ $WORKSPACE == *://* ]]; then
cleanup
exit 0
fi

####################################################################
#
# Check old style workspaces with genomicsdb_query/genomicsdb_cache
#
####################################################################

OLDSTYLE_DIR=$TEMP_DIR/old_style
mkdir -p $OLDSTYLE_DIR
tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR
WORKSPACE=$OLDSTYLE_DIR/ws

run_command "genomicsdb_query -w $WORKSPACE --list-samples"
run_command "genomicsdb_query -w $WORKSPACE --list-contigs"
run_command "genomicsdb_query -w $WORKSPACE --list-partitions"
run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT"

OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT"

cleanup