diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 7e18d4a..31093a3 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -96,8 +96,7 @@ jobs: run: | PYTHONPATH=. pytest PYTHONPATH=. python test/test.py - tar xzf examples/examples_ws.tgz - PYTHONPATH=. WORKSPACE=ws ./examples/test.sh + PYTHONPATH=. examples/test.sh - name: Cloud Test - Azurite run: | diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index 2a926f9..c0617d2 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -53,9 +53,17 @@ def parse_vidmap_json(vidmap_file, intervals=None): if not intervals: intervals = [] for contig in contigs: - contigs_map[contig["name"]] = contig + if isinstance(contig, str): # Old style vidmap json + contig_name = contig + contigs_map[contig] = { + "length": contigs[contig]["length"], + "tiledb_column_offset": contigs[contig]["tiledb_column_offset"], + } + else: # Generated with vcf2genomicsdb_init + contig_name = contig["name"] + contigs_map[contig["name"]] = contig if all_intervals: - intervals.append(contig["name"]) + intervals.append(contig_name) return contigs_map, intervals diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 5015c02..62c89b4 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -45,7 +45,7 @@ from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb def parse_callset_json(callset_file): callset = json.loads(genomicsdb.read_entire_file(callset_file)) callsets = callset["callsets"] - samples = [callset["sample_name"] for callset in callsets] + samples = [callset if isinstance(callset, str) else callset["sample_name"] for callset in callsets] return samples @@ -57,7 +57,16 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): if isinstance(samples, str): with open(samples) as file: samples = [line.rstrip() for line in file] - rows = [callset["row_idx"] for sample in samples for callset in callsets if sample == callset["sample_name"]] + if not samples: + return None + # Old style json has type(callset) == str whereas the one generated with vcf2genomicsdb_init is a list + rows = [ + callsets[callset]["row_idx"] if isinstance(callset, str) else callset["row_idx"] + for sample in samples + for callset in callsets + if (not isinstance(callset, str) and sample == callset["sample_name"]) + or (isinstance(callset, str) and sample == callset) + ] if len(rows) == 0: print(f"None of the samples{samples} specified were found in the workspace") return [] @@ -81,7 +90,9 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): def parse_loader_json(loader_file): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] - array_names = [partition["array_name"] for partition in partitions] + array_names = [ + partition["array_name"] if "array_name" in partition else partition["array"] for partition in partitions + ] return [name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names] @@ -140,7 +151,7 @@ def setup_gdb(): parser.add_argument( "--list-partitions", action="store_true", - help="List interval partitions for the ingested samples in the workspace and exit", + help="List interval partitions(genomicsdb arrays in the workspace) for the ingested samples in the workspace and exit", # noqa ) parser.add_argument( "-i", @@ -233,6 +244,7 @@ def setup_gdb(): sys.exit(0) if args.list_partitions: + contigs_map, _ = genomicsdb_common.parse_vidmap_json(vidmap_file) # parse loader.json for partitions partitions = parse_loader_json(loader_file) print(*partitions, sep="\n") @@ -326,10 +338,24 @@ def main(): continue arrays = [] - for partition in partitions: - if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]: + for idx, partition in enumerate(partitions): + if isinstance(partition["begin"], int): # Old style vidmap json + column_begin = partition["begin"] + if "end" in partition.keys(): + column_end = partition["end"] + elif idx + 1 < len(partitions): + column_end = partitions[idx + 1]["begin"] - 1 + else: + column_end = sys.maxsize + else: # Generated with vcf2genomicsdb_init + column_begin = partition["begin"]["tiledb_column"] + column_end = partition["end"]["tiledb_column"] + if contig_end < column_begin or contig_offset > column_end: continue - arrays.append(partition["array_name"]) + if "array_name" in partition.keys(): + arrays.append(partition["array_name"]) + elif "array" in partition.keys(): + arrays.append(partition["array"]) arrays_length = len(arrays) if arrays_length == 0: diff --git a/examples/run.sh b/examples/run.sh index 49c4b90..0689575 100755 --- a/examples/run.sh +++ b/examples/run.sh @@ -26,7 +26,9 @@ # # -WORKSPACE=${WORKSPACE:-my_workspace} +set -e + +export WORKSPACE=${WORKSPACE:-my_workspace} export CALLSET_FILE=${CALLSET_FILE:-$WORKSPACE/callset.json} export VIDMAP_FILE=${VIDMAP_FILE:-$WORKSPACE/vidmap.json} export LOADER_FILE=${LOADER_FILE:-$WORKSPACE/loader.json} @@ -97,11 +99,17 @@ if [[ -f vidmap.json ]]; then export VIDMAP_FILE="vidmap.json" fi +if [[ $(uname) == "Darwin" ]]; then + export MEASURE_PERFORMANCE="/usr/bin/time -l" +else + export MEASURE_PERFORMANCE="/usr/bin/time -v" +fi + run_query() { INTERVAL=$1 OUTPUT_FILE=$2 echo genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE - /usr/bin/time -l genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE + $MEASURE_PERFORMANCE genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE } export -f run_query diff --git a/examples/test.sh b/examples/test.sh index e10e185..85b2667 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -39,7 +39,6 @@ # ~/GenomicsDB/install/bin/vcf2genomicsdb -r 2 ws/loader.json # ~/GenomicsDB/install/bin/vcf2genomicsdb -r 80 ws/loader.json -WORKSPACE=${WORKSPACE:-ws} INTERVAL_ARGS="-i 1:1-40000000 -i 2:3000-40000 -i 2:40001 -i 3" SAMPLE_ARGS="-s HG00096 -s HG00097 -s HG00099" VIDMAP_FILE=vidmap_file.json @@ -93,7 +92,13 @@ run_command() { fi } +if [[ -z $WORKSPACE ]]; then + tar xzf $(dirname $0)/examples_ws.tgz -C $TEMP_DIR + WORKSPACE=$TEMP_DIR/ws +fi + PATH=$(dirname $0):$PATH + run_command "genomicsdb_query" 2 run_command "genomicsdb_query -h" run_command "genomicsdb_query --version" @@ -102,11 +107,6 @@ run_command "genomicsdb_query --list-contigs" 2 run_command "genomicsdb_query -w $WORKSPACE" 1 run_command "genomicsdb_query -w $WORKSPACE -i xx -S XX" 1 -if [[ -z $WORKSPACE ]]; then - echo "Specify an existing workspace in env WORKSPACE" - exit 1 -fi - genomicsdb_query -w $WORKSPACE --list-contigs > $TEMP_DIR/contigs.list genomicsdb_query -w $WORKSPACE --list-samples > $TEMP_DIR/samples.list @@ -137,14 +137,15 @@ do fi done -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list" -run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list" -run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER -o $OUTPUT" +rm -f loader.json callset.json vidmap.json run_command "genomicsdb_cache -w $WORKSPACE $INTERVAL_ARGS" export TILEDB_CACHE=1 if [[ $WORKSPACE == *://* ]]; then @@ -152,8 +153,38 @@ if [[ $WORKSPACE == *://* ]]; then die "Could not cache workspace metadata for cloud URL=$WORKSPACE" fi echo "Running from cached metadata for workspace=$WORKSPACE..." - run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json" + run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json -o $OUTPUT" echo "Running from cached metadata for workspace=$WORKSPACE DONE" fi +rm -f loader.json callset.json vidmap.json + +if [[ $WORKSPACE == *://* ]]; then + cleanup + exit 0 +fi + +#################################################################### +# +# Check old style workspaces with genomicsdb_query/genomicsdb_cache +# +#################################################################### + +OLDSTYLE_DIR=$TEMP_DIR/old_style +mkdir -p $OLDSTYLE_DIR +tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR +WORKSPACE=$OLDSTYLE_DIR/ws + +run_command "genomicsdb_query -w $WORKSPACE --list-samples" +run_command "genomicsdb_query -w $WORKSPACE --list-contigs" +run_command "genomicsdb_query -w $WORKSPACE --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" + +OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" cleanup