From 15cd41f58dda9d3f36378aaed2bd1e7d65d34a5a Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 12 Nov 2024 14:35:12 +0000 Subject: [PATCH 01/15] [DOR-895] Clarify usage of num_samples --- dorado/read_pipeline/messages.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dorado/read_pipeline/messages.h b/dorado/read_pipeline/messages.h index 1bd5818fb..e40b9d809 100644 --- a/dorado/read_pipeline/messages.h +++ b/dorado/read_pipeline/messages.h @@ -26,9 +26,11 @@ struct Attributes { int32_t channel_number{-1}; //Channel ID std::string start_time{}; //Read acquisition start time std::string fast5_filename{}; - uint64_t num_samples; // Indicates if this read had end reason `mux_change` or `unblock_mux_change` bool is_end_reason_mux_change{false}; + + // Only used by tests, and only valid for POD5 data. + uint64_t num_samples{}; }; } // namespace details From 99f9aec001a713eb9ca77e1f7c43d446f61a47ed Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 12 Nov 2024 14:40:18 +0000 Subject: [PATCH 02/15] [DOR-895] Drop fast5 from filename We use this for pod5s too. --- dorado/data_loader/DataLoader.cpp | 6 +++--- dorado/read_pipeline/messages.cpp | 4 ++-- dorado/read_pipeline/messages.h | 2 +- tests/NodeSmokeTest.cpp | 2 +- tests/ReadFilterNodeTest.cpp | 12 ++++++------ tests/ReadTest.cpp | 6 +++--- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index c9d5297ae..918e91056 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -177,7 +177,7 @@ SimplexReadPtr process_pod5_thread_fn( new_read->read_common.read_id = std::move(read_id_str); new_read->read_common.num_trimmed_samples = 0; new_read->read_common.attributes.read_number = read_data.read_number; - new_read->read_common.attributes.fast5_filename = + new_read->read_common.attributes.filename = std::filesystem::path(path.c_str()).filename().string(); new_read->read_common.attributes.mux = read_data.well; new_read->read_common.attributes.num_samples = read_data.num_samples; @@ -658,7 +658,7 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) { start_time_attr.read(start_time); string_reader(read_id_attr, read_id); - std::string fast5_filename = std::filesystem::path(path).filename().string(); + std::string filename = std::filesystem::path(path).filename().string(); HighFive::Group tracking_id_group = read.getGroup("tracking_id"); std::string exp_start_time = get_string_attribute(tracking_id_group, "exp_start_time"); @@ -685,7 +685,7 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) { new_read->read_common.attributes.read_number = read_number; new_read->read_common.attributes.channel_number = channel_number; new_read->read_common.attributes.start_time = start_time_str; - new_read->read_common.attributes.fast5_filename = fast5_filename; + new_read->read_common.attributes.filename = filename; new_read->read_common.flowcell_id = flow_cell_id; new_read->read_common.flow_cell_product_code = flow_cell_product_code; new_read->read_common.position_id = device_id; diff --git a/dorado/read_pipeline/messages.cpp b/dorado/read_pipeline/messages.cpp index 4390d9a2d..39c92d199 100644 --- a/dorado/read_pipeline/messages.cpp +++ b/dorado/read_pipeline/messages.cpp @@ -62,8 +62,8 @@ void ReadCommon::generate_read_tags(bam1_t *aln, bool emit_moves, bool is_duplex int rn = attributes.read_number; bam_aux_append(aln, "rn", 'i', sizeof(rn), (uint8_t *)&rn); - bam_aux_append(aln, "fn", 'Z', int(attributes.fast5_filename.length() + 1), - (uint8_t *)attributes.fast5_filename.c_str()); + bam_aux_append(aln, "fn", 'Z', int(attributes.filename.length() + 1), + (uint8_t *)attributes.filename.c_str()); float sm = shift; bam_aux_append(aln, "sm", 'f', sizeof(sm), (uint8_t *)&sm); diff --git a/dorado/read_pipeline/messages.h b/dorado/read_pipeline/messages.h index e40b9d809..09d35bf5c 100644 --- a/dorado/read_pipeline/messages.h +++ b/dorado/read_pipeline/messages.h @@ -25,7 +25,7 @@ struct Attributes { int32_t read_number{-1}; // Per-channel number of each read as it was acquired by minknow int32_t channel_number{-1}; //Channel ID std::string start_time{}; //Read acquisition start time - std::string fast5_filename{}; + std::string filename{}; // Indicates if this read had end reason `mux_change` or `unblock_mux_change` bool is_end_reason_mux_change{false}; diff --git a/tests/NodeSmokeTest.cpp b/tests/NodeSmokeTest.cpp index c7acb1565..f6bbf36e6 100644 --- a/tests/NodeSmokeTest.cpp +++ b/tests/NodeSmokeTest.cpp @@ -79,7 +79,7 @@ class NodeSmokeTestBase { read->read_common.attributes.read_number = 12345; read->read_common.attributes.channel_number = 5; read->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read->read_common.attributes.fast5_filename = "test.fast5"; + read->read_common.attributes.filename = "test.fast5"; read->read_common.client_info = client_info; return read; } diff --git a/tests/ReadFilterNodeTest.cpp b/tests/ReadFilterNodeTest.cpp index f7959b629..a6e7aef8a 100644 --- a/tests/ReadFilterNodeTest.cpp +++ b/tests/ReadFilterNodeTest.cpp @@ -38,7 +38,7 @@ TEST_CASE("ReadFilterNode: Filter read based on qscore", TEST_GROUP) { read_1->read_common.attributes.read_number = 18501; read_1->read_common.attributes.channel_number = 5; read_1->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_1->read_common.attributes.fast5_filename = "batch_0.fast5"; + read_1->read_common.attributes.filename = "batch_0.fast5"; auto read_2 = std::make_unique(); read_2->read_common.raw_data = at::empty(100); @@ -53,7 +53,7 @@ TEST_CASE("ReadFilterNode: Filter read based on qscore", TEST_GROUP) { read_2->read_common.attributes.read_number = 18501; read_2->read_common.attributes.channel_number = 5; read_2->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_2->read_common.attributes.fast5_filename = "batch_0.fast5"; + read_2->read_common.attributes.filename = "batch_0.fast5"; pipeline->push_message(std::move(read_1)); pipeline->push_message(std::move(read_2)); @@ -82,7 +82,7 @@ TEST_CASE("ReadFilterNode: Filter read based on read name", TEST_GROUP) { read_1->read_common.attributes.read_number = 18501; read_1->read_common.attributes.channel_number = 5; read_1->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_1->read_common.attributes.fast5_filename = "batch_0.fast5"; + read_1->read_common.attributes.filename = "batch_0.fast5"; auto read_2 = std::make_unique(); read_2->read_common.raw_data = at::empty(100); @@ -97,7 +97,7 @@ TEST_CASE("ReadFilterNode: Filter read based on read name", TEST_GROUP) { read_2->read_common.attributes.read_number = 18501; read_2->read_common.attributes.channel_number = 5; read_2->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_2->read_common.attributes.fast5_filename = "batch_0.fast5"; + read_2->read_common.attributes.filename = "batch_0.fast5"; pipeline->push_message(std::move(read_1)); pipeline->push_message(std::move(read_2)); @@ -126,7 +126,7 @@ TEST_CASE("ReadFilterNode: Filter read based on read length", TEST_GROUP) { read_1->read_common.attributes.read_number = 18501; read_1->read_common.attributes.channel_number = 5; read_1->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_1->read_common.attributes.fast5_filename = "batch_0.fast5"; + read_1->read_common.attributes.filename = "batch_0.fast5"; auto read_2 = std::make_unique(); read_2->read_common.raw_data = at::empty(100); @@ -141,7 +141,7 @@ TEST_CASE("ReadFilterNode: Filter read based on read length", TEST_GROUP) { read_2->read_common.attributes.read_number = 18501; read_2->read_common.attributes.channel_number = 5; read_2->read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_2->read_common.attributes.fast5_filename = "batch_0.fast5"; + read_2->read_common.attributes.filename = "batch_0.fast5"; pipeline->push_message(std::move(read_1)); pipeline->push_message(std::move(read_2)); diff --git a/tests/ReadTest.cpp b/tests/ReadTest.cpp index 8c050b2ef..3b5108605 100644 --- a/tests/ReadTest.cpp +++ b/tests/ReadTest.cpp @@ -24,7 +24,7 @@ TEST_CASE(TEST_GROUP ": Test tag generation", TEST_GROUP) { read_common.attributes.read_number = 18501; read_common.attributes.channel_number = 5; read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_common.attributes.fast5_filename = "batch_0.fast5"; + read_common.attributes.filename = "batch_0.fast5"; read_common.run_id = "xyz"; read_common.model_name = "test_model"; read_common.is_duplex = false; @@ -178,7 +178,7 @@ TEST_CASE(TEST_GROUP ": Test sam record generation", TEST_GROUP) { test_read.read_common.attributes.read_number = 18501; test_read.read_common.attributes.channel_number = 5; test_read.read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - test_read.read_common.attributes.fast5_filename = "batch_0.fast5"; + test_read.read_common.attributes.filename = "batch_0.fast5"; auto lines = test_read.read_common.extract_sam_lines(false, 0, false); REQUIRE(!lines.empty()); @@ -350,7 +350,7 @@ TEST_CASE(TEST_GROUP ": Test mean q-score generation", TEST_GROUP) { read_common.attributes.read_number = 18501; read_common.attributes.channel_number = 5; read_common.attributes.start_time = "2017-04-29T09:10:04Z"; - read_common.attributes.fast5_filename = "batch_0.fast5"; + read_common.attributes.filename = "batch_0.fast5"; read_common.run_id = "xyz"; read_common.model_name = "test_model"; read_common.is_duplex = false; From 38f13d2ed03e0bb7040b1c834c66125b5b2fc7d7 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 12 Nov 2024 14:54:24 +0000 Subject: [PATCH 03/15] [DOR-895] Fix a crash when there's a type mismatch Also extend tests to cover this and other similar situations. --- dorado/read_pipeline/HtsReader.h | 3 ++- tests/BamReaderTest.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dorado/read_pipeline/HtsReader.h b/dorado/read_pipeline/HtsReader.h index a3a5a4e5c..7a898c7c9 100644 --- a/dorado/read_pipeline/HtsReader.h +++ b/dorado/read_pipeline/HtsReader.h @@ -69,7 +69,8 @@ T HtsReader::get_tag(const char* tagname) { } else if constexpr (std::is_floating_point_v) { tag_value = static_cast(bam_aux2f(tag)); } else { - tag_value = static_cast(bam_aux2Z(tag)); + const char* val = bam_aux2Z(tag); + tag_value = val ? val : T{}; } return tag_value; diff --git a/tests/BamReaderTest.cpp b/tests/BamReaderTest.cpp index db193182a..286519b42 100644 --- a/tests/BamReaderTest.cpp +++ b/tests/BamReaderTest.cpp @@ -87,7 +87,12 @@ TEST_CASE("HtsReaderTest: get_tag", TEST_GROUP) { // All records in small.sam have this set to 0. CHECK(reader.get_tag("rl") == 0); // Intentionally bad tag to test that missing tags don't return garbage. + CHECK(reader.get_tag("##") == 0); CHECK(reader.get_tag("##") == 0); + CHECK(reader.get_tag("##") == ""); + // Type mismatch doesn't crash. + CHECK(reader.get_tag("rl") == 0); + CHECK(reader.get_tag("rl") == ""); } } From 6d956a8d0e884ef3cec93d9be70cbd4ce7fa716a Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 12 Nov 2024 15:27:37 +0000 Subject: [PATCH 04/15] [DOR-895] Replace allocations with constants A vector of strings needs to allocate the vectors and the strings at runtime then copy the constant data into them. Instead reference the constant data directly. --- dorado/summary/summary.cpp | 60 +++++++++++++++++++++++++------------- dorado/summary/summary.h | 5 ---- 2 files changed, 39 insertions(+), 26 deletions(-) diff --git a/dorado/summary/summary.cpp b/dorado/summary/summary.cpp index 66f7a732e..2400f4c0e 100644 --- a/dorado/summary/summary.cpp +++ b/dorado/summary/summary.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace { @@ -37,27 +38,44 @@ volatile sig_atomic_t SigIntHandler::interrupt{}; namespace dorado { -std::vector SummaryData::s_required_fields = {"filename", "read_id"}; - -std::vector SummaryData::s_general_fields = {"run_id", - "channel", - "mux", - "start_time", - "duration", - "template_start", - "template_duration", - "sequence_length_template", - "mean_qscore_template"}; - -std::vector SummaryData::s_barcoding_fields = {"barcode"}; - -std::vector SummaryData::s_alignment_fields = { - "alignment_genome", "alignment_genome_start", "alignment_genome_end", - "alignment_strand_start", "alignment_strand_end", "alignment_direction", - "alignment_length", "alignment_num_aligned", "alignment_num_correct", - "alignment_num_insertions", "alignment_num_deletions", "alignment_num_substitutions", - "alignment_mapq", "alignment_strand_coverage", "alignment_identity", - "alignment_accuracy", "alignment_bed_hits"}; +namespace { + +using namespace std::string_view_literals; + +const std::array s_required_fields = { + "filename"sv, + "read_id"sv, +}; + +const std::array s_general_fields = { + "run_id"sv, + "channel"sv, + "mux"sv, + "start_time"sv, + "duration"sv, + "template_start"sv, + "template_duration"sv, + "sequence_length_template"sv, + "mean_qscore_template"sv, +}; + +const std::array s_barcoding_fields = { + "barcode"sv, +}; + +const std::array s_alignment_fields = { + "alignment_genome"sv, "alignment_genome_start"sv, + "alignment_genome_end"sv, "alignment_strand_start"sv, + "alignment_strand_end"sv, "alignment_direction"sv, + "alignment_length"sv, "alignment_num_aligned"sv, + "alignment_num_correct"sv, "alignment_num_insertions"sv, + "alignment_num_deletions"sv, "alignment_num_substitutions"sv, + "alignment_mapq"sv, "alignment_strand_coverage"sv, + "alignment_identity"sv, "alignment_accuracy"sv, + "alignment_bed_hits"sv, +}; + +} // namespace SummaryData::SummaryData() = default; diff --git a/dorado/summary/summary.h b/dorado/summary/summary.h index bec105c2c..824917fc2 100644 --- a/dorado/summary/summary.h +++ b/dorado/summary/summary.h @@ -29,11 +29,6 @@ class SummaryData { bool process_tree(const std::string& folder, std::ostream& writer); private: - static std::vector s_required_fields; - static std::vector s_general_fields; - static std::vector s_barcoding_fields; - static std::vector s_alignment_fields; - char m_separator{'\t'}; FieldFlags m_field_flags{}; From d019531a981c8cba2a090e2ec097c54969d8da45 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 12 Nov 2024 15:35:09 +0000 Subject: [PATCH 05/15] [DOR-895] Do some constant folding The return value wasn't used by the caller anyway. --- dorado/summary/summary.cpp | 12 ++++-------- dorado/summary/summary.h | 4 ++-- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/dorado/summary/summary.cpp b/dorado/summary/summary.cpp index 2400f4c0e..5e0167da8 100644 --- a/dorado/summary/summary.cpp +++ b/dorado/summary/summary.cpp @@ -91,7 +91,7 @@ void SummaryData::set_fields(FieldFlags flags) { m_field_flags = flags; } -bool SummaryData::process_file(const std::string& filename, std::ostream& writer) { +void SummaryData::process_file(const std::string& filename, std::ostream& writer) { SigIntHandler sig_handler; HtsReader reader(filename, std::nullopt); m_field_flags = GENERAL_FIELDS | BARCODING_FIELDS; @@ -100,7 +100,7 @@ bool SummaryData::process_file(const std::string& filename, std::ostream& writer } auto read_group_exp_start_time = utils::get_read_group_info(reader.header(), "DT"); write_header(writer); - return write_rows_from_reader(reader, writer, read_group_exp_start_time); + write_rows_from_reader(reader, writer, read_group_exp_start_time); } bool SummaryData::process_tree(const std::string& folder, std::ostream& writer) { @@ -122,10 +122,7 @@ bool SummaryData::process_tree(const std::string& folder, std::ostream& writer) for (const auto& read_file : files) { HtsReader reader(read_file, std::nullopt); auto read_group_exp_start_time = utils::get_read_group_info(reader.header(), "DT"); - bool ok = write_rows_from_reader(reader, writer, read_group_exp_start_time); - if (!ok) { - spdlog::error("File {} could not be processed. Skipping file.", read_file); - } + write_rows_from_reader(reader, writer, read_group_exp_start_time); } return true; } @@ -155,7 +152,7 @@ void SummaryData::write_header(std::ostream& writer) { writer << '\n'; } -bool SummaryData::write_rows_from_reader( +void SummaryData::write_rows_from_reader( HtsReader& reader, std::ostream& writer, const std::map& read_group_exp_start_time) { @@ -284,7 +281,6 @@ bool SummaryData::write_rows_from_reader( } writer << '\n'; } - return true; } } // namespace dorado diff --git a/dorado/summary/summary.h b/dorado/summary/summary.h index 824917fc2..fab40f854 100644 --- a/dorado/summary/summary.h +++ b/dorado/summary/summary.h @@ -23,7 +23,7 @@ class SummaryData { void set_fields(FieldFlags flags); /// This will automatically set the fields based on the contents of the file. - bool process_file(const std::string& filename, std::ostream& writer); + void process_file(const std::string& filename, std::ostream& writer); /// For this method the fields must already be set. bool process_tree(const std::string& folder, std::ostream& writer); @@ -33,7 +33,7 @@ class SummaryData { FieldFlags m_field_flags{}; void write_header(std::ostream& writer); - bool write_rows_from_reader(HtsReader& reader, + void write_rows_from_reader(HtsReader& reader, std::ostream& writer, const std::map& rgst); }; From 4454e556de50e7e7b16973b05b55a58facdf907d Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Tue, 12 Nov 2024 17:24:30 +0000 Subject: [PATCH 06/15] [DOR-895] Fix empty filename in summaries FASTQ/FASTA files don't have information about where the read came from so in those cases we want to use the name of FASTA/FASTQ file instead. Note that FASTA is read in by htslib so we can't localise this fix to just the FASTQ generator. --- dorado/read_pipeline/HtsReader.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/dorado/read_pipeline/HtsReader.cpp b/dorado/read_pipeline/HtsReader.cpp index 58c39246d..8ed4ce197 100644 --- a/dorado/read_pipeline/HtsReader.cpp +++ b/dorado/read_pipeline/HtsReader.cpp @@ -160,16 +160,28 @@ HtsReader::HtsReader(const std::string& filename, } template -bool HtsReader::try_initialise_generator(const std::string& filename) { - auto generator = std::make_shared(filename); // shared to allow copy assignment +bool HtsReader::try_initialise_generator(const std::string& filepath) { + auto generator = std::make_shared(filepath); // shared to allow copy assignment if (!generator->is_valid()) { return false; } m_header = generator->header(); m_format = generator->format(); - m_bam_record_generator = [generator_ = std::move(generator)](bam1_t& bam_record) { - return generator_->try_get_next_record(bam_record); - }; + m_bam_record_generator = + [generator_ = std::move(generator), + filename = std::filesystem::path(filepath).filename().string()](bam1_t& bam_record) { + if (!generator_->try_get_next_record(bam_record)) { + return false; + } + + // If the record doesn't have a filename set then say that it came from the currently processing file. + if (!bam_aux_get(&bam_record, "fn")) { + bam_aux_append(&bam_record, "fn", 'Z', filename.size() + 1, + reinterpret_cast(filename.c_str())); + } + + return true; + }; return true; } From c7fe9a18a93192f4ffbb2e420b64f7db59bd98d3 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 10:33:23 +0000 Subject: [PATCH 07/15] [DOR-895] Add unit test --- tests/BamReaderTest.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/BamReaderTest.cpp b/tests/BamReaderTest.cpp index 286519b42..2c3a10063 100644 --- a/tests/BamReaderTest.cpp +++ b/tests/BamReaderTest.cpp @@ -134,4 +134,16 @@ TEST_CASE( "read=1728 ch=332 start_time=2017-06-16T15:31:55Z"); } -} // namespace dorado::hts_reader::test \ No newline at end of file +TEST_CASE("HtsReaderTest: filename tag added if missing", TEST_GROUP) { + fs::path aligner_test_dir = fs::path(get_data_dir("bam_reader")); + auto filename = GENERATE("input.fa", "fastq_with_tags.fq"); + auto fasta = aligner_test_dir / filename; + + dorado::HtsReader reader(fasta.string(), std::nullopt); + while (reader.read()) { + // All should be given the name of input file. + CHECK(reader.get_tag("fn") == filename); + } +} + +} // namespace dorado::hts_reader::test From 0087e6819a6c6a23f29fe84c1e0ce5c98856ca7e Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 10:28:32 +0000 Subject: [PATCH 08/15] [DOR-895] Add a way to opt out of having the filename added to a record This is necessary for some existing tests to pass. --- dorado/read_pipeline/HtsReader.cpp | 30 +++++++++++++++--------------- dorado/read_pipeline/HtsReader.h | 6 ++++++ tests/AlignerTest.cpp | 1 + tests/BamUtilsTest.cpp | 1 + 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/dorado/read_pipeline/HtsReader.cpp b/dorado/read_pipeline/HtsReader.cpp index 8ed4ce197..0cd99cd29 100644 --- a/dorado/read_pipeline/HtsReader.cpp +++ b/dorado/read_pipeline/HtsReader.cpp @@ -167,21 +167,21 @@ bool HtsReader::try_initialise_generator(const std::string& filepath) { } m_header = generator->header(); m_format = generator->format(); - m_bam_record_generator = - [generator_ = std::move(generator), - filename = std::filesystem::path(filepath).filename().string()](bam1_t& bam_record) { - if (!generator_->try_get_next_record(bam_record)) { - return false; - } - - // If the record doesn't have a filename set then say that it came from the currently processing file. - if (!bam_aux_get(&bam_record, "fn")) { - bam_aux_append(&bam_record, "fn", 'Z', filename.size() + 1, - reinterpret_cast(filename.c_str())); - } - - return true; - }; + m_bam_record_generator = [generator_ = std::move(generator), + filename = std::filesystem::path(filepath).filename().string(), + this](bam1_t& bam_record) { + if (!generator_->try_get_next_record(bam_record)) { + return false; + } + + // If the record doesn't have a filename set then say that it came from the currently processing file. + if (m_add_filename_tag && !bam_aux_get(&bam_record, "fn")) { + bam_aux_append(&bam_record, "fn", 'Z', filename.size() + 1, + reinterpret_cast(filename.c_str())); + } + + return true; + }; return true; } diff --git a/dorado/read_pipeline/HtsReader.h b/dorado/read_pipeline/HtsReader.h index 7a898c7c9..a3069b03a 100644 --- a/dorado/read_pipeline/HtsReader.h +++ b/dorado/read_pipeline/HtsReader.h @@ -25,6 +25,11 @@ class HtsReader { public: HtsReader(const std::string& filename, std::optional> read_list); + + // By default we'll add a filename tag to each record to match the current file + // if one isn't included in the data, but that can be disabled with this method. + void set_add_filename_tag(bool should) { m_add_filename_tag = should; } + bool read(); // If reading directly into a pipeline need to set the client info on the messages @@ -51,6 +56,7 @@ class HtsReader { std::optional> m_read_list; std::function m_bam_record_generator{}; + bool m_add_filename_tag{true}; template bool try_initialise_generator(const std::string& filename); diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index f991350c2..78e4bbba0 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -544,6 +544,7 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, // Get the sam line from BAM pipeline dorado::HtsReader bam_reader(query, std::nullopt); + bam_reader.set_add_filename_tag(false); auto bam_records = RunPipelineWithBamMessages(bam_reader, ref, "", options, 2); CHECK(bam_records.size() == 1); auto sam_line_from_bam_ptr = get_sam_line_from_bam(std::move(bam_records[0])); diff --git a/tests/BamUtilsTest.cpp b/tests/BamUtilsTest.cpp index ca710eabb..586398640 100644 --- a/tests/BamUtilsTest.cpp +++ b/tests/BamUtilsTest.cpp @@ -294,6 +294,7 @@ TEST_CASE("BamUtilsTest: Remove all alignment tags", TEST_GROUP) { auto sam = bam_utils_test_dir / "aligned_record.bam"; HtsReader reader(sam.string(), std::nullopt); + reader.set_add_filename_tag(false); REQUIRE(reader.read()); // Parse first and only record. auto record = reader.record.get(); From 83d0c243d83bdf1c925b593b9d416f84260f4ab4 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 10:30:35 +0000 Subject: [PATCH 09/15] [DOR-895] Drop initialisers for non-trivial types These types have constructors that will do what we want and having them in the header only slows down the compilation of every TU including it. --- dorado/read_pipeline/HtsReader.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dorado/read_pipeline/HtsReader.h b/dorado/read_pipeline/HtsReader.h index a3069b03a..2101841b2 100644 --- a/dorado/read_pipeline/HtsReader.h +++ b/dorado/read_pipeline/HtsReader.h @@ -41,7 +41,7 @@ class HtsReader { void set_record_mutator(std::function mutator); bool is_aligned{false}; - BamPtr record{nullptr}; + BamPtr record; sam_hdr_t* header(); const sam_hdr_t* header() const; @@ -49,13 +49,13 @@ class HtsReader { private: sam_hdr_t* m_header{nullptr}; // non-owning - std::string m_format{}; + std::string m_format; std::shared_ptr m_client_info; - std::function m_record_mutator{}; + std::function m_record_mutator; std::optional> m_read_list; - std::function m_bam_record_generator{}; + std::function m_bam_record_generator; bool m_add_filename_tag{true}; template From 131c5e1c2252f31deb8f0a8305c5086ed85c30f5 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 10:42:25 +0000 Subject: [PATCH 10/15] [DOR-895] Simplify some code and remove some logging --- tests/AlignerTest.cpp | 20 ++++++++------------ tests/FastxRandomReaderTest.cpp | 1 - 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index 78e4bbba0..83dbb8aef 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -110,16 +110,14 @@ class AlignerNodeTestFixture { create_pipeline(index_file_access, bed_file_access, thread_pool, dorado::utils::concurrency::TaskPriority::normal); - dorado::ReadCommon read_common{}; auto client_info = std::make_shared(); client_info->contexts().register_context( client_align_info); - read_common.client_info = client_info; - read_common.read_id = std::move(read_id); - read_common.seq = std::move(sequence); auto read = std::make_unique(); - read->read_common = std::move(read_common); + read->read_common.client_info = std::move(client_info); + read->read_common.read_id = std::move(read_id); + read->read_common.seq = std::move(sequence); pipeline->push_message(std::move(read)); pipeline->terminate({}); @@ -560,17 +558,15 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, std::move(simplex_read->read_common.alignment_results[0].sam_string); // Do the comparison checks - CHECK_FALSE(sam_line_from_read_common.empty()); - - if (sam_line_from_read_common.at(sam_line_from_read_common.size() - 1) == '\n') { - sam_line_from_read_common = - sam_line_from_read_common.substr(0, sam_line_from_read_common.size() - 1); + REQUIRE_FALSE(sam_line_from_read_common.empty()); + if (sam_line_from_read_common.back() == '\n') { + sam_line_from_read_common.resize(sam_line_from_read_common.size() - 1); } const auto bam_fields = dorado::utils::split(sam_line_from_bam_ptr, '\t'); const auto read_common_fields = dorado::utils::split(sam_line_from_read_common, '\t'); - CHECK(bam_fields.size() == read_common_fields.size()); - CHECK(bam_fields.size() >= 11); + REQUIRE(bam_fields.size() == read_common_fields.size()); + REQUIRE(bam_fields.size() >= 11); // first 11 mandatory fields should be identical for (std::size_t field_index{0}; field_index < 11; ++field_index) { CAPTURE(field_index); diff --git a/tests/FastxRandomReaderTest.cpp b/tests/FastxRandomReaderTest.cpp index 05f951050..45f7c7a6d 100644 --- a/tests/FastxRandomReaderTest.cpp +++ b/tests/FastxRandomReaderTest.cpp @@ -23,7 +23,6 @@ BamPtr generate_bam_entry(const std::string& read_id, TEST_CASE("Check if read can be loaded correctly.", "FastxRandomReader") { auto temp_dir = tests::make_temp_dir("fastx_random_reader_test"); auto temp_input_file = temp_dir.m_path / "input.fq"; - spdlog::info("{}", temp_dir.m_path.string()); const std::string seq = "ACTGATCG"; const std::vector qscore = {20, 20, 30, 30, 20, 20, 40, 40}; From deef19bb88d5db8000886766cff8db9929f47d6d Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 10:54:37 +0000 Subject: [PATCH 11/15] [DOR-895] Avoid implicit conversions for CI --- dorado/read_pipeline/HtsReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dorado/read_pipeline/HtsReader.cpp b/dorado/read_pipeline/HtsReader.cpp index 0cd99cd29..84f9868da 100644 --- a/dorado/read_pipeline/HtsReader.cpp +++ b/dorado/read_pipeline/HtsReader.cpp @@ -176,7 +176,7 @@ bool HtsReader::try_initialise_generator(const std::string& filepath) { // If the record doesn't have a filename set then say that it came from the currently processing file. if (m_add_filename_tag && !bam_aux_get(&bam_record, "fn")) { - bam_aux_append(&bam_record, "fn", 'Z', filename.size() + 1, + bam_aux_append(&bam_record, "fn", 'Z', static_cast(filename.size() + 1), reinterpret_cast(filename.c_str())); } From 297f9a6b5db73b0ff5baf41f9b985dabf3ca4f99 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 10:56:46 +0000 Subject: [PATCH 12/15] [DOR-895] Workaround clang-tidy bug clang-tidy complains that these are only copied so should be taken by reference even though they're clearly being moved, so do what it says to appease CI. --- tests/AlignerTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/AlignerTest.cpp b/tests/AlignerTest.cpp index 83dbb8aef..abe351e6e 100644 --- a/tests/AlignerTest.cpp +++ b/tests/AlignerTest.cpp @@ -99,8 +99,8 @@ class AlignerNodeTestFixture { MessageTypePtr RunPipelineForRead( const std::shared_ptr& loaded_align_info, const std::shared_ptr& client_align_info, - std::string read_id, - std::string sequence) { + const std::string& read_id, + const std::string& sequence) { auto index_file_access = std::make_shared(); auto bed_file_access = std::make_shared(); CHECK(index_file_access->load_index(loaded_align_info->reference_file, @@ -116,8 +116,8 @@ class AlignerNodeTestFixture { auto read = std::make_unique(); read->read_common.client_info = std::move(client_info); - read->read_common.read_id = std::move(read_id); - read->read_common.seq = std::move(sequence); + read->read_common.read_id = read_id; + read->read_common.seq = sequence; pipeline->push_message(std::move(read)); pipeline->terminate({}); @@ -552,8 +552,8 @@ TEST_CASE_METHOD(AlignerNodeTestFixture, auto align_info = std::make_shared(); align_info->minimap_options = options; align_info->reference_file = ref; - auto simplex_read = RunPipelineForRead( - align_info, align_info, std::move(read_id), std::move(sequence)); + auto simplex_read = + RunPipelineForRead(align_info, align_info, read_id, sequence); auto sam_line_from_read_common = std::move(simplex_read->read_common.alignment_results[0].sam_string); From 04631d813564864eccb6e188fe69d525701d939f Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 11:38:14 +0000 Subject: [PATCH 13/15] [DOR-895] Add missing filenames to test expectation --- .../read_correction/expected.alignment_summary.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/data/read_correction/expected.alignment_summary.txt b/tests/data/read_correction/expected.alignment_summary.txt index d57e5d49b..38d6a4c6a 100644 --- a/tests/data/read_correction/expected.alignment_summary.txt +++ b/tests/data/read_correction/expected.alignment_summary.txt @@ -1,7 +1,7 @@ filename read_id alignment_genome alignment_genome_start alignment_genome_end alignment_strand_start alignment_strand_end alignment_direction alignment_length alignment_num_aligned alignment_num_correct alignment_num_insertions alignment_num_deletions alignment_num_substitutions alignment_mapq alignment_strand_coverage alignment_identity alignment_accuracy alignment_bed_hits - 3c88104d-7964-43dc-ac47-c22b12cdc994 chr6_PATERNAL:115004800-115077200 164 59219 35 59080 + 59060 59040 59039 5 15 1 60 0.999272 0.999983 0.999644 0 - e3066d3e-2bdf-4803-89b9-0f077ac7ff7f chr6_PATERNAL:115004800-115077200 1917 70731 0 68796 + 68832 68778 68769 18 36 9 60 0.999797 0.999869 0.999085 0 - 73d5eb75-700e-42a0-9a5d-f9952bd7d829 chr6_PATERNAL:115004800-115077200 2873 68539 0 65645 - 65678 65633 65624 12 33 9 60 1 0.999863 0.999178 0 - df814002-1961-4262-aaf5-e8f2760aa77a chr6_PATERNAL:115004800-115077200 9503 72207 12 62700 - 62722 62670 62660 18 34 10 60 0.999315 0.99984 0.999012 0 - b93514e5-c61b-48d8-b730-f6c97d169ff7 chr6_PATERNAL:115004800-115077200 11003 60600 38 49627 + 49601 49585 49585 4 12 0 60 0.999234 1 0.999677 0 - 3855985e-bb9b-4df4-9825-cc08f373342b chr6_PATERNAL:115004800-115077200 12624 68476 10 55847 - 55864 55825 55816 12 27 9 60 0.999821 0.999839 0.999141 0 +corrected_reads.fasta 3c88104d-7964-43dc-ac47-c22b12cdc994 chr6_PATERNAL:115004800-115077200 164 59219 35 59080 + 59060 59040 59039 5 15 1 60 0.999272 0.999983 0.999644 0 +corrected_reads.fasta e3066d3e-2bdf-4803-89b9-0f077ac7ff7f chr6_PATERNAL:115004800-115077200 1917 70731 0 68796 + 68832 68778 68769 18 36 9 60 0.999797 0.999869 0.999085 0 +corrected_reads.fasta 73d5eb75-700e-42a0-9a5d-f9952bd7d829 chr6_PATERNAL:115004800-115077200 2873 68539 0 65645 - 65678 65633 65624 12 33 9 60 1 0.999863 0.999178 0 +corrected_reads.fasta df814002-1961-4262-aaf5-e8f2760aa77a chr6_PATERNAL:115004800-115077200 9503 72207 12 62700 - 62722 62670 62660 18 34 10 60 0.999315 0.99984 0.999012 0 +corrected_reads.fasta b93514e5-c61b-48d8-b730-f6c97d169ff7 chr6_PATERNAL:115004800-115077200 11003 60600 38 49627 + 49601 49585 49585 4 12 0 60 0.999234 1 0.999677 0 +corrected_reads.fasta 3855985e-bb9b-4df4-9825-cc08f373342b chr6_PATERNAL:115004800-115077200 12624 68476 10 55847 - 55864 55825 55816 12 27 9 60 0.999821 0.999839 0.999141 0 From b9521e66186682c7038ca33dce78f491f64b8258 Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 13:35:53 +0000 Subject: [PATCH 14/15] [DOR-895] Drop unnecessary c_str() call std::string already has the length so we don't need fs::path's ctor to recalculate it. Also remove some more unnecessary lambdas from this file. --- dorado/data_loader/DataLoader.cpp | 267 +++++++++++++++--------------- 1 file changed, 129 insertions(+), 138 deletions(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index 918e91056..1e5d03fb0 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -177,8 +177,7 @@ SimplexReadPtr process_pod5_thread_fn( new_read->read_common.read_id = std::move(read_id_str); new_read->read_common.num_trimmed_samples = 0; new_read->read_common.attributes.read_number = read_data.read_number; - new_read->read_common.attributes.filename = - std::filesystem::path(path.c_str()).filename().string(); + new_read->read_common.attributes.filename = std::filesystem::path(path).filename().string(); new_read->read_common.attributes.mux = read_data.well; new_read->read_common.attributes.num_samples = read_data.num_samples; new_read->read_common.attributes.channel_number = read_data.channel; @@ -271,176 +270,168 @@ void DataLoader::load_reads(const std::filesystem::path& path, return; } - auto iterate_directory = [&](const auto& iterator) { - switch (traversal_order) { - case ReadOrder::BY_CHANNEL: - // If traversal in channel order is required, the following algorithm - // is used - - // 1. iterate through all the read metadata to collect channel information - // across all pod5 files - // 2. store the read list sorted by channel number - spdlog::info("> Reading read channel info"); - load_read_channels(path, recursive_file_loading); - spdlog::info("> Processed read channel info"); - // 3. for each channel, iterate through all files and in each iteration - // only load the reads that correspond to that channel. - for (int channel = 0; channel <= m_max_channel; channel++) { - if (m_reads_by_channel.find(channel) != m_reads_by_channel.end()) { - // Sort the read ids within a channel by its mux - // and start time. - spdlog::debug("Sort channel {}", channel); - auto& reads = m_reads_by_channel.at(channel); - std::sort(reads.begin(), reads.end(), [](ReadSortInfo& a, ReadSortInfo& b) { - if (a.mux != b.mux) { - return a.mux < b.mux; - } else { - return a.read_number < b.read_number; - } - }); - // Once sorted, create a hash table from read id - // to index in the sorted list to quickly fetch the - // read location and its neighbors. - for (size_t i = 0; i < reads.size(); i++) { - m_read_id_to_index[reads[i].read_id] = i; - } - spdlog::debug("Sorted channel {}", channel); - } - for (const auto& entry : iterator) { - if (m_loaded_read_count == m_max_reads) { - break; - } - auto entry_path = std::filesystem::path(entry); - std::string ext = entry_path.extension().string(); - std::transform(ext.begin(), ext.end(), ext.begin(), - [](unsigned char c) { return std::tolower(c); }); - if (ext == ".fast5") { - throw std::runtime_error( - "Traversing reads by channel is only available for POD5. " - "Encountered FAST5 at " + - entry_path.string()); - } else if (ext == ".pod5") { - auto& channel_to_read_ids = - m_file_channel_read_order_map.at(entry_path.string()); - auto& read_ids = channel_to_read_ids[channel]; - if (!read_ids.empty()) { - load_pod5_reads_from_file_by_read_ids(entry_path.string(), read_ids); - } + const auto filtered_entries = filter_fast5_for_mixed_datasets( + utils::fetch_directory_entries(path, recursive_file_loading)); + + switch (traversal_order) { + case ReadOrder::BY_CHANNEL: + // If traversal in channel order is required, the following algorithm + // is used - + // 1. iterate through all the read metadata to collect channel information + // across all pod5 files + // 2. store the read list sorted by channel number + spdlog::info("> Reading read channel info"); + load_read_channels(path, recursive_file_loading); + spdlog::info("> Processed read channel info"); + // 3. for each channel, iterate through all files and in each iteration + // only load the reads that correspond to that channel. + for (int channel = 0; channel <= m_max_channel; channel++) { + if (m_reads_by_channel.find(channel) != m_reads_by_channel.end()) { + // Sort the read ids within a channel by its mux + // and start time. + spdlog::debug("Sort channel {}", channel); + auto& reads = m_reads_by_channel.at(channel); + std::sort(reads.begin(), reads.end(), [](ReadSortInfo& a, ReadSortInfo& b) { + if (a.mux != b.mux) { + return a.mux < b.mux; + } else { + return a.read_number < b.read_number; } + }); + // Once sorted, create a hash table from read id + // to index in the sorted list to quickly fetch the + // read location and its neighbors. + for (size_t i = 0; i < reads.size(); i++) { + m_read_id_to_index[reads[i].read_id] = i; } - // Erase sorted list as it's not needed anymore. - m_reads_by_channel.erase(channel); + spdlog::debug("Sorted channel {}", channel); } - break; - case ReadOrder::UNRESTRICTED: - for (const auto& entry : iterator) { + for (const auto& entry : filtered_entries) { if (m_loaded_read_count == m_max_reads) { break; } - spdlog::debug("Load reads from file {}", entry.path().string()); - std::string ext = std::filesystem::path(entry).extension().string(); + auto entry_path = std::filesystem::path(entry); + std::string ext = entry_path.extension().string(); std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c) { return std::tolower(c); }); if (ext == ".fast5") { - load_fast5_reads_from_file(entry.path().string()); + throw std::runtime_error( + "Traversing reads by channel is only available for POD5. " + "Encountered FAST5 at " + + entry_path.string()); } else if (ext == ".pod5") { - load_pod5_reads_from_file(entry.path().string()); + auto& channel_to_read_ids = + m_file_channel_read_order_map.at(entry_path.string()); + auto& read_ids = channel_to_read_ids[channel]; + if (!read_ids.empty()) { + load_pod5_reads_from_file_by_read_ids(entry_path.string(), read_ids); + } } } - break; - default: - throw std::runtime_error("Unsupported traversal order detected: " + - dorado::to_string(traversal_order)); + // Erase sorted list as it's not needed anymore. + m_reads_by_channel.erase(channel); } - }; - - auto filtered_entries = filter_fast5_for_mixed_datasets( - utils::fetch_directory_entries(path, recursive_file_loading)); - iterate_directory(filtered_entries); + break; + case ReadOrder::UNRESTRICTED: + for (const auto& entry : filtered_entries) { + if (m_loaded_read_count == m_max_reads) { + break; + } + spdlog::debug("Load reads from file {}", entry.path().string()); + std::string ext = std::filesystem::path(entry).extension().string(); + std::transform(ext.begin(), ext.end(), ext.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (ext == ".fast5") { + load_fast5_reads_from_file(entry.path().string()); + } else if (ext == ".pod5") { + load_pod5_reads_from_file(entry.path().string()); + } + } + break; + default: + throw std::runtime_error("Unsupported traversal order detected: " + + dorado::to_string(traversal_order)); + } } void DataLoader::load_read_channels(const std::filesystem::path& data_path, bool recursive_file_loading) { - auto iterate_directory = [&](const auto& iterator) { - for (const auto& entry : iterator) { - auto file_path = std::filesystem::path(entry); - std::string ext = file_path.extension().string(); - std::transform(ext.begin(), ext.end(), ext.begin(), - [](unsigned char c) { return std::tolower(c); }); - if (ext != ".pod5") { - continue; - } - pod5_init(); + const auto entries = utils::fetch_directory_entries(data_path, recursive_file_loading); + for (const auto& entry : entries) { + auto file_path = std::filesystem::path(entry); + std::string ext = file_path.extension().string(); + std::transform(ext.begin(), ext.end(), ext.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (ext != ".pod5") { + continue; + } + pod5_init(); + + // Use a std::map to store by sorted channel order. + m_file_channel_read_order_map.emplace(file_path.string(), channel_to_read_id_t()); + auto& channel_to_read_id = m_file_channel_read_order_map[file_path.string()]; - // Use a std::map to store by sorted channel order. - m_file_channel_read_order_map.emplace(file_path.string(), channel_to_read_id_t()); - auto& channel_to_read_id = m_file_channel_read_order_map[file_path.string()]; + // Open the file ready for walking: + Pod5FileReader_t* file = pod5_open_file(file_path.string().c_str()); - // Open the file ready for walking: - Pod5FileReader_t* file = pod5_open_file(file_path.string().c_str()); + if (!file) { + spdlog::error("Failed to open file {}: {}", file_path.string().c_str(), + pod5_get_error_string()); + continue; + } + std::size_t batch_count = 0; + if (pod5_get_read_batch_count(&batch_count, file) != POD5_OK) { + spdlog::error("Failed to query batch count: {}", pod5_get_error_string()); + } - if (!file) { - spdlog::error("Failed to open file {}: {}", file_path.string().c_str(), - pod5_get_error_string()); + for (std::size_t batch_index = 0; batch_index < batch_count; ++batch_index) { + Pod5ReadRecordBatch_t* batch = nullptr; + if (pod5_get_read_batch(&batch, file, batch_index) != POD5_OK) { + spdlog::error("Failed to get batch: {}", pod5_get_error_string()); continue; } - std::size_t batch_count = 0; - if (pod5_get_read_batch_count(&batch_count, file) != POD5_OK) { - spdlog::error("Failed to query batch count: {}", pod5_get_error_string()); - } - for (std::size_t batch_index = 0; batch_index < batch_count; ++batch_index) { - Pod5ReadRecordBatch_t* batch = nullptr; - if (pod5_get_read_batch(&batch, file, batch_index) != POD5_OK) { - spdlog::error("Failed to get batch: {}", pod5_get_error_string()); - continue; - } + std::size_t batch_row_count = 0; + if (pod5_get_read_batch_row_count(&batch_row_count, batch) != POD5_OK) { + spdlog::error("Failed to get batch row count"); + continue; + } - std::size_t batch_row_count = 0; - if (pod5_get_read_batch_row_count(&batch_row_count, batch) != POD5_OK) { - spdlog::error("Failed to get batch row count"); + for (std::size_t row = 0; row < batch_row_count; ++row) { + uint16_t read_table_version = 0; + ReadBatchRowInfo_t read_data; + if (pod5_get_read_batch_row_info_data(batch, row, READ_BATCH_ROW_INFO_VERSION, + &read_data, &read_table_version) != POD5_OK) { + spdlog::error("Failed to get read {}", row); continue; } - for (std::size_t row = 0; row < batch_row_count; ++row) { - uint16_t read_table_version = 0; - ReadBatchRowInfo_t read_data; - if (pod5_get_read_batch_row_info_data(batch, row, READ_BATCH_ROW_INFO_VERSION, - &read_data, - &read_table_version) != POD5_OK) { - spdlog::error("Failed to get read {}", row); - continue; - } - - int channel = read_data.channel; - - // Update maximum number of channels encountered. - m_max_channel = std::max(m_max_channel, channel); + int channel = read_data.channel; - // Store the read_id in the channel's list. - ReadID read_id; - std::memcpy(read_id.data(), read_data.read_id, POD5_READ_ID_SIZE); - channel_to_read_id[channel].push_back(read_id); + // Update maximum number of channels encountered. + m_max_channel = std::max(m_max_channel, channel); - char read_id_tmp[POD5_READ_ID_LEN]; - if (pod5_format_read_id(read_data.read_id, read_id_tmp) != POD5_OK) { - spdlog::error("Failed to format read id"); - } - std::string rid(read_id_tmp); - m_reads_by_channel[channel].push_back( - {rid, read_data.well, read_data.read_number}); - } + // Store the read_id in the channel's list. + ReadID read_id; + std::memcpy(read_id.data(), read_data.read_id, POD5_READ_ID_SIZE); + channel_to_read_id[channel].push_back(read_id); - if (pod5_free_read_batch(batch) != POD5_OK) { - spdlog::error("Failed to release batch"); + char read_id_tmp[POD5_READ_ID_LEN]; + if (pod5_format_read_id(read_data.read_id, read_id_tmp) != POD5_OK) { + spdlog::error("Failed to format read id"); } + std::string rid(read_id_tmp); + m_reads_by_channel[channel].push_back({rid, read_data.well, read_data.read_number}); } - if (pod5_close_and_free_reader(file) != POD5_OK) { - spdlog::error("Failed to close and free POD5 reader"); + + if (pod5_free_read_batch(batch) != POD5_OK) { + spdlog::error("Failed to release batch"); } } - }; - - iterate_directory(utils::fetch_directory_entries(data_path, recursive_file_loading)); + if (pod5_close_and_free_reader(file) != POD5_OK) { + spdlog::error("Failed to close and free POD5 reader"); + } + } } void DataLoader::load_pod5_reads_from_file_by_read_ids(const std::string& path, From 86ff2f991ac981d16e2569037cf56087f1e998da Mon Sep 17 00:00:00 2001 From: Ben Lawrence Date: Wed, 13 Nov 2024 13:38:21 +0000 Subject: [PATCH 15/15] [DOR-895] Change test value so that it doesn't match the default As spotted in !1266, there'd be no difference in behaviour between this test doing the right or wrong thing since they'd both return the same value. --- tests/BamReaderTest.cpp | 10 ++++++---- tests/data/bam_reader/small.sam | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/BamReaderTest.cpp b/tests/BamReaderTest.cpp index 2c3a10063..f16e9c1e1 100644 --- a/tests/BamReaderTest.cpp +++ b/tests/BamReaderTest.cpp @@ -84,15 +84,17 @@ TEST_CASE("HtsReaderTest: get_tag", TEST_GROUP) { dorado::HtsReader reader(sam.string(), std::nullopt); while (reader.read()) { - // All records in small.sam have this set to 0. - CHECK(reader.get_tag("rl") == 0); + // All records in small.sam have these set. + CHECK(reader.get_tag("XA") == 42); + CHECK(reader.get_tag("XB") == "test"); // Intentionally bad tag to test that missing tags don't return garbage. CHECK(reader.get_tag("##") == 0); CHECK(reader.get_tag("##") == 0); CHECK(reader.get_tag("##") == ""); // Type mismatch doesn't crash. - CHECK(reader.get_tag("rl") == 0); - CHECK(reader.get_tag("rl") == ""); + CHECK(reader.get_tag("XB") == 0); + CHECK(reader.get_tag("XB") == 0); + CHECK(reader.get_tag("XA") == ""); } } diff --git a/tests/data/bam_reader/small.sam b/tests/data/bam_reader/small.sam index 8a5ab5e96..2e79b5f4c 100644 --- a/tests/data/bam_reader/small.sam +++ b/tests/data/bam_reader/small.sam @@ -2,14 +2,14 @@ @SQ SN:NZ_CP062160.1 LN:5498557 @SQ SN:NZ_CP062161.1 LN:104844 @PG ID:minimap2 PN:minimap2 VN:2.24-r1155-dirty CL:minimap2 -k 19 -w 19 -H -a --MD -o test2.sam /Users/joyjit.daw/references/ecoli-GCF_014825845.1_ASM1482584v1_genomic.fn /Users/joyjit.daw/Downloads/temp.fasta -f25635d3-3b81-4ff2-8510-f28330590801 4 * 0 0 * * 0 0 TGCTCAGATGGGGGAATGAAAACAATACTCGACAGGAACTACAACGCCACAGCGGCGATTGTATCAGAAAGTTGACACTGGAAGATAAACTGGTGGAGAAACCTGTCACGGGGAAGAGCAGAGCTTGACGCGCTTCGCTTGTCCCGGCCTCACGACCCGCGAAAGAGTTTCGCGAACCGTTCACCGTTTTTCAATATGATTCTGTCTGCTTTGGTCATTAAAGCAGACGATAGTGTCCAACTTCCCCGTATATAACGGTACGTGCTCATCGGTAAGAGCTGGCGGTATTGTGTTTATCAGGGGAAGTTTTCAGATACTCACGAAAATGCCTGGCAGTACCGCCGTCTACAATGCCATCGATAATATGTAGGTGTTTTCACCTAATTTTGCCATGATGATAATCCTTATATTACGGTTTGTGAATAGGTCTGCTTGTCAGGGTTTGTGTAACCTTTTAACCTTAGCAATGCGTAACGCGTGTACTTTGCGTTCAATGGTTACGTATTGCTGGGTTAGAGAGGACAAAGGTTTCAACTGCCCCCAGCACCTCTGGTTAATTGGATAGCGACACTTCCTGTATCATTCGGTGGCGGAAAATCAGTGGAATCGCCGAAATGGCTGGCGCGATTAACCATTTTGCGCCTGGCACCTGCTCTATTTTACCGATTTATCAACTGGCCTCGCCTGCGGCGAATTGTTAGTGGAATGATAGCTGAAGAGCTGCAAAGTTCAGG * rl:i:0 pi:Z:aaaaaaaaaa -95e6aa45-efe0-4ed9-b7b9-2f35164f9836 0 NZ_CP062160.1 1012347 60 1S43M1D8M1D11M1D5M1I10M1D2M5D2M1I37M1I102M2D14M2D25M3D90M2D7M1I35M3D8M1D73M1I28M1D9M1D2M1D9M3D5M3D28M1D21M4D62M1D6M1D5M1D52M1D2M1D39M1D45M1I30M2D119M1D2M2D20M2I76M1D12M1D4M1I176M1D22M780S * 0 0 TGTAATCACGCCGTCTGCCAGTAGGATCACTTTGTTGGCGGTTGAGCGGGATAATGCCAATCGTTCTGCTCAATGCTGCCCGGCATCTCAGCAAAGGCCGCGACACGGGTGCTGAGCTGCCACGACATCCGAGGTGGAGTAGTCTGTTTCAATAGCGATATAAGGAATGTTGTGCTGCTGGCGCACATGACGTTTAATCGCCAGCGATTCCACCGCGTAGGTATCGCGCCTGCAAAATCATCAACTACGCCATCGACCTGATATTCAACCATCTGGCTGAGCATTTGCGGGCGCTGATCGTTCGGCGAAACACAGGAGCAGCCAATCGCCAGATATTTATCCGCCAGCGCGTCATACATCACCCCGTTTCCGCCACGCATTGCTCGGTGGCTTTCGCGGTGCAGTTTCATAACCGACAACCCAGCCGCCATTCTCTTCAATCGCGCGCACCACTTTCTCTGCTGCGCCGCCAATTGGAGCAGCCGGTGATTAAAATACGTGGGCGCGGTCCTGTCATGCCTTCTTCCTGCTGAACGCGGGCGGTCATTGCATCCAGTTCGTGATCAACGCCTCTTTATCGGGCAGGTTGCGCCGTAGACTACTTTCAGAATGTCGCTGCCGCTAAGCGCTGGAGGATTTAACGCACAAATGATAAAGTTGGCCAGTGCGCGACGTTCGCGGTTTTTCAGCGCAATGGCATCGCGCGATATCTTCGCTAATCTCGTGCCCAAAACGTTCTTCTACTGTTTTTGCAAGCGCAGCATCTCGGCTTTCCATAATGCACGCGAGGCGCTCGTCCTTAACGCTGTTTGGCAACTGCATCATGAACGGGCTTAAATTCCGCCATGTATTCATACATTTTCTTTTTGCCGTCACAGGTGGTTTCACCGACCACCAGATCAGAAAAACAGAAGTAGGGGCATTTATCGGTTTTGCCGAAGCGGCTGCTTTTAATCAGCGGGCAGAGAGGTTGCGCGGCAGATCTTTCTCGGCTTCTTCAATGGTTTCATCAGAGGTGGAACATAGCGAAACCACAACCACCCGGCTGCCATCGGATACTCTTGCGGCATAAAGGTGCAGTAGGTGCCAACCAGCGGAATGCCGCGCTCCTTGAGATCCATGACGGTGAGAAAGCCTTTCTGGCGAGCTTCAGAGAACTGATCGAAAATGGCGGGTAGCTCGGTAACAAGTGACATGATTTTCCTTCCCCGTACCACGGGAGATAATGAAAAGAGCCACATTATTACCATTCTTTTGTTGTACTTCGTTCGATTGCATTACTGGGTGGCAGCGGCGTTCGAACGGCACTGATATCAACTGAAACAGACCATGGCGATGAGCTGTTGGGCGAGCAGGAGGAGCTGGTGGGCAACAAGGTCGCGAAAAAGCTGACCACTGACGCTTCTGAATGGAAGTCTTGCTGGATGGCGAAAAGGTAAAGTGAAATGGTCGCTGGTGACGAGCATAATATGCGTAGCTGATGGCGATTGCGGCGGCTCGCCATGTTGGTGTAGCGGCGGATACTCACCAGGTTCGTTTTAACGCTCGTCGCCGTCTGGAGTTGCACCATCCGCCAGCACGGATCTGCTCCAGCTACCTGGCTGTGGTTCTGAATAATGGTGAGCATGTTTTCCCGGCGTTAACCGTAATGGTCATCGGCTCGCGCCATCTGTCGCGGTGACTGACTTGCGGAATATCGGCGTTAGCCATAAAAGCAGAGAACAGCGCAGCCGCATAACTGCAATACGACAGTTAATGGTCTTGACGAGACTATCCGTGTATGTTGTAACTAAACAGAGTTTATCGTATTTGCCCACGGGCGGGACGCAGTCGCTAAGTGGCAGAACAAATGCCACCAGCGCCGGGATAGATAAATAAACCAGACGGCGCGCTTCGCTGACGCTCGGTGCTTCCTGATAGCCAAATCCCTTCAGTGTGCCAAACAGCGAAATTGTGAGTTGAGCACCGCGCTCATATCGAAGGCGATTTCCTGAAGATGGTTCCACAATCCGGCTTCATGAAATGCGCGAATGGCA * NM:i:124 ms:i:1853 AS:i:1832 nn:i:0 tp:A:P cm:i:24 s1:i:396 s2:i:0 de:f:0.0802 MD:Z:17G4A4C5G7A1^A1C1A1A2^C11^A0A1A5C4A1^T2^TTACA8A0T18A11A56A17T25^GG1A12^CA25^TTC3C19T1A64^GT1G5G8T17C7^CCC8^T70C20G2C2A3^G5A3^G0C1^G9^CCA5^GAC3T12C2G8^A0T20^AACC2A13A2C29C8G3^T2C3^G5^A3A48^A2^G0C12T20C2C1^C6T26C11A5T16G6^CA16C35G26C5G0T32^C2^TA3A41C32G15G1^A12^G28A24A70A5G49^G22 rl:i:0 pi:Z:aaaaaaaaaa -13d1b2d7-b6a8-4c58-a709-49f0f341c26f 4 * 0 0 * * 0 0 CTACCCAGATTTTTGTGCAAAACCGTGACGGTACTGATCGGCATGGGGAAATTCTTCGCTATCCTGGTCACCGGTATCGACCTTTCGGTTAGCGCGATTCTGGCGCTTTCCGGTATGGTGACCGCTAAACTGATGTTGGCAGGTATTGATCCGTTTCTCGCGGCGCTAATTGGCGGTGTACTGGTTGGCGGCGCACTGGGGCGATCAACGGCTGCCTGGTCAACTGGACGGGACTACACCCGTTCATTATCACCTTAGCACCAACGCTATTTTCCGTGGGATCACGCTGGTGATCTCCGATGCCAACTCGGTAGCCGACTTCTCATTTGACCGTGAACTTCTTTTGCCGCCAGCGTAATTGGGATACCTGTCCCCGTTATCTTCTCGCTAATTGCTTCGCGCTCATCCTTTGGTTTCTGACAACGCGTATGCGCTCGGGCGCAACATCTACGCACTGGGCGGCAGCAAAAACTCGGCGTTATTCCGGGATTGACGTGAAATAGGTGCTGAAGCGTTGAGTATCCTCTTGGCGCCTTAGCAGTACGTAGCCTTGTGTTGCCGCGTTTCAATTTACGTATTTGCTAAAAGGTTAAACAGGCGACTACAAACAGAATCGACGCTTTACCCGCAAACAATGCCCGTTGAAATTTATCCGCCGTTCATAGCCTCACCTCCGCAAATAACGGATGGCGTAGTTTTACACTGAGAAATGAAAGGATTTGAAAAAACCGCAAAGCGGGCGAAACGATATATACAGAAAGAAAGGAAAGCACTCTATCCAACAAACACACCCACAGTTGATCGGAATAAAAAGCAGA * rl:i:0 -489f7224-7d18-41d4-9129-d28c3e6aa780 16 NZ_CP062160.1 1629814 60 2S9M1I95M1D3M1D53M1I17M1D75M1I46M1I6M1I23M1I22M168S * 0 0 CTTATCATGTCTTTTTGGGTGAGTATTCATCCATAATGCGTCCTCTTCTTAGCGGTTGAACTAACGGACACCTTTCGGGATGGAAAAAACTTACTGACCTGGACTTGCCTTCGTTTGTTAGCTTAACTATAAGCCACTCTTTGCAGGTTTTCATCGCATTATTAATGAAAAATTGCAATTAGAAGGAAGCGGGACAGGACTCACTTTTTCAGCCTAGCACCAACCCGCAGCAGGTAAAAGCAGTTTGCCCGAACTGTTTCATAACATTTCAGAATTATCGCCAGAAGAAATTGGTGACACTAAGTACTGGTGATTGCTCTGAATATGACGTAATAAATCGAGGAATGAATAAAGAATATGTAACGTCCGGCTTCATCCGGATTTTCTGAGGCTACCGTGTTCACCACAACCGTGGTGCTGTTACGTTTTGTTTTGAGCTGAATGTGCAGTTTCAGTCGGTTTCCTGCACCGTCTTTCAGCACGCCTGAAATCTGTACTGCCATATTCACTCCACAAATAAAAAAG * NM:i:22 ms:i:566 AS:i:566 nn:i:0 tp:A:P cm:i:6 s1:i:121 s2:i:0 de:f:0.0615 MD:Z:33C6C63^C3^C54C8A6^G17G12T0A35T22G12T20C0A26C19 rl:i:0 -d7500028-dfcc-4404-b636-13edae804c55 16 NZ_CP062160.1 3164467 1 174M2D20M329S * 0 0 CGCCGCCGGTGCGGCAATCCGGAACGATACCGATGCCGGATCGCCCCGCTGCCCCCACGCATTTACTGCCCGGACTGTCAGCGTGTACCGCCCCAGCGCCAGTTGCGTGAAGCGGTATGTGGTTTCCGTCGTCCGGGCCGTGCTGACCAGCCGCTCACTGCCGTCATCCGCTGCCGGTCAGGCGAAGCATAAAGGTGCTGTCGATTCCGTTTGTAGTCGTCTGTTTAACCTTAGCAATACGTAACTGAACGAAGTACAACACATTTTTTTTTTTTTTTTTTCAGTTACGTATTGCCAAGGTTAAGAGAGGTCAGCGTTTCAACGCTCAGCACCTATCGCTCCGGAAGTGGACGTTGACGACGAGCCAGAAGAAGAATAATTTCATCGCTTTCATGCCAGGGAAAAGGGAGCCATCTCCTCTTTGAATTGAAAAGTCCAGGCTGTAAAGTCTGGGCTTTTGTCGTATTAGGCGCGGTGTTTGGCTGTGCCTCGTAAAAAATGGCTGGCTATACACAAGGAATAG * NM:i:12 ms:i:321 AS:i:320 nn:i:0 tp:A:P cm:i:4 s1:i:70 s2:i:77 de:f:0.0564 SA:Z:NZ_CP062160.1,4112332,+,2S187M334S,19,20; MD:Z:3T29C12T19C15C4G0T0T4G76G2^CA20 rl:i:0 -d7500028-dfcc-4404-b636-13edae804c55 256 NZ_CP062160.1 2003415 0 334S14M2D175M * 0 0 * * NM:i:12 ms:i:311 AS:i:310 nn:i:0 tp:A:S cm:i:3 s1:i:51 de:f:0.0579 MD:Z:14^GT72G12A0A0C4G15G14A4A12G29A3 rl:i:0 -d7500028-dfcc-4404-b636-13edae804c55 256 NZ_CP062160.1 2912586 0 334S14M2D175M * 0 0 * * NM:i:14 ms:i:299 AS:i:298 nn:i:0 tp:A:S cm:i:4 s1:i:77 de:f:0.0684 MD:Z:14^GT40A7A31C4A0A0C4G1T13G14A4A42A3 rl:i:0 -d7500028-dfcc-4404-b636-13edae804c55 272 NZ_CP062160.1 3313878 0 174M2D15M334S * 0 0 * * NM:i:15 ms:i:293 AS:i:292 nn:i:0 tp:A:S cm:i:3 s1:i:58 de:f:0.0737 MD:Z:3T42T4T14C8A6C4G0T0T44T6A20A2G8^CA15 rl:i:0 -d7500028-dfcc-4404-b636-13edae804c55 2048 NZ_CP062160.1 4112332 19 2H31M1I13M3D6M1D52M2D16M4I9M3I2M2D50M334H * 0 0 ATTCCTTGTGTATAGCCAGCCATTTTTTACGAGGCACAGCCAAACACCGCGCCTAATACGACAAAAGCCCAGACTTTACAGCCTGGACTTTTCAATTCAAAGAGGAGATGGCTCCCTTTTCCCTGGCATGAAAGCGATGAAATTATTCTTCTTCTGGCTCGTCGTCAACGTCCACTTCCGGAGCGAT * NM:i:20 ms:i:283 AS:i:274 nn:i:0 tp:A:P cm:i:3 s1:i:64 s2:i:0 de:f:0.0591 SA:Z:NZ_CP062160.1,3164467,-,194M2D329S,1,12; MD:Z:32A11^TTT6^C26G23A1^AG27^AG1A48 rl:i:0 -60588a89-f191-414e-b444-ad0815b7d9c9 4 * 0 0 * * 0 0 CTGCATTTCATCGACCACCGGCTTGGTCTGCCAGGCGATAAGCTCCAGGCAAATTCAGCCAGTGAGTCGGTGGTACGGCATCACCTGCAATAATCGGATACAGTTCCTCTTCCAGGGGAAGCGGAGCATTTCACCGGGTGGAGGAGGGCGTTGGCAGTACCTTGCTGTATTTTTGAGTTTCGAACGTGTCCATAATGAGGTTGATAAAACAACGGTAAAGCGGGTATTTTCATCAGGTATTTACCCACAGGACCGTTGTCGTTATTTTTATTTATTGTCGTTGCCCAGGGCGGCGAGACTAATACTGGTAGCATTTCTGAGTTTGCAAAATCTTCATTGCGGCTATTCCAGTAATTGCTGGCATTGTTATAAGTCCGCGGCGTGTGGCGATAACCTCATGACTAATTTTTCGGCCAGCAGGGTTGCATTGGTAATGAAACCAGCAATGTATGTTTGCGTTTATCTCTTTTTACCCCAATGCTTCGGCTAATGCTAACGCTATTGTGAAGGAAAATTTTCAGCACTTTTTATTAGTCGTTCATAACAGCGTTTACAGTATTATCGGTGGCTTCTATTAGGGCTGAAACAGCGCATAGGGATGGTTGACCATGTAAGTACATCACCAACTGTCATGTTGAGCATAGGGAACCAGGCGGTAGTGCCGTTGCCTTGTTCATAATACTGTAGCTGACCAACTGGTACCATAGATTAAACATCAATGTGTTGATAGTTTTCAAGTATTTCAAGCGTGACAGGATCTGTCGCGCTACCAGTTTTAACATTTCCAGTTGTGGAATATCGGCGAGCTGGTGCAATCGTTGGTATCTCATGGAGCGCTTCAGCGTTGGCATTAAAATAGCAATTAACGGTTGGCGATAGACTGATATATCTGGTAACACAACGAGTAATTTTGCAATACAGTTATTCAGACTTCCTTTTCAGCCAGCGATAAATGTTAG * rl:i:0 -dbd32a76-43d2-40d6-9ccc-610af70a4051 16 NZ_CP062160.1 2846079 60 130S32M3I13M1D9M1I13M4I85M3I16M3I24M1I16M1D86M2I14M3D113M3I1M1I25M5I1M2I191M3I57M2I4M1D30M2I1M2I31M3D59M1D10M2D35M1D31M341S * 0 0 TAAATAAAGTTTGTTTAATGGCTTCGTTTATTCGCCAGGCTGATTAATCACATGCAGGCTATCTGACCGCCAGCACCAAGAATCAGTACATTTTTCATGAAACTTTCCCAGGATTACGCGGCAAGCACATGCCATATCACTATTACGGATTTGAATATGTTTCCCCATTACAGAGTTTCTCAAAGAATCAGAGCTAATACTTCTCGAAATTACTTCAGCCACATAGACTTTGCCATCATAAAGATCCATATGGTTTGCGCCTTCAACAATGTGATAGCGTTTATCCTCGCGCTTTGATGCTCGATCGTACAGCAGGCCGTCACTCATCCATTTGCTCCCCAGCCAGGCTGCTCACATAATCTGTATCGGCTGAGTCAGGTACACTTCCGCCATATGGTAAGCATCATAGGTAATAATCTGGTTAAGGCTGCGCAAAGTAGCGCGTAACCCGGTGCTGGATACTGCGCGCGAGGGGTGTGGTAATATTCCCAGGCCTGACGCAGTTCTTCGTTCGGCGCATCGGACTCCTTCATTGGTGCCAGTGGCATAGTGGCGCATTCTCCGCTGCGTTTCAATATCGCTGGTTCGGGCGTTTGAAATTTTCTGGGGCGTCAACGTATGGTAGGGCATCAACAGATTTCACATTGTTTTCCCAACCATTACGGAACATCGAACCAATATTGACCGCACTAACAGTACCGATGGCCTTGATGCGCGGATCCCGAATTGCAGCATTGGCTGTATATCCTGCACCGGCACAAATTCCCATCGCACCAATTCGGGTATTGTCGACATAACCTGAAAGCGTTGTCAGGTAATCAATCACGGCACTGACGTCTTCAGTACGAATGTATGGGGCGTTTTAACTGACGCGGCTCGCCGCCGCTTTCTTCGTCTTTGATAAGATGCGTCATAAGCAATAGTGACAACTTTTTCCGCCAGTTTTTCGGCATAGGTTCCGGCCGTTTGCTCTTTAACGCCCCCACCTGGTGAGATACCAATTGCCGGATACTGACGGGTTTCATCAAATTTTGAGGGAAATAGATCACTGCAGACAAAGAGATAGGTGCTGAAGCGTTGAAACCTTTGTCCTCTCTTAACCTTAGCAATACGTAACCAAACGTGAAGTGCCGAAAGATAAGCAGCAACAGACGTGACTACGGAATCGACAGCACCCATAGCGTACCGAGCGCCACCAGAAAGATTTTTGCGGATTCATCATCGCGGTACCACAAAAATCACCAATAGAAGCGGATGCCAGGATGCGGCACGTTGCGCAACATCTGAATTGAGGTATCCAGGCAGTCGCTTCACTCCCAGCGACAGCCCGCCAATCAGCCCTTACAGTAGAATTCAATCCCCAGCGATCTCGCCAATTGAAAAGCCAACGAACGCCCGCCAGGAG * NM:i:87 ms:i:1407 AS:i:1380 nn:i:0 tp:A:P cm:i:19 s1:i:360 s2:i:0 de:f:0.0652 MD:Z:14G30^C10G17G14G7G52G9C12T22T3T6C1^G0C2C6C0G88^TGT30C63A5T27T11C0C2T11C10T60G20G0C5T131T3^C21A5A0C33^ATC1C26C11T18^G10^CA13T21^G4A26 rl:i:0 +f25635d3-3b81-4ff2-8510-f28330590801 4 * 0 0 * * 0 0 TGCTCAGATGGGGGAATGAAAACAATACTCGACAGGAACTACAACGCCACAGCGGCGATTGTATCAGAAAGTTGACACTGGAAGATAAACTGGTGGAGAAACCTGTCACGGGGAAGAGCAGAGCTTGACGCGCTTCGCTTGTCCCGGCCTCACGACCCGCGAAAGAGTTTCGCGAACCGTTCACCGTTTTTCAATATGATTCTGTCTGCTTTGGTCATTAAAGCAGACGATAGTGTCCAACTTCCCCGTATATAACGGTACGTGCTCATCGGTAAGAGCTGGCGGTATTGTGTTTATCAGGGGAAGTTTTCAGATACTCACGAAAATGCCTGGCAGTACCGCCGTCTACAATGCCATCGATAATATGTAGGTGTTTTCACCTAATTTTGCCATGATGATAATCCTTATATTACGGTTTGTGAATAGGTCTGCTTGTCAGGGTTTGTGTAACCTTTTAACCTTAGCAATGCGTAACGCGTGTACTTTGCGTTCAATGGTTACGTATTGCTGGGTTAGAGAGGACAAAGGTTTCAACTGCCCCCAGCACCTCTGGTTAATTGGATAGCGACACTTCCTGTATCATTCGGTGGCGGAAAATCAGTGGAATCGCCGAAATGGCTGGCGCGATTAACCATTTTGCGCCTGGCACCTGCTCTATTTTACCGATTTATCAACTGGCCTCGCCTGCGGCGAATTGTTAGTGGAATGATAGCTGAAGAGCTGCAAAGTTCAGG * rl:i:0 pi:Z:aaaaaaaaaa XA:i:42 XB:Z:test +95e6aa45-efe0-4ed9-b7b9-2f35164f9836 0 NZ_CP062160.1 1012347 60 1S43M1D8M1D11M1D5M1I10M1D2M5D2M1I37M1I102M2D14M2D25M3D90M2D7M1I35M3D8M1D73M1I28M1D9M1D2M1D9M3D5M3D28M1D21M4D62M1D6M1D5M1D52M1D2M1D39M1D45M1I30M2D119M1D2M2D20M2I76M1D12M1D4M1I176M1D22M780S * 0 0 TGTAATCACGCCGTCTGCCAGTAGGATCACTTTGTTGGCGGTTGAGCGGGATAATGCCAATCGTTCTGCTCAATGCTGCCCGGCATCTCAGCAAAGGCCGCGACACGGGTGCTGAGCTGCCACGACATCCGAGGTGGAGTAGTCTGTTTCAATAGCGATATAAGGAATGTTGTGCTGCTGGCGCACATGACGTTTAATCGCCAGCGATTCCACCGCGTAGGTATCGCGCCTGCAAAATCATCAACTACGCCATCGACCTGATATTCAACCATCTGGCTGAGCATTTGCGGGCGCTGATCGTTCGGCGAAACACAGGAGCAGCCAATCGCCAGATATTTATCCGCCAGCGCGTCATACATCACCCCGTTTCCGCCACGCATTGCTCGGTGGCTTTCGCGGTGCAGTTTCATAACCGACAACCCAGCCGCCATTCTCTTCAATCGCGCGCACCACTTTCTCTGCTGCGCCGCCAATTGGAGCAGCCGGTGATTAAAATACGTGGGCGCGGTCCTGTCATGCCTTCTTCCTGCTGAACGCGGGCGGTCATTGCATCCAGTTCGTGATCAACGCCTCTTTATCGGGCAGGTTGCGCCGTAGACTACTTTCAGAATGTCGCTGCCGCTAAGCGCTGGAGGATTTAACGCACAAATGATAAAGTTGGCCAGTGCGCGACGTTCGCGGTTTTTCAGCGCAATGGCATCGCGCGATATCTTCGCTAATCTCGTGCCCAAAACGTTCTTCTACTGTTTTTGCAAGCGCAGCATCTCGGCTTTCCATAATGCACGCGAGGCGCTCGTCCTTAACGCTGTTTGGCAACTGCATCATGAACGGGCTTAAATTCCGCCATGTATTCATACATTTTCTTTTTGCCGTCACAGGTGGTTTCACCGACCACCAGATCAGAAAAACAGAAGTAGGGGCATTTATCGGTTTTGCCGAAGCGGCTGCTTTTAATCAGCGGGCAGAGAGGTTGCGCGGCAGATCTTTCTCGGCTTCTTCAATGGTTTCATCAGAGGTGGAACATAGCGAAACCACAACCACCCGGCTGCCATCGGATACTCTTGCGGCATAAAGGTGCAGTAGGTGCCAACCAGCGGAATGCCGCGCTCCTTGAGATCCATGACGGTGAGAAAGCCTTTCTGGCGAGCTTCAGAGAACTGATCGAAAATGGCGGGTAGCTCGGTAACAAGTGACATGATTTTCCTTCCCCGTACCACGGGAGATAATGAAAAGAGCCACATTATTACCATTCTTTTGTTGTACTTCGTTCGATTGCATTACTGGGTGGCAGCGGCGTTCGAACGGCACTGATATCAACTGAAACAGACCATGGCGATGAGCTGTTGGGCGAGCAGGAGGAGCTGGTGGGCAACAAGGTCGCGAAAAAGCTGACCACTGACGCTTCTGAATGGAAGTCTTGCTGGATGGCGAAAAGGTAAAGTGAAATGGTCGCTGGTGACGAGCATAATATGCGTAGCTGATGGCGATTGCGGCGGCTCGCCATGTTGGTGTAGCGGCGGATACTCACCAGGTTCGTTTTAACGCTCGTCGCCGTCTGGAGTTGCACCATCCGCCAGCACGGATCTGCTCCAGCTACCTGGCTGTGGTTCTGAATAATGGTGAGCATGTTTTCCCGGCGTTAACCGTAATGGTCATCGGCTCGCGCCATCTGTCGCGGTGACTGACTTGCGGAATATCGGCGTTAGCCATAAAAGCAGAGAACAGCGCAGCCGCATAACTGCAATACGACAGTTAATGGTCTTGACGAGACTATCCGTGTATGTTGTAACTAAACAGAGTTTATCGTATTTGCCCACGGGCGGGACGCAGTCGCTAAGTGGCAGAACAAATGCCACCAGCGCCGGGATAGATAAATAAACCAGACGGCGCGCTTCGCTGACGCTCGGTGCTTCCTGATAGCCAAATCCCTTCAGTGTGCCAAACAGCGAAATTGTGAGTTGAGCACCGCGCTCATATCGAAGGCGATTTCCTGAAGATGGTTCCACAATCCGGCTTCATGAAATGCGCGAATGGCA * NM:i:124 ms:i:1853 AS:i:1832 nn:i:0 tp:A:P cm:i:24 s1:i:396 s2:i:0 de:f:0.0802 MD:Z:17G4A4C5G7A1^A1C1A1A2^C11^A0A1A5C4A1^T2^TTACA8A0T18A11A56A17T25^GG1A12^CA25^TTC3C19T1A64^GT1G5G8T17C7^CCC8^T70C20G2C2A3^G5A3^G0C1^G9^CCA5^GAC3T12C2G8^A0T20^AACC2A13A2C29C8G3^T2C3^G5^A3A48^A2^G0C12T20C2C1^C6T26C11A5T16G6^CA16C35G26C5G0T32^C2^TA3A41C32G15G1^A12^G28A24A70A5G49^G22 rl:i:0 pi:Z:aaaaaaaaaa XA:i:42 XB:Z:test +13d1b2d7-b6a8-4c58-a709-49f0f341c26f 4 * 0 0 * * 0 0 CTACCCAGATTTTTGTGCAAAACCGTGACGGTACTGATCGGCATGGGGAAATTCTTCGCTATCCTGGTCACCGGTATCGACCTTTCGGTTAGCGCGATTCTGGCGCTTTCCGGTATGGTGACCGCTAAACTGATGTTGGCAGGTATTGATCCGTTTCTCGCGGCGCTAATTGGCGGTGTACTGGTTGGCGGCGCACTGGGGCGATCAACGGCTGCCTGGTCAACTGGACGGGACTACACCCGTTCATTATCACCTTAGCACCAACGCTATTTTCCGTGGGATCACGCTGGTGATCTCCGATGCCAACTCGGTAGCCGACTTCTCATTTGACCGTGAACTTCTTTTGCCGCCAGCGTAATTGGGATACCTGTCCCCGTTATCTTCTCGCTAATTGCTTCGCGCTCATCCTTTGGTTTCTGACAACGCGTATGCGCTCGGGCGCAACATCTACGCACTGGGCGGCAGCAAAAACTCGGCGTTATTCCGGGATTGACGTGAAATAGGTGCTGAAGCGTTGAGTATCCTCTTGGCGCCTTAGCAGTACGTAGCCTTGTGTTGCCGCGTTTCAATTTACGTATTTGCTAAAAGGTTAAACAGGCGACTACAAACAGAATCGACGCTTTACCCGCAAACAATGCCCGTTGAAATTTATCCGCCGTTCATAGCCTCACCTCCGCAAATAACGGATGGCGTAGTTTTACACTGAGAAATGAAAGGATTTGAAAAAACCGCAAAGCGGGCGAAACGATATATACAGAAAGAAAGGAAAGCACTCTATCCAACAAACACACCCACAGTTGATCGGAATAAAAAGCAGA * rl:i:0 XA:i:42 XB:Z:test +489f7224-7d18-41d4-9129-d28c3e6aa780 16 NZ_CP062160.1 1629814 60 2S9M1I95M1D3M1D53M1I17M1D75M1I46M1I6M1I23M1I22M168S * 0 0 CTTATCATGTCTTTTTGGGTGAGTATTCATCCATAATGCGTCCTCTTCTTAGCGGTTGAACTAACGGACACCTTTCGGGATGGAAAAAACTTACTGACCTGGACTTGCCTTCGTTTGTTAGCTTAACTATAAGCCACTCTTTGCAGGTTTTCATCGCATTATTAATGAAAAATTGCAATTAGAAGGAAGCGGGACAGGACTCACTTTTTCAGCCTAGCACCAACCCGCAGCAGGTAAAAGCAGTTTGCCCGAACTGTTTCATAACATTTCAGAATTATCGCCAGAAGAAATTGGTGACACTAAGTACTGGTGATTGCTCTGAATATGACGTAATAAATCGAGGAATGAATAAAGAATATGTAACGTCCGGCTTCATCCGGATTTTCTGAGGCTACCGTGTTCACCACAACCGTGGTGCTGTTACGTTTTGTTTTGAGCTGAATGTGCAGTTTCAGTCGGTTTCCTGCACCGTCTTTCAGCACGCCTGAAATCTGTACTGCCATATTCACTCCACAAATAAAAAAG * NM:i:22 ms:i:566 AS:i:566 nn:i:0 tp:A:P cm:i:6 s1:i:121 s2:i:0 de:f:0.0615 MD:Z:33C6C63^C3^C54C8A6^G17G12T0A35T22G12T20C0A26C19 rl:i:0 XA:i:42 XB:Z:test +d7500028-dfcc-4404-b636-13edae804c55 16 NZ_CP062160.1 3164467 1 174M2D20M329S * 0 0 CGCCGCCGGTGCGGCAATCCGGAACGATACCGATGCCGGATCGCCCCGCTGCCCCCACGCATTTACTGCCCGGACTGTCAGCGTGTACCGCCCCAGCGCCAGTTGCGTGAAGCGGTATGTGGTTTCCGTCGTCCGGGCCGTGCTGACCAGCCGCTCACTGCCGTCATCCGCTGCCGGTCAGGCGAAGCATAAAGGTGCTGTCGATTCCGTTTGTAGTCGTCTGTTTAACCTTAGCAATACGTAACTGAACGAAGTACAACACATTTTTTTTTTTTTTTTTTCAGTTACGTATTGCCAAGGTTAAGAGAGGTCAGCGTTTCAACGCTCAGCACCTATCGCTCCGGAAGTGGACGTTGACGACGAGCCAGAAGAAGAATAATTTCATCGCTTTCATGCCAGGGAAAAGGGAGCCATCTCCTCTTTGAATTGAAAAGTCCAGGCTGTAAAGTCTGGGCTTTTGTCGTATTAGGCGCGGTGTTTGGCTGTGCCTCGTAAAAAATGGCTGGCTATACACAAGGAATAG * NM:i:12 ms:i:321 AS:i:320 nn:i:0 tp:A:P cm:i:4 s1:i:70 s2:i:77 de:f:0.0564 SA:Z:NZ_CP062160.1,4112332,+,2S187M334S,19,20; MD:Z:3T29C12T19C15C4G0T0T4G76G2^CA20 rl:i:0 XA:i:42 XB:Z:test +d7500028-dfcc-4404-b636-13edae804c55 256 NZ_CP062160.1 2003415 0 334S14M2D175M * 0 0 * * NM:i:12 ms:i:311 AS:i:310 nn:i:0 tp:A:S cm:i:3 s1:i:51 de:f:0.0579 MD:Z:14^GT72G12A0A0C4G15G14A4A12G29A3 rl:i:0 XA:i:42 XB:Z:test +d7500028-dfcc-4404-b636-13edae804c55 256 NZ_CP062160.1 2912586 0 334S14M2D175M * 0 0 * * NM:i:14 ms:i:299 AS:i:298 nn:i:0 tp:A:S cm:i:4 s1:i:77 de:f:0.0684 MD:Z:14^GT40A7A31C4A0A0C4G1T13G14A4A42A3 rl:i:0 XA:i:42 XB:Z:test +d7500028-dfcc-4404-b636-13edae804c55 272 NZ_CP062160.1 3313878 0 174M2D15M334S * 0 0 * * NM:i:15 ms:i:293 AS:i:292 nn:i:0 tp:A:S cm:i:3 s1:i:58 de:f:0.0737 MD:Z:3T42T4T14C8A6C4G0T0T44T6A20A2G8^CA15 rl:i:0 XA:i:42 XB:Z:test +d7500028-dfcc-4404-b636-13edae804c55 2048 NZ_CP062160.1 4112332 19 2H31M1I13M3D6M1D52M2D16M4I9M3I2M2D50M334H * 0 0 ATTCCTTGTGTATAGCCAGCCATTTTTTACGAGGCACAGCCAAACACCGCGCCTAATACGACAAAAGCCCAGACTTTACAGCCTGGACTTTTCAATTCAAAGAGGAGATGGCTCCCTTTTCCCTGGCATGAAAGCGATGAAATTATTCTTCTTCTGGCTCGTCGTCAACGTCCACTTCCGGAGCGAT * NM:i:20 ms:i:283 AS:i:274 nn:i:0 tp:A:P cm:i:3 s1:i:64 s2:i:0 de:f:0.0591 SA:Z:NZ_CP062160.1,3164467,-,194M2D329S,1,12; MD:Z:32A11^TTT6^C26G23A1^AG27^AG1A48 rl:i:0 XA:i:42 XB:Z:test +60588a89-f191-414e-b444-ad0815b7d9c9 4 * 0 0 * * 0 0 CTGCATTTCATCGACCACCGGCTTGGTCTGCCAGGCGATAAGCTCCAGGCAAATTCAGCCAGTGAGTCGGTGGTACGGCATCACCTGCAATAATCGGATACAGTTCCTCTTCCAGGGGAAGCGGAGCATTTCACCGGGTGGAGGAGGGCGTTGGCAGTACCTTGCTGTATTTTTGAGTTTCGAACGTGTCCATAATGAGGTTGATAAAACAACGGTAAAGCGGGTATTTTCATCAGGTATTTACCCACAGGACCGTTGTCGTTATTTTTATTTATTGTCGTTGCCCAGGGCGGCGAGACTAATACTGGTAGCATTTCTGAGTTTGCAAAATCTTCATTGCGGCTATTCCAGTAATTGCTGGCATTGTTATAAGTCCGCGGCGTGTGGCGATAACCTCATGACTAATTTTTCGGCCAGCAGGGTTGCATTGGTAATGAAACCAGCAATGTATGTTTGCGTTTATCTCTTTTTACCCCAATGCTTCGGCTAATGCTAACGCTATTGTGAAGGAAAATTTTCAGCACTTTTTATTAGTCGTTCATAACAGCGTTTACAGTATTATCGGTGGCTTCTATTAGGGCTGAAACAGCGCATAGGGATGGTTGACCATGTAAGTACATCACCAACTGTCATGTTGAGCATAGGGAACCAGGCGGTAGTGCCGTTGCCTTGTTCATAATACTGTAGCTGACCAACTGGTACCATAGATTAAACATCAATGTGTTGATAGTTTTCAAGTATTTCAAGCGTGACAGGATCTGTCGCGCTACCAGTTTTAACATTTCCAGTTGTGGAATATCGGCGAGCTGGTGCAATCGTTGGTATCTCATGGAGCGCTTCAGCGTTGGCATTAAAATAGCAATTAACGGTTGGCGATAGACTGATATATCTGGTAACACAACGAGTAATTTTGCAATACAGTTATTCAGACTTCCTTTTCAGCCAGCGATAAATGTTAG * rl:i:0 XA:i:42 XB:Z:test +dbd32a76-43d2-40d6-9ccc-610af70a4051 16 NZ_CP062160.1 2846079 60 130S32M3I13M1D9M1I13M4I85M3I16M3I24M1I16M1D86M2I14M3D113M3I1M1I25M5I1M2I191M3I57M2I4M1D30M2I1M2I31M3D59M1D10M2D35M1D31M341S * 0 0 TAAATAAAGTTTGTTTAATGGCTTCGTTTATTCGCCAGGCTGATTAATCACATGCAGGCTATCTGACCGCCAGCACCAAGAATCAGTACATTTTTCATGAAACTTTCCCAGGATTACGCGGCAAGCACATGCCATATCACTATTACGGATTTGAATATGTTTCCCCATTACAGAGTTTCTCAAAGAATCAGAGCTAATACTTCTCGAAATTACTTCAGCCACATAGACTTTGCCATCATAAAGATCCATATGGTTTGCGCCTTCAACAATGTGATAGCGTTTATCCTCGCGCTTTGATGCTCGATCGTACAGCAGGCCGTCACTCATCCATTTGCTCCCCAGCCAGGCTGCTCACATAATCTGTATCGGCTGAGTCAGGTACACTTCCGCCATATGGTAAGCATCATAGGTAATAATCTGGTTAAGGCTGCGCAAAGTAGCGCGTAACCCGGTGCTGGATACTGCGCGCGAGGGGTGTGGTAATATTCCCAGGCCTGACGCAGTTCTTCGTTCGGCGCATCGGACTCCTTCATTGGTGCCAGTGGCATAGTGGCGCATTCTCCGCTGCGTTTCAATATCGCTGGTTCGGGCGTTTGAAATTTTCTGGGGCGTCAACGTATGGTAGGGCATCAACAGATTTCACATTGTTTTCCCAACCATTACGGAACATCGAACCAATATTGACCGCACTAACAGTACCGATGGCCTTGATGCGCGGATCCCGAATTGCAGCATTGGCTGTATATCCTGCACCGGCACAAATTCCCATCGCACCAATTCGGGTATTGTCGACATAACCTGAAAGCGTTGTCAGGTAATCAATCACGGCACTGACGTCTTCAGTACGAATGTATGGGGCGTTTTAACTGACGCGGCTCGCCGCCGCTTTCTTCGTCTTTGATAAGATGCGTCATAAGCAATAGTGACAACTTTTTCCGCCAGTTTTTCGGCATAGGTTCCGGCCGTTTGCTCTTTAACGCCCCCACCTGGTGAGATACCAATTGCCGGATACTGACGGGTTTCATCAAATTTTGAGGGAAATAGATCACTGCAGACAAAGAGATAGGTGCTGAAGCGTTGAAACCTTTGTCCTCTCTTAACCTTAGCAATACGTAACCAAACGTGAAGTGCCGAAAGATAAGCAGCAACAGACGTGACTACGGAATCGACAGCACCCATAGCGTACCGAGCGCCACCAGAAAGATTTTTGCGGATTCATCATCGCGGTACCACAAAAATCACCAATAGAAGCGGATGCCAGGATGCGGCACGTTGCGCAACATCTGAATTGAGGTATCCAGGCAGTCGCTTCACTCCCAGCGACAGCCCGCCAATCAGCCCTTACAGTAGAATTCAATCCCCAGCGATCTCGCCAATTGAAAAGCCAACGAACGCCCGCCAGGAG * NM:i:87 ms:i:1407 AS:i:1380 nn:i:0 tp:A:P cm:i:19 s1:i:360 s2:i:0 de:f:0.0652 MD:Z:14G30^C10G17G14G7G52G9C12T22T3T6C1^G0C2C6C0G88^TGT30C63A5T27T11C0C2T11C10T60G20G0C5T131T3^C21A5A0C33^ATC1C26C11T18^G10^CA13T21^G4A26 rl:i:0 XA:i:42 XB:Z:test