From c3b01e746ea084c3720e8f6ba6eab14f87e900a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Eiras?= Date: Mon, 13 Aug 2018 18:44:31 +0200 Subject: [PATCH] Issue 298: boundary checks for when number of columns in FORMAT != samples And don't crash when FORMAT field names are not valid python identifiers. --- vcf/model.py | 2 +- vcf/parser.py | 2 ++ vcf/test/issue_298.vcf | 7 +++++++ vcf/test/test_vcf.py | 12 ++++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 vcf/test/issue_298.vcf diff --git a/vcf/model.py b/vcf/model.py index 34a4d17..14a5a88 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -683,7 +683,7 @@ def __repr__(self): def make_calldata_tuple(fields): """ Return a namedtuple for a given call format """ - class CallData(collections.namedtuple('calldata', fields)): + class CallData(collections.namedtuple('calldata', fields, rename=True)): __slots__ = () _types = [] diff --git a/vcf/parser.py b/vcf/parser.py index c3c3d08..125b1aa 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -474,6 +474,8 @@ def _parse_samples(self, samples, samp_fmt, site): sampdat = [None] * nfields for i, vals in enumerate(sample.split(':')): + if i >= nfields: + break # short circuit the most common if samp_fmt._fields[i] == 'GT': diff --git a/vcf/test/issue_298.vcf b/vcf/test/issue_298.vcf new file mode 100644 index 0000000..7be57af --- /dev/null +++ b/vcf/test/issue_298.vcf @@ -0,0 +1,7 @@ +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 +chr1 123123123 . A C . . . . 1:2 +chr1 123123123 . A C . . . A 1:2 +chr1 123123123 . A C . . . A:B 1:2 +chr1 123123123 . A C . . . A:B:C 1:2 +chr1 123123123 . A C . . . :: 1:2 +chr1 123123123 . A C . . . -invalid!:B:C 1:2 diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py index f04c8b1..5b7c7db 100644 --- a/vcf/test/test_vcf.py +++ b/vcf/test/test_vcf.py @@ -1716,6 +1716,17 @@ def test_strelka(self): n = next(reader) assert n is not None +class TestIssue298(unittest.TestCase): + def test_issue_298(self): + records = list(vcf.Reader(fh('issue_298.vcf'))) + assert len(records) == 6 + + assert not records[0].samples + assert records[1].samples and tuple(records[1].samples[0].data) == (['1'],), records[1].samples + assert records[2].samples and tuple(records[2].samples[0].data) == (['1'], ['2']), records[2].samples + assert records[3].samples and tuple(records[3].samples[0].data) == (['1'], ['2'], None), records[3].samples + assert records[4].samples and tuple(records[4].samples[0].data) == (['1'], ['2'], None), records[4].samples + assert records[5].samples and tuple(records[5].samples[0].data) == (['1'], ['2'], None), records[5].samples suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput)) @@ -1753,3 +1764,4 @@ def test_strelka(self): suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncalledGenotypes)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStrelka)) suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBadInfoFields)) +suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue298))