From c3b01e746ea084c3720e8f6ba6eab14f87e900a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Eiras?= <joao.eiras@gmail.com>
Date: Mon, 13 Aug 2018 18:44:31 +0200
Subject: [PATCH] Issue 298: boundary checks for when number of columns in
 FORMAT != samples

And don't crash when FORMAT field names are not valid python identifiers.
---
 vcf/model.py           |  2 +-
 vcf/parser.py          |  2 ++
 vcf/test/issue_298.vcf |  7 +++++++
 vcf/test/test_vcf.py   | 12 ++++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 vcf/test/issue_298.vcf

diff --git a/vcf/model.py b/vcf/model.py
index 34a4d17..14a5a88 100644
--- a/vcf/model.py
+++ b/vcf/model.py
@@ -683,7 +683,7 @@ def __repr__(self):
 def make_calldata_tuple(fields):
     """ Return a namedtuple for a given call format """
 
-    class CallData(collections.namedtuple('calldata', fields)):
+    class CallData(collections.namedtuple('calldata', fields, rename=True)):
         __slots__ = ()
 
         _types = []
diff --git a/vcf/parser.py b/vcf/parser.py
index c3c3d08..125b1aa 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -474,6 +474,8 @@ def _parse_samples(self, samples, samp_fmt, site):
             sampdat = [None] * nfields
 
             for i, vals in enumerate(sample.split(':')):
+                if i >= nfields:
+                    break
 
                 # short circuit the most common
                 if samp_fmt._fields[i] == 'GT':
diff --git a/vcf/test/issue_298.vcf b/vcf/test/issue_298.vcf
new file mode 100644
index 0000000..7be57af
--- /dev/null
+++ b/vcf/test/issue_298.vcf
@@ -0,0 +1,7 @@
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample1
+chr1	123123123	.	A	C	.	.	.	.	1:2
+chr1	123123123	.	A	C	.	.	.	A	1:2
+chr1	123123123	.	A	C	.	.	.	A:B	1:2
+chr1	123123123	.	A	C	.	.	.	A:B:C	1:2
+chr1	123123123	.	A	C	.	.	.	::	1:2
+chr1	123123123	.	A	C	.	.	.	-invalid!:B:C	1:2
diff --git a/vcf/test/test_vcf.py b/vcf/test/test_vcf.py
index f04c8b1..5b7c7db 100644
--- a/vcf/test/test_vcf.py
+++ b/vcf/test/test_vcf.py
@@ -1716,6 +1716,17 @@ def test_strelka(self):
         n = next(reader)
         assert n is not None
 
+class TestIssue298(unittest.TestCase):
+    def test_issue_298(self):
+        records = list(vcf.Reader(fh('issue_298.vcf')))
+        assert len(records) == 6
+
+        assert not records[0].samples
+        assert records[1].samples and tuple(records[1].samples[0].data) == (['1'],), records[1].samples
+        assert records[2].samples and tuple(records[2].samples[0].data) == (['1'], ['2']), records[2].samples
+        assert records[3].samples and tuple(records[3].samples[0].data) == (['1'], ['2'], None), records[3].samples
+        assert records[4].samples and tuple(records[4].samples[0].data) == (['1'], ['2'], None), records[4].samples
+        assert records[5].samples and tuple(records[5].samples[0].data) == (['1'], ['2'], None), records[5].samples
 
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestVcfSpecs))
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput))
@@ -1753,3 +1764,4 @@ def test_strelka(self):
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUncalledGenotypes))
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStrelka))
 suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestBadInfoFields))
+suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestIssue298))