From 7a1b5c9f5ba22b0c59cece74944f51e0b17b6082 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Mon, 29 Jul 2024 17:59:20 -0700
Subject: [PATCH 1/9] Parsing the reference object

---
 pipeline/clinvar/clinVarParse.py   |  22 ++++-
 pipeline/clinvar/clinvar_common.py | 129 ++++++++++-------------------
 2 files changed, 63 insertions(+), 88 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index 26dac400c..e46cf2216 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -67,6 +67,26 @@ def processSubmission(submissionSet, assembly):
                                      str(ra.condition_value),
                                      ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id),
                                      str(synonyms))))
+                    #print("\t".join((str(hgvs),
+                    #                 oa.submitter,
+                    #                 str(oa.clinicalSignificance),
+                    #                 str(oa.dateLastUpdated),
+                    #                 str(oa.dateSignificanceLastEvaluated),
+                    #                 str(oa.accession),
+                    #                 str(oa.accession_version),
+                    #                 str(oa.id),
+                    #                 str(oa.origin),
+                    #                 str(oa.method),
+                    #                 str(vcf_var).replace('g.', ''),
+                    #                 str(variant.geneSymbol),
+                    #                 str(proteinChange),
+                    #                 str(oa.description),
+                    #                 str(oa.summaryEvidence),
+                    #                 str(oa.reviewStatus),
+                    #                 str(ra.condition_type),
+                    #                 str(ra.condition_value),
+                    #                 ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id),
+                    #                 str(synonyms))))
 
 
 def _bases_only(seq):
@@ -87,7 +107,7 @@ def main():
 
 
     with open(args.clinVarXmlFilename) as inputFile:
-        for event, elem in ET.iterparse(input_fp, events=('start', 'end')):
+        for event, elem in ET.iterparse(inputFile, events=('start', 'end')):
             if event == 'end' and elem.tag == 'VariationArchive':
                 if clinvar.isCurrent(elem):
                     submissionSet = clinvar.clinVarSet(elem)
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index 7014776a1..f13143f83 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -20,16 +20,18 @@ def textIfPresent(element, field):
     """Return the text associated with a field under the element, or
     None if the field is not present"""
     ff = element.find(field)
-    if ff == None or ff.text == None:
-        return None
-    else:
+    if ff is not None:
         return ff.text
+    elif field in element.attrib:
+        return element.attrib[field]
+    else:
+        return None
 
-
+    
 def processClinicalSignificanceElement(el, obj):
     if el != None:
         obj.reviewStatus = textIfPresent(el, "ReviewStatus")
-        obj.clinicalSignificance = textIfPresent(el, "Description")
+        obj.clinicalSignificance = textIfPresent(el, "Interpretation")
         obj.summaryEvidence = textIfPresent(el, "Comment")
         obj.dateSignificanceLastEvaluated = el.get('DateLastEvaluated', None)
     else:
@@ -39,12 +41,12 @@ def processClinicalSignificanceElement(el, obj):
         obj.dateSignificanceLastEvaluated = None
 
 
-def build_xpath_filter_for_cv_assertions(gene_symbols):
-    symbols_str = [ f'text()="{s}"' for s in gene_symbols]
-    symbols_pred = ' or '.join(symbols_str)
+#def build_xpath_filter_for_cv_assertions(gene_symbols):
+##    symbols_str = [ f'text()="{s}"' for s in gene_symbols] ### HERE
+#    symbols_pred = ' or '.join(symbols_str)
 
     # filter assertion if it contains a Symbol we are interested in
-    return f"ReferenceClinVarAssertion/MeasureSet/Measure/MeasureRelationship/Symbol/ElementValue[({symbols_pred}) and @Type=\"Preferred\"]"
+#    return f"ReferenceClinVarAssertion/MeasureSet/Measure/MeasureRelationship/Symbol/ElementValue[({symbols_pred}) and @Type=\"Preferred\"]"
 
 
 def extractSynonyms(el):
@@ -167,9 +169,9 @@ class variant:
     """The Measure set.  We are interested in the variants specifically,
     but measure sets can be other things as well, such as haplotypes"""
 
-    def __init__(self, element, name, id, debug=False):
+    def __init__(self, element, debug=False):
         self.element = element
-        self.id = id
+        self.id = element.get("AlleleID")
         if debug:
             print("Parsing variant", self.id)
         self.name = name
@@ -182,9 +184,9 @@ def __init__(self, element, name, id, debug=False):
         self.coordinates = extract_genomic_coordinates_from_measure(element)
 
         self.geneSymbol = None
-        symbols = element.findall("MeasureRelationship/Symbol")
+        symbols = element.findall("Gene")
         for symbol in symbols:
-            symbol_val = textIfPresent(symbol, "ElementValue")
+            symbol_val = textIfPresent(symbol, "Symbol")
             if symbol_val.startswith('BRCA'):
                 self.geneSymbol = symbol_val
 
@@ -195,76 +197,22 @@ class referenceAssertion:
 
     def __init__(self, element, debug=False):
         self.element = element
-        self.id = element.get("ID")
+        self.title = element.get("Title")
         if debug:
-            print("Parsing ReferenceClinVarAssertion", self.id)
+            print("Parsing ReferenceClinVarAssertion", self.title)
 
         processClinicalSignificanceElement(element.find(
-            "ClinicalSignificance"), self)
-
-
-        obs = element.find("ObservedIn")
-        if obs == None:
-            self.origin = None
-            self.ethnicity = None
-            self.geographicOrigin = None
-            self.age = None
-            self.gender = None
-            self.familyData = None
-            self.method = None
-        else:
-            sample = obs.find("Sample")
-            if sample != None:
-                self.origin = textIfPresent(sample, "Origin")
-                self.ethnicity = textIfPresent(sample, "Ethnicity")
-                self.geographicOrigin = textIfPresent(sample, "GeographicOrigin")
-                self.age = textIfPresent(sample, "Age")
-                self.gender = textIfPresent(sample, "Gender")
-                self.familyData = textIfPresent(sample, "FamilyData")
-            method = obs.find("Method")
-            if method != None:
-                self.method = textIfPresent(method, "MethodType")
-        self.variant = None
-        self.synonyms = []
-
-        measureSet = element.find("MeasureSet")
-        #if measureSet.get("Type") == "Variant":
-
-        if len(measureSet.findall("Measure")) > 1:
-            logging.warning("Assertion with ID " + str(self.id) + " has multiple measures. Taking first one.")
-        if len(measureSet.findall("Measure")) >= 1:
-            name = measureSet.find("Name")
-            if name == None:
-                variantName = None
-            else:
-                variantName = name.find("ElementValue").text
-            self.variant = variant(measureSet.find("Measure"), variantName,
-                                   measureSet.get("ID"),
-                                   debug=debug)
-
-        self.synonyms = extractSynonyms(element)
+            "Interpretation"), self)
 
         # extract condition
-        self.condition_type = None
         self.condition_value = None
         self.condition_db_id = None
-        traitSet = element.find("TraitSet")
+        traitSet = element.find("InterpretedConditionList")
         if traitSet != None:
-            self.condition_type = traitSet.attrib["Type"]
-            trait = traitSet.find("Trait")
+            trait = traitSet.find("InterpretedCondition")
             if trait != None:
-                names = trait.findall("Name")
-                if names != None and len(names) > 0:
-                    for name in names:
-                        ev = name.find("ElementValue")
-                        if ev != None and ev.attrib["Type"] == "Preferred":
-                            self.condition_value = textIfPresent(name, "ElementValue")
-                        break
-                xrefs = trait.findall("XRef")
-                if xrefs != None and len(xrefs) > 0:
-                    self.condition_db_id = []
-                    for xref in xrefs:
-                        self.condition_db_id.append(xref.attrib["DB"] + "_" + xref.attrib["ID"])
+                self.condition_db_id = trait.attrib["DB"] + "_" + trait.attrib["ID"]
+                self.condition_value = trait.text
 
 
 
@@ -323,24 +271,31 @@ class clinVarSet:
     one or more submissions ("SCV Accessions").
     """
 
-    def __init__(self, element, debug=False):
+    def __init__(self, element, debug=True):
         self.element = element
-        self.id = element.get("ID")
+        self.id = element.get("VariationID")
         if debug:
             print("Parsing ClinVarSet ID", self.id)
-        rcva = element.find("ReferenceClinVarAssertion")
-        if isCurrent(rcva):
-            self.referenceAssertion = referenceAssertion(rcva, debug=debug)
+        #
+        # Look for the RCVAccession object.  There should be exactly one.
+        rcva_found = False
+        for next_rcva in element.iter("RCVAccession"):
+            assert(rcva_found is False)
+            rcva = next_rcva
+            rcva_found = True
+        self.referenceAssertion = referenceAssertion(rcva, debug=debug)
+        sa = element.find("SimpleAllele")
+        #self.variant = variant(sa, debug=debug)
         self.otherAssertions = dict()
 
-        for item in element.findall("ClinVarAssertion"):
-            if isCurrent(item):
-                cva = clinVarAssertion(item)
-                accession = cva.accession
-                self.otherAssertions[accession] = cva
-
-        if self.referenceAssertion.variant:
-            self.referenceAssertion.hgvs_cdna = self.extract_hgvs_cdna(self.referenceAssertion.variant.name, element)
+        #for item in element.findall("ClinVarAssertion"):
+        #    if isCurrent(item):
+        #        cva = clinVarAssertion(item)
+        #        accession = cva.accession
+        #        self.otherAssertions[accession] = cva
+        #
+        #if self.referenceAssertion.variant:
+        #    self.referenceAssertion.hgvs_cdna = self.extract_hgvs_cdna(self.referenceAssertion.variant.name, element)
 
     def extract_hgvs_cdna(self, variant_name, clinvar_set_el):
         """

From 8cf64cf5cfa92bc0c095c229a19b9aec35d39d1f Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Mon, 29 Jul 2024 18:54:58 -0700
Subject: [PATCH 2/9] Set up the interpretation object

---
 pipeline/clinvar/clinVarParse.py   |  7 ++--
 pipeline/clinvar/clinvar_common.py | 62 +++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index e46cf2216..1beeef007 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -23,6 +23,7 @@ def printHeader():
 
 def processSubmission(submissionSet, assembly):
     ra = submissionSet.referenceAssertion
+    interpretation = submissionSet.interpretation
     variant = submissionSet.variant
 
     if variant is None:
@@ -63,9 +64,9 @@ def processSubmission(submissionSet, assembly):
                                      str(oa.description),
                                      str(oa.summaryEvidence),
                                      str(oa.reviewStatus),
-                                     str(ra.condition_type),
-                                     str(ra.condition_value),
-                                     ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id),
+                                     str(interpretation.condition_type),
+                                     str(interpretation.condition_value),
+                                     ",".join(interpretation.condition_db_id) if isinstance(interpretation.condition_db_id, list) else str(interpretation.condition_db_id),
                                      str(synonyms))))
                     #print("\t".join((str(hgvs),
                     #                 oa.submitter,
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index f13143f83..6b2a3aa5a 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -27,7 +27,19 @@ def textIfPresent(element, field):
     else:
         return None
 
-    
+def findUniqueElement(name, parent):
+    """Find a child element directly or indirectly underneath this parent
+       element which should occur only once (i.e. there should be no other
+       elements of the same name).  Test this assumption and return
+       the child element"""
+    child_found = False
+    for next_child in parent.iter(name):
+        assert(child_found is False)
+        child = next_child
+        child_found = True
+    return(child)
+
+
 def processClinicalSignificanceElement(el, obj):
     if el != None:
         obj.reviewStatus = textIfPresent(el, "ReviewStatus")
@@ -204,16 +216,36 @@ def __init__(self, element, debug=False):
         processClinicalSignificanceElement(element.find(
             "Interpretation"), self)
 
-        # extract condition
+
+class interpretations:
+    """For gathering data on the trait.  This code expects only one trait"""
+
+    def __init__(self, element, debug=False):
+        self.element = element
+        
+        self.condition_type = None
         self.condition_value = None
         self.condition_db_id = None
-        traitSet = element.find("InterpretedConditionList")
-        if traitSet != None:
-            trait = traitSet.find("InterpretedCondition")
-            if trait != None:
-                self.condition_db_id = trait.attrib["DB"] + "_" + trait.attrib["ID"]
-                self.condition_value = trait.text
-
+        for interpretation in element.iter("Interpretation"):
+            if interpretation.attrib["Type"] == "Clinical significance"]:
+                for trait in interpretation.iter("Trait"):
+                    assert(self.condition_type is None)
+                    self.condition_type = trait.attrib("Type")
+                    for name in trait.iter("Name"):
+                        ev = name.find("ElementValue")
+                        if ev.attrib["Type"] == "Preferred":
+                            assert(self.condition_value is None)
+                            self.condition_value = ev.text
+                    for xref in trait.iter("XRef"):
+                        if self.condition_db_id is None:
+                            self.condition_db_id = (xref.attrib["DB"] + "_"
+                                                    + xref.attrib["ID"])
+                        else:
+                            c_db_id = list()
+                            c_db_id.append(self.condition_db_id)
+                            self.condition_db_id = c_db_id
+                            self.condition_db_id.append(xref.attrib["DB"] + \
+                                                        "_" + xref.attrib["ID"])
 
 
 class clinVarAssertion:
@@ -278,12 +310,14 @@ def __init__(self, element, debug=True):
             print("Parsing ClinVarSet ID", self.id)
         #
         # Look for the RCVAccession object.  There should be exactly one.
-        rcva_found = False
-        for next_rcva in element.iter("RCVAccession"):
-            assert(rcva_found is False)
-            rcva = next_rcva
-            rcva_found = True
+        rcva = findUniqueElement("RCVAccession", element)
         self.referenceAssertion = referenceAssertion(rcva, debug=debug)
+        
+        #
+        # Look for the Interpretations object.  There should be exactly one.
+        interp = findUniqueElement("Interpretations", element)
+        self.interpretations = interpretations(interp, debug=debug)
+        
         sa = element.find("SimpleAllele")
         #self.variant = variant(sa, debug=debug)
         self.otherAssertions = dict()

From f6fc9a90c407d33c2ee94b68803ed1edb3f5c609 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Mon, 5 Aug 2024 12:54:11 -0700
Subject: [PATCH 3/9] Added HGVS parsing

---
 pipeline/clinvar/clinVarParse.py   |  10 +--
 pipeline/clinvar/clinvar_common.py | 105 ++++++++++++++++-------------
 2 files changed, 62 insertions(+), 53 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index 1beeef007..fb34fb44c 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -34,15 +34,11 @@ def processSubmission(submissionSet, assembly):
     debug = False
     for oa in list(submissionSet.otherAssertions.values()):
         if oa.origin != "somatic":
-            hgvs = ra.hgvs_cdna
-
-            proteinChange = None
-            if "HGVS, protein, RefSeq" in variant.attribute:
-                proteinChange = variant.attribute["HGVS, protein, RefSeq"]
+            hgvs = variant.hgvs_cdna
 
             if assembly in variant.coordinates:
 
-                synonyms = MULTI_VALUE_SEP.join(ra.synonyms + oa.synonyms)
+                synonyms = MULTI_VALUE_SEP.join(variant.synonyms + oa.synonyms)
 
                 vcf_var = variant.coordinates[assembly]
 
@@ -60,7 +56,7 @@ def processSubmission(submissionSet, assembly):
                                      str(oa.method),
                                      str(vcf_var).replace('g.', ''),
                                      str(variant.geneSymbol),
-                                     str(proteinChange),
+                                     str(variant.proteinChange),
                                      str(oa.description),
                                      str(oa.summaryEvidence),
                                      str(oa.reviewStatus),
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index 6b2a3aa5a..9c4e6d28b 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -84,32 +84,25 @@ def extractSynonyms(el):
     return sy + sy_alt
 
 
-def extract_genomic_coordinates_from_measure(meas_el):
+def extract_genomic_coordinates_from_location(loc):
     """
-    meas_el: `xml` module object of a ClinVar `Measure` element
-
+    loc: `xml` module object of a ClinVar `Location` element
     returns: dictionary of assembly (str) to genomic coordinates (VCFVariant object)
     """
-
-    sequence_locations = meas_el.findall('SequenceLocation')
-
     coords = {}
-    for el in sequence_locations:
-        assembly = el.attrib['Assembly'] # GRCh38
-
-        if el.get('referenceAlleleVCF'):
+    for sl in loc.iter('SequenceLocation'):
+        assembly = sl.attrib['Assembly'] # GRCh38
+        if sl.get('referenceAlleleVCF'):
             coords[assembly] = variant_utils.VCFVariant(
-                int(el.get('Chr')),
-                int(el.get('positionVCF')),
-                el.get('referenceAlleleVCF'),
-                el.get('alternateAlleleVCF')
+                int(sl.attrib['Chr']),
+                int(sl.attrib['positionVCF']),
+                sl.attrib['referenceAlleleVCF'],
+                sl.attrib['alternateAlleleVCF']
             )
-
     # if no reference/alternate allele found, compute (assuming genomic coordinates
     # are either present for all assemblies or for none)
-    if not coords:
-        coords = _extract_genomic_coordinates_from_non_genomic_fields(meas_el)
-
+    #if not coords:
+    #    coords = _extract_genomic_coordinates_from_non_genomic_fields(meas_el)
     return coords
 
 
@@ -183,25 +176,43 @@ class variant:
 
     def __init__(self, element, debug=False):
         self.element = element
-        self.id = element.get("AlleleID")
+        self.id = element.attrib["AlleleID"]
         if debug:
             print("Parsing variant", self.id)
-        self.name = name
-
-        self.attribute = dict()
-        for attrs in element.findall("AttributeSet"):
-            for attrib in attrs.findall("Attribute"):
-                self.attribute[attrib.get("Type")] = attrib.text
-
-        self.coordinates = extract_genomic_coordinates_from_measure(element)
-
-        self.geneSymbol = None
-        symbols = element.findall("Gene")
-        for symbol in symbols:
-            symbol_val = textIfPresent(symbol, "Symbol")
-            if symbol_val.startswith('BRCA'):
-                self.geneSymbol = symbol_val
-
+        gene = findUniqueElement("Gene", element)
+        self.geneSymbol = gene.attrib["Symbol"]
+        location = element.find("Location")
+        self.coordinates = extract_genomic_coordinates_from_location(location)
+        self.proteinChange = None
+        pc = element.find("ProteinChange")
+        if pc is not None:
+            self.proteinChange = pc.text
+        self.synonyms = list()
+        self.hgvs_cdna = None
+        self.hgvs_protein = None
+        for hgvs in element.find("HGVSlist").iter("HGVS"):
+            pe = hgvs.find("ProteinExpression")
+            if pe:
+                proteinHgvs = pe.find("Expression").text
+                if debug:
+                    print("protein HGVS", proteinHgvs)
+                self.synonyms.append(proteinHgvs)
+            ne = hgvs.find("NucleotideExpression")
+            if ne:
+                nucleotideHgvs = ne.find("Expression").text
+                if debug:
+                    print("Nucleotide hgvs", nucleotideHgvs)
+                self.synonyms.append(nucleotideHgvs)
+                if "MANESelect" in ne.attrib:
+                    if ne.attrib["MANESelect"] == "true":
+                        self.hgvs_cdna = nucleotideHgvs
+                        if pe:
+                            self.hgvs_protein = proteinHgvs
+                        if debug:
+                            print("MANE Select HGVS cDNA ", nucleotideHgvs,
+                                  "protein", proteinHgvs)
+                
+                
 
 
 class referenceAssertion:
@@ -213,30 +224,30 @@ def __init__(self, element, debug=False):
         if debug:
             print("Parsing ReferenceClinVarAssertion", self.title)
 
-        processClinicalSignificanceElement(element.find(
-            "Interpretation"), self)
+        processClinicalSignificanceElement(element, self)
 
 
 class interpretations:
     """For gathering data on the trait.  This code expects only one trait"""
-
     def __init__(self, element, debug=False):
-        self.element = element
-        
+        self.element = element        
         self.condition_type = None
         self.condition_value = None
         self.condition_db_id = None
         for interpretation in element.iter("Interpretation"):
-            if interpretation.attrib["Type"] == "Clinical significance"]:
+            if interpretation.attrib["Type"] == "Clinical significance":
                 for trait in interpretation.iter("Trait"):
+                    print("found trait")
                     assert(self.condition_type is None)
-                    self.condition_type = trait.attrib("Type")
+                    self.condition_type = trait.attrib["Type"]
                     for name in trait.iter("Name"):
+                        print("Found name")
                         ev = name.find("ElementValue")
                         if ev.attrib["Type"] == "Preferred":
                             assert(self.condition_value is None)
                             self.condition_value = ev.text
                     for xref in trait.iter("XRef"):
+                        print("found xref")
                         if self.condition_db_id is None:
                             self.condition_db_id = (xref.attrib["DB"] + "_"
                                                     + xref.attrib["ID"])
@@ -250,7 +261,7 @@ def __init__(self, element, debug=False):
 
 class clinVarAssertion:
     """Class for representing one submission (i.e. one annotation of a
-    submitted variant"""
+dir(referen    submitted variant"""
 
     def __init__(self, element, debug=False):
         self.element = element
@@ -299,7 +310,8 @@ def __init__(self, element, debug=False):
 class clinVarSet:
     """Container class for a ClinVarSet record, which is a set of submissions
     that were submitted to ClinVar together.  In the ClinVar terminology,
-    each ClinVarSet is one aggregate record ("RCV Accession"), which contains
+    each ClinVarSet is one "SimpleAllele" with one aggregate record 
+   ("RCV Accession"), which contains
     one or more submissions ("SCV Accessions").
     """
 
@@ -308,6 +320,9 @@ def __init__(self, element, debug=True):
         self.id = element.get("VariationID")
         if debug:
             print("Parsing ClinVarSet ID", self.id)
+        sa = element.find("InterpretedRecord/SimpleAllele")
+        self.variant = variant(sa, debug=debug)
+
         #
         # Look for the RCVAccession object.  There should be exactly one.
         rcva = findUniqueElement("RCVAccession", element)
@@ -318,8 +333,6 @@ def __init__(self, element, debug=True):
         interp = findUniqueElement("Interpretations", element)
         self.interpretations = interpretations(interp, debug=debug)
         
-        sa = element.find("SimpleAllele")
-        #self.variant = variant(sa, debug=debug)
         self.otherAssertions = dict()
 
         #for item in element.findall("ClinVarAssertion"):

From 1b3c244c41f56e1a45cf3388c83700d9fa455b02 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Mon, 5 Aug 2024 20:45:29 -0700
Subject: [PATCH 4/9] Made it through the first VariationRecord object!

---
 pipeline/clinvar/clinVarParse.py   |  17 ++--
 pipeline/clinvar/clinvar_common.py | 121 +++++++++++++----------------
 2 files changed, 61 insertions(+), 77 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index fb34fb44c..f4b8da221 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -33,13 +33,10 @@ def processSubmission(submissionSet, assembly):
 
     debug = False
     for oa in list(submissionSet.otherAssertions.values()):
-        if oa.origin != "somatic":
+        if not ("somatic" in oa.origin and length(oa.origin) == 1):
             hgvs = variant.hgvs_cdna
-
             if assembly in variant.coordinates:
-
-                synonyms = MULTI_VALUE_SEP.join(variant.synonyms + oa.synonyms)
-
+                synonyms = MULTI_VALUE_SEP.join(variant.synonyms)
                 vcf_var = variant.coordinates[assembly]
 
                 # Omit the variants that don't have any genomic start coordinate indicated.
@@ -52,12 +49,12 @@ def processSubmission(submissionSet, assembly):
                                      str(oa.accession),
                                      str(oa.accession_version),
                                      str(oa.id),
-                                     str(oa.origin),
-                                     str(oa.method),
-                                     str(vcf_var).replace('g.', ''),
+                                     ",".join(oa.origin),
+                                     ",".join(oa.method),
+                                     str(vcf_var).replace('g.', ''), #change
                                      str(variant.geneSymbol),
                                      str(variant.proteinChange),
-                                     str(oa.description),
+                                     ",".join(oa.description),
                                      str(oa.summaryEvidence),
                                      str(oa.reviewStatus),
                                      str(interpretation.condition_type),
@@ -107,7 +104,7 @@ def main():
         for event, elem in ET.iterparse(inputFile, events=('start', 'end')):
             if event == 'end' and elem.tag == 'VariationArchive':
                 if clinvar.isCurrent(elem):
-                    submissionSet = clinvar.clinVarSet(elem)
+                    submissionSet = clinvar.clinVarSet(elem, debug=True)
                     processSubmission(submissionSet, args.assembly)
                 elem.clear()
 
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index 9c4e6d28b..7084c34a1 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -40,18 +40,6 @@ def findUniqueElement(name, parent):
     return(child)
 
 
-def processClinicalSignificanceElement(el, obj):
-    if el != None:
-        obj.reviewStatus = textIfPresent(el, "ReviewStatus")
-        obj.clinicalSignificance = textIfPresent(el, "Interpretation")
-        obj.summaryEvidence = textIfPresent(el, "Comment")
-        obj.dateSignificanceLastEvaluated = el.get('DateLastEvaluated', None)
-    else:
-        obj.reviewStatus = None
-        obj.clinicalSignificance = None
-        obj.summaryEvidence = None
-        obj.dateSignificanceLastEvaluated = None
-
 
 #def build_xpath_filter_for_cv_assertions(gene_symbols):
 ##    symbols_str = [ f'text()="{s}"' for s in gene_symbols] ### HERE
@@ -60,7 +48,6 @@ def processClinicalSignificanceElement(el, obj):
     # filter assertion if it contains a Symbol we are interested in
 #    return f"ReferenceClinVarAssertion/MeasureSet/Measure/MeasureRelationship/Symbol/ElementValue[({symbols_pred}) and @Type=\"Preferred\"]"
 
-
 def extractSynonyms(el):
     include_types = {'ProteinChange3LetterCode', 'ProteinChange1LetterCode',
                      'nucleotide change', 'protein change, historical'}
@@ -174,7 +161,7 @@ class variant:
     """The Measure set.  We are interested in the variants specifically,
     but measure sets can be other things as well, such as haplotypes"""
 
-    def __init__(self, element, debug=False):
+    def __init__(self, element, debug=True):
         self.element = element
         self.id = element.attrib["AlleleID"]
         if debug:
@@ -183,13 +170,12 @@ def __init__(self, element, debug=False):
         self.geneSymbol = gene.attrib["Symbol"]
         location = element.find("Location")
         self.coordinates = extract_genomic_coordinates_from_location(location)
+        self.hgvs_cdna = None
         self.proteinChange = None
-        pc = element.find("ProteinChange")
-        if pc is not None:
-            self.proteinChange = pc.text
         self.synonyms = list()
-        self.hgvs_cdna = None
-        self.hgvs_protein = None
+        pc = textIfPresent(element, "ProteinChange")
+        if pc is not None:
+            self.synonyms.append(pc)
         for hgvs in element.find("HGVSlist").iter("HGVS"):
             pe = hgvs.find("ProteinExpression")
             if pe:
@@ -207,7 +193,7 @@ def __init__(self, element, debug=False):
                     if ne.attrib["MANESelect"] == "true":
                         self.hgvs_cdna = nucleotideHgvs
                         if pe:
-                            self.hgvs_protein = proteinHgvs
+                            self.proteinChange = proteinHgvs
                         if debug:
                             print("MANE Select HGVS cDNA ", nucleotideHgvs,
                                   "protein", proteinHgvs)
@@ -223,11 +209,13 @@ def __init__(self, element, debug=False):
         self.title = element.get("Title")
         if debug:
             print("Parsing ReferenceClinVarAssertion", self.title)
-
-        processClinicalSignificanceElement(element, self)
+        self.reviewStatus = element.attrib["ReviewStatus"]
+        self.clinicalSignificance = element.attrib["Interpretation"]
+        self.dateSignificanceLastEvaluated = element.attrib["DateLastEvaluated"]
+        self.summaryDescription = None
 
 
-class interpretations:
+class interpretation:
     """For gathering data on the trait.  This code expects only one trait"""
     def __init__(self, element, debug=False):
         self.element = element        
@@ -237,17 +225,14 @@ def __init__(self, element, debug=False):
         for interpretation in element.iter("Interpretation"):
             if interpretation.attrib["Type"] == "Clinical significance":
                 for trait in interpretation.iter("Trait"):
-                    print("found trait")
                     assert(self.condition_type is None)
                     self.condition_type = trait.attrib["Type"]
                     for name in trait.iter("Name"):
-                        print("Found name")
                         ev = name.find("ElementValue")
                         if ev.attrib["Type"] == "Preferred":
                             assert(self.condition_value is None)
                             self.condition_value = ev.text
                     for xref in trait.iter("XRef"):
-                        print("found xref")
                         if self.condition_db_id is None:
                             self.condition_db_id = (xref.attrib["DB"] + "_"
                                                     + xref.attrib["ID"])
@@ -259,52 +244,57 @@ def __init__(self, element, debug=False):
                                                         "_" + xref.attrib["ID"])
 
 
-class clinVarAssertion:
+class clinicalAssertion:
     """Class for representing one submission (i.e. one annotation of a
 dir(referen    submitted variant"""
 
+    
+    
     def __init__(self, element, debug=False):
         self.element = element
         self.id = element.get("ID")
         if debug:
-            print("Parsing ClinVarAssertion", self.id)
-        cvsd = element.find("ClinVarSubmissionID")
-        if cvsd == None:
-            self.submitter = None
-            self.dateSubmitted = None
-        else:
-            self.submitter = cvsd.get("submitter", default=None)
-            self.dateSubmitted = cvsd.get("submitterDate")
+            print("Parsing ClinicalAssertion", self.id)
+        self.dateSubmitted = element.attrib["SubmissionDate"]
+        self.dateLastUpdated = element.attrib["DateLastUpdated"]
         cva = element.find("ClinVarAccession")
         if cva == None:
             self.accession = None
         else:
-            self.accession = cva.get("Acc", default=None)
-            self.accession_version = cva.get("Version", default=None)
-
-        self.origin = None
-        self.method = None
-        self.description = None
-        oi = element.find("ObservedIn")
-        if oi != None:
+            self.accession = cva.attrib["Accession"]
+            self.accession_version = cva.attrib["Version"]
+            self.submitter = cva.attrib["SubmitterName"]
+        self.reviewStatus = textIfPresent(element, "ReviewStatus")
+        interpretation = element.find("Interpretation")
+        self.dateSignificanceLastEvaluated = interpretation.attrib["DateLastEvaluated"]
+        self.clinicalSignificance = textIfPresent(interpretation,
+                                                  "Description")
+        self.summaryEvidence = textIfPresent(interpretation, "Comment")
+        oil = element.find("ObservedInList")
+        self.origin = list()
+        self.method = list()
+        self.description = list()
+        for oi in oil.iter("ObservedIn"):
             sample = oi.find("Sample")
             if sample != None:
-                self.origin = textIfPresent(sample, "Origin")
+                origin = textIfPresent(sample, "Origin")
+                if not origin in self.origin:
+                    self.origin.append(origin)
             method = oi.find("Method")
             if method != None:
-                self.method = textIfPresent(method, "MethodType")
-            description = oi.find("ObservedData")
-            if description != None:
-                for attr in description.findall("Attribute"):
+                newMethod = textIfPresent(method, "MethodType")
+                if not newMethod in self.method:
+                    self.method.append(newMethod)
+            od = oi.find("ObservedData")
+            if od != None:
+                for attr in od.iter("Attribute"):
                     if attr.attrib["Type"] == 'Description':
-                        self.description = textIfPresent(description, "Attribute")
-
-        processClinicalSignificanceElement(element.find(
-            "ClinicalSignificance"), self)
+                        newDescription = textIfPresent(od, "Attribute")
+                    else:
+                        newDescription = "None"
+                    if not newDescription in self.description:
+                        self.description.append(newDescription)
 
-        self.dateLastUpdated = cva.get("DateUpdated")
-
-        self.synonyms = extractSynonyms(element)
 
 
 class clinVarSet:
@@ -321,29 +311,26 @@ def __init__(self, element, debug=True):
         if debug:
             print("Parsing ClinVarSet ID", self.id)
         sa = element.find("InterpretedRecord/SimpleAllele")
-        self.variant = variant(sa, debug=debug)
+        self.variant = variant(sa, debug=False)
 
         #
         # Look for the RCVAccession object.  There should be exactly one.
         rcva = findUniqueElement("RCVAccession", element)
-        self.referenceAssertion = referenceAssertion(rcva, debug=debug)
+        self.referenceAssertion = referenceAssertion(rcva, debug=False)
         
         #
         # Look for the Interpretations object.  There should be exactly one.
         interp = findUniqueElement("Interpretations", element)
-        self.interpretations = interpretations(interp, debug=debug)
+        self.interpretation = interpretation(interp, debug=debug)
         
         self.otherAssertions = dict()
 
-        #for item in element.findall("ClinVarAssertion"):
-        #    if isCurrent(item):
-        #        cva = clinVarAssertion(item)
-        #        accession = cva.accession
-        #        self.otherAssertions[accession] = cva
-        #
-        #if self.referenceAssertion.variant:
-        #    self.referenceAssertion.hgvs_cdna = self.extract_hgvs_cdna(self.referenceAssertion.variant.name, element)
-
+        for item in element.iter("ClinicalAssertion"):
+            if isCurrent(item):
+                cva = clinicalAssertion(item)
+                accession = cva.accession
+                self.otherAssertions[accession] = cva
+        
     def extract_hgvs_cdna(self, variant_name, clinvar_set_el):
         """
         Finds a HGVS CDNA representation of a variant within a ClinVarSet.

From 6e7cae279dacf9d8ca1ccf3394691d83e8cbf464 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Tue, 6 Aug 2024 18:35:29 -0700
Subject: [PATCH 5/9] Redid the XML parsing to use text

---
 pipeline/clinvar/filter_clinvar.py | 56 ++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 pipeline/clinvar/filter_clinvar.py

diff --git a/pipeline/clinvar/filter_clinvar.py b/pipeline/clinvar/filter_clinvar.py
old mode 100644
new mode 100755
index bbe9d6749..bdbde29f4
--- a/pipeline/clinvar/filter_clinvar.py
+++ b/pipeline/clinvar/filter_clinvar.py
@@ -1,11 +1,60 @@
 #!/usr/bin/env python
+"""
 
+filter_clinvar.py: Filter the ClinVar VCV Release XML file, and build a subset
+XML file containing the VariationArchive records for the genes of interest.
+
+This file filters the XML by reading the XML file as text, assembling text 
+objects of the VariationArchive records, and then parsing those records as XML
+to see if they specify the target gene symbols.  It's done this way because:
+- reading the whole file into one big ElementTree XML object doesn't work 
+  because it's too large and we run out of memory
+- reading the file iteratively, with ElementTree iterparse hasn't worked, 
+  because the ClinVar archive contains some badly-formatted XML records.
+  By not parsing the XML as XML until we know that we're looking at an object
+  that we want, we can wager that the broken XML records are for other genes.
+"""
 import argparse
 import gzip
+import io
 import xml.etree.ElementTree as ET
 
+def contains_target_gene_symbol(variationArchiveElement, target_symbol_values):
+    root = ET.fromstring(variationArchiveElement)
+    geneList = root.find(".//GeneList")
+    if geneList:
+        for geneElement in geneList.iter("Gene"):
+            if "Symbol" in geneElement.attrib:
+                if geneElement.attrib["Symbol"] in target_symbol_values:
+                    return(True)
+    return(False)
+
+
+
+    
 def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values,
                                chunk_size=1024):
+    in_variation_archive = False
+    in_header = True
+    for line in input_fp:
+        if in_header:
+            output_fp.write(line)
+        if '<VariationArchive' in line:
+            in_variation_archive = True
+            in_header = False
+            variation_archive = io.StringIO()
+        if in_variation_archive:
+            variation_archive.write(line)
+        if '</VariationArchive>' in line:
+            in_variation_archive = False
+            xml_string = variation_archive.getvalue()
+            variation_archive.close()
+            if contains_target_gene_symbol(xml_string,
+                                           target_symbol_values):
+                output_fp.write(xml_string)
+
+
+def old(input_fp, output_fp, target_symbol_values,  chunk_size=1024):
     for event, elem in ET.iterparse(input_fp, events=('start', 'end')):
         if event == 'start' and elem.tag == "ClnVarVariationRelease":
             output_FP.write(ET.tostring(elem, encoding='UTF-8'))
@@ -16,7 +65,6 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values,
                     if gene_element.attrib["Symbol"] in target_symbol_values:
                         output_fp.write(ET.tostring(elem, encoding='UTF-8'))
             elem.clear()
-    output_fp.write("</ClinVarVariationRelease>".encode('UTF-8'))
 
 
 def main():
@@ -26,11 +74,13 @@ def main():
     parser.add_argument('-g', "--gene", action='append')  
     args = parser.parse_args()
     if args.input.endswith('gz'):
-        input_fp = gzip.open(args.input)
+        input_fp = io.TextIOWrapper(gzip.open(args.input, 'rb'),
+                                    encoding='utf-8')
     else:
         input_fp = open(args.input, 'r')
-    output_fp = open(args.output, 'wb')
+    output_fp = open(args.output, 'w')
     filter_xml_for_gene_symbol(input_fp, output_fp, args.gene)
+    output_fp.write("</ClinVarVariationRelease>".encode('UTF-8'))
 
 if __name__ == "__main__":
     main()

From 325c5935deaca22ffa0a39c8f59920d81dc758be Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Wed, 7 Aug 2024 17:17:21 -0700
Subject: [PATCH 6/9] Fixed a bug that led to the first line of the first
 VariationArchive record being included erroneously

---
 pipeline/clinvar/filter_clinvar.py | 31 +++---------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

diff --git a/pipeline/clinvar/filter_clinvar.py b/pipeline/clinvar/filter_clinvar.py
index bdbde29f4..78addc104 100755
--- a/pipeline/clinvar/filter_clinvar.py
+++ b/pipeline/clinvar/filter_clinvar.py
@@ -1,19 +1,5 @@
 #!/usr/bin/env python
-"""
 
-filter_clinvar.py: Filter the ClinVar VCV Release XML file, and build a subset
-XML file containing the VariationArchive records for the genes of interest.
-
-This file filters the XML by reading the XML file as text, assembling text 
-objects of the VariationArchive records, and then parsing those records as XML
-to see if they specify the target gene symbols.  It's done this way because:
-- reading the whole file into one big ElementTree XML object doesn't work 
-  because it's too large and we run out of memory
-- reading the file iteratively, with ElementTree iterparse hasn't worked, 
-  because the ClinVar archive contains some badly-formatted XML records.
-  By not parsing the XML as XML until we know that we're looking at an object
-  that we want, we can wager that the broken XML records are for other genes.
-"""
 import argparse
 import gzip
 import io
@@ -37,12 +23,12 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values,
     in_variation_archive = False
     in_header = True
     for line in input_fp:
-        if in_header:
-            output_fp.write(line)
         if '<VariationArchive' in line:
             in_variation_archive = True
             in_header = False
             variation_archive = io.StringIO()
+        if in_header:
+            output_fp.write(line)
         if in_variation_archive:
             variation_archive.write(line)
         if '</VariationArchive>' in line:
@@ -54,17 +40,6 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values,
                 output_fp.write(xml_string)
 
 
-def old(input_fp, output_fp, target_symbol_values,  chunk_size=1024):
-    for event, elem in ET.iterparse(input_fp, events=('start', 'end')):
-        if event == 'start' and elem.tag == "ClnVarVariationRelease":
-            output_FP.write(ET.tostring(elem, encoding='UTF-8'))
-        if event == 'end' and elem.tag == 'VariationArchive':
-            gene_element = elem.find('.//Gene')
-            if gene_element is not None:
-                if "Symbol" in gene_element.attrib:
-                    if gene_element.attrib["Symbol"] in target_symbol_values:
-                        output_fp.write(ET.tostring(elem, encoding='UTF-8'))
-            elem.clear()
 
 
 def main():
@@ -80,7 +55,7 @@ def main():
         input_fp = open(args.input, 'r')
     output_fp = open(args.output, 'w')
     filter_xml_for_gene_symbol(input_fp, output_fp, args.gene)
-    output_fp.write("</ClinVarVariationRelease>".encode('UTF-8'))
+    output_fp.write("</ClinVarVariationRelease>")
 
 if __name__ == "__main__":
     main()

From b9f6dd7bcb77e933c7615d17b4857bfea9b599e2 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Wed, 7 Aug 2024 17:18:30 -0700
Subject: [PATCH 7/9] Parsed the entire ClinVar record for the first time!

---
 pipeline/clinvar/clinVarParse.py   |  15 +--
 pipeline/clinvar/clinvar_common.py | 175 +++++++++++++++++------------
 2 files changed, 111 insertions(+), 79 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index f4b8da221..c0bb2f16f 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -23,7 +23,7 @@ def printHeader():
 
 def processSubmission(submissionSet, assembly):
     ra = submissionSet.referenceAssertion
-    interpretation = submissionSet.interpretation
+    classification = submissionSet.classification
     variant = submissionSet.variant
 
     if variant is None:
@@ -33,7 +33,7 @@ def processSubmission(submissionSet, assembly):
 
     debug = False
     for oa in list(submissionSet.otherAssertions.values()):
-        if not ("somatic" in oa.origin and length(oa.origin) == 1):
+        if not ("somatic" in oa.origin and len(oa.origin) == 1):
             hgvs = variant.hgvs_cdna
             if assembly in variant.coordinates:
                 synonyms = MULTI_VALUE_SEP.join(variant.synonyms)
@@ -57,9 +57,9 @@ def processSubmission(submissionSet, assembly):
                                      ",".join(oa.description),
                                      str(oa.summaryEvidence),
                                      str(oa.reviewStatus),
-                                     str(interpretation.condition_type),
-                                     str(interpretation.condition_value),
-                                     ",".join(interpretation.condition_db_id) if isinstance(interpretation.condition_db_id, list) else str(interpretation.condition_db_id),
+                                     str(classification.condition_type),
+                                     str(classification.condition_value),
+                                     ",".join(classification.condition_db_id),
                                      str(synonyms))))
                     #print("\t".join((str(hgvs),
                     #                 oa.submitter,
@@ -104,8 +104,9 @@ def main():
         for event, elem in ET.iterparse(inputFile, events=('start', 'end')):
             if event == 'end' and elem.tag == 'VariationArchive':
                 if clinvar.isCurrent(elem):
-                    submissionSet = clinvar.clinVarSet(elem, debug=True)
-                    processSubmission(submissionSet, args.assembly)
+                    submissionSet = clinvar.variationArchive(elem, debug=True)
+                    if submissionSet.valid:
+                        processSubmission(submissionSet, args.assembly)
                 elem.clear()
 
 if __name__ == "__main__":
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index 7084c34a1..0e425353e 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -158,45 +158,63 @@ def __init__(self, element, useNone=False, debug=False):
 
 
 class variant:
-    """The Measure set.  We are interested in the variants specifically,
-    but measure sets can be other things as well, such as haplotypes"""
+    """The variant (SimpleAllele) set.  """
 
     def __init__(self, element, debug=True):
         self.element = element
-        self.id = element.attrib["AlleleID"]
+        self.valid = True
+        self.variantType = textIfPresent(element, "VariantType")
+        if self.variantType in ["Deletion", "Duplication", "Insertion"]:
+            self.valid = False
+            return
+        if "AlleleID" in element.attrib:
+            self.id = element.get("AlleleID")
+        else:
+            self.id = None
         if debug:
             print("Parsing variant", self.id)
-        gene = findUniqueElement("Gene", element)
-        self.geneSymbol = gene.attrib["Symbol"]
+        self.geneSymbol = None
+        for gene in element.iter("GeneList/Gene"):
+            geneSymbol = gene.get("Symbol")
+            if self.geneSymbol is None:
+                self.geneSymbol = geneSymbol
+            else:
+                self.geneSymbol = self.geneSymbol + "," + geneSymbol
         location = element.find("Location")
+        if location is None:
+            self.valid = False
+            return
         self.coordinates = extract_genomic_coordinates_from_location(location)
         self.hgvs_cdna = None
         self.proteinChange = None
         self.synonyms = list()
+        spdi = textIfPresent(element, "CanonicalSPDI")
         pc = textIfPresent(element, "ProteinChange")
         if pc is not None:
             self.synonyms.append(pc)
-        for hgvs in element.find("HGVSlist").iter("HGVS"):
-            pe = hgvs.find("ProteinExpression")
-            if pe:
-                proteinHgvs = pe.find("Expression").text
-                if debug:
-                    print("protein HGVS", proteinHgvs)
-                self.synonyms.append(proteinHgvs)
-            ne = hgvs.find("NucleotideExpression")
-            if ne:
-                nucleotideHgvs = ne.find("Expression").text
-                if debug:
-                    print("Nucleotide hgvs", nucleotideHgvs)
-                self.synonyms.append(nucleotideHgvs)
-                if "MANESelect" in ne.attrib:
-                    if ne.attrib["MANESelect"] == "true":
-                        self.hgvs_cdna = nucleotideHgvs
-                        if pe:
-                            self.proteinChange = proteinHgvs
-                        if debug:
-                            print("MANE Select HGVS cDNA ", nucleotideHgvs,
-                                  "protein", proteinHgvs)
+        if element.find("HGVSlist"):
+            for hgvs in element.find("HGVSlist").iter("HGVS"):
+                pe = hgvs.find("ProteinExpression")
+                if pe:
+                    proteinHgvs = pe.find("Expression").text
+                    if debug:
+                        print("protein HGVS", proteinHgvs)
+                    self.synonyms.append(proteinHgvs)
+                ne = hgvs.find("NucleotideExpression")
+                if ne:
+                    nucleotideHgvs = ne.find("Expression").text
+                    if debug:
+                        print("Nucleotide hgvs", nucleotideHgvs)
+                    self.synonyms.append(nucleotideHgvs)
+                    if "MANESelect" in ne.attrib:
+                        if ne.attrib["MANESelect"] == "true":
+                            self.hgvs_cdna = nucleotideHgvs
+                            if pe:
+                                self.proteinChange = proteinHgvs
+                                if debug:
+                                    print("MANE Select HGVS cDNA ",
+                                          nucleotideHgvs,
+                                          "protein", proteinHgvs)
                 
                 
 
@@ -206,42 +224,41 @@ class referenceAssertion:
 
     def __init__(self, element, debug=False):
         self.element = element
+        self.valid = True
         self.title = element.get("Title")
         if debug:
             print("Parsing ReferenceClinVarAssertion", self.title)
-        self.reviewStatus = element.attrib["ReviewStatus"]
-        self.clinicalSignificance = element.attrib["Interpretation"]
-        self.dateSignificanceLastEvaluated = element.attrib["DateLastEvaluated"]
+        gc = element.find(".//GermlineClassification")
+        if gc is None:
+            self.valid = False
+            return
+        self.reviewStatus = textIfPresent(gc, "ReviewStatus")
+        self.clinicalSignificance = textIfPresent(gc, "Description")
+        if self.clinicalSignificance is not None:
+            self.dateSignificanceLastEvaluated = gc.find("Description").get("DateLastEvaluated")
+        else:
+            self.dateSignificanceLastEvaluated = None
         self.summaryDescription = None
 
 
-class interpretation:
+class classification:
     """For gathering data on the trait.  This code expects only one trait"""
     def __init__(self, element, debug=False):
         self.element = element        
         self.condition_type = None
         self.condition_value = None
-        self.condition_db_id = None
-        for interpretation in element.iter("Interpretation"):
-            if interpretation.attrib["Type"] == "Clinical significance":
-                for trait in interpretation.iter("Trait"):
-                    assert(self.condition_type is None)
-                    self.condition_type = trait.attrib["Type"]
-                    for name in trait.iter("Name"):
-                        ev = name.find("ElementValue")
-                        if ev.attrib["Type"] == "Preferred":
-                            assert(self.condition_value is None)
-                            self.condition_value = ev.text
-                    for xref in trait.iter("XRef"):
-                        if self.condition_db_id is None:
-                            self.condition_db_id = (xref.attrib["DB"] + "_"
-                                                    + xref.attrib["ID"])
-                        else:
-                            c_db_id = list()
-                            c_db_id.append(self.condition_db_id)
-                            self.condition_db_id = c_db_id
-                            self.condition_db_id.append(xref.attrib["DB"] + \
-                                                        "_" + xref.attrib["ID"])
+        self.condition_db_id = list()
+        for trait in element.iter("Trait"):
+            self.condition_type = trait.get("Type")
+            for name in trait.iter("Name"):
+                ev = name.find("ElementValue")
+                if ev.get("Type") == "Preferred":
+                    self.condition_value = ev.text
+                for xref in trait.iter("XRef"):
+                    xref_string = xref.get("DB") + "_" + xref.get("ID")
+                    if not xref_string in self.condition_db_id:
+                        self.condition_db_id.append(xref_string)
+
 
 
 class clinicalAssertion:
@@ -255,21 +272,21 @@ def __init__(self, element, debug=False):
         self.id = element.get("ID")
         if debug:
             print("Parsing ClinicalAssertion", self.id)
-        self.dateSubmitted = element.attrib["SubmissionDate"]
-        self.dateLastUpdated = element.attrib["DateLastUpdated"]
+        self.dateSubmitted = element.get("SubmissionDate")
+        self.dateLastUpdated = element.get("DateLastUpdated")
         cva = element.find("ClinVarAccession")
         if cva == None:
             self.accession = None
         else:
-            self.accession = cva.attrib["Accession"]
-            self.accession_version = cva.attrib["Version"]
-            self.submitter = cva.attrib["SubmitterName"]
+            self.accession = cva.get("Accession")
+            self.accession_version = cva.get("Version")
+            self.submitter = cva.get("SubmitterName")
         self.reviewStatus = textIfPresent(element, "ReviewStatus")
-        interpretation = element.find("Interpretation")
-        self.dateSignificanceLastEvaluated = interpretation.attrib["DateLastEvaluated"]
-        self.clinicalSignificance = textIfPresent(interpretation,
-                                                  "Description")
-        self.summaryEvidence = textIfPresent(interpretation, "Comment")
+        classif = element.find("Classification")
+        self.dateSignificanceLastEvaluated = classif.get("DateLastEvaluated")
+        self.clinicalSignificance = textIfPresent(classif,
+                                                  "GermlineClassification")
+        self.summaryEvidence = textIfPresent(classif, "Comment")
         oil = element.find("ObservedInList")
         self.origin = list()
         self.method = list()
@@ -288,7 +305,7 @@ def __init__(self, element, debug=False):
             od = oi.find("ObservedData")
             if od != None:
                 for attr in od.iter("Attribute"):
-                    if attr.attrib["Type"] == 'Description':
+                    if attr.get("Type") == 'Description':
                         newDescription = textIfPresent(od, "Attribute")
                     else:
                         newDescription = "None"
@@ -297,8 +314,9 @@ def __init__(self, element, debug=False):
 
 
 
-class clinVarSet:
-    """Container class for a ClinVarSet record, which is a set of submissions
+class variationArchive:
+    """Container class for a variationArchive record, which is a set of 
+    submissions
     that were submitted to ClinVar together.  In the ClinVar terminology,
     each ClinVarSet is one "SimpleAllele" with one aggregate record 
    ("RCV Accession"), which contains
@@ -310,26 +328,39 @@ def __init__(self, element, debug=True):
         self.id = element.get("VariationID")
         if debug:
             print("Parsing ClinVarSet ID", self.id)
-        sa = element.find("InterpretedRecord/SimpleAllele")
+        self.valid = True
+        sa = element.find("./ClassifiedRecord/SimpleAllele")
+        if sa is None:
+            self.valid = False
+            return
         self.variant = variant(sa, debug=False)
+        if not self.variant.valid:
+            self.valid = False
+            return
 
         #
         # Look for the RCVAccession object.  There should be exactly one.
-        rcva = findUniqueElement("RCVAccession", element)
+        rcva = element.find("./ClassifiedRecord/RCVList/RCVAccession")
+        if rcva is None:
+            self.valid = False
+            return
         self.referenceAssertion = referenceAssertion(rcva, debug=False)
+        if not self.referenceAssertion.valid:
+            self.valid = False
+            return
         
         #
-        # Look for the Interpretations object.  There should be exactly one.
-        interp = findUniqueElement("Interpretations", element)
-        self.interpretation = interpretation(interp, debug=debug)
+        # Look for the GermlineClassification object.
+        cl = element.find("./ClassifiedRecord/Classifications")
+        self.classification = classification(cl, debug=debug)
         
         self.otherAssertions = dict()
 
         for item in element.iter("ClinicalAssertion"):
             if isCurrent(item):
-                cva = clinicalAssertion(item)
-                accession = cva.accession
-                self.otherAssertions[accession] = cva
+                ca = clinicalAssertion(item)
+                accession = ca.accession
+                self.otherAssertions[accession] = ca
         
     def extract_hgvs_cdna(self, variant_name, clinvar_set_el):
         """

From 4a8a9bf72af5377988c6c06e208c5f037185dcbf Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Wed, 21 Aug 2024 18:58:01 -0700
Subject: [PATCH 8/9] fixed the broken code for returning the gene symbol

---
 pipeline/clinvar/clinVarParse.py   |  2 +-
 pipeline/clinvar/clinvar_common.py | 21 ++++++++++-----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index c0bb2f16f..ae3e5e8fa 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -104,7 +104,7 @@ def main():
         for event, elem in ET.iterparse(inputFile, events=('start', 'end')):
             if event == 'end' and elem.tag == 'VariationArchive':
                 if clinvar.isCurrent(elem):
-                    submissionSet = clinvar.variationArchive(elem, debug=True)
+                    submissionSet = clinvar.variationArchive(elem, debug=False)
                     if submissionSet.valid:
                         processSubmission(submissionSet, args.assembly)
                 elem.clear()
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index 0e425353e..a1a9bfa46 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -164,9 +164,6 @@ def __init__(self, element, debug=True):
         self.element = element
         self.valid = True
         self.variantType = textIfPresent(element, "VariantType")
-        if self.variantType in ["Deletion", "Duplication", "Insertion"]:
-            self.valid = False
-            return
         if "AlleleID" in element.attrib:
             self.id = element.get("AlleleID")
         else:
@@ -174,12 +171,14 @@ def __init__(self, element, debug=True):
         if debug:
             print("Parsing variant", self.id)
         self.geneSymbol = None
-        for gene in element.iter("GeneList/Gene"):
-            geneSymbol = gene.get("Symbol")
-            if self.geneSymbol is None:
-                self.geneSymbol = geneSymbol
-            else:
-                self.geneSymbol = self.geneSymbol + "," + geneSymbol
+        geneList = element.find("GeneList")
+        if geneList is not None:
+            for gene in geneList.iter("Gene"):
+                geneSymbol = gene.get("Symbol")
+                if self.geneSymbol is None:
+                    self.geneSymbol = geneSymbol
+                else:
+                    self.geneSymbol = self.geneSymbol + "," + geneSymbol
         location = element.find("Location")
         if location is None:
             self.valid = False
@@ -281,8 +280,8 @@ def __init__(self, element, debug=False):
             self.accession = cva.get("Accession")
             self.accession_version = cva.get("Version")
             self.submitter = cva.get("SubmitterName")
-        self.reviewStatus = textIfPresent(element, "ReviewStatus")
         classif = element.find("Classification")
+        self.reviewStatus = textIfPresent(classif, "ReviewStatus")
         self.dateSignificanceLastEvaluated = classif.get("DateLastEvaluated")
         self.clinicalSignificance = textIfPresent(classif,
                                                   "GermlineClassification")
@@ -327,7 +326,7 @@ def __init__(self, element, debug=True):
         self.element = element
         self.id = element.get("VariationID")
         if debug:
-            print("Parsing ClinVarSet ID", self.id)
+            print("DEBUG: Parsing ClinVarSet ID", self.id)
         self.valid = True
         sa = element.find("./ClassifiedRecord/SimpleAllele")
         if sa is None:

From 12ac6953d584545abb91f080c616cb34ba934349 Mon Sep 17 00:00:00 2001
From: melissacline <melissa.s.cline@gmail.com>
Date: Wed, 23 Oct 2024 17:11:58 -0700
Subject: [PATCH 9/9] Updated parsing for the new ClinVar structure

---
 pipeline/clinvar/clinVarParse.py   | 41 +++++++----------
 pipeline/clinvar/clinvar_common.py | 70 +++++++++++++++---------------
 pipeline/clinvar/filter_clinvar.py | 22 +++++++---
 pipeline/clinvar/test_clinvar.py   | 22 +++++-----
 4 files changed, 81 insertions(+), 74 deletions(-)

diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py
index ae3e5e8fa..7dc878e69 100755
--- a/pipeline/clinvar/clinVarParse.py
+++ b/pipeline/clinvar/clinVarParse.py
@@ -27,20 +27,26 @@ def processSubmission(submissionSet, assembly):
     variant = submissionSet.variant
 
     if variant is None:
-        logging.warning("No variant information could be extracted for ReferenceClinVarAssertion ID %s %s",
+        logging.warning("Warning","No variant information could be extracted for ReferenceClinVarAssertion ID %s %s",
                      submissionSet.referenceAssertion.id, [c.accession for c in submissionSet.otherAssertions.values()])
         return None
 
+    hgvs = submissionSet.name
     debug = False
     for oa in list(submissionSet.otherAssertions.values()):
-        if not ("somatic" in oa.origin and len(oa.origin) == 1):
-            hgvs = variant.hgvs_cdna
-            if assembly in variant.coordinates:
+        if ("somatic" in oa.origin and len(oa.origin) == 1):
+            logging.warning("HGVS %s because submissions are only somatic in origin", hgvs)
+        else:
+            if not assembly in variant.coordinates:
+                logging.warning("HGVS %s rejected for poor variant coordinate data", hgvs)
+            else:
                 synonyms = MULTI_VALUE_SEP.join(variant.synonyms)
                 vcf_var = variant.coordinates[assembly]
 
                 # Omit the variants that don't have any genomic start coordinate indicated.
-                if vcf_var and _bases_only(vcf_var.ref) and _bases_only(vcf_var.alt):
+                if not (vcf_var and _bases_only(vcf_var.ref) and _bases_only(vcf_var.alt)):
+                    logging.warning("HGVS %s rejected for poor VCF data", hgvs)
+                else:
                     print("\t".join((str(hgvs),
                                      oa.submitter,
                                      str(oa.clinicalSignificance),
@@ -61,26 +67,6 @@ def processSubmission(submissionSet, assembly):
                                      str(classification.condition_value),
                                      ",".join(classification.condition_db_id),
                                      str(synonyms))))
-                    #print("\t".join((str(hgvs),
-                    #                 oa.submitter,
-                    #                 str(oa.clinicalSignificance),
-                    #                 str(oa.dateLastUpdated),
-                    #                 str(oa.dateSignificanceLastEvaluated),
-                    #                 str(oa.accession),
-                    #                 str(oa.accession_version),
-                    #                 str(oa.id),
-                    #                 str(oa.origin),
-                    #                 str(oa.method),
-                    #                 str(vcf_var).replace('g.', ''),
-                    #                 str(variant.geneSymbol),
-                    #                 str(proteinChange),
-                    #                 str(oa.description),
-                    #                 str(oa.summaryEvidence),
-                    #                 str(oa.reviewStatus),
-                    #                 str(ra.condition_type),
-                    #                 str(ra.condition_value),
-                    #                 ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id),
-                    #                 str(synonyms))))
 
 
 def _bases_only(seq):
@@ -107,6 +93,11 @@ def main():
                     submissionSet = clinvar.variationArchive(elem, debug=False)
                     if submissionSet.valid:
                         processSubmission(submissionSet, args.assembly)
+                    else:
+                        if hasattr(submissionSet, 'name'):
+                            logging.warning("Submission %s not valid", submissionSet.name)
+                        else:
+                            logging.warning("Submission not valid, name not available")
                 elem.clear()
 
 if __name__ == "__main__":
diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py
index a1a9bfa46..4008e3ebe 100644
--- a/pipeline/clinvar/clinvar_common.py
+++ b/pipeline/clinvar/clinvar_common.py
@@ -6,6 +6,7 @@
 from common import hgvs_utils, variant_utils
 from hgvs.exceptions import HGVSError
 import hgvs
+import html
 import logging
 
 def isCurrent(element):
@@ -192,14 +193,14 @@ def __init__(self, element, debug=True):
         if pc is not None:
             self.synonyms.append(pc)
         if element.find("HGVSlist"):
-            for hgvs in element.find("HGVSlist").iter("HGVS"):
-                pe = hgvs.find("ProteinExpression")
+            for hgvsObj in element.find("HGVSlist").iter("HGVS"):
+                pe = hgvsObj.find("ProteinExpression")
                 if pe:
                     proteinHgvs = pe.find("Expression").text
                     if debug:
                         print("protein HGVS", proteinHgvs)
                     self.synonyms.append(proteinHgvs)
-                ne = hgvs.find("NucleotideExpression")
+                ne = hgvsObj.find("NucleotideExpression")
                 if ne:
                     nucleotideHgvs = ne.find("Expression").text
                     if debug:
@@ -254,9 +255,10 @@ def __init__(self, element, debug=False):
                 if ev.get("Type") == "Preferred":
                     self.condition_value = ev.text
                 for xref in trait.iter("XRef"):
-                    xref_string = xref.get("DB") + "_" + xref.get("ID")
-                    if not xref_string in self.condition_db_id:
-                        self.condition_db_id.append(xref_string)
+                    if not re.search("Genetic Testing Registry", xref.get("DB")):
+                        xref_string = xref.get("DB") + "_" + xref.get("ID")
+                        if not xref_string in self.condition_db_id:
+                            self.condition_db_id.append(xref_string)
 
 
 
@@ -336,7 +338,14 @@ def __init__(self, element, debug=True):
         if not self.variant.valid:
             self.valid = False
             return
-
+        #
+        # Find an HGVS variant name, either by taking the VariationName object of this element and removing the
+        # gene name in the parens, or if that name doesn't match an HGVS expression, look for a viable HGVS expression
+        # amoung the synonyms
+        fullname = re.sub(r'\([^)]*\)', '', html.unescape(element.get("VariationName")))
+        selected_hgvs = extract_hgvs_cdna(fullname, self.variant.synonyms)
+        self.name = re.sub(r'^\s+|\s+$', '', selected_hgvs)
+                                      
         #
         # Look for the RCVAccession object.  There should be exactly one.
         rcva = element.find("./ClassifiedRecord/RCVList/RCVAccession")
@@ -361,30 +370,23 @@ def __init__(self, element, debug=True):
                 accession = ca.accession
                 self.otherAssertions[accession] = ca
         
-    def extract_hgvs_cdna(self, variant_name, clinvar_set_el):
-        """
-        Finds a HGVS CDNA representation of a variant within a ClinVarSet.
-        If possible, avoid repeat representations using the "[]" synatax, since
-        we are currently not able to handle it further downstream (https://github.com/biocommons/hgvs/issues/113)
-
-        :param variant_name: variant name from title
-        :param clinvar_set_el: clinvar set element
-        :return: HGVS CDNA representation as string
-        """
-        hgvs_cand = re.sub(r"\(" + "(BRCA[1|2])" + r"\)",
-                      "", variant_name.split()[0])
-
-        # only take variants starting with NM_ and not containing []
-        hgvs_cdna_re = r'NM_.*:[^\[]*$'
-
-        if not re.match(hgvs_cdna_re, hgvs_cand):
-            # taking Attribute of 'HGVS', 'HGVS, coding' or 'HGVS, coding, RefSeq' in
-            # both ReferenceClinVarAssertion and ClinVarAssertion's
-            hgvs_candidates = [ev.text for ev in clinvar_set_el.findall('.//Measure/AttributeSet/Attribute')
-                               if ev.attrib['Type'].startswith('HGVS, coding') or ev.attrib['Type'] == 'HGVS']
-
-            filtered = [s for s in hgvs_candidates if re.match(hgvs_cdna_re, s)]
-            if filtered:
-                return filtered[0]
-
-        return hgvs_cand
+def extract_hgvs_cdna(variant_name, synonym_list):
+    """
+    Finds a HGVS CDNA representation of a variant within a ClinVarSet.
+    If possible, avoid repeat representations using the "[]" synatax, since
+    we are currently not able to handle it further downstream (https://github.com/biocommons/hgvs/issues/113)
+
+    :param variant_name: variant name from title
+    :param synonym_list: synonyms extracted from variant object
+    :return: HGVS CDNA representation as string
+    """
+
+    # only take variants starting with NM_ and not containing []
+    hgvs_cdna_re = r'NM_.*:[^\[]*$'
+    if not re.match(hgvs_cdna_re, variant_name):
+        # check for anything in the synonym list that matches an HGVS expression
+        filtered = [s for s in synonym_list if re.match(hgvs_cdna_re, s)]
+        if filtered:
+            return filtered[0]
+
+    return variant_name
diff --git a/pipeline/clinvar/filter_clinvar.py b/pipeline/clinvar/filter_clinvar.py
index 78addc104..5aa0f4b7b 100755
--- a/pipeline/clinvar/filter_clinvar.py
+++ b/pipeline/clinvar/filter_clinvar.py
@@ -3,16 +3,24 @@
 import argparse
 import gzip
 import io
+import re
 import xml.etree.ElementTree as ET
 
-def contains_target_gene_symbol(variationArchiveElement, target_symbol_values):
+def contains_target_gene_symbol(variationArchiveElement,
+                                target_symbol_values, debug=True):
     root = ET.fromstring(variationArchiveElement)
+    if debug:
+        print("testing", root.get("VariationName"))
     geneList = root.find(".//GeneList")
     if geneList:
         for geneElement in geneList.iter("Gene"):
             if "Symbol" in geneElement.attrib:
                 if geneElement.attrib["Symbol"] in target_symbol_values:
+                    if debug:
+                        print("Saving")
                     return(True)
+    if debug:
+        print("rejecting")
     return(False)
 
 
@@ -23,7 +31,9 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values,
     in_variation_archive = False
     in_header = True
     for line in input_fp:
-        if '<VariationArchive' in line:
+        if re.search("NM_000059.4(BRCA2):c.193C>T", line):
+            print("Found it!", line)
+        if re.search('<VariationArchive', line):
             in_variation_archive = True
             in_header = False
             variation_archive = io.StringIO()
@@ -31,12 +41,12 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values,
             output_fp.write(line)
         if in_variation_archive:
             variation_archive.write(line)
-        if '</VariationArchive>' in line:
+        if re.search('</VariationArchive>', line):
             in_variation_archive = False
             xml_string = variation_archive.getvalue()
             variation_archive.close()
             if contains_target_gene_symbol(xml_string,
-                                           target_symbol_values):
+                                           target_symbol_values, debug=True):
                 output_fp.write(xml_string)
 
 
@@ -46,7 +56,9 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('-i', "--input")
     parser.add_argument('-o', "--output")
-    parser.add_argument('-g', "--gene", action='append')  
+    parser.add_argument('-g', "--gene", action='append')
+    parser.add_argument('-d', "--debug", action='store_true')
+    parser.set_defaults(debug=False)
     args = parser.parse_args()
     if args.input.endswith('gz'):
         input_fp = io.TextIOWrapper(gzip.open(args.input, 'rb'),
diff --git a/pipeline/clinvar/test_clinvar.py b/pipeline/clinvar/test_clinvar.py
index 1c25a38f4..f9e998585 100644
--- a/pipeline/clinvar/test_clinvar.py
+++ b/pipeline/clinvar/test_clinvar.py
@@ -5,21 +5,23 @@
 
 
 def test_simple_genomic_coordinate_extraction():
-    sample_measure = """
-      <Measure Type="Deletion" ID="24358">
-        <Name>
-          <ElementValue Type="Preferred">NM_000059.3(BRCA2):c.6591_6592del (p.Glu2198fs)</ElementValue>
-        </Name>
-        <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="13" Accession="NC_000013.11" start="32340946" stop="32340947" display_start="32340946" display_stop="32340947" variantLength="2" positionVCF="32340945" referenceAlleleVCF="CTG" alternateAlleleVCF="C"/>
-        <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="13" Accession="NC_000013.10" start="32915083" stop="32915084" display_start="32915083" display_stop="32915084" variantLength="2" positionVCF="32915082" referenceAlleleVCF="CTG" alternateAlleleVCF="C"/>
-     </Measure>
+    sample_location = """
+      <Location>
+        <CytogeneticLocation>13q13.1</CytogeneticLocation>
+        <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" forDisplay="true" AssemblyStatus="cur\
+rent" Chr="13" Accession="NC_000013.11" start="32345245" stop="32345245" display_start="32345245" display_stop="32345245" var\
+iantLength="1" positionVCF="32345245" referenceAlleleVCF="A" alternateAlleleVCF="G"/>
+        <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="13" Ac\
+cession="NC_000013.10" start="32919382" stop="32919382" display_start="32919382" display_stop="32919382" variantLength="1" po\
+sitionVCF="32919382" referenceAlleleVCF="A" alternateAlleleVCF="G"/>
+      </Location>
     """
 
     measure_el = ET.fromstring(sample_measure)
 
-    genomic_coords = clinvar_common.extract_genomic_coordinates_from_measure(measure_el)
+    genomic_coords = clinvar_common.extract_genomic_coordinates_from_location(location_el)
 
-    assert genomic_coords[hgvs_utils.HgvsWrapper.GRCh38_Assem] == variant_utils.VCFVariant(13, 32340945, "CTG", "C")
+    assert genomic_coords[hgvs_utils.HgvsWrapper.GRCh38_Assem] == variant_utils.VCFVariant(13, 32345245, "A", "G")
 
 
 def test_genomic_coordinate_extraction_from_NM():