From 7a1b5c9f5ba22b0c59cece74944f51e0b17b6082 Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 29 Jul 2024 17:59:20 -0700 Subject: [PATCH 1/9] Parsing the reference object --- pipeline/clinvar/clinVarParse.py | 22 ++++- pipeline/clinvar/clinvar_common.py | 129 ++++++++++------------------- 2 files changed, 63 insertions(+), 88 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index 26dac400c..e46cf2216 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -67,6 +67,26 @@ def processSubmission(submissionSet, assembly): str(ra.condition_value), ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id), str(synonyms)))) + #print("\t".join((str(hgvs), + # oa.submitter, + # str(oa.clinicalSignificance), + # str(oa.dateLastUpdated), + # str(oa.dateSignificanceLastEvaluated), + # str(oa.accession), + # str(oa.accession_version), + # str(oa.id), + # str(oa.origin), + # str(oa.method), + # str(vcf_var).replace('g.', ''), + # str(variant.geneSymbol), + # str(proteinChange), + # str(oa.description), + # str(oa.summaryEvidence), + # str(oa.reviewStatus), + # str(ra.condition_type), + # str(ra.condition_value), + # ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id), + # str(synonyms)))) def _bases_only(seq): @@ -87,7 +107,7 @@ def main(): with open(args.clinVarXmlFilename) as inputFile: - for event, elem in ET.iterparse(input_fp, events=('start', 'end')): + for event, elem in ET.iterparse(inputFile, events=('start', 'end')): if event == 'end' and elem.tag == 'VariationArchive': if clinvar.isCurrent(elem): submissionSet = clinvar.clinVarSet(elem) diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index 7014776a1..f13143f83 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -20,16 +20,18 @@ def textIfPresent(element, field): """Return the text associated with a field under the element, or None if the field is not present""" ff = element.find(field) - if ff == None or ff.text == None: - return None - else: + if ff is not None: return ff.text + elif field in element.attrib: + return element.attrib[field] + else: + return None - + def processClinicalSignificanceElement(el, obj): if el != None: obj.reviewStatus = textIfPresent(el, "ReviewStatus") - obj.clinicalSignificance = textIfPresent(el, "Description") + obj.clinicalSignificance = textIfPresent(el, "Interpretation") obj.summaryEvidence = textIfPresent(el, "Comment") obj.dateSignificanceLastEvaluated = el.get('DateLastEvaluated', None) else: @@ -39,12 +41,12 @@ def processClinicalSignificanceElement(el, obj): obj.dateSignificanceLastEvaluated = None -def build_xpath_filter_for_cv_assertions(gene_symbols): - symbols_str = [ f'text()="{s}"' for s in gene_symbols] - symbols_pred = ' or '.join(symbols_str) +#def build_xpath_filter_for_cv_assertions(gene_symbols): +## symbols_str = [ f'text()="{s}"' for s in gene_symbols] ### HERE +# symbols_pred = ' or '.join(symbols_str) # filter assertion if it contains a Symbol we are interested in - return f"ReferenceClinVarAssertion/MeasureSet/Measure/MeasureRelationship/Symbol/ElementValue[({symbols_pred}) and @Type=\"Preferred\"]" +# return f"ReferenceClinVarAssertion/MeasureSet/Measure/MeasureRelationship/Symbol/ElementValue[({symbols_pred}) and @Type=\"Preferred\"]" def extractSynonyms(el): @@ -167,9 +169,9 @@ class variant: """The Measure set. We are interested in the variants specifically, but measure sets can be other things as well, such as haplotypes""" - def __init__(self, element, name, id, debug=False): + def __init__(self, element, debug=False): self.element = element - self.id = id + self.id = element.get("AlleleID") if debug: print("Parsing variant", self.id) self.name = name @@ -182,9 +184,9 @@ def __init__(self, element, name, id, debug=False): self.coordinates = extract_genomic_coordinates_from_measure(element) self.geneSymbol = None - symbols = element.findall("MeasureRelationship/Symbol") + symbols = element.findall("Gene") for symbol in symbols: - symbol_val = textIfPresent(symbol, "ElementValue") + symbol_val = textIfPresent(symbol, "Symbol") if symbol_val.startswith('BRCA'): self.geneSymbol = symbol_val @@ -195,76 +197,22 @@ class referenceAssertion: def __init__(self, element, debug=False): self.element = element - self.id = element.get("ID") + self.title = element.get("Title") if debug: - print("Parsing ReferenceClinVarAssertion", self.id) + print("Parsing ReferenceClinVarAssertion", self.title) processClinicalSignificanceElement(element.find( - "ClinicalSignificance"), self) - - - obs = element.find("ObservedIn") - if obs == None: - self.origin = None - self.ethnicity = None - self.geographicOrigin = None - self.age = None - self.gender = None - self.familyData = None - self.method = None - else: - sample = obs.find("Sample") - if sample != None: - self.origin = textIfPresent(sample, "Origin") - self.ethnicity = textIfPresent(sample, "Ethnicity") - self.geographicOrigin = textIfPresent(sample, "GeographicOrigin") - self.age = textIfPresent(sample, "Age") - self.gender = textIfPresent(sample, "Gender") - self.familyData = textIfPresent(sample, "FamilyData") - method = obs.find("Method") - if method != None: - self.method = textIfPresent(method, "MethodType") - self.variant = None - self.synonyms = [] - - measureSet = element.find("MeasureSet") - #if measureSet.get("Type") == "Variant": - - if len(measureSet.findall("Measure")) > 1: - logging.warning("Assertion with ID " + str(self.id) + " has multiple measures. Taking first one.") - if len(measureSet.findall("Measure")) >= 1: - name = measureSet.find("Name") - if name == None: - variantName = None - else: - variantName = name.find("ElementValue").text - self.variant = variant(measureSet.find("Measure"), variantName, - measureSet.get("ID"), - debug=debug) - - self.synonyms = extractSynonyms(element) + "Interpretation"), self) # extract condition - self.condition_type = None self.condition_value = None self.condition_db_id = None - traitSet = element.find("TraitSet") + traitSet = element.find("InterpretedConditionList") if traitSet != None: - self.condition_type = traitSet.attrib["Type"] - trait = traitSet.find("Trait") + trait = traitSet.find("InterpretedCondition") if trait != None: - names = trait.findall("Name") - if names != None and len(names) > 0: - for name in names: - ev = name.find("ElementValue") - if ev != None and ev.attrib["Type"] == "Preferred": - self.condition_value = textIfPresent(name, "ElementValue") - break - xrefs = trait.findall("XRef") - if xrefs != None and len(xrefs) > 0: - self.condition_db_id = [] - for xref in xrefs: - self.condition_db_id.append(xref.attrib["DB"] + "_" + xref.attrib["ID"]) + self.condition_db_id = trait.attrib["DB"] + "_" + trait.attrib["ID"] + self.condition_value = trait.text @@ -323,24 +271,31 @@ class clinVarSet: one or more submissions ("SCV Accessions"). """ - def __init__(self, element, debug=False): + def __init__(self, element, debug=True): self.element = element - self.id = element.get("ID") + self.id = element.get("VariationID") if debug: print("Parsing ClinVarSet ID", self.id) - rcva = element.find("ReferenceClinVarAssertion") - if isCurrent(rcva): - self.referenceAssertion = referenceAssertion(rcva, debug=debug) + # + # Look for the RCVAccession object. There should be exactly one. + rcva_found = False + for next_rcva in element.iter("RCVAccession"): + assert(rcva_found is False) + rcva = next_rcva + rcva_found = True + self.referenceAssertion = referenceAssertion(rcva, debug=debug) + sa = element.find("SimpleAllele") + #self.variant = variant(sa, debug=debug) self.otherAssertions = dict() - for item in element.findall("ClinVarAssertion"): - if isCurrent(item): - cva = clinVarAssertion(item) - accession = cva.accession - self.otherAssertions[accession] = cva - - if self.referenceAssertion.variant: - self.referenceAssertion.hgvs_cdna = self.extract_hgvs_cdna(self.referenceAssertion.variant.name, element) + #for item in element.findall("ClinVarAssertion"): + # if isCurrent(item): + # cva = clinVarAssertion(item) + # accession = cva.accession + # self.otherAssertions[accession] = cva + # + #if self.referenceAssertion.variant: + # self.referenceAssertion.hgvs_cdna = self.extract_hgvs_cdna(self.referenceAssertion.variant.name, element) def extract_hgvs_cdna(self, variant_name, clinvar_set_el): """ From 8cf64cf5cfa92bc0c095c229a19b9aec35d39d1f Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 29 Jul 2024 18:54:58 -0700 Subject: [PATCH 2/9] Set up the interpretation object --- pipeline/clinvar/clinVarParse.py | 7 ++-- pipeline/clinvar/clinvar_common.py | 62 +++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index e46cf2216..1beeef007 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -23,6 +23,7 @@ def printHeader(): def processSubmission(submissionSet, assembly): ra = submissionSet.referenceAssertion + interpretation = submissionSet.interpretation variant = submissionSet.variant if variant is None: @@ -63,9 +64,9 @@ def processSubmission(submissionSet, assembly): str(oa.description), str(oa.summaryEvidence), str(oa.reviewStatus), - str(ra.condition_type), - str(ra.condition_value), - ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id), + str(interpretation.condition_type), + str(interpretation.condition_value), + ",".join(interpretation.condition_db_id) if isinstance(interpretation.condition_db_id, list) else str(interpretation.condition_db_id), str(synonyms)))) #print("\t".join((str(hgvs), # oa.submitter, diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index f13143f83..6b2a3aa5a 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -27,7 +27,19 @@ def textIfPresent(element, field): else: return None - +def findUniqueElement(name, parent): + """Find a child element directly or indirectly underneath this parent + element which should occur only once (i.e. there should be no other + elements of the same name). Test this assumption and return + the child element""" + child_found = False + for next_child in parent.iter(name): + assert(child_found is False) + child = next_child + child_found = True + return(child) + + def processClinicalSignificanceElement(el, obj): if el != None: obj.reviewStatus = textIfPresent(el, "ReviewStatus") @@ -204,16 +216,36 @@ def __init__(self, element, debug=False): processClinicalSignificanceElement(element.find( "Interpretation"), self) - # extract condition + +class interpretations: + """For gathering data on the trait. This code expects only one trait""" + + def __init__(self, element, debug=False): + self.element = element + + self.condition_type = None self.condition_value = None self.condition_db_id = None - traitSet = element.find("InterpretedConditionList") - if traitSet != None: - trait = traitSet.find("InterpretedCondition") - if trait != None: - self.condition_db_id = trait.attrib["DB"] + "_" + trait.attrib["ID"] - self.condition_value = trait.text - + for interpretation in element.iter("Interpretation"): + if interpretation.attrib["Type"] == "Clinical significance"]: + for trait in interpretation.iter("Trait"): + assert(self.condition_type is None) + self.condition_type = trait.attrib("Type") + for name in trait.iter("Name"): + ev = name.find("ElementValue") + if ev.attrib["Type"] == "Preferred": + assert(self.condition_value is None) + self.condition_value = ev.text + for xref in trait.iter("XRef"): + if self.condition_db_id is None: + self.condition_db_id = (xref.attrib["DB"] + "_" + + xref.attrib["ID"]) + else: + c_db_id = list() + c_db_id.append(self.condition_db_id) + self.condition_db_id = c_db_id + self.condition_db_id.append(xref.attrib["DB"] + \ + "_" + xref.attrib["ID"]) class clinVarAssertion: @@ -278,12 +310,14 @@ def __init__(self, element, debug=True): print("Parsing ClinVarSet ID", self.id) # # Look for the RCVAccession object. There should be exactly one. - rcva_found = False - for next_rcva in element.iter("RCVAccession"): - assert(rcva_found is False) - rcva = next_rcva - rcva_found = True + rcva = findUniqueElement("RCVAccession", element) self.referenceAssertion = referenceAssertion(rcva, debug=debug) + + # + # Look for the Interpretations object. There should be exactly one. + interp = findUniqueElement("Interpretations", element) + self.interpretations = interpretations(interp, debug=debug) + sa = element.find("SimpleAllele") #self.variant = variant(sa, debug=debug) self.otherAssertions = dict() From f6fc9a90c407d33c2ee94b68803ed1edb3f5c609 Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 5 Aug 2024 12:54:11 -0700 Subject: [PATCH 3/9] Added HGVS parsing --- pipeline/clinvar/clinVarParse.py | 10 +-- pipeline/clinvar/clinvar_common.py | 105 ++++++++++++++++------------- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index 1beeef007..fb34fb44c 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -34,15 +34,11 @@ def processSubmission(submissionSet, assembly): debug = False for oa in list(submissionSet.otherAssertions.values()): if oa.origin != "somatic": - hgvs = ra.hgvs_cdna - - proteinChange = None - if "HGVS, protein, RefSeq" in variant.attribute: - proteinChange = variant.attribute["HGVS, protein, RefSeq"] + hgvs = variant.hgvs_cdna if assembly in variant.coordinates: - synonyms = MULTI_VALUE_SEP.join(ra.synonyms + oa.synonyms) + synonyms = MULTI_VALUE_SEP.join(variant.synonyms + oa.synonyms) vcf_var = variant.coordinates[assembly] @@ -60,7 +56,7 @@ def processSubmission(submissionSet, assembly): str(oa.method), str(vcf_var).replace('g.', ''), str(variant.geneSymbol), - str(proteinChange), + str(variant.proteinChange), str(oa.description), str(oa.summaryEvidence), str(oa.reviewStatus), diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index 6b2a3aa5a..9c4e6d28b 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -84,32 +84,25 @@ def extractSynonyms(el): return sy + sy_alt -def extract_genomic_coordinates_from_measure(meas_el): +def extract_genomic_coordinates_from_location(loc): """ - meas_el: `xml` module object of a ClinVar `Measure` element - + loc: `xml` module object of a ClinVar `Location` element returns: dictionary of assembly (str) to genomic coordinates (VCFVariant object) """ - - sequence_locations = meas_el.findall('SequenceLocation') - coords = {} - for el in sequence_locations: - assembly = el.attrib['Assembly'] # GRCh38 - - if el.get('referenceAlleleVCF'): + for sl in loc.iter('SequenceLocation'): + assembly = sl.attrib['Assembly'] # GRCh38 + if sl.get('referenceAlleleVCF'): coords[assembly] = variant_utils.VCFVariant( - int(el.get('Chr')), - int(el.get('positionVCF')), - el.get('referenceAlleleVCF'), - el.get('alternateAlleleVCF') + int(sl.attrib['Chr']), + int(sl.attrib['positionVCF']), + sl.attrib['referenceAlleleVCF'], + sl.attrib['alternateAlleleVCF'] ) - # if no reference/alternate allele found, compute (assuming genomic coordinates # are either present for all assemblies or for none) - if not coords: - coords = _extract_genomic_coordinates_from_non_genomic_fields(meas_el) - + #if not coords: + # coords = _extract_genomic_coordinates_from_non_genomic_fields(meas_el) return coords @@ -183,25 +176,43 @@ class variant: def __init__(self, element, debug=False): self.element = element - self.id = element.get("AlleleID") + self.id = element.attrib["AlleleID"] if debug: print("Parsing variant", self.id) - self.name = name - - self.attribute = dict() - for attrs in element.findall("AttributeSet"): - for attrib in attrs.findall("Attribute"): - self.attribute[attrib.get("Type")] = attrib.text - - self.coordinates = extract_genomic_coordinates_from_measure(element) - - self.geneSymbol = None - symbols = element.findall("Gene") - for symbol in symbols: - symbol_val = textIfPresent(symbol, "Symbol") - if symbol_val.startswith('BRCA'): - self.geneSymbol = symbol_val - + gene = findUniqueElement("Gene", element) + self.geneSymbol = gene.attrib["Symbol"] + location = element.find("Location") + self.coordinates = extract_genomic_coordinates_from_location(location) + self.proteinChange = None + pc = element.find("ProteinChange") + if pc is not None: + self.proteinChange = pc.text + self.synonyms = list() + self.hgvs_cdna = None + self.hgvs_protein = None + for hgvs in element.find("HGVSlist").iter("HGVS"): + pe = hgvs.find("ProteinExpression") + if pe: + proteinHgvs = pe.find("Expression").text + if debug: + print("protein HGVS", proteinHgvs) + self.synonyms.append(proteinHgvs) + ne = hgvs.find("NucleotideExpression") + if ne: + nucleotideHgvs = ne.find("Expression").text + if debug: + print("Nucleotide hgvs", nucleotideHgvs) + self.synonyms.append(nucleotideHgvs) + if "MANESelect" in ne.attrib: + if ne.attrib["MANESelect"] == "true": + self.hgvs_cdna = nucleotideHgvs + if pe: + self.hgvs_protein = proteinHgvs + if debug: + print("MANE Select HGVS cDNA ", nucleotideHgvs, + "protein", proteinHgvs) + + class referenceAssertion: @@ -213,30 +224,30 @@ def __init__(self, element, debug=False): if debug: print("Parsing ReferenceClinVarAssertion", self.title) - processClinicalSignificanceElement(element.find( - "Interpretation"), self) + processClinicalSignificanceElement(element, self) class interpretations: """For gathering data on the trait. This code expects only one trait""" - def __init__(self, element, debug=False): - self.element = element - + self.element = element self.condition_type = None self.condition_value = None self.condition_db_id = None for interpretation in element.iter("Interpretation"): - if interpretation.attrib["Type"] == "Clinical significance"]: + if interpretation.attrib["Type"] == "Clinical significance": for trait in interpretation.iter("Trait"): + print("found trait") assert(self.condition_type is None) - self.condition_type = trait.attrib("Type") + self.condition_type = trait.attrib["Type"] for name in trait.iter("Name"): + print("Found name") ev = name.find("ElementValue") if ev.attrib["Type"] == "Preferred": assert(self.condition_value is None) self.condition_value = ev.text for xref in trait.iter("XRef"): + print("found xref") if self.condition_db_id is None: self.condition_db_id = (xref.attrib["DB"] + "_" + xref.attrib["ID"]) @@ -250,7 +261,7 @@ def __init__(self, element, debug=False): class clinVarAssertion: """Class for representing one submission (i.e. one annotation of a - submitted variant""" +dir(referen submitted variant""" def __init__(self, element, debug=False): self.element = element @@ -299,7 +310,8 @@ def __init__(self, element, debug=False): class clinVarSet: """Container class for a ClinVarSet record, which is a set of submissions that were submitted to ClinVar together. In the ClinVar terminology, - each ClinVarSet is one aggregate record ("RCV Accession"), which contains + each ClinVarSet is one "SimpleAllele" with one aggregate record + ("RCV Accession"), which contains one or more submissions ("SCV Accessions"). """ @@ -308,6 +320,9 @@ def __init__(self, element, debug=True): self.id = element.get("VariationID") if debug: print("Parsing ClinVarSet ID", self.id) + sa = element.find("InterpretedRecord/SimpleAllele") + self.variant = variant(sa, debug=debug) + # # Look for the RCVAccession object. There should be exactly one. rcva = findUniqueElement("RCVAccession", element) @@ -318,8 +333,6 @@ def __init__(self, element, debug=True): interp = findUniqueElement("Interpretations", element) self.interpretations = interpretations(interp, debug=debug) - sa = element.find("SimpleAllele") - #self.variant = variant(sa, debug=debug) self.otherAssertions = dict() #for item in element.findall("ClinVarAssertion"): From 1b3c244c41f56e1a45cf3388c83700d9fa455b02 Mon Sep 17 00:00:00 2001 From: melissacline Date: Mon, 5 Aug 2024 20:45:29 -0700 Subject: [PATCH 4/9] Made it through the first VariationRecord object! --- pipeline/clinvar/clinVarParse.py | 17 ++-- pipeline/clinvar/clinvar_common.py | 121 +++++++++++++---------------- 2 files changed, 61 insertions(+), 77 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index fb34fb44c..f4b8da221 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -33,13 +33,10 @@ def processSubmission(submissionSet, assembly): debug = False for oa in list(submissionSet.otherAssertions.values()): - if oa.origin != "somatic": + if not ("somatic" in oa.origin and length(oa.origin) == 1): hgvs = variant.hgvs_cdna - if assembly in variant.coordinates: - - synonyms = MULTI_VALUE_SEP.join(variant.synonyms + oa.synonyms) - + synonyms = MULTI_VALUE_SEP.join(variant.synonyms) vcf_var = variant.coordinates[assembly] # Omit the variants that don't have any genomic start coordinate indicated. @@ -52,12 +49,12 @@ def processSubmission(submissionSet, assembly): str(oa.accession), str(oa.accession_version), str(oa.id), - str(oa.origin), - str(oa.method), - str(vcf_var).replace('g.', ''), + ",".join(oa.origin), + ",".join(oa.method), + str(vcf_var).replace('g.', ''), #change str(variant.geneSymbol), str(variant.proteinChange), - str(oa.description), + ",".join(oa.description), str(oa.summaryEvidence), str(oa.reviewStatus), str(interpretation.condition_type), @@ -107,7 +104,7 @@ def main(): for event, elem in ET.iterparse(inputFile, events=('start', 'end')): if event == 'end' and elem.tag == 'VariationArchive': if clinvar.isCurrent(elem): - submissionSet = clinvar.clinVarSet(elem) + submissionSet = clinvar.clinVarSet(elem, debug=True) processSubmission(submissionSet, args.assembly) elem.clear() diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index 9c4e6d28b..7084c34a1 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -40,18 +40,6 @@ def findUniqueElement(name, parent): return(child) -def processClinicalSignificanceElement(el, obj): - if el != None: - obj.reviewStatus = textIfPresent(el, "ReviewStatus") - obj.clinicalSignificance = textIfPresent(el, "Interpretation") - obj.summaryEvidence = textIfPresent(el, "Comment") - obj.dateSignificanceLastEvaluated = el.get('DateLastEvaluated', None) - else: - obj.reviewStatus = None - obj.clinicalSignificance = None - obj.summaryEvidence = None - obj.dateSignificanceLastEvaluated = None - #def build_xpath_filter_for_cv_assertions(gene_symbols): ## symbols_str = [ f'text()="{s}"' for s in gene_symbols] ### HERE @@ -60,7 +48,6 @@ def processClinicalSignificanceElement(el, obj): # filter assertion if it contains a Symbol we are interested in # return f"ReferenceClinVarAssertion/MeasureSet/Measure/MeasureRelationship/Symbol/ElementValue[({symbols_pred}) and @Type=\"Preferred\"]" - def extractSynonyms(el): include_types = {'ProteinChange3LetterCode', 'ProteinChange1LetterCode', 'nucleotide change', 'protein change, historical'} @@ -174,7 +161,7 @@ class variant: """The Measure set. We are interested in the variants specifically, but measure sets can be other things as well, such as haplotypes""" - def __init__(self, element, debug=False): + def __init__(self, element, debug=True): self.element = element self.id = element.attrib["AlleleID"] if debug: @@ -183,13 +170,12 @@ def __init__(self, element, debug=False): self.geneSymbol = gene.attrib["Symbol"] location = element.find("Location") self.coordinates = extract_genomic_coordinates_from_location(location) + self.hgvs_cdna = None self.proteinChange = None - pc = element.find("ProteinChange") - if pc is not None: - self.proteinChange = pc.text self.synonyms = list() - self.hgvs_cdna = None - self.hgvs_protein = None + pc = textIfPresent(element, "ProteinChange") + if pc is not None: + self.synonyms.append(pc) for hgvs in element.find("HGVSlist").iter("HGVS"): pe = hgvs.find("ProteinExpression") if pe: @@ -207,7 +193,7 @@ def __init__(self, element, debug=False): if ne.attrib["MANESelect"] == "true": self.hgvs_cdna = nucleotideHgvs if pe: - self.hgvs_protein = proteinHgvs + self.proteinChange = proteinHgvs if debug: print("MANE Select HGVS cDNA ", nucleotideHgvs, "protein", proteinHgvs) @@ -223,11 +209,13 @@ def __init__(self, element, debug=False): self.title = element.get("Title") if debug: print("Parsing ReferenceClinVarAssertion", self.title) - - processClinicalSignificanceElement(element, self) + self.reviewStatus = element.attrib["ReviewStatus"] + self.clinicalSignificance = element.attrib["Interpretation"] + self.dateSignificanceLastEvaluated = element.attrib["DateLastEvaluated"] + self.summaryDescription = None -class interpretations: +class interpretation: """For gathering data on the trait. This code expects only one trait""" def __init__(self, element, debug=False): self.element = element @@ -237,17 +225,14 @@ def __init__(self, element, debug=False): for interpretation in element.iter("Interpretation"): if interpretation.attrib["Type"] == "Clinical significance": for trait in interpretation.iter("Trait"): - print("found trait") assert(self.condition_type is None) self.condition_type = trait.attrib["Type"] for name in trait.iter("Name"): - print("Found name") ev = name.find("ElementValue") if ev.attrib["Type"] == "Preferred": assert(self.condition_value is None) self.condition_value = ev.text for xref in trait.iter("XRef"): - print("found xref") if self.condition_db_id is None: self.condition_db_id = (xref.attrib["DB"] + "_" + xref.attrib["ID"]) @@ -259,52 +244,57 @@ def __init__(self, element, debug=False): "_" + xref.attrib["ID"]) -class clinVarAssertion: +class clinicalAssertion: """Class for representing one submission (i.e. one annotation of a dir(referen submitted variant""" + + def __init__(self, element, debug=False): self.element = element self.id = element.get("ID") if debug: - print("Parsing ClinVarAssertion", self.id) - cvsd = element.find("ClinVarSubmissionID") - if cvsd == None: - self.submitter = None - self.dateSubmitted = None - else: - self.submitter = cvsd.get("submitter", default=None) - self.dateSubmitted = cvsd.get("submitterDate") + print("Parsing ClinicalAssertion", self.id) + self.dateSubmitted = element.attrib["SubmissionDate"] + self.dateLastUpdated = element.attrib["DateLastUpdated"] cva = element.find("ClinVarAccession") if cva == None: self.accession = None else: - self.accession = cva.get("Acc", default=None) - self.accession_version = cva.get("Version", default=None) - - self.origin = None - self.method = None - self.description = None - oi = element.find("ObservedIn") - if oi != None: + self.accession = cva.attrib["Accession"] + self.accession_version = cva.attrib["Version"] + self.submitter = cva.attrib["SubmitterName"] + self.reviewStatus = textIfPresent(element, "ReviewStatus") + interpretation = element.find("Interpretation") + self.dateSignificanceLastEvaluated = interpretation.attrib["DateLastEvaluated"] + self.clinicalSignificance = textIfPresent(interpretation, + "Description") + self.summaryEvidence = textIfPresent(interpretation, "Comment") + oil = element.find("ObservedInList") + self.origin = list() + self.method = list() + self.description = list() + for oi in oil.iter("ObservedIn"): sample = oi.find("Sample") if sample != None: - self.origin = textIfPresent(sample, "Origin") + origin = textIfPresent(sample, "Origin") + if not origin in self.origin: + self.origin.append(origin) method = oi.find("Method") if method != None: - self.method = textIfPresent(method, "MethodType") - description = oi.find("ObservedData") - if description != None: - for attr in description.findall("Attribute"): + newMethod = textIfPresent(method, "MethodType") + if not newMethod in self.method: + self.method.append(newMethod) + od = oi.find("ObservedData") + if od != None: + for attr in od.iter("Attribute"): if attr.attrib["Type"] == 'Description': - self.description = textIfPresent(description, "Attribute") - - processClinicalSignificanceElement(element.find( - "ClinicalSignificance"), self) + newDescription = textIfPresent(od, "Attribute") + else: + newDescription = "None" + if not newDescription in self.description: + self.description.append(newDescription) - self.dateLastUpdated = cva.get("DateUpdated") - - self.synonyms = extractSynonyms(element) class clinVarSet: @@ -321,29 +311,26 @@ def __init__(self, element, debug=True): if debug: print("Parsing ClinVarSet ID", self.id) sa = element.find("InterpretedRecord/SimpleAllele") - self.variant = variant(sa, debug=debug) + self.variant = variant(sa, debug=False) # # Look for the RCVAccession object. There should be exactly one. rcva = findUniqueElement("RCVAccession", element) - self.referenceAssertion = referenceAssertion(rcva, debug=debug) + self.referenceAssertion = referenceAssertion(rcva, debug=False) # # Look for the Interpretations object. There should be exactly one. interp = findUniqueElement("Interpretations", element) - self.interpretations = interpretations(interp, debug=debug) + self.interpretation = interpretation(interp, debug=debug) self.otherAssertions = dict() - #for item in element.findall("ClinVarAssertion"): - # if isCurrent(item): - # cva = clinVarAssertion(item) - # accession = cva.accession - # self.otherAssertions[accession] = cva - # - #if self.referenceAssertion.variant: - # self.referenceAssertion.hgvs_cdna = self.extract_hgvs_cdna(self.referenceAssertion.variant.name, element) - + for item in element.iter("ClinicalAssertion"): + if isCurrent(item): + cva = clinicalAssertion(item) + accession = cva.accession + self.otherAssertions[accession] = cva + def extract_hgvs_cdna(self, variant_name, clinvar_set_el): """ Finds a HGVS CDNA representation of a variant within a ClinVarSet. From 6e7cae279dacf9d8ca1ccf3394691d83e8cbf464 Mon Sep 17 00:00:00 2001 From: melissacline Date: Tue, 6 Aug 2024 18:35:29 -0700 Subject: [PATCH 5/9] Redid the XML parsing to use text --- pipeline/clinvar/filter_clinvar.py | 56 ++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) mode change 100644 => 100755 pipeline/clinvar/filter_clinvar.py diff --git a/pipeline/clinvar/filter_clinvar.py b/pipeline/clinvar/filter_clinvar.py old mode 100644 new mode 100755 index bbe9d6749..bdbde29f4 --- a/pipeline/clinvar/filter_clinvar.py +++ b/pipeline/clinvar/filter_clinvar.py @@ -1,11 +1,60 @@ #!/usr/bin/env python +""" +filter_clinvar.py: Filter the ClinVar VCV Release XML file, and build a subset +XML file containing the VariationArchive records for the genes of interest. + +This file filters the XML by reading the XML file as text, assembling text +objects of the VariationArchive records, and then parsing those records as XML +to see if they specify the target gene symbols. It's done this way because: +- reading the whole file into one big ElementTree XML object doesn't work + because it's too large and we run out of memory +- reading the file iteratively, with ElementTree iterparse hasn't worked, + because the ClinVar archive contains some badly-formatted XML records. + By not parsing the XML as XML until we know that we're looking at an object + that we want, we can wager that the broken XML records are for other genes. +""" import argparse import gzip +import io import xml.etree.ElementTree as ET +def contains_target_gene_symbol(variationArchiveElement, target_symbol_values): + root = ET.fromstring(variationArchiveElement) + geneList = root.find(".//GeneList") + if geneList: + for geneElement in geneList.iter("Gene"): + if "Symbol" in geneElement.attrib: + if geneElement.attrib["Symbol"] in target_symbol_values: + return(True) + return(False) + + + + def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values, chunk_size=1024): + in_variation_archive = False + in_header = True + for line in input_fp: + if in_header: + output_fp.write(line) + if '' in line: + in_variation_archive = False + xml_string = variation_archive.getvalue() + variation_archive.close() + if contains_target_gene_symbol(xml_string, + target_symbol_values): + output_fp.write(xml_string) + + +def old(input_fp, output_fp, target_symbol_values, chunk_size=1024): for event, elem in ET.iterparse(input_fp, events=('start', 'end')): if event == 'start' and elem.tag == "ClnVarVariationRelease": output_FP.write(ET.tostring(elem, encoding='UTF-8')) @@ -16,7 +65,6 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values, if gene_element.attrib["Symbol"] in target_symbol_values: output_fp.write(ET.tostring(elem, encoding='UTF-8')) elem.clear() - output_fp.write("".encode('UTF-8')) def main(): @@ -26,11 +74,13 @@ def main(): parser.add_argument('-g', "--gene", action='append') args = parser.parse_args() if args.input.endswith('gz'): - input_fp = gzip.open(args.input) + input_fp = io.TextIOWrapper(gzip.open(args.input, 'rb'), + encoding='utf-8') else: input_fp = open(args.input, 'r') - output_fp = open(args.output, 'wb') + output_fp = open(args.output, 'w') filter_xml_for_gene_symbol(input_fp, output_fp, args.gene) + output_fp.write("".encode('UTF-8')) if __name__ == "__main__": main() From 325c5935deaca22ffa0a39c8f59920d81dc758be Mon Sep 17 00:00:00 2001 From: melissacline Date: Wed, 7 Aug 2024 17:17:21 -0700 Subject: [PATCH 6/9] Fixed a bug that led to the first line of the first VariationArchive record being included erroneously --- pipeline/clinvar/filter_clinvar.py | 31 +++--------------------------- 1 file changed, 3 insertions(+), 28 deletions(-) diff --git a/pipeline/clinvar/filter_clinvar.py b/pipeline/clinvar/filter_clinvar.py index bdbde29f4..78addc104 100755 --- a/pipeline/clinvar/filter_clinvar.py +++ b/pipeline/clinvar/filter_clinvar.py @@ -1,19 +1,5 @@ #!/usr/bin/env python -""" -filter_clinvar.py: Filter the ClinVar VCV Release XML file, and build a subset -XML file containing the VariationArchive records for the genes of interest. - -This file filters the XML by reading the XML file as text, assembling text -objects of the VariationArchive records, and then parsing those records as XML -to see if they specify the target gene symbols. It's done this way because: -- reading the whole file into one big ElementTree XML object doesn't work - because it's too large and we run out of memory -- reading the file iteratively, with ElementTree iterparse hasn't worked, - because the ClinVar archive contains some badly-formatted XML records. - By not parsing the XML as XML until we know that we're looking at an object - that we want, we can wager that the broken XML records are for other genes. -""" import argparse import gzip import io @@ -37,12 +23,12 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values, in_variation_archive = False in_header = True for line in input_fp: - if in_header: - output_fp.write(line) if '' in line: @@ -54,17 +40,6 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values, output_fp.write(xml_string) -def old(input_fp, output_fp, target_symbol_values, chunk_size=1024): - for event, elem in ET.iterparse(input_fp, events=('start', 'end')): - if event == 'start' and elem.tag == "ClnVarVariationRelease": - output_FP.write(ET.tostring(elem, encoding='UTF-8')) - if event == 'end' and elem.tag == 'VariationArchive': - gene_element = elem.find('.//Gene') - if gene_element is not None: - if "Symbol" in gene_element.attrib: - if gene_element.attrib["Symbol"] in target_symbol_values: - output_fp.write(ET.tostring(elem, encoding='UTF-8')) - elem.clear() def main(): @@ -80,7 +55,7 @@ def main(): input_fp = open(args.input, 'r') output_fp = open(args.output, 'w') filter_xml_for_gene_symbol(input_fp, output_fp, args.gene) - output_fp.write("".encode('UTF-8')) + output_fp.write("") if __name__ == "__main__": main() From b9f6dd7bcb77e933c7615d17b4857bfea9b599e2 Mon Sep 17 00:00:00 2001 From: melissacline Date: Wed, 7 Aug 2024 17:18:30 -0700 Subject: [PATCH 7/9] Parsed the entire ClinVar record for the first time! --- pipeline/clinvar/clinVarParse.py | 15 +-- pipeline/clinvar/clinvar_common.py | 175 +++++++++++++++++------------ 2 files changed, 111 insertions(+), 79 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index f4b8da221..c0bb2f16f 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -23,7 +23,7 @@ def printHeader(): def processSubmission(submissionSet, assembly): ra = submissionSet.referenceAssertion - interpretation = submissionSet.interpretation + classification = submissionSet.classification variant = submissionSet.variant if variant is None: @@ -33,7 +33,7 @@ def processSubmission(submissionSet, assembly): debug = False for oa in list(submissionSet.otherAssertions.values()): - if not ("somatic" in oa.origin and length(oa.origin) == 1): + if not ("somatic" in oa.origin and len(oa.origin) == 1): hgvs = variant.hgvs_cdna if assembly in variant.coordinates: synonyms = MULTI_VALUE_SEP.join(variant.synonyms) @@ -57,9 +57,9 @@ def processSubmission(submissionSet, assembly): ",".join(oa.description), str(oa.summaryEvidence), str(oa.reviewStatus), - str(interpretation.condition_type), - str(interpretation.condition_value), - ",".join(interpretation.condition_db_id) if isinstance(interpretation.condition_db_id, list) else str(interpretation.condition_db_id), + str(classification.condition_type), + str(classification.condition_value), + ",".join(classification.condition_db_id), str(synonyms)))) #print("\t".join((str(hgvs), # oa.submitter, @@ -104,8 +104,9 @@ def main(): for event, elem in ET.iterparse(inputFile, events=('start', 'end')): if event == 'end' and elem.tag == 'VariationArchive': if clinvar.isCurrent(elem): - submissionSet = clinvar.clinVarSet(elem, debug=True) - processSubmission(submissionSet, args.assembly) + submissionSet = clinvar.variationArchive(elem, debug=True) + if submissionSet.valid: + processSubmission(submissionSet, args.assembly) elem.clear() if __name__ == "__main__": diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index 7084c34a1..0e425353e 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -158,45 +158,63 @@ def __init__(self, element, useNone=False, debug=False): class variant: - """The Measure set. We are interested in the variants specifically, - but measure sets can be other things as well, such as haplotypes""" + """The variant (SimpleAllele) set. """ def __init__(self, element, debug=True): self.element = element - self.id = element.attrib["AlleleID"] + self.valid = True + self.variantType = textIfPresent(element, "VariantType") + if self.variantType in ["Deletion", "Duplication", "Insertion"]: + self.valid = False + return + if "AlleleID" in element.attrib: + self.id = element.get("AlleleID") + else: + self.id = None if debug: print("Parsing variant", self.id) - gene = findUniqueElement("Gene", element) - self.geneSymbol = gene.attrib["Symbol"] + self.geneSymbol = None + for gene in element.iter("GeneList/Gene"): + geneSymbol = gene.get("Symbol") + if self.geneSymbol is None: + self.geneSymbol = geneSymbol + else: + self.geneSymbol = self.geneSymbol + "," + geneSymbol location = element.find("Location") + if location is None: + self.valid = False + return self.coordinates = extract_genomic_coordinates_from_location(location) self.hgvs_cdna = None self.proteinChange = None self.synonyms = list() + spdi = textIfPresent(element, "CanonicalSPDI") pc = textIfPresent(element, "ProteinChange") if pc is not None: self.synonyms.append(pc) - for hgvs in element.find("HGVSlist").iter("HGVS"): - pe = hgvs.find("ProteinExpression") - if pe: - proteinHgvs = pe.find("Expression").text - if debug: - print("protein HGVS", proteinHgvs) - self.synonyms.append(proteinHgvs) - ne = hgvs.find("NucleotideExpression") - if ne: - nucleotideHgvs = ne.find("Expression").text - if debug: - print("Nucleotide hgvs", nucleotideHgvs) - self.synonyms.append(nucleotideHgvs) - if "MANESelect" in ne.attrib: - if ne.attrib["MANESelect"] == "true": - self.hgvs_cdna = nucleotideHgvs - if pe: - self.proteinChange = proteinHgvs - if debug: - print("MANE Select HGVS cDNA ", nucleotideHgvs, - "protein", proteinHgvs) + if element.find("HGVSlist"): + for hgvs in element.find("HGVSlist").iter("HGVS"): + pe = hgvs.find("ProteinExpression") + if pe: + proteinHgvs = pe.find("Expression").text + if debug: + print("protein HGVS", proteinHgvs) + self.synonyms.append(proteinHgvs) + ne = hgvs.find("NucleotideExpression") + if ne: + nucleotideHgvs = ne.find("Expression").text + if debug: + print("Nucleotide hgvs", nucleotideHgvs) + self.synonyms.append(nucleotideHgvs) + if "MANESelect" in ne.attrib: + if ne.attrib["MANESelect"] == "true": + self.hgvs_cdna = nucleotideHgvs + if pe: + self.proteinChange = proteinHgvs + if debug: + print("MANE Select HGVS cDNA ", + nucleotideHgvs, + "protein", proteinHgvs) @@ -206,42 +224,41 @@ class referenceAssertion: def __init__(self, element, debug=False): self.element = element + self.valid = True self.title = element.get("Title") if debug: print("Parsing ReferenceClinVarAssertion", self.title) - self.reviewStatus = element.attrib["ReviewStatus"] - self.clinicalSignificance = element.attrib["Interpretation"] - self.dateSignificanceLastEvaluated = element.attrib["DateLastEvaluated"] + gc = element.find(".//GermlineClassification") + if gc is None: + self.valid = False + return + self.reviewStatus = textIfPresent(gc, "ReviewStatus") + self.clinicalSignificance = textIfPresent(gc, "Description") + if self.clinicalSignificance is not None: + self.dateSignificanceLastEvaluated = gc.find("Description").get("DateLastEvaluated") + else: + self.dateSignificanceLastEvaluated = None self.summaryDescription = None -class interpretation: +class classification: """For gathering data on the trait. This code expects only one trait""" def __init__(self, element, debug=False): self.element = element self.condition_type = None self.condition_value = None - self.condition_db_id = None - for interpretation in element.iter("Interpretation"): - if interpretation.attrib["Type"] == "Clinical significance": - for trait in interpretation.iter("Trait"): - assert(self.condition_type is None) - self.condition_type = trait.attrib["Type"] - for name in trait.iter("Name"): - ev = name.find("ElementValue") - if ev.attrib["Type"] == "Preferred": - assert(self.condition_value is None) - self.condition_value = ev.text - for xref in trait.iter("XRef"): - if self.condition_db_id is None: - self.condition_db_id = (xref.attrib["DB"] + "_" - + xref.attrib["ID"]) - else: - c_db_id = list() - c_db_id.append(self.condition_db_id) - self.condition_db_id = c_db_id - self.condition_db_id.append(xref.attrib["DB"] + \ - "_" + xref.attrib["ID"]) + self.condition_db_id = list() + for trait in element.iter("Trait"): + self.condition_type = trait.get("Type") + for name in trait.iter("Name"): + ev = name.find("ElementValue") + if ev.get("Type") == "Preferred": + self.condition_value = ev.text + for xref in trait.iter("XRef"): + xref_string = xref.get("DB") + "_" + xref.get("ID") + if not xref_string in self.condition_db_id: + self.condition_db_id.append(xref_string) + class clinicalAssertion: @@ -255,21 +272,21 @@ def __init__(self, element, debug=False): self.id = element.get("ID") if debug: print("Parsing ClinicalAssertion", self.id) - self.dateSubmitted = element.attrib["SubmissionDate"] - self.dateLastUpdated = element.attrib["DateLastUpdated"] + self.dateSubmitted = element.get("SubmissionDate") + self.dateLastUpdated = element.get("DateLastUpdated") cva = element.find("ClinVarAccession") if cva == None: self.accession = None else: - self.accession = cva.attrib["Accession"] - self.accession_version = cva.attrib["Version"] - self.submitter = cva.attrib["SubmitterName"] + self.accession = cva.get("Accession") + self.accession_version = cva.get("Version") + self.submitter = cva.get("SubmitterName") self.reviewStatus = textIfPresent(element, "ReviewStatus") - interpretation = element.find("Interpretation") - self.dateSignificanceLastEvaluated = interpretation.attrib["DateLastEvaluated"] - self.clinicalSignificance = textIfPresent(interpretation, - "Description") - self.summaryEvidence = textIfPresent(interpretation, "Comment") + classif = element.find("Classification") + self.dateSignificanceLastEvaluated = classif.get("DateLastEvaluated") + self.clinicalSignificance = textIfPresent(classif, + "GermlineClassification") + self.summaryEvidence = textIfPresent(classif, "Comment") oil = element.find("ObservedInList") self.origin = list() self.method = list() @@ -288,7 +305,7 @@ def __init__(self, element, debug=False): od = oi.find("ObservedData") if od != None: for attr in od.iter("Attribute"): - if attr.attrib["Type"] == 'Description': + if attr.get("Type") == 'Description': newDescription = textIfPresent(od, "Attribute") else: newDescription = "None" @@ -297,8 +314,9 @@ def __init__(self, element, debug=False): -class clinVarSet: - """Container class for a ClinVarSet record, which is a set of submissions +class variationArchive: + """Container class for a variationArchive record, which is a set of + submissions that were submitted to ClinVar together. In the ClinVar terminology, each ClinVarSet is one "SimpleAllele" with one aggregate record ("RCV Accession"), which contains @@ -310,26 +328,39 @@ def __init__(self, element, debug=True): self.id = element.get("VariationID") if debug: print("Parsing ClinVarSet ID", self.id) - sa = element.find("InterpretedRecord/SimpleAllele") + self.valid = True + sa = element.find("./ClassifiedRecord/SimpleAllele") + if sa is None: + self.valid = False + return self.variant = variant(sa, debug=False) + if not self.variant.valid: + self.valid = False + return # # Look for the RCVAccession object. There should be exactly one. - rcva = findUniqueElement("RCVAccession", element) + rcva = element.find("./ClassifiedRecord/RCVList/RCVAccession") + if rcva is None: + self.valid = False + return self.referenceAssertion = referenceAssertion(rcva, debug=False) + if not self.referenceAssertion.valid: + self.valid = False + return # - # Look for the Interpretations object. There should be exactly one. - interp = findUniqueElement("Interpretations", element) - self.interpretation = interpretation(interp, debug=debug) + # Look for the GermlineClassification object. + cl = element.find("./ClassifiedRecord/Classifications") + self.classification = classification(cl, debug=debug) self.otherAssertions = dict() for item in element.iter("ClinicalAssertion"): if isCurrent(item): - cva = clinicalAssertion(item) - accession = cva.accession - self.otherAssertions[accession] = cva + ca = clinicalAssertion(item) + accession = ca.accession + self.otherAssertions[accession] = ca def extract_hgvs_cdna(self, variant_name, clinvar_set_el): """ From 4a8a9bf72af5377988c6c06e208c5f037185dcbf Mon Sep 17 00:00:00 2001 From: melissacline Date: Wed, 21 Aug 2024 18:58:01 -0700 Subject: [PATCH 8/9] fixed the broken code for returning the gene symbol --- pipeline/clinvar/clinVarParse.py | 2 +- pipeline/clinvar/clinvar_common.py | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index c0bb2f16f..ae3e5e8fa 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -104,7 +104,7 @@ def main(): for event, elem in ET.iterparse(inputFile, events=('start', 'end')): if event == 'end' and elem.tag == 'VariationArchive': if clinvar.isCurrent(elem): - submissionSet = clinvar.variationArchive(elem, debug=True) + submissionSet = clinvar.variationArchive(elem, debug=False) if submissionSet.valid: processSubmission(submissionSet, args.assembly) elem.clear() diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index 0e425353e..a1a9bfa46 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -164,9 +164,6 @@ def __init__(self, element, debug=True): self.element = element self.valid = True self.variantType = textIfPresent(element, "VariantType") - if self.variantType in ["Deletion", "Duplication", "Insertion"]: - self.valid = False - return if "AlleleID" in element.attrib: self.id = element.get("AlleleID") else: @@ -174,12 +171,14 @@ def __init__(self, element, debug=True): if debug: print("Parsing variant", self.id) self.geneSymbol = None - for gene in element.iter("GeneList/Gene"): - geneSymbol = gene.get("Symbol") - if self.geneSymbol is None: - self.geneSymbol = geneSymbol - else: - self.geneSymbol = self.geneSymbol + "," + geneSymbol + geneList = element.find("GeneList") + if geneList is not None: + for gene in geneList.iter("Gene"): + geneSymbol = gene.get("Symbol") + if self.geneSymbol is None: + self.geneSymbol = geneSymbol + else: + self.geneSymbol = self.geneSymbol + "," + geneSymbol location = element.find("Location") if location is None: self.valid = False @@ -281,8 +280,8 @@ def __init__(self, element, debug=False): self.accession = cva.get("Accession") self.accession_version = cva.get("Version") self.submitter = cva.get("SubmitterName") - self.reviewStatus = textIfPresent(element, "ReviewStatus") classif = element.find("Classification") + self.reviewStatus = textIfPresent(classif, "ReviewStatus") self.dateSignificanceLastEvaluated = classif.get("DateLastEvaluated") self.clinicalSignificance = textIfPresent(classif, "GermlineClassification") @@ -327,7 +326,7 @@ def __init__(self, element, debug=True): self.element = element self.id = element.get("VariationID") if debug: - print("Parsing ClinVarSet ID", self.id) + print("DEBUG: Parsing ClinVarSet ID", self.id) self.valid = True sa = element.find("./ClassifiedRecord/SimpleAllele") if sa is None: From 12ac6953d584545abb91f080c616cb34ba934349 Mon Sep 17 00:00:00 2001 From: melissacline Date: Wed, 23 Oct 2024 17:11:58 -0700 Subject: [PATCH 9/9] Updated parsing for the new ClinVar structure --- pipeline/clinvar/clinVarParse.py | 41 +++++++---------- pipeline/clinvar/clinvar_common.py | 70 +++++++++++++++--------------- pipeline/clinvar/filter_clinvar.py | 22 +++++++--- pipeline/clinvar/test_clinvar.py | 22 +++++----- 4 files changed, 81 insertions(+), 74 deletions(-) diff --git a/pipeline/clinvar/clinVarParse.py b/pipeline/clinvar/clinVarParse.py index ae3e5e8fa..7dc878e69 100755 --- a/pipeline/clinvar/clinVarParse.py +++ b/pipeline/clinvar/clinVarParse.py @@ -27,20 +27,26 @@ def processSubmission(submissionSet, assembly): variant = submissionSet.variant if variant is None: - logging.warning("No variant information could be extracted for ReferenceClinVarAssertion ID %s %s", + logging.warning("Warning","No variant information could be extracted for ReferenceClinVarAssertion ID %s %s", submissionSet.referenceAssertion.id, [c.accession for c in submissionSet.otherAssertions.values()]) return None + hgvs = submissionSet.name debug = False for oa in list(submissionSet.otherAssertions.values()): - if not ("somatic" in oa.origin and len(oa.origin) == 1): - hgvs = variant.hgvs_cdna - if assembly in variant.coordinates: + if ("somatic" in oa.origin and len(oa.origin) == 1): + logging.warning("HGVS %s because submissions are only somatic in origin", hgvs) + else: + if not assembly in variant.coordinates: + logging.warning("HGVS %s rejected for poor variant coordinate data", hgvs) + else: synonyms = MULTI_VALUE_SEP.join(variant.synonyms) vcf_var = variant.coordinates[assembly] # Omit the variants that don't have any genomic start coordinate indicated. - if vcf_var and _bases_only(vcf_var.ref) and _bases_only(vcf_var.alt): + if not (vcf_var and _bases_only(vcf_var.ref) and _bases_only(vcf_var.alt)): + logging.warning("HGVS %s rejected for poor VCF data", hgvs) + else: print("\t".join((str(hgvs), oa.submitter, str(oa.clinicalSignificance), @@ -61,26 +67,6 @@ def processSubmission(submissionSet, assembly): str(classification.condition_value), ",".join(classification.condition_db_id), str(synonyms)))) - #print("\t".join((str(hgvs), - # oa.submitter, - # str(oa.clinicalSignificance), - # str(oa.dateLastUpdated), - # str(oa.dateSignificanceLastEvaluated), - # str(oa.accession), - # str(oa.accession_version), - # str(oa.id), - # str(oa.origin), - # str(oa.method), - # str(vcf_var).replace('g.', ''), - # str(variant.geneSymbol), - # str(proteinChange), - # str(oa.description), - # str(oa.summaryEvidence), - # str(oa.reviewStatus), - # str(ra.condition_type), - # str(ra.condition_value), - # ",".join(ra.condition_db_id) if isinstance(ra.condition_db_id, list) else str(ra.condition_db_id), - # str(synonyms)))) def _bases_only(seq): @@ -107,6 +93,11 @@ def main(): submissionSet = clinvar.variationArchive(elem, debug=False) if submissionSet.valid: processSubmission(submissionSet, args.assembly) + else: + if hasattr(submissionSet, 'name'): + logging.warning("Submission %s not valid", submissionSet.name) + else: + logging.warning("Submission not valid, name not available") elem.clear() if __name__ == "__main__": diff --git a/pipeline/clinvar/clinvar_common.py b/pipeline/clinvar/clinvar_common.py index a1a9bfa46..4008e3ebe 100644 --- a/pipeline/clinvar/clinvar_common.py +++ b/pipeline/clinvar/clinvar_common.py @@ -6,6 +6,7 @@ from common import hgvs_utils, variant_utils from hgvs.exceptions import HGVSError import hgvs +import html import logging def isCurrent(element): @@ -192,14 +193,14 @@ def __init__(self, element, debug=True): if pc is not None: self.synonyms.append(pc) if element.find("HGVSlist"): - for hgvs in element.find("HGVSlist").iter("HGVS"): - pe = hgvs.find("ProteinExpression") + for hgvsObj in element.find("HGVSlist").iter("HGVS"): + pe = hgvsObj.find("ProteinExpression") if pe: proteinHgvs = pe.find("Expression").text if debug: print("protein HGVS", proteinHgvs) self.synonyms.append(proteinHgvs) - ne = hgvs.find("NucleotideExpression") + ne = hgvsObj.find("NucleotideExpression") if ne: nucleotideHgvs = ne.find("Expression").text if debug: @@ -254,9 +255,10 @@ def __init__(self, element, debug=False): if ev.get("Type") == "Preferred": self.condition_value = ev.text for xref in trait.iter("XRef"): - xref_string = xref.get("DB") + "_" + xref.get("ID") - if not xref_string in self.condition_db_id: - self.condition_db_id.append(xref_string) + if not re.search("Genetic Testing Registry", xref.get("DB")): + xref_string = xref.get("DB") + "_" + xref.get("ID") + if not xref_string in self.condition_db_id: + self.condition_db_id.append(xref_string) @@ -336,7 +338,14 @@ def __init__(self, element, debug=True): if not self.variant.valid: self.valid = False return - + # + # Find an HGVS variant name, either by taking the VariationName object of this element and removing the + # gene name in the parens, or if that name doesn't match an HGVS expression, look for a viable HGVS expression + # amoung the synonyms + fullname = re.sub(r'\([^)]*\)', '', html.unescape(element.get("VariationName"))) + selected_hgvs = extract_hgvs_cdna(fullname, self.variant.synonyms) + self.name = re.sub(r'^\s+|\s+$', '', selected_hgvs) + # # Look for the RCVAccession object. There should be exactly one. rcva = element.find("./ClassifiedRecord/RCVList/RCVAccession") @@ -361,30 +370,23 @@ def __init__(self, element, debug=True): accession = ca.accession self.otherAssertions[accession] = ca - def extract_hgvs_cdna(self, variant_name, clinvar_set_el): - """ - Finds a HGVS CDNA representation of a variant within a ClinVarSet. - If possible, avoid repeat representations using the "[]" synatax, since - we are currently not able to handle it further downstream (https://github.com/biocommons/hgvs/issues/113) - - :param variant_name: variant name from title - :param clinvar_set_el: clinvar set element - :return: HGVS CDNA representation as string - """ - hgvs_cand = re.sub(r"\(" + "(BRCA[1|2])" + r"\)", - "", variant_name.split()[0]) - - # only take variants starting with NM_ and not containing [] - hgvs_cdna_re = r'NM_.*:[^\[]*$' - - if not re.match(hgvs_cdna_re, hgvs_cand): - # taking Attribute of 'HGVS', 'HGVS, coding' or 'HGVS, coding, RefSeq' in - # both ReferenceClinVarAssertion and ClinVarAssertion's - hgvs_candidates = [ev.text for ev in clinvar_set_el.findall('.//Measure/AttributeSet/Attribute') - if ev.attrib['Type'].startswith('HGVS, coding') or ev.attrib['Type'] == 'HGVS'] - - filtered = [s for s in hgvs_candidates if re.match(hgvs_cdna_re, s)] - if filtered: - return filtered[0] - - return hgvs_cand +def extract_hgvs_cdna(variant_name, synonym_list): + """ + Finds a HGVS CDNA representation of a variant within a ClinVarSet. + If possible, avoid repeat representations using the "[]" synatax, since + we are currently not able to handle it further downstream (https://github.com/biocommons/hgvs/issues/113) + + :param variant_name: variant name from title + :param synonym_list: synonyms extracted from variant object + :return: HGVS CDNA representation as string + """ + + # only take variants starting with NM_ and not containing [] + hgvs_cdna_re = r'NM_.*:[^\[]*$' + if not re.match(hgvs_cdna_re, variant_name): + # check for anything in the synonym list that matches an HGVS expression + filtered = [s for s in synonym_list if re.match(hgvs_cdna_re, s)] + if filtered: + return filtered[0] + + return variant_name diff --git a/pipeline/clinvar/filter_clinvar.py b/pipeline/clinvar/filter_clinvar.py index 78addc104..5aa0f4b7b 100755 --- a/pipeline/clinvar/filter_clinvar.py +++ b/pipeline/clinvar/filter_clinvar.py @@ -3,16 +3,24 @@ import argparse import gzip import io +import re import xml.etree.ElementTree as ET -def contains_target_gene_symbol(variationArchiveElement, target_symbol_values): +def contains_target_gene_symbol(variationArchiveElement, + target_symbol_values, debug=True): root = ET.fromstring(variationArchiveElement) + if debug: + print("testing", root.get("VariationName")) geneList = root.find(".//GeneList") if geneList: for geneElement in geneList.iter("Gene"): if "Symbol" in geneElement.attrib: if geneElement.attrib["Symbol"] in target_symbol_values: + if debug: + print("Saving") return(True) + if debug: + print("rejecting") return(False) @@ -23,7 +31,9 @@ def filter_xml_for_gene_symbol(input_fp, output_fp, target_symbol_values, in_variation_archive = False in_header = True for line in input_fp: - if 'T", line): + print("Found it!", line) + if re.search('' in line: + if re.search('', line): in_variation_archive = False xml_string = variation_archive.getvalue() variation_archive.close() if contains_target_gene_symbol(xml_string, - target_symbol_values): + target_symbol_values, debug=True): output_fp.write(xml_string) @@ -46,7 +56,9 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', "--input") parser.add_argument('-o', "--output") - parser.add_argument('-g', "--gene", action='append') + parser.add_argument('-g', "--gene", action='append') + parser.add_argument('-d', "--debug", action='store_true') + parser.set_defaults(debug=False) args = parser.parse_args() if args.input.endswith('gz'): input_fp = io.TextIOWrapper(gzip.open(args.input, 'rb'), diff --git a/pipeline/clinvar/test_clinvar.py b/pipeline/clinvar/test_clinvar.py index 1c25a38f4..f9e998585 100644 --- a/pipeline/clinvar/test_clinvar.py +++ b/pipeline/clinvar/test_clinvar.py @@ -5,21 +5,23 @@ def test_simple_genomic_coordinate_extraction(): - sample_measure = """ - - - NM_000059.3(BRCA2):c.6591_6592del (p.Glu2198fs) - - - - + sample_location = """ + + 13q13.1 + + + """ measure_el = ET.fromstring(sample_measure) - genomic_coords = clinvar_common.extract_genomic_coordinates_from_measure(measure_el) + genomic_coords = clinvar_common.extract_genomic_coordinates_from_location(location_el) - assert genomic_coords[hgvs_utils.HgvsWrapper.GRCh38_Assem] == variant_utils.VCFVariant(13, 32340945, "CTG", "C") + assert genomic_coords[hgvs_utils.HgvsWrapper.GRCh38_Assem] == variant_utils.VCFVariant(13, 32345245, "A", "G") def test_genomic_coordinate_extraction_from_NM():