Support for wildcards lost in the newer versions of hgvs?
One of the engineers on my team noticed that he updated hgvs recently, and after the update we somehow lost the ability to parse cDNA variants with indeterminate nucleotides. Was this by design?
(the message was copied from our project slack board, and sorry if the formatting looks funny. The original is here).
From Marc:
here one of the variants I've mentioned on the call where the conversion to protein fails with newer versions of hgvs (newer than 1.3.0 I believe)
print(parser,(url) assembly_mapper)
cdna = parser.parse('NM_000059.3:c.2339C>R')
assembly_mapper.c_to_p(cdna)
gives:
KeyError Traceback (most recent call last)
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/bioutils/sequences.py in translate_cds(seq, full_codons, ter_symbol)
443 try:
--> 444 aa = dna_to_aa1_lut[seq[i:i + 3]]
445 except KeyError:
KeyError: 'TRA'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-31-0172ec91a9d8> in <module>
1 print(parser, assembly_mapper)
2 cdna = parser.parse('NM_000059.3:c.2339C>R')
----> 3 assembly_mapper.c_to_p(cdna)
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/assemblymapper.py in c_to_p(self, var_c)
156
157 def c_to_p(self, var_c):
--> 158 var_out = super(AssemblyMapper, self).c_to_p(var_c)
159 return self._maybe_normalize(var_out)
160
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/variantmapper.py in c_to_p(self, var_c, pro_ac)
402 # currently get list of 1 element loop structure implemented
403 # to handle this, but doesn't really do anything currently.
--> 404 all_alt_data = builder.build_altseq()
405
406 var_ps = []
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/utils/altseqbuilder.py in build_altseq(self)
158
159 try:
--> 160 this_alt_data = type_map[edit_type]()
161 except KeyError:
162 raise NotImplementedError("c to p translation unsupported for {} type {}".format(
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/utils/altseqbuilder.py in _incorporate_delins(self)
235 variant_start_aa = max(int(math.ceil((self._var_c.posedit.pos.start.base) / 3.0)), 1)
236
--> 237 alt_data = AltTranscriptData(
238 seq,
239 cds_start,
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/utils/altseqbuilder.py in __init__(self, seq, cds_start, cds_stop, is_frameshift, variant_start_aa, accession, is_substitution, is_ambiguous)
59 seq_cds = seq[cds_start - 1:]
60 seq_cds = ''.join(seq_cds)
---> 61 seq_aa = translate_cds(seq_cds, full_codons=False, ter_symbol="X")
62 stop_pos = seq_aa[:(cds_stop - cds_start + 1) // 3].rfind("*")
63 if stop_pos == -1:
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/bioutils/sequences.py in translate_cds(seq, full_codons, ter_symbol)
444 aa = dna_to_aa1_lut[seq[i:i + 3]]
445 except KeyError:
--> 446 raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(
447 seq[i:i + 3], i+1, i+3))
448 protein_seq.append(aa)
ValueError: Codon TRA at position 2338..2340 is undefined in codon table
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/bioutils/sequences.py in translate_cds(seq, full_codons, ter_symbol)
443 try:
--> 444 aa = dna_to_aa1_lut[seq[i:i + 3]]
445 except KeyError:
KeyError: 'TRA'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-31-0172ec91a9d8> in <module>
1 print(parser, assembly_mapper)
2 cdna = parser.parse('NM_000059.3:c.2339C>R')
----> 3 assembly_mapper.c_to_p(cdna)
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/assemblymapper.py in c_to_p(self, var_c)
156
157 def c_to_p(self, var_c):
--> 158 var_out = super(AssemblyMapper, self).c_to_p(var_c)
159 return self._maybe_normalize(var_out)
160
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/variantmapper.py in c_to_p(self, var_c, pro_ac)
402 # currently get list of 1 element loop structure implemented
403 # to handle this, but doesn't really do anything currently.
--> 404 all_alt_data = builder.build_altseq()
405
406 var_ps = []
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/utils/altseqbuilder.py in build_altseq(self)
158
159 try:
--> 160 this_alt_data = type_map[edit_type]()
161 except KeyError:
162 raise NotImplementedError("c to p translation unsupported for {} type {}".format(
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/utils/altseqbuilder.py in _incorporate_delins(self)
235 variant_start_aa = max(int(math.ceil((self._var_c.posedit.pos.start.base) / 3.0)), 1)
236
--> 237 alt_data = AltTranscriptData(
238 seq,
239 cds_start,
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/hgvs/utils/altseqbuilder.py in __init__(self, seq, cds_start, cds_stop, is_frameshift, variant_start_aa, accession, is_substitution, is_ambiguous)
59 seq_cds = seq[cds_start - 1:]
60 seq_cds = ''.join(seq_cds)
---> 61 seq_aa = translate_cds(seq_cds, full_codons=False, ter_symbol="X")
62 stop_pos = seq_aa[:(cds_stop - cds_start + 1) // 3].rfind("*")
63 if stop_pos == -1:
~/software/anaconda/envs/brca_pipeline_py3/lib/python3.8/site-packages/bioutils/sequences.py in translate_cds(seq, full_codons, ter_symbol)
444 aa = dna_to_aa1_lut[seq[i:i + 3]]
445 except KeyError:
--> 446 raise ValueError("Codon {} at position {}..{} is undefined in codon table".format(
447 seq[i:i + 3], i+1, i+3))
448 protein_seq.append(aa)
ValueError: Codon TRA at position 2338..2340 is undefined in codon table"
Confirmed. Rats.
#527, released in hgvs 1.4, eliminated the dependency on biopython which was used solely for the translate function.
The replacement translate function doesn't support degenerate codons. The lookup table is here: https://github.com/biocommons/bioutils/blob/master/src/bioutils/sequences.py#L40
See biocommons/bioutils#29.