VariantWorks
VariantWorks copied to clipboard
[I/O] KeyErrors are thrown for info columns not present in a region
When passing a list of INFO fields and reading a VCF region where no variants have an entry for the field, an error is thrown. This happens with the 1000 Genomes VCF and the population-specific allele frequencies such as AFR. I assume it's because the header may be poorly formatted and the expected number does not match the real number.
More graceful behavior would be helpful, and we should probably have a test or two to catch things like this (and to declare whether or not we consider this a bug).
Example read:
onekg = utils.read_vcf_to_pandas("/home/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz", regions=["1:1-1000000"], info_keys=["AA", "AC", "AF", "AMR", "AN"], num_threads=1)
Error:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-15-9addf9417f5b> in <module>
1 ## Read in a VCF using the VariantWorks VCFReader
----> 2 onekg = utils.read_vcf_to_pandas("/home/edawson/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz", regions=["1:1-1000000"], info_keys=["AFR"], num_threads=1)
~/sandbox/somatic-vc/somaticdf/utils.py in read_vcf_to_pandas(vcf_file, regions, num_threads, tags, info_keys, filter_keys, format_keys, chunk_size)
343 Read a VCF file into a PANDAS Variant DataFrame.
344 """
--> 345 v = VCFReader(vcf_file, regions=regions, num_threads=num_threads, tags=tags, require_genotype=False, info_keys=info_keys, format_keys=format_keys, filter_keys=filter_keys, chunksize=chunk_size)
346 df = v.dataframe
347 df = set_default_types(df)
~/sandbox/VariantWorks/variantworks/io/vcfio.py in __init__(self, vcf, bams, is_fp, require_genotype, tags, info_keys, filter_keys, format_keys, regions, num_threads, chunksize, sort, unbounded_val_max_cols)
172
173 # Parse the VCF
--> 174 self._parallel_parse_vcf()
175
176 @property
~/sandbox/VariantWorks/variantworks/io/vcfio.py in _parallel_parse_vcf(self)
599 self._info_vcf_keys.append(h['ID'])
600 for k in self._info_vcf_keys:
--> 601 header_number = vcf.get_header_type(k)['Number']
602 self._info_vcf_key_counts[k] = self._get_normalized_count(header_number, 1, len(vcf.samples))
603 self._header_number[k] = header_number
~/anaconda3/envs/rapids16/lib/python3.8/site-packages/cyvcf2/cyvcf2.pyx in cyvcf2.cyvcf2.VCF.get_header_type()
~/anaconda3/envs/rapids16/lib/python3.8/site-packages/cyvcf2/cyvcf2.pyx in cyvcf2.cyvcf2.VCF.get_header_type()
KeyError: b'AFR'
Particularly problematic are the population-specific allele frequencies (AFR, AMR, EUR, EX, etc.). MEINFO throws a different error:
---------------------------------------------------------------------------
RemoteTraceback Traceback (most recent call last)
RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/edawson/anaconda3/envs/rapids16/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/home/edawson/sandbox/VariantWorks/variantworks/io/vcfio.py", line 571, in _parse_vcf_cyvcf
df_list.append(self._create_df(vcf, variant_list, local_variant_idx))
File "/home/edawson/sandbox/VariantWorks/variantworks/io/vcfio.py", line 431, in _create_df
df_dict[df_key + "-" + str(i)].append(val[i])
IndexError: tuple index out of range
"""
The above exception was the direct cause of the following exception:
IndexError Traceback (most recent call last)
<ipython-input-30-508b4cbbd49a> in <module>
1 ## Read in a VCF using the VariantWorks VCFReader
----> 2 onekg = utils.read_vcf_to_pandas("/home/edawson/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5b.20130502.sites.vcf.gz", regions=["1:10000000-15000000"], info_keys=["AA", "AC", "AN", "CIEND", "CIPOS", "CS", "DP", "END", "IMPRECISE", "MC", "MEINFO"], num_threads=1)
~/sandbox/somatic-vc/somaticdf/utils.py in read_vcf_to_pandas(vcf_file, regions, num_threads, tags, info_keys, filter_keys, format_keys, chunk_size)
343 Read a VCF file into a PANDAS Variant DataFrame.
344 """
--> 345 v = VCFReader(vcf_file, regions=regions, num_threads=num_threads, tags=tags, require_genotype=False, info_keys=info_keys, format_keys=format_keys, filter_keys=filter_keys, chunksize=chunk_size)
346 df = v.dataframe
347 df = set_default_types(df)
~/sandbox/VariantWorks/variantworks/io/vcfio.py in __init__(self, vcf, bams, is_fp, require_genotype, tags, info_keys, filter_keys, format_keys, regions, num_threads, chunksize, sort, unbounded_val_max_cols)
172
173 # Parse the VCF
--> 174 self._parallel_parse_vcf()
175
176 @property
~/sandbox/VariantWorks/variantworks/io/vcfio.py in _parallel_parse_vcf(self)
638 df_lists = []
639 func = partial(self._parse_vcf_cyvcf)
--> 640 for df in pool.imap(func, range(self._num_threads)):
641 df_lists.append(df)
642 pool.close()
~/anaconda3/envs/rapids16/lib/python3.8/multiprocessing/pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
IndexError: tuple index out of range