VariantWorks
VariantWorks copied to clipboard
[I/O] VCFReader crashes when reading from regions that are empty
When reading a VCF and parsing a specific region, the reader can crash if that region contains no variants. This can happen if the region is truly empty or if the tabix index is invalid.
The solution is probably to place a guard around the call to concat
to check if any of the VCF dataframes are not empty:
if any([not df.empty for df in df_list]):
concat(df_list)
and return an empty dataframe if no concat-able datframes exist.
I've placed a stacktrace here, but it's from a VCF with an invalid tabix index, so it's maybe not the best example.
/home/eric/vcfs/LP6005441-DNA_B01.annotated.nh.vcf.gz
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
10
11 sgdp_pon = pon.PanelOfNormals()
---> 12 sgdp_pon.from_vcf_list(sgdp_list, merge_columns=["chrom", "start_pos", "end_pos", "ref", "alt"], info_keys=["AC"], regions=["22"], usePandas=False)
~/sandbox/somatic-vc/somaticdf/pon.py in from_vcf_list(self, vcf_list, merge_columns, regions, info_keys, tags, count_variable, usePandas)
109 for i in range(0, len(vcf_list)):
110 print(vcf_list[i])
--> 111 d = read_func(vcf_list[i], regions, tags=tags, info_keys=info_keys)
112
113 if not d.empty:
~/sandbox/somatic-vc/somaticdf/utils.py in read_vcf_to_df(vcf_file, regions, num_threads, tags, info_keys, filter_keys, format_keys, chunk_size)
130 Read a VCF file into a cuDF Variant DataFrame.
131 """
--> 132 ret = cudf.DataFrame(read_vcf_to_pandas(vcf_file, regions=regions, num_threads=num_threads, tags=tags, info_keys=info_keys, filter_keys=filter_keys, format_keys=format_keys, chunk_size=chunk_size))
133 return ret
134
~/sandbox/somatic-vc/somaticdf/utils.py in read_vcf_to_pandas(vcf_file, regions, num_threads, tags, info_keys, filter_keys, format_keys, chunk_size)
137 Read a VCF file into a PANDAS Variant DataFrame.
138 """
--> 139 v = VCFReader(vcf_file, regions=regions, num_threads=num_threads, tags=tags, require_genotype=False, info_keys=info_keys, format_keys=format_keys, filter_keys=filter_keys, chunksize=chunk_size)
140 df = v.dataframe
141 df = set_default_types(df)
~/sandbox/somatic-vc/VariantWorks/variantworks/io/vcfio.py in __init__(self, vcf, bams, is_fp, require_genotype, tags, info_keys, filter_keys, format_keys, regions, num_threads, chunksize, sort, unbounded_val_max_cols)
172
173 # Parse the VCF
--> 174 self._parallel_parse_vcf()
175
176 @property
~/sandbox/somatic-vc/VariantWorks/variantworks/io/vcfio.py in _parallel_parse_vcf(self)
646 # Generate final DataFrame from intermediate DataFrames computed by
647 # individual threads.
--> 648 self._dataframe = pd.concat(df_list, ignore_index=True)
649
650 # Manually set strict data types of specific fields
~/anaconda3/envs/ashg/lib/python3.7/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
279 verify_integrity=verify_integrity,
280 copy=copy,
--> 281 sort=sort,
282 )
283
~/anaconda3/envs/ashg/lib/python3.7/site-packages/pandas/core/reshape/concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
327
328 if len(objs) == 0:
--> 329 raise ValueError("No objects to concatenate")
330
331 if keys is None:
ValueError: No objects to concatenate