scenicplus
scenicplus copied to clipboard
OSError: ZSTD decompression failed: Unknown frame descriptor
Describe the bug
Hi, thank you for creating a nice tool!
I get the following error:
OSError: ZSTD decompression failed: Unknown frame descriptor
on running the following code in 10x multiome pbmc tutorial
print(rankings_db)
print(scores_db)
print(motif_annotation)
/data01/SingleCell/Scenicplus/Extradownloads/hg38_screen_v10_clust.regions_vs_motifs.rankings.feather
/data01/SingleCell/Scenicplus/Extradownloads/hg38_screen_v10_clust.regions_vs_motifs.scores.feather
/data01/SingleCell/Scenicplus/Extradownloads/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl
if not os.path.exists(os.path.join(work_dir, 'motifs')):
os.makedirs(os.path.join(work_dir, 'motifs'))
from scenicplus.wrappers.run_pycistarget import run_pycistarget
run_pycistarget(
region_sets = region_sets,
species = 'homo_sapiens',
save_path = os.path.join(work_dir, 'motifs'),
ctx_db_path = rankings_db,
dem_db_path = scores_db,
path_to_motif_annotations = motif_annotation,
run_without_promoters = True,
n_cpu = 1,
_temp_dir = os.path.join(tmp_dir, 'ray_spill'),
annotation_version = 'v10nr_clust',
)
Error output
2023-02-02 13:04:35,949 pycisTarget_wrapper INFO pbmc_tutorial/motifs folder already exists.
2023-02-02 13:04:36,491 pycisTarget_wrapper INFO Loading cisTarget database for topics_otsu
2023-02-02 13:04:36,492 cisTarget INFO Reading cisTarget database
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Input In [91], in <cell line: 2>()
1 from scenicplus.wrappers.run_pycistarget import run_pycistarget
----> 2 run_pycistarget(
3 region_sets = region_sets,
4 species = 'homo_sapiens',
5 save_path = os.path.join(work_dir, 'motifs'),
6 ctx_db_path = rankings_db,
7 dem_db_path = scores_db,
8 path_to_motif_annotations = motif_annotation,
9 run_without_promoters = True,
10 n_cpu = 1,
11 _temp_dir = os.path.join(tmp_dir, 'ray_spill'),
12 annotation_version = 'v10nr_clust',
13 )
File ~/sw/scenicplus/src/scenicplus/wrappers/run_pycistarget.py:182, in run_pycistarget(region_sets, species, save_path, custom_annot, save_partial, ctx_db_path, dem_db_path, run_without_promoters, biomart_host, promoter_space, ctx_auc_threshold, ctx_nes_threshold, ctx_rank_threshold, dem_log2fc_thr, dem_motif_hit_thr, dem_max_bg_regions, annotation, motif_similarity_fdr, path_to_motif_annotations, annotation_version, n_cpu, _temp_dir, exclude_motifs, exclude_collection, **kwargs)
180 ## CISTARGET
181 regions = region_sets[key]
--> 182 ctx_db = cisTargetDatabase(ctx_db_path, regions)
183 if exclude_motifs is not None:
184 out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/motif_enrichment_cistarget.py:67, in cisTargetDatabase.__init__(self, fname, region_sets, name, fraction_overlap)
48 def __init__(self,
49 fname: str,
50 region_sets: Union[Dict[str, pr.PyRanges], pr.PyRanges] = None,
51 name: str = None,
52 fraction_overlap: float = 0.4):
53 """
54 Initialize cisTargetDatabase
55
(...)
65 Minimal overlap between query and regions in the database for the mapping.
66 """
---> 67 self.regions_to_db, self.db_rankings, self.total_regions = self.load_db(fname,
68 region_sets,
69 name,
70 fraction_overlap)
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/motif_enrichment_cistarget.py:132, in cisTargetDatabase.load_db(self, fname, region_sets, name, fraction_overlap)
130 target_regions_in_db = [prefix + '__' + x for x in target_regions_in_db]
131 target_regions_in_db = GeneSignature(name=name, gene2weight=target_regions_in_db)
--> 132 db_rankings = db.load(target_regions_in_db)
133 if prefix is not None:
134 db_rankings.columns = [x.split('__')[1] for x in db_rankings.columns]
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/rnkdb.py:132, in FeatherRankingDatabase.load(self, gs)
128 def load(self, gs: GeneSignature) -> pd.DataFrame:
129 # For some genes in the signature there might not be a rank available in the database.
130 gene_set = self.geneset.intersection(set(gs.genes))
--> 132 return self.ct_db.subset_to_pandas(
133 region_or_gene_ids=RegionOrGeneIDs(
134 region_or_gene_ids=gene_set,
135 regions_or_genes_type=self.ct_db.all_region_or_gene_ids.type,
136 )
137 )
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:789, in CisTargetDatabase.subset_to_pandas(self, region_or_gene_ids, engine)
785 engine = engine if engine else self.engine
787 # Fetch scores or rankings for input region IDs or gene IDs from cisTarget database file for region IDs or
788 # gene IDs which were not prefetched in previous calls.
--> 789 self.prefetch(region_or_gene_ids=region_or_gene_ids, engine=engine, sort=True)
791 if not self.df_cached:
792 raise RuntimeError(
793 f"Prefetch failed to retrieve {self.scores_or_rankings} for "
794 f"{region_or_gene_ids} from cisTarget database "
795 f'"{self.ct_db_filename}".'
796 )
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:739, in CisTargetDatabase.prefetch(self, region_or_gene_ids, engine, sort)
734 self._prefetch_as_polars_dataframe(
735 region_or_gene_ids=region_or_gene_ids, use_pyarrow=True, sort=sort
736 )
737 elif engine == "pyarrow":
738 # Store prefetched data as pyarrow Table (self.df_cached) and read data with pyarrow's native IPC reader.
--> 739 self._prefetch_as_pyarrow_table(
740 region_or_gene_ids=region_or_gene_ids, sort=sort
741 )
742 else:
743 raise ValueError(
744 f'Unsupported engine "{engine}" for reading cisTarget database.'
745 )
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:627, in CisTargetDatabase._prefetch_as_pyarrow_table(self, region_or_gene_ids, sort)
618 raise ValueError(
619 f"Not all provided {self.all_region_or_gene_ids.type} are found: {not_found_region_or_gene_ids}"
620 )
622 if not self.df_cached or not isinstance(self.df_cached, pa.Table):
623 # No region IDs or gene IDs scores/rankings where loaded before or cached version was a polars DataFrame.
624
625 # Get all found region IDs or gene IDs columns with scores/rankings and "motifs" or "track" column from
626 # cisTarget Feather file as a pyarrow Table.
--> 627 self.df_cached = pf.read_table(
628 source=self.ct_db_filename,
629 columns=(
630 found_region_or_gene_ids.sort().ids
631 if sort
632 else found_region_or_gene_ids.ids
633 )
634 + (self.all_motif_or_track_ids.type.value,),
635 memory_map=False,
636 use_threads=True,
637 )
639 # Keep track of loaded region IDs or gene IDs scores/rankings.
640 self.region_or_gene_ids_loaded = found_region_or_gene_ids
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pyarrow/feather.py:266, in read_table(source, columns, memory_map, use_threads)
264 table = reader.read_indices(columns)
265 elif all(map(lambda t: t == str, column_types)):
--> 266 table = reader.read_names(columns)
267 else:
268 column_type_names = [t.__name__ for t in column_types]
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pyarrow/_feather.pyx:114, in pyarrow._feather.FeatherReader.read_names()
File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pyarrow/error.pxi:115, in pyarrow.lib.check_status()
OSError: ZSTD decompression failed: Unknown frame descriptor
Version :
- Python: Python 3.8.13
- SCENIC+: 0.1.dev456+g9662363
Can kindly help how this can be resolved?
Probably the feather files you downloaded, where incomplete or corrupted.