scenicplus icon indicating copy to clipboard operation
scenicplus copied to clipboard

OSError: ZSTD decompression failed: Unknown frame descriptor

Open ramadatta opened this issue 2 years ago • 1 comments

Describe the bug

Hi, thank you for creating a nice tool!

I get the following error:

OSError: ZSTD decompression failed: Unknown frame descriptor

on running the following code in 10x multiome pbmc tutorial

print(rankings_db)
print(scores_db)
print(motif_annotation) 

/data01/SingleCell/Scenicplus/Extradownloads/hg38_screen_v10_clust.regions_vs_motifs.rankings.feather
/data01/SingleCell/Scenicplus/Extradownloads/hg38_screen_v10_clust.regions_vs_motifs.scores.feather
/data01/SingleCell/Scenicplus/Extradownloads/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl

if not os.path.exists(os.path.join(work_dir, 'motifs')):
    os.makedirs(os.path.join(work_dir, 'motifs'))

from scenicplus.wrappers.run_pycistarget import run_pycistarget
run_pycistarget(
    region_sets = region_sets,
    species = 'homo_sapiens',
    save_path = os.path.join(work_dir, 'motifs'),
    ctx_db_path = rankings_db,
    dem_db_path = scores_db,
    path_to_motif_annotations = motif_annotation,
    run_without_promoters = True,
    n_cpu = 1,
    _temp_dir = os.path.join(tmp_dir, 'ray_spill'),
    annotation_version = 'v10nr_clust',
    )

Error output

2023-02-02 13:04:35,949 pycisTarget_wrapper INFO     pbmc_tutorial/motifs folder already exists.
2023-02-02 13:04:36,491 pycisTarget_wrapper INFO     Loading cisTarget database for topics_otsu
2023-02-02 13:04:36,492 cisTarget    INFO     Reading cisTarget database
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Input In [91], in <cell line: 2>()
      1 from scenicplus.wrappers.run_pycistarget import run_pycistarget
----> 2 run_pycistarget(
      3     region_sets = region_sets,
      4     species = 'homo_sapiens',
      5     save_path = os.path.join(work_dir, 'motifs'),
      6     ctx_db_path = rankings_db,
      7     dem_db_path = scores_db,
      8     path_to_motif_annotations = motif_annotation,
      9     run_without_promoters = True,
     10     n_cpu = 1,
     11     _temp_dir = os.path.join(tmp_dir, 'ray_spill'),
     12     annotation_version = 'v10nr_clust',
     13     )

File ~/sw/scenicplus/src/scenicplus/wrappers/run_pycistarget.py:182, in run_pycistarget(region_sets, species, save_path, custom_annot, save_partial, ctx_db_path, dem_db_path, run_without_promoters, biomart_host, promoter_space, ctx_auc_threshold, ctx_nes_threshold, ctx_rank_threshold, dem_log2fc_thr, dem_motif_hit_thr, dem_max_bg_regions, annotation, motif_similarity_fdr, path_to_motif_annotations, annotation_version, n_cpu, _temp_dir, exclude_motifs, exclude_collection, **kwargs)
    180 ## CISTARGET
    181 regions = region_sets[key]
--> 182 ctx_db = cisTargetDatabase(ctx_db_path, regions)  
    183 if exclude_motifs is not None:
    184     out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist()

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/motif_enrichment_cistarget.py:67, in cisTargetDatabase.__init__(self, fname, region_sets, name, fraction_overlap)
     48 def __init__(self, 
     49             fname: str,
     50             region_sets: Union[Dict[str, pr.PyRanges], pr.PyRanges] = None,
     51             name: str = None,
     52             fraction_overlap: float = 0.4):
     53     """
     54     Initialize cisTargetDatabase
     55     
   (...)
     65         Minimal overlap between query and regions in the database for the mapping.     
     66     """
---> 67     self.regions_to_db, self.db_rankings, self.total_regions = self.load_db(fname,
     68                                                       region_sets,
     69                                                       name,
     70                                                       fraction_overlap)

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pycistarget/motif_enrichment_cistarget.py:132, in cisTargetDatabase.load_db(self, fname, region_sets, name, fraction_overlap)
    130     target_regions_in_db = [prefix + '__' + x for x in target_regions_in_db]
    131 target_regions_in_db = GeneSignature(name=name, gene2weight=target_regions_in_db)
--> 132 db_rankings = db.load(target_regions_in_db)
    133 if prefix is not None:
    134     db_rankings.columns = [x.split('__')[1] for x in db_rankings.columns]

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/rnkdb.py:132, in FeatherRankingDatabase.load(self, gs)
    128 def load(self, gs: GeneSignature) -> pd.DataFrame:
    129     # For some genes in the signature there might not be a rank available in the database.
    130     gene_set = self.geneset.intersection(set(gs.genes))
--> 132     return self.ct_db.subset_to_pandas(
    133         region_or_gene_ids=RegionOrGeneIDs(
    134             region_or_gene_ids=gene_set,
    135             regions_or_genes_type=self.ct_db.all_region_or_gene_ids.type,
    136         )
    137     )

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:789, in CisTargetDatabase.subset_to_pandas(self, region_or_gene_ids, engine)
    785 engine = engine if engine else self.engine
    787 # Fetch scores or rankings for input region IDs or gene IDs from cisTarget database file for region IDs or
    788 # gene IDs which were not prefetched in previous calls.
--> 789 self.prefetch(region_or_gene_ids=region_or_gene_ids, engine=engine, sort=True)
    791 if not self.df_cached:
    792     raise RuntimeError(
    793         f"Prefetch failed to retrieve {self.scores_or_rankings} for "
    794         f"{region_or_gene_ids} from cisTarget database "
    795         f'"{self.ct_db_filename}".'
    796     )

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:739, in CisTargetDatabase.prefetch(self, region_or_gene_ids, engine, sort)
    734     self._prefetch_as_polars_dataframe(
    735         region_or_gene_ids=region_or_gene_ids, use_pyarrow=True, sort=sort
    736     )
    737 elif engine == "pyarrow":
    738     # Store prefetched data as pyarrow Table (self.df_cached) and read data with pyarrow's native IPC reader.
--> 739     self._prefetch_as_pyarrow_table(
    740         region_or_gene_ids=region_or_gene_ids, sort=sort
    741     )
    742 else:
    743     raise ValueError(
    744         f'Unsupported engine "{engine}" for reading cisTarget database.'
    745     )

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/ctxcore/ctdb.py:627, in CisTargetDatabase._prefetch_as_pyarrow_table(self, region_or_gene_ids, sort)
    618     raise ValueError(
    619         f"Not all provided {self.all_region_or_gene_ids.type} are found: {not_found_region_or_gene_ids}"
    620     )
    622 if not self.df_cached or not isinstance(self.df_cached, pa.Table):
    623     # No region IDs or gene IDs scores/rankings where loaded before or cached version was a polars DataFrame.
    624 
    625     # Get all found region IDs or gene IDs columns with scores/rankings and "motifs" or "track" column from
    626     # cisTarget Feather file as a pyarrow Table.
--> 627     self.df_cached = pf.read_table(
    628         source=self.ct_db_filename,
    629         columns=(
    630             found_region_or_gene_ids.sort().ids
    631             if sort
    632             else found_region_or_gene_ids.ids
    633         )
    634         + (self.all_motif_or_track_ids.type.value,),
    635         memory_map=False,
    636         use_threads=True,
    637     )
    639     # Keep track of loaded region IDs or gene IDs scores/rankings.
    640     self.region_or_gene_ids_loaded = found_region_or_gene_ids

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pyarrow/feather.py:266, in read_table(source, columns, memory_map, use_threads)
    264     table = reader.read_indices(columns)
    265 elif all(map(lambda t: t == str, column_types)):
--> 266     table = reader.read_names(columns)
    267 else:
    268     column_type_names = [t.__name__ for t in column_types]

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pyarrow/_feather.pyx:114, in pyarrow._feather.FeatherReader.read_names()

File ~/anaconda3/envs/scenicplus/lib/python3.8/site-packages/pyarrow/error.pxi:115, in pyarrow.lib.check_status()

OSError: ZSTD decompression failed: Unknown frame descriptor

Version :

  • Python: Python 3.8.13
  • SCENIC+: 0.1.dev456+g9662363

Can kindly help how this can be resolved?

ramadatta avatar Feb 02 '23 05:02 ramadatta

Probably the feather files you downloaded, where incomplete or corrupted.

ghuls avatar Jun 20 '23 15:06 ghuls