pySCENIC
pySCENIC copied to clipboard
prune2df report error
Describe the bug When the program runs to the prune2df step, it will report an error: ArrowInvalid: Not a feather file. I read your previous discussion and thought it was a problem with the integrity of the database file, but I checked my database and there is no problem.
Steps to reproduce the behavior
- Command run when the error occurred:
import pandas as pd
import numpy as np
import os, glob
import pickle
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell
import seaborn as sns
DATA_FOLDER="/home/xinzhou/scenic_test/"
DATABASE_FOLDER = "/home/xinzhou/scenic_test/cisTarget_databases/"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.mc9nr.genes_vs_motifs.rankings.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(DATABASE_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(DATABASE_FOLDER, 'allTFs_mm.txt')
SC_EXP_FNAME = os.path.join(DATA_FOLDER, "RNA-seq-counts_202207.csv")
ADJACENCIES_FNAME = os.path.join(DATA_FOLDER, "adjacencies.tsv")
MODULES_FNAME = os.path.join(DATA_FOLDER, "modules.p")
MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs.csv")
REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons.p")
N_SAMPLES = 12
ex_matrix = pd.read_csv(SC_EXP_FNAME, sep=',', header=0, index_col=0).T
tf_names = load_tf_names(MM_TFS_FNAME)
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
# adjancencies = grnboost2(expression_data=ex_matrix, tf_names=tf_names, verbose=True)
# adjancencies.to_csv(ADJACENCIES_FNAME, index=False, sep='\t')
adjancencies = pd.read_csv('/home/xinzhou/scenic_test/adjacencies.tsv', sep='\t')
# modules = list(modules_from_adjacencies(adjancencies, ex_matrix))
# with open(MODULES_FNAME, 'wb') as f:
# pickle.dump(modules, f)
with open(MODULES_FNAME, 'rb') as f:
modules = pickle.load(f)
df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
- Error encountered:
ArrowInvalid Traceback (most recent call last)
/tmp/ipykernel_2294546/3264675668.py in <module>
----> 1 df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
/home/miniconda3/envs/scenic/lib/python3.7/site-packages/pyscenic/prune.py in prune2df(rnkdbs, modules, motif_annotations_fname, rank_threshold, auc_threshold, nes_threshold, motif_similarity_fdr, orthologuous_identity_threshold, weighted_recovery, client_or_address, num_workers, module_chunksize, filter_for_annotation)
349 return _distributed_calc(rnkdbs, modules, motif_annotations_fname, transformation_func, aggregation_func,
350 motif_similarity_fdr, orthologuous_identity_threshold, client_or_address,
--> 351 num_workers, module_chunksize)
352
353
/home/miniconda3/envs/scenic/lib/python3.7/site-packages/pyscenic/prune.py in _distributed_calc(rnkdbs, modules, motif_annotations_fname, transform_func, aggregate_func, motif_similarity_fdr, orthologuous_identity_threshold, client_or_address, num_workers, module_chunksize)
298 if client_or_address == "dask_multiprocessing":
299 # ... via multiprocessing.
--> 300 return create_graph().compute(scheduler='processes', num_workers=num_workers if num_workers else cpu_count())
301 else:
302 # ... via dask.distributed framework.
/home/miniconda3/envs/scenic/lib/python3.7/site-packages/dask/base.py in compute(self, **kwargs)
288 dask.base.compute
289 """
--> 290 (result,) = compute(self, traverse=False, **kwargs)
291 return result
292
...
---> 84 raise ArrowInvalid(message)
85 elif status.IsIOError():
86 # Note: OSError constructor is
ArrowInvalid: Not a feather file
Please complete the following information:
- pySCENIC version: 0.11.1
- Installation method: pip
- Run environment: Jupyter notebook
- OS: Ubuntu
- Package versions:
aiohttp==3.8.3
aiosignal==1.3.1
arboreto==0.1.6
async-timeout==4.0.2
asynctest==0.13.0
attrs==22.2.0
backcall==0.2.0
bokeh==2.4.3
boltons==21.0.0
Bottleneck @ file:///home/conda/feedstock_root/build_artifacts/bottleneck_1656803757560/work
certifi==2022.12.7
charset-normalizer==2.1.1
click==8.1.3
cloudpickle==2.2.1
ctxcore==0.2.0
cycler==0.11.0
cytoolz==0.12.1
dask==2022.2.0
debugpy==1.6.6
decorator==5.1.1
dill==0.3.6
distributed==2022.2.0
entrypoints==0.4
fonttools==4.38.0
frozendict==2.3.4
frozenlist==1.3.3
fsspec==2023.1.0
h5py==3.8.0
HeapDict==1.0.1
idna==3.4
importlib-metadata==6.0.0
interlap==0.2.7
ipykernel==6.16.2
ipython==7.34.0
jedi==0.18.2
Jinja2==3.1.2
joblib==1.2.0
jupyter_client==7.4.9
jupyter_core==4.12.0
kiwisolver==1.4.4
llvmlite==0.39.1
locket==1.0.0
loompy==3.0.7
MarkupSafe==2.1.2
matplotlib==3.5.3
matplotlib-inline==0.1.6
msgpack==1.0.4
multidict==6.0.4
multiprocessing-on-dill==3.5.0a4
nest-asyncio==1.5.6
networkx==2.6.3
numba==0.56.4
numexpr==2.8.4
numpy==1.21.6
numpy-groupies==0.9.20
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1673482170163/work
pandas==1.3.5
parso==0.8.3
partd==1.3.0
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.4.0
prompt-toolkit==3.0.36
psutil==5.9.4
ptyprocess==0.7.0
pyarrow==0.16.0
Pygments==2.14.0
pynndescent==0.5.8
pyparsing==3.0.9
pyscenic==0.11.1
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1673864280276/work
PyYAML==6.0
pyzmq==25.0.0
requests==2.28.2
scikit-learn==1.0.2
scipy==1.7.3
seaborn==0.12.2
six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work
sortedcontainers==2.4.0
tblib==1.7.0
threadpoolctl==3.1.0
toolz==0.12.0
tornado==6.2
tqdm==4.64.1
traitlets==5.8.1
typing_extensions==4.4.0
umap-learn==0.5.3
urllib3==1.26.14
wcwidth==0.2.6
yarl==1.8.2
zict==2.2.0
zipp==3.11.0
Have you tried using the most recent version of pyscenic? I would also try checking whether the line dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
ran correctly. Inspect the dbs
object.
hello, I encounter the same problem, I would like to ask, have you solved this problem and how did you solve it?