scanpy
scanpy copied to clipboard
t-SNE optimization using scikit-learn-intelex
This pull request accelerates t-SNE using the scikit-learn-intelex library, resulting in approximately a 10x runtime improvement for the t-SNE implementation in the package for the given example below. The experiment was run on AWS r7i.24xlarge.
import time
import numpy as np
import pandas as pd
import scanpy as sc
from sklearn.cluster import KMeans
import os
import wget
import warnings
warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')
input_file = "./1M_brain_cells_10X.sparse.h5ad"
if not os.path.exists(input_file):
print('Downloading import file...')
wget.download('https://rapids-single-cell-examples.s3.us-east-2.amazonaws.com/1M_brain_cells_10X.sparse.h5ad',input_file)
# marker genes
MITO_GENE_PREFIX = "mt-" # Prefix for mitochondrial genes to regress out
markers = ["Stmn2", "Hes1", "Olig1"] # Marker genes for visualization
# filtering cells
min_genes_per_cell = 200 # Filter out cells with fewer genes than this expressed
max_genes_per_cell = 6000 # Filter out cells with more genes than this expressed
# filtering genes
min_cells_per_gene = 1 # Filter out genes expressed in fewer cells than this
n_top_genes = 4000 # Number of highly variable genes to retain
# PCA
n_components = 50 # Number of principal components to compute
# t-SNE
tsne_n_pcs = 20 # Number of principal components to use for t-SNE
# k-means
k = 35 # Number of clusters for k-means
# Gene ranking
ranking_n_top_genes = 50 # Number of differential genes to compute for each cluster
# Number of parallel jobs
sc._settings.ScanpyConfig.n_jobs = os.cpu_count()
start=time.time()
tr=time.time()
adata = sc.read(input_file)
adata.var_names_make_unique()
adata.shape
print("Total read time : %s" % (time.time()-tr))
tr=time.time()
# To reduce the number of cells:
USE_FIRST_N_CELLS = 1300000
adata = adata[0:USE_FIRST_N_CELLS]
adata.shape
sc.pp.filter_cells(adata, min_genes=min_genes_per_cell)
sc.pp.filter_cells(adata, max_genes=max_genes_per_cell)
sc.pp.filter_genes(adata, min_cells=min_cells_per_gene)
sc.pp.normalize_total(adata, target_sum=1e4)
print("Total filter and normalize time : %s" % (time.time()-tr))
tr=time.time()
sc.pp.log1p(adata)
print("Total log time : %s" % (time.time()-tr))
# Select highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_genes, flavor = "cell_ranger")
# Retain marker gene expression
for marker in markers:
adata.obs[marker + "_raw"] = adata.X[:, adata.var.index == marker].toarray().ravel()
# Filter matrix to only variable genes
adata = adata[:, adata.var.highly_variable]
ts=time.time()
#Regress out confounding factors (number of counts, mitochondrial gene expression)
mito_genes = adata.var_names.str.startswith(MITO_GENE_PREFIX)
n_counts = np.array(adata.X.sum(axis=1))
adata.obs['percent_mito'] = np.array(np.sum(adata[:, mito_genes].X, axis=1)) / n_counts
adata.obs['n_counts'] = n_counts
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])
print("Total regress out time : %s" % (time.time()-ts))
#scale
ts=time.time()
sc.pp.scale(adata, max_value=10)
print("Total scale time : %s" % (time.time()-ts))
t0=time.time()
sc.tl.pca(adata, n_comps=n_components)
t0=time.time()
sc._settings.ScanpyConfig.n_jobs = os.cpu_count()
sc.tl.tsne(adata, n_pcs=tsne_n_pcs, use_fast_tsne=True)
'''
from sklearn.manifold import TSNE
from scanpy.tools._utils import _choose_representation
X = _choose_representation(adata, n_pcs=tsne_n_pcs)
X_tsne = TSNE().fit_transform(X.astype(np.float32))
adata.obsm['X_tsne'] = X_tsne
'''
print("Tsne time:", time.time()-t0)