Source code for pyrovelocity.cytotrace

__all__ = [
    "census_normalize",
    "remove_zero_mvg",
    "compute_similarity2",
    "compute_similarity1",
    "compute_gcs",
    "convert_to_markov",
    "any",
    "find_nonzero",
    "FNNLSa",
    "nu",
    "eps",
    "regressed",
    "diffused",
    "compare_cytotrace",
    "cytotrace",
    "batch_cytotrace",
    "compare_cytotrace_ncores",
    "cytotrace_ncore",
    "align_diffrate",
    "plot_multiadata",
    "cumulative_boxplot",
    "run_cytotrace",
]

from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Cell
import scvelo as scv
from numpy import ndarray
from scipy.stats import mannwhitneyu
from scipy.stats import rankdata


##from scanorama import correct_scanpy
scv.logging.verbosity = 3
scv.settings.presenter_view = True
scv.set_figure_params("scvelo")
np.random.seed(99)

# Cell
from scipy.sparse import csr_matrix
from scipy.sparse import issparse


[docs]def census_normalize(mat, count): "RNA-seq census normalization to correct cell lysis" if issparse(mat): # x = mat.copy() # x.data = 2**x.data - 1 # gene x cell x = mat.multiply(count).multiply(1.0 / mat.sum(axis=0)).tocsr() return x.log1p() else: x = 2**mat - 1 # gene x cell x = (x * count) / x.sum(axis=0) return np.log2(x + 1)
# Cell
[docs]def remove_zero_mvg(mat): "remove cells not expressing any of the top 1000 variable genes" # gene x cell cell_count = (mat > 0).sum(axis=1) X_topcell = mat[cell_count >= 0.05 * mat.shape[1], :] var = X_topcell.var(axis=1) mean = X_topcell.mean(axis=1) disp = var / mean return X_topcell[ disp >= (np.sort(disp)[::-1])[:1000][-1], : ] # in case less than 1000 genes
# Cell
[docs]def compute_similarity2(O: ndarray, P: ndarray) -> ndarray: "Compute pearson correlation between two matrices O and P using einstein summation" (n, t) = O.shape # n traces of t samples (n_bis, m) = P.shape # n predictions for each of m candidates DO = O - ( np.einsum("nt->t", O, optimize="optimal") / np.double(n) ) # compute O - mean(O) DP = P - ( np.einsum("nm->m", P, optimize="optimal") / np.double(n) ) # compute P - mean(P) cov = np.einsum("nm,nt->mt", DP, DO, optimize="optimal") varP = np.einsum("nm,nm->m", DP, DP, optimize="optimal") varO = np.einsum("nt,nt->t", DO, DO, optimize="optimal") tmp = np.einsum("m,t->mt", varP, varO, optimize="optimal") return cov / np.sqrt(tmp)
# Cell # adapt from https://github.com/ikizhvatov/efficient-columnwise-correlation/blob/master/columnwise_corrcoef_perf.py
[docs]def compute_similarity1(A): "Compute pairwise correlation of all columns in matrices A" n, t = A.shape # genes x samples DO = A - ( np.einsum("nt->t", A, optimize="optimal") / np.double(n) ) # compute O - mean(O) cov = np.einsum("nm,nt->mt", DO, DO, optimize="optimal") varP = np.einsum("nt,nt->t", DO, DO, optimize="optimal") return cov / np.sqrt(np.einsum("m,t->mt", varP, varP, optimize="optimal"))
# Cell
[docs]def compute_gcs(mat, count, top_n_genes=200): "Compute gene set enrichment scores by correlating gene count and gene expression" corrs = compute_similarity2(mat.T, count.reshape(-1, 1))[0, :] # avoid nan, put all nan to -1 corrs[np.isnan(corrs)] = -1 gcs = mat[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0) return gcs
# Cell
[docs]def convert_to_markov(sim): """Convert the Pearson correlation to Markov matrix TODO: use velocity graph to replace this markov matrix """ Ds = np.copy(sim) cutoff = np.mean(Ds) np.fill_diagonal(Ds, 0) Ds[Ds <= 0] = 0 Ds[Ds <= cutoff] = 0 zero_rows = Ds.sum(axis=1) == 0 zero_cols = Ds.sum(axis=0) == 0 Ds[~zero_rows, :] = (Ds[~zero_rows, :].T / Ds[~zero_rows, :].sum(axis=1)).T # tips Ds[:, zero_cols] = 0 return Ds
# Cell # http://xrm.phys.northwestern.edu/research/pdf_papers/1997/bro_chemometrics_1997.pdf # matlab reference: https://www.mathworks.com/matlabcentral/mlc-downloads/downloads/submissions/3388/versions/1/previews/fnnls.m/index.html from numpy import abs from numpy import arange from numpy import argmax from numpy import finfo from numpy import float64 from numpy import int64 from numpy import min from numpy import newaxis from numpy import nonzero from numpy import sum from numpy import zeros from scipy.linalg import solve nu = newaxis import numpy as np # machine epsilon eps = finfo(float64).eps
[docs]def any(a): # assuming a vector, a larger_than_zero = sum(a > 0) if larger_than_zero: return True else: return False
[docs]def find_nonzero(a): # returns indices of nonzero elements in a return nonzero(a)[0]
[docs]def FNNLSa(XtX, Xty, tol=None): """Faster NNLS imported from https://github.com/delnatan/FNNLSa A fast non-negativity-constrained least squares algorithm. Journal of chemometrics """ if tol is None: tol = eps M, N = XtX.shape # initialize passive set, P. Indices where coefficient is >0 P = zeros(N, dtype=int64) # and active set. Indices where coefficient is <=0 Z = arange(N) + 1 # working active set ZZ = arange(N) + 1 # initial solution vector, x x = zeros(N, dtype=float64) # weight vector w = Xty - XtX @ x # iteration counts and parameter it = 0 itmax = 30 * N # MAIN LOOP # continue as long as there are indices within the active set Z # or elements in inner loop active set is larger than 'tolerance' piter = 0 while any(Z) and any(w[ZZ - 1] > tol): piter += 1 t = argmax(w[ZZ - 1]) + 1 # find largest weight t = ZZ[t - 1] P[t - 1] = t # move to passive set Z[t - 1] = 0 # remove from active set PP = find_nonzero(P) + 1 ZZ = find_nonzero(Z) + 1 NZZ = ZZ.shape # compute trial solution, s s = zeros(N, dtype=float64) if len(PP) == 1: s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1] else: s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1]) s[ZZ - 1] = 0.0 # set active coefficients to 0 while any(s[PP - 1] <= tol) and it < itmax: it = it + 1 QQ = find_nonzero((s <= tol) * P) + 1 alpha = min(x[QQ - 1] / (x[QQ - 1] - s[QQ - 1])) x = x + alpha * (s - x) ij = find_nonzero((abs(x) < tol) * (P != 0)) + 1 Z[ij - 1] = ij P[ij - 1] = 0 PP = find_nonzero(P) + 1 ZZ = find_nonzero(Z) + 1 if len(PP) == 1: s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1] else: s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1]) s[ZZ - 1] = 0.0 # assign current solution, s, to x x = s # recompute weights w = Xty - XtX @ x return x, w
# Cell
[docs]def regressed(markov, gcs, solver="fnnls"): """solve markov @ weight = gcs problems, solver: fnnls (default) is faster in larger dataset, e.g., above 20,000 cells nnls is faster in smaller dataset, e.g. less than 5,000 cells """ print(solver) if solver == "fnnls": coef, err = FNNLSa(markov.T @ markov, markov.T @ gcs) elif solver == "jfnnls": # pip install fnnls # https://github.com/jvendrow/fnnls from fnnls import fnnls coef, err = fnnls(markov, gcs) elif solver == "nnls": from scipy.optimize import nnls print((markov > 0).sum()) coef, err = nnls(markov, gcs) # from 1987... elif solver == "lsq_linear": from scipy import sparse from scipy.optimize import lsq_linear # slow as well sol = lsq_linear( sparse.csr_matrix(markov), gcs, bounds=(0, np.inf), lsmr_tol="auto", verbose=1, ) # from 1997... coef = sol.x elif solver == "lasso": from scipy import sparse from sklearn.linear_model import Lasso lasso = Lasso(alpha=1e-7, max_iter=100000, fit_intercept=False, positive=True) den = (markov > 0).sum() / np.prod(markov.shape) print(den) if den >= 0.5: lasso.fit(markov, gcs) else: # lasso.fit(sparse.csr_matrix(markov), gcs) lasso.fit(sparse.coo_matrix(markov), gcs) coef = lasso.coef_ elif solver == "nnlsgd": # somehow not consistent with nnls and fnnls... coef = NNLS_GD(markov.T @ markov, markov.T @ gcs) elif solver == None: # do not fit return gcs else: # tnt-nn raise NotImplementedError # faster nnls return np.dot(markov, coef)
# Cell
[docs]def diffused(markov, gcs, ALPHA=0.9): """Compute diffusion process""" v_prev = np.copy(gcs) v_curr = np.copy(gcs) for i in range(10000): v_prev = np.copy(v_curr) v_curr = ALPHA * (markov.dot(v_curr)) + (1 - ALPHA) * gcs diff = np.mean(np.abs(v_curr - v_prev)) if diff <= 1e-6: break return v_curr
# Cell from scipy.sparse import issparse
[docs]def compare_cytotrace( adata, layer="all", cell_count=10, condition="age", solver="nnls", is_normalized=False, n_cores=4, top_n_genes=200, ): "Main interface of cytotrace reimplementation used for single dataset with multiple conditions" # condition common steps proc_time = time() if not is_normalized: if layer == "all": X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray() else: try: X = ( adata.layers[layer].toarray() if issparse(adata.layers[layer]) else adata.layers[layer] ) except: X = adata.X.toarray() if issparse(adata.X) else adata.X cells_selected = np.arange(X.shape[0]) genes_selected = np.arange(X.shape[1]) pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0) # cell x gene X = X[:, ~pgenes] X = (X.T / X.sum(axis=1)) * 1e6 pqcells = (pd.isnull((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count) X = X[:, ~pqcells] genes_selected = genes_selected[~pgenes] cells_selected = cells_selected[~pqcells] X = np.log2(X + 1) counts = (X > 0).sum(axis=0) mat2 = census_normalize(X, counts) mvg = remove_zero_mvg(mat2) cells_selected = cells_selected[mvg.sum(axis=0) != 0] mat2 = mat2[:, mvg.sum(axis=0) != 0] counts = counts[mvg.sum(axis=0) != 0] # joint two conditions to determine gcs feature # using the same set of genes gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes) adata.uns["gcs"] = gcs adata.uns["cells_selected"] = cells_selected adata.uns["counts"] = counts adata.uns["mat2"] = mat2 adata.uns["mvg"] = mvg adata.uns["genes_selected"] = genes_selected else: cells_selected = adata.uns["cells_selected"] counts = adata.uns["counts"] mat2 = adata.uns["mat2"] mvg = adata.uns["mvg"] genes_selected = adata.uns["genes_selected"] gcs = adata.uns["gcs"] # for indepent gcs feature # using different set of genes is very variable # gcs = np.zeros(len(cells_selected)) # condition-specific steps for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]): selection = adata.obs.loc[:, condition].values[cells_selected] == cond gcs_cond = gcs[selection] counts_cond = counts[selection] mat2_cond = mat2[:, selection] # compute gcs feature independently for two conditions # this would select different gene set, and make the # cytotrace score incomparable # gcs_cond = compute_gcs(mat2_cond, counts_cond) mvg_cond = mvg[:, selection] trans_time = time() print("preprocessing: %s" % (trans_time - proc_time)) # get transition matrix corr_cond = compute_similarity1(mvg_cond) markov_cond = convert_to_markov(corr_cond) gc_time = time() print("markov: %s" % (gc_time - trans_time)) # get Gene count signature gcs_cond = regressed(markov_cond, gcs_cond, solver=solver) gcs_cond = diffused(markov_cond, gcs_cond) gcs[selection] = gcs_cond # condition specific cytotrace correlated gene sets scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0] score_time = time() print("score time: %s" % (score_time - gc_time)) corrs_cond = compute_similarity2(mat2_cond.T, scores_cond.reshape(-1, 1))[0, :] final_time = time() print("final time: %s" % (final_time - score_time)) adata.var["cytotrace_corrs_%s" % cond] = None adata.var.iloc[ genes_selected, adata.var.columns.get_loc("cytotrace_corrs_%s" % cond) ] = corrs_cond adata.obs["gcs"] = None adata.obs["cytotrace"] = None adata.obs["counts"] = None adata.obs.iloc[ cells_selected, adata.obs.columns.get_loc("gcs") ] = gcs # used for re-rank when integrating multiple-samples adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = ( rankdata(gcs) / gcs.shape[0] ) # cytotrace scores adata.obs.iloc[ cells_selected, adata.obs.columns.get_loc("counts") ] = counts # used for re-rank when integrating multiple-samples
def cytotrace_sparse( adata, layer="raw", cell_count=20, solver="nnls", top_n_features=200, skip_regress=False, ): "optimized version" proc_time = time() if not issparse(adata.layers[layer]): raise NotImplementedError else: X = adata.layers[layer] cells_selected = np.arange(X.shape[0]) features_selected = np.arange(X.shape[1]) n_cells = X.shape[0] feature_mean = (X.sum(0) / n_cells).A1 feature_mean_sq = (X.multiply(X).sum(0) / n_cells).A1 feature_var = (feature_mean_sq - feature_mean**2) * (n_cells / (n_cells - 1)) non_zero_features = (X > 0).sum(axis=0).A1 pfeatures = np.isnan(non_zero_features) | (feature_var == 0) # cell x feature, filter feature X = X[:, ~pfeatures] features_selected = features_selected[~pfeatures] # gene x cell X = X.T X = X.multiply(1.0 / csr_matrix.sum(X, axis=0)) * 1e6 X = X.tocsr() # filter cell, feature x cell feature_count_per_cell = (X > 0).sum(axis=0).A1 pcells = np.isnan(feature_count_per_cell) | (feature_count_per_cell < cell_count) # feature x cell X = X[:, ~pcells] cells_selected = cells_selected[~pcells] feature_count_per_cell = feature_count_per_cell[~pcells] # census normalize, feature x cell census_X = ( X.multiply(feature_count_per_cell) .multiply(1.0 / csr_matrix.sum(X, axis=0)) .log1p() .tocsr() ) # top variable features, feature x cell cell_count_per_feature = (census_X > 0).sum(axis=1).A1 census_X_topcell = census_X[ cell_count_per_feature >= 0.05 * census_X.shape[1], : ].tocsr() n_cells = census_X_topcell.shape[1] feature_mean = (census_X_topcell.sum(1) / n_cells).A1 feature_mean_sq = (census_X_topcell.multiply(census_X_topcell).sum(1) / n_cells).A1 feature_var = (feature_mean_sq - feature_mean**2) * (n_cells / (n_cells - 1)) disp = feature_var / feature_mean mvg = census_X_topcell[ disp >= (np.sort(disp)[::-1])[:1000][-1], : ] # in case less than 1000 features # top 1000 variable features for markov matrix selection = (mvg.sum(axis=0) != 0).A1 mvg = mvg[:, selection] corr = compute_similarity1(mvg.A) markov = convert_to_markov(corr) # filter census output with nonzero cells in top variable features matrix cells_selected = cells_selected[selection] census_X = census_X[:, selection] feature_count_per_cell = feature_count_per_cell[selection] # calculate GC scores corrs = compute_similarity2(census_X.T.A, feature_count_per_cell.reshape(-1, 1))[ 0, : ] # avoid nan, put all nan to -1 corrs[np.isnan(corrs)] = -1 gcs = census_X[np.argsort(corrs)[::-1][:top_n_features], :].mean(axis=0).A1 if not skip_regress: print("regress...") # from scipy.optimize import lsq_linear # # slow as well # sol = lsq_linear(csr_matrix(markov), gcs, bounds=(0, np.inf), lsmr_tol='auto', verbose=1) # from 1997... # coef = sol.x from scipy.optimize import nnls coef, err = nnls(markov, gcs) # from 1987... gcs = np.dot(markov, coef) print(markov.shape, gcs.shape) gcs = diffused(markov, gcs) rank = rankdata(gcs) scores = rank / gcs.shape[0] print(gcs) adata.obs["gcs"] = np.nan adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs cytoGenes = compute_similarity2(census_X.T.A, scores.reshape(-1, 1))[0, :] adata.obs["cytotrace"] = np.nan adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = scores adata.obs["counts"] = np.nan adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = gcs adata.var["cytotrace"] = False adata.var.iloc[features_selected, adata.var.columns.get_loc("cytotrace")] = True adata.var["cytotrace_corrs"] = np.nan adata.var.iloc[ features_selected, adata.var.columns.get_loc("cytotrace_corrs") ] = np.array(corrs, dtype=np.float32) return { "CytoTRACE": scores, "CytoTRACErank": rank, "GCS": gcs, "Counts": feature_count_per_cell, "exprMatrix": census_X, "cytoGenes": cytoGenes, "gcsGenes": corrs, "filteredCells": np.setdiff1d(np.arange(adata.shape[0]), cells_selected), } # Cell
[docs]def cytotrace(adata, layer="all", cell_count=10, solver="nnls", top_n_genes=200): "Main interface of cytotrace reimplementation used for single dataset with one condition" proc_time = time() if layer == "all": X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray() # X = (adata.layers['spliced'] + adata.layers['unspliced']) else: try: X = ( adata.layers[layer].toarray() if issparse(adata.layers[layer]) else adata.layers[layer] ) # X = adata.layers[layer] if issparse(adata.layers[layer]) else adata.layers[layer] except: X = adata.X.toarray() if issparse(adata.X) else adata.X # X = adata.X if issparse(adata.X) else adata.X cells_selected = np.arange(X.shape[0]) genes_selected = np.arange(X.shape[1]) pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0) # cell x gene X = X[:, ~pgenes] # gene x cell X = (X.T / X.sum(axis=1)) * 1e6 pqcells = (pd.isnull((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count) X = X[:, ~pqcells] genes_selected = genes_selected[~pgenes] cells_selected = cells_selected[~pqcells] X = np.log2(X + 1) counts = (X > 0).sum(axis=0) mat2 = census_normalize(X, counts) mvg = remove_zero_mvg(mat2) selection = mvg.sum(axis=0) != 0 mvg = mvg[:, selection] mat2 = mat2[:, selection] counts = counts[selection] cells_selected = cells_selected[selection] trans_time = time() print("processing: %s" % (trans_time - proc_time)) # get transition matrix corr = compute_similarity1(mvg) markov = convert_to_markov(corr) gc_time = time() print("markov: %s" % (gc_time - trans_time)) # get Gene count signature # gcs = compute_gcs(mat2, counts) # gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes) corrs = compute_similarity2(mat2.T, counts.reshape(-1, 1))[0, :] # avoid nan, put all nan to -1 corrs[np.isnan(corrs)] = -1 gcs = mat2[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0) gcs_time = time() print("gcs: %s" % (gcs_time - gc_time)) gcs = regressed(markov, gcs, solver=solver) regress_time = time() print("regression: %s" % (regress_time - gcs_time)) gcs = diffused(markov, gcs) rank = rankdata(gcs) scores = rankdata(gcs) / gcs.shape[0] score_time = time() print("diffusion:%s " % (score_time - regress_time)) cyto_corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :] gene_time = time() print("genes:%s" % (gene_time - score_time)) adata.obs["gcs"] = None adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs adata.obs["cytotrace"] = None adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = scores adata.obs["counts"] = None adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts adata.var["cytotrace"] = None adata.var.iloc[genes_selected, adata.var.columns.get_loc("cytotrace")] = True adata.var["cytotrace_corrs"] = None adata.var.iloc[ genes_selected, adata.var.columns.get_loc("cytotrace_corrs") ] = cyto_corrs return { "CytoTRACE": scores, "CytoTRACErank": rank, "GCS": gcs, "Counts": counts, "exprMatrix": mat2, "cytoGenes": cyto_corrs, "gcsGenes": corrs, "filteredCells": np.setdiff1d(np.arange(adata.shape[0]), cells_selected), }
def visualize(adata, metrics, name="test"): import scanpy as sc import seaborn as sns fig, ax = plt.subplots(2) fig.set_size_inches(8, 6) sc.pl.umap(adata, color="cytotrace", show=False, ax=ax[0]) order = adata.obs.groupby("clusters").median().sort_values(["cytotrace"]).index bplot = sns.boxplot( x="clusters", y="cytotrace", data=adata.obs, order=order, width=0.35, ax=ax[1] ) ax[1].set_xticklabels(order, rotation=40, ha="right") fig.savefig(f"{name}_figure.pdf") # Cell
[docs]def batch_cytotrace(mvg_batch, gcs_batch, solver="jfnnls"): corr_batch = compute_similarity1(mvg_batch) markov_batch = convert_to_markov(corr_batch) gcs_time = time() gcs_batch = regressed(markov_batch, gcs_batch, solver=solver) regress_time = time() print("regression: %s" % (regress_time - gcs_time)) print(markov_batch.shape, gcs_batch.shape) gcs_batch = diffused(markov_batch, gcs_batch) return gcs_batch
# Cell from scipy.sparse import issparse
[docs]def compare_cytotrace_ncores( adata, layer="all", cell_count=10, condition="age", solver="nnls", is_normalized=False, ncores=4, batch_cell=2000, ): "Main interface of cytotrace reimplementation used for single dataset with multiple conditions" # condition common steps proc_time = time() if not is_normalized: if layer == "all": X = adata.layers["spliced"] + adata.layers["unspliced"] # .toarray() else: try: X = ( adata.layers[layer] if issparse(adata.layers[layer]) else adata.layers[layer] ) except: X = adata.X if issparse(adata.X) else adata.X cells_selected = np.arange(X.shape[0]) genes_selected = np.arange(X.shape[1]) from sklearn.utils import sparsefuncs_fast mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X) pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0) # cell x gene X = X[:, ~pgenes] X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr() pqcells = np.array( (np.isnan((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count) )[0] X = X[:, ~pqcells] genes_selected = genes_selected[~pgenes] cells_selected = cells_selected[~pqcells] counts = np.array((X > 0).sum(axis=0))[0] mat2 = census_normalize(X, counts).toarray() mvg = remove_zero_mvg(mat2) # sel = np.array(mvg.sum(axis=0))[0]!=0 sel = mvg.sum(axis=0) != 0 mat2 = mat2[:, sel] counts = counts[sel] cells_selected = cells_selected[sel] # joint two conditions to determine gcs feature # using the same set of genes gcs = compute_gcs(mat2, counts) adata.uns["gcs"] = gcs adata.uns["cells_selected"] = cells_selected adata.uns["counts"] = counts adata.uns["mat2"] = mat2 adata.uns["mvg"] = mvg adata.uns["genes_selected"] = genes_selected else: cells_selected = adata.uns["cells_selected"] counts = adata.uns["counts"] mat2 = adata.uns["mat2"] mvg = adata.uns["mvg"] genes_selected = adata.uns["genes_selected"] gcs = adata.uns["gcs"] # for indepent gcs feature # using different set of genes is very variable # gcs = np.zeros(len(cells_selected)) # parallel part import multiprocessing as mp # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map # condition-specific steps pool = mp.Pool(ncores) for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]): print(cond) selection = adata.obs.loc[:, condition].values[cells_selected] == cond gcs_cond = gcs[selection] counts_cond = counts[selection] mat2_cond = mat2[:, selection] mvg_cond = mvg[:, selection] mvg_gcs_batches = [] for i in range(0, gcs_cond.shape[0], batch_cell): mvg_batch = mvg_cond[:, i : (i + batch_cell)].copy() gcs_batch = gcs_cond[i : i + batch_cell].copy() print(mvg_batch.shape, gcs_batch.shape) mvg_gcs_batches.append([mvg_batch, gcs_batch, solver]) # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map # http://jennguyen1.github.io/nhuyhoa/software/Parallel-Processing.html result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches) result.wait() gcs_batches = result.get() # gcs_batches = result.get() gcs_cond = np.hstack(gcs_batches) trans_time = time() print("preprocessing: %s" % (trans_time - proc_time)) gc_time = time() print("markov: %s" % (gc_time - trans_time)) gcs[selection] = gcs_cond # condition specific cytotrace correlated gene sets scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0] score_time = time() print("score time: %s" % (score_time - gc_time)) corrs_cond = compute_similarity2(mat2_cond.T, scores_cond.reshape(-1, 1))[0, :] final_time = time() print("final time: %s" % (final_time - score_time)) adata.var["cytotrace_corrs_%s" % cond] = None adata.var.iloc[ genes_selected, adata.var.columns.get_loc("cytotrace_corrs_%s" % cond) ] = corrs_cond pool.close() pool.join() adata.obs["gcs"] = None adata.obs["cytotrace"] = None adata.obs["counts"] = None adata.obs.iloc[ cells_selected, adata.obs.columns.get_loc("gcs") ] = gcs # used for re-rank when integrating multiple-samples adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = ( rankdata(gcs) / gcs.shape[0] ) # cytotrace scores adata.obs.iloc[ cells_selected, adata.obs.columns.get_loc("counts") ] = counts # used for re-rank when integrating multiple-samples
# Cell
[docs]def cytotrace_ncore( adata, layer="all", cell_count=10, solver="nnls", ncores=4, batch_cell=3000, shuffle=3, ): "optimized version" proc_time = time() if layer == "all": X = adata.layers["spliced"] + adata.layers["unspliced"] else: try: X = ( adata.layers[layer] if issparse(adata.layers[layer]) else adata.layers[layer] ) except: X = adata.X if issparse(adata.X) else adata.X cells_selected = np.arange(X.shape[0]) genes_selected = np.arange(X.shape[1]) from sklearn.utils import sparsefuncs_fast mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X) pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0) # pgenes = np.isnan(mean * adata.shape[0]) | (var == 0) # cell x gene X = X[:, ~pgenes] # gene x cell # X = (X.T / np.array(X.sum(axis=1))[0]) * 1e6 # sparse operation X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr() pqcells = np.array( (np.isnan((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count) )[0] X = X[:, ~pqcells] genes_selected = genes_selected[~pgenes] cells_selected = cells_selected[~pqcells] counts = np.array((X > 0).sum(axis=0))[0] mat2 = census_normalize(X, counts).toarray() # cell_count = np.array((mat2 > 0).sum(axis=1))[:, 0] # X_topcell = mat2[cell_count >= 0.05 * mat2.shape[1], :] # mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X_topcell) # disp = var / mean # mvg = X_topcell[disp >= (np.sort(disp)[::-1])[:1000][-1], :] mvg = remove_zero_mvg(mat2) # sel = np.array(mvg.sum(axis=0))[0]!=0 sel = mvg.sum(axis=0) != 0 mat2 = mat2[:, sel] counts = counts[sel] cells_selected = cells_selected[sel] # mat2 = mat2.toarray() # mvg = mvg.toarray() print(mvg.shape, mat2.shape) trans_time = time() print("processing: %s" % (trans_time - proc_time)) gcs = compute_gcs(mat2, counts) # get transition matrix # parallel part import multiprocessing as mp pool = mp.Pool(ncores) # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map mvg_gcs_batches = [] for i in range(0, adata.shape[0], batch_cell): mvg_batch = mvg[:, i : (i + batch_cell)].copy() gcs_batch = gcs[i : i + batch_cell].copy() mvg_gcs_batches.append([mvg_batch, gcs_batch, solver]) markov_diffusion_time = time() # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches) pool.close() pool.join() score_time = time() print(score_time - markov_diffusion_time) gcs_batches = result.get() gcs_batches = np.hstack(gcs_batches) scores = rankdata(gcs_batches) / gcs_batches.shape[0] corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :] gene_time = time() print("genes:%s" % (gene_time - score_time)) adata.obs["gcs"] = None adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs adata.obs["cytotrace"] = None adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = scores adata.obs["counts"] = None adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts adata.var["cytotrace"] = None adata.var.iloc[genes_selected, adata.var.columns.get_loc("cytotrace")] = True adata.var["cytotrace_corrs"] = None adata.var.iloc[genes_selected, adata.var.columns.get_loc("cytotrace_corrs")] = corrs
# Cell
[docs]def align_diffrate( adatas, labels, field="condition", type="A", outfield="cytotrace", ax=None ): "this is used for differentiation rate comparison across samples" scores = [ adatas[0].obs.loc[:, outfield][adatas[0].obs.loc[:, field] == type], adatas[1].obs.loc[:, outfield][adatas[1].obs.loc[:, field] == type], ] if outfield in ["latent_time", "velocity_pseudotime"]: pvalg = mannwhitneyu(scores[0], scores[1], alternative="greater")[1] pvall = mannwhitneyu(scores[0], scores[1], alternative="less")[1] else: pvalg = mannwhitneyu(1 - scores[0], 1 - scores[1], alternative="greater")[1] pvall = mannwhitneyu(1 - scores[0], 1 - scores[1], alternative="less")[1] for s, l in zip(scores, labels): if outfield in ["latent_time", "velocity_pseudotime"]: s = np.sort(s) else: if outfield != "cytotrace": s = np.sort(-s) else: s = np.sort(1 - s) ax.step(np.concatenate([s, s[[-1]]]), np.arange(s.size + 1) / s.size, label=l) ax.scatter(np.concatenate([s, s[[-1]]]), np.arange(s.size + 1) / s.size, s=5) if outfield == "cytotrace": ax.set_xlabel("Differentiation level") else: ax.set_xlabel(outfield) ax.set_ylabel("Cumulative proportion of cells") ax.set_title(type) ax.text( np.percentile(s, 0.6), 0.4, f"{labels[0]} > {labels[1]}: {pvalg:.3E}", ) ax.text( np.percentile(s, 0.6), 0.48, f"{labels[0]} < {labels[1]}: {pvall:.3E}", ) ax.legend()
# Cell
[docs]def plot_multiadata(adatas): for a in adatas: print(a.obs.age[0]) fig, ax = plt.subplots(1, 5) fig.set_size_inches(32, 9) scv.pl.velocity_embedding_grid( a, scale=0.2, color="louvain", show=False, basis="pca", legend_loc="on data", ax=ax[0], title=a.obs.age[0], ) scv.pl.scatter( a, color="velocity_pseudotime", basis="pca", size=80, color_map="gnuplot", ax=ax[1], show=False, title=a.obs.age[0], ) scv.pl.scatter( a, color="latent_time", basis="pca", size=80, color_map="gnuplot", ax=ax[2], show=False, title=a.obs.age[0], ) scv.pl.scatter( a, color="end_points", basis="pca", size=80, color_map="gnuplot", ax=ax[3], show=False, title=a.obs.age[0], ) scv.pl.scatter( a, color="root_cells", basis="pca", size=80, color_map="gnuplot", ax=ax[4], show=False, title=a.obs.age[0], )
# Cell # https://stackoverflow.com/questions/15408371/cumulative-distribution-plots-python
[docs]def cumulative_boxplot(): data = adata_copy[adata_copy.obs.age == "A",].obs.n_counts # evaluate the histogram values, base = np.histogram(data, bins=10) # evaluate the cumulative cumulative = np.cumsum(values) # plot the cumulative function plt.scatter(base[:-1], cumulative / cumulative[-1], c="blue")
# plot the survival function # plt.plot(base[:-1], len(data)-cumulative, c='green')