Source code for pyrovelocity.cytotrace

__all__ = [
    "census_normalize",
    "remove_zero_mvg",
    "compute_similarity2",
    "compute_similarity1",
    "compute_gcs",
    "convert_to_markov",
    "any",
    "find_nonzero",
    "FNNLSa",
    "nu",
    "eps",
    "regressed",
    "diffused",
    "compare_cytotrace",
    "cytotrace",
    "batch_cytotrace",
    "compare_cytotrace_ncores",
    "cytotrace_ncore",
    "align_diffrate",
    "plot_multiadata",
    "cumulative_boxplot",
    "run_cytotrace",
]

from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Cell
import scvelo as scv
from numpy import ndarray
from scipy.stats import mannwhitneyu
from scipy.stats import rankdata


##from scanorama import correct_scanpy
scv.logging.verbosity = 3
scv.settings.presenter_view = True
scv.set_figure_params("scvelo")
np.random.seed(99)

# Cell
from scipy.sparse import csr_matrix
from scipy.sparse import issparse


[docs]def census_normalize(mat, count):
    "RNA-seq census normalization to correct cell lysis"
    if issparse(mat):
        # x = mat.copy()
        # x.data = 2**x.data - 1
        # gene x cell
        x = mat.multiply(count).multiply(1.0 / mat.sum(axis=0)).tocsr()
        return x.log1p()
    else:
        x = 2**mat - 1
        # gene x cell
        x = (x * count) / x.sum(axis=0)
        return np.log2(x + 1)


# Cell
[docs]def remove_zero_mvg(mat):
    "remove cells not expressing any of the top 1000 variable genes"
    # gene x cell
    cell_count = (mat > 0).sum(axis=1)
    X_topcell = mat[cell_count >= 0.05 * mat.shape[1], :]
    var = X_topcell.var(axis=1)
    mean = X_topcell.mean(axis=1)
    disp = var / mean
    return X_topcell[
        disp >= (np.sort(disp)[::-1])[:1000][-1], :
    ]  # in case less than 1000 genes


# Cell
[docs]def compute_similarity2(O: ndarray, P: ndarray) -> ndarray:
    "Compute pearson correlation between two matrices O and P using einstein summation"
    (n, t) = O.shape  # n traces of t samples
    (n_bis, m) = P.shape  # n predictions for each of m candidates

    DO = O - (
        np.einsum("nt->t", O, optimize="optimal") / np.double(n)
    )  # compute O - mean(O)
    DP = P - (
        np.einsum("nm->m", P, optimize="optimal") / np.double(n)
    )  # compute P - mean(P)

    cov = np.einsum("nm,nt->mt", DP, DO, optimize="optimal")

    varP = np.einsum("nm,nm->m", DP, DP, optimize="optimal")
    varO = np.einsum("nt,nt->t", DO, DO, optimize="optimal")
    tmp = np.einsum("m,t->mt", varP, varO, optimize="optimal")
    return cov / np.sqrt(tmp)


# Cell
# adapt from https://github.com/ikizhvatov/efficient-columnwise-correlation/blob/master/columnwise_corrcoef_perf.py
[docs]def compute_similarity1(A):
    "Compute pairwise correlation of all columns in matrices A"
    n, t = A.shape  # genes x samples
    DO = A - (
        np.einsum("nt->t", A, optimize="optimal") / np.double(n)
    )  # compute O - mean(O)
    cov = np.einsum("nm,nt->mt", DO, DO, optimize="optimal")
    varP = np.einsum("nt,nt->t", DO, DO, optimize="optimal")
    return cov / np.sqrt(np.einsum("m,t->mt", varP, varP, optimize="optimal"))


# Cell
[docs]def compute_gcs(mat, count, top_n_genes=200):
    "Compute gene set enrichment scores by correlating gene count and gene expression"
    corrs = compute_similarity2(mat.T, count.reshape(-1, 1))[0, :]
    # avoid nan, put all nan to -1
    corrs[np.isnan(corrs)] = -1
    gcs = mat[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0)
    return gcs


# Cell
[docs]def convert_to_markov(sim):
    """Convert the Pearson correlation to Markov matrix

    TODO: use velocity graph to replace this markov matrix
    """
    Ds = np.copy(sim)

    cutoff = np.mean(Ds)
    np.fill_diagonal(Ds, 0)

    Ds[Ds <= 0] = 0
    Ds[Ds <= cutoff] = 0

    zero_rows = Ds.sum(axis=1) == 0
    zero_cols = Ds.sum(axis=0) == 0

    Ds[~zero_rows, :] = (Ds[~zero_rows, :].T / Ds[~zero_rows, :].sum(axis=1)).T  # tips
    Ds[:, zero_cols] = 0
    return Ds


# Cell
# http://xrm.phys.northwestern.edu/research/pdf_papers/1997/bro_chemometrics_1997.pdf
# matlab reference: https://www.mathworks.com/matlabcentral/mlc-downloads/downloads/submissions/3388/versions/1/previews/fnnls.m/index.html

from numpy import abs
from numpy import arange
from numpy import argmax
from numpy import finfo
from numpy import float64
from numpy import int64
from numpy import min
from numpy import newaxis
from numpy import nonzero
from numpy import sum
from numpy import zeros
from scipy.linalg import solve


nu = newaxis
import numpy as np


# machine epsilon
eps = finfo(float64).eps


[docs]def any(a):
    # assuming a vector, a
    larger_than_zero = sum(a > 0)
    if larger_than_zero:
        return True
    else:
        return False


[docs]def find_nonzero(a):
    # returns indices of nonzero elements in a
    return nonzero(a)[0]


[docs]def FNNLSa(XtX, Xty, tol=None):
    """Faster NNLS imported from https://github.com/delnatan/FNNLSa
    A fast non-negativity-constrained least squares algorithm. Journal of chemometrics
    """

    if tol is None:
        tol = eps

    M, N = XtX.shape
    # initialize passive set, P. Indices where coefficient is >0
    P = zeros(N, dtype=int64)
    # and active set. Indices where coefficient is <=0
    Z = arange(N) + 1
    # working active set
    ZZ = arange(N) + 1
    # initial solution vector, x
    x = zeros(N, dtype=float64)
    # weight vector
    w = Xty - XtX @ x
    # iteration counts and parameter
    it = 0
    itmax = 30 * N
    # MAIN LOOP
    # continue as long as there are indices within the active set Z
    # or elements in inner loop active set is larger than 'tolerance'
    piter = 0
    while any(Z) and any(w[ZZ - 1] > tol):
        piter += 1
        t = argmax(w[ZZ - 1]) + 1  # find largest weight
        t = ZZ[t - 1]
        P[t - 1] = t  # move to passive set
        Z[t - 1] = 0  # remove from active set
        PP = find_nonzero(P) + 1
        ZZ = find_nonzero(Z) + 1
        NZZ = ZZ.shape

        # compute trial solution, s
        s = zeros(N, dtype=float64)

        if len(PP) == 1:
            s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1]
        else:
            s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1])
        s[ZZ - 1] = 0.0  # set active coefficients to 0

        while any(s[PP - 1] <= tol) and it < itmax:
            it = it + 1
            QQ = find_nonzero((s <= tol) * P) + 1
            alpha = min(x[QQ - 1] / (x[QQ - 1] - s[QQ - 1]))
            x = x + alpha * (s - x)
            ij = find_nonzero((abs(x) < tol) * (P != 0)) + 1
            Z[ij - 1] = ij
            P[ij - 1] = 0
            PP = find_nonzero(P) + 1
            ZZ = find_nonzero(Z) + 1
            if len(PP) == 1:
                s[PP - 1] = Xty[PP - 1] / XtX[PP - 1, PP - 1]
            else:
                s[PP - 1] = solve(XtX[PP - 1, PP[:, nu] - 1], Xty[PP - 1])
            s[ZZ - 1] = 0.0
        # assign current solution, s, to x
        x = s
        # recompute weights
        w = Xty - XtX @ x
    return x, w


# Cell
[docs]def regressed(markov, gcs, solver="fnnls"):
    """solve markov @ weight = gcs problems,

    solver: fnnls (default) is faster in larger dataset, e.g., above 20,000 cells
            nnls is faster in smaller dataset, e.g. less than 5,000 cells
    """
    print(solver)
    if solver == "fnnls":
        coef, err = FNNLSa(markov.T @ markov, markov.T @ gcs)
    elif solver == "jfnnls":  # pip install fnnls # https://github.com/jvendrow/fnnls
        from fnnls import fnnls

        coef, err = fnnls(markov, gcs)
    elif solver == "nnls":
        from scipy.optimize import nnls

        print((markov > 0).sum())
        coef, err = nnls(markov, gcs)  # from 1987...
    elif solver == "lsq_linear":
        from scipy import sparse
        from scipy.optimize import lsq_linear

        # slow as well
        sol = lsq_linear(
            sparse.csr_matrix(markov),
            gcs,
            bounds=(0, np.inf),
            lsmr_tol="auto",
            verbose=1,
        )  # from 1997...
        coef = sol.x
    elif solver == "lasso":
        from scipy import sparse
        from sklearn.linear_model import Lasso

        lasso = Lasso(alpha=1e-7, max_iter=100000, fit_intercept=False, positive=True)
        den = (markov > 0).sum() / np.prod(markov.shape)
        print(den)
        if den >= 0.5:
            lasso.fit(markov, gcs)
        else:
            # lasso.fit(sparse.csr_matrix(markov), gcs)
            lasso.fit(sparse.coo_matrix(markov), gcs)
        coef = lasso.coef_
    elif solver == "nnlsgd":  # somehow not consistent with nnls and fnnls...
        coef = NNLS_GD(markov.T @ markov, markov.T @ gcs)
    elif solver == None:  # do not fit
        return gcs
    else:  # tnt-nn
        raise NotImplementedError
    # faster nnls
    return np.dot(markov, coef)


# Cell
[docs]def diffused(markov, gcs, ALPHA=0.9):
    """Compute diffusion process"""
    v_prev = np.copy(gcs)
    v_curr = np.copy(gcs)

    for i in range(10000):
        v_prev = np.copy(v_curr)
        v_curr = ALPHA * (markov.dot(v_curr)) + (1 - ALPHA) * gcs

        diff = np.mean(np.abs(v_curr - v_prev))
        if diff <= 1e-6:
            break
    return v_curr


# Cell
from scipy.sparse import issparse


[docs]def compare_cytotrace(
    adata,
    layer="all",
    cell_count=10,
    condition="age",
    solver="nnls",
    is_normalized=False,
    n_cores=4,
    top_n_genes=200,
):
    "Main interface of cytotrace reimplementation used for single dataset with multiple conditions"
    # condition common steps

    proc_time = time()
    if not is_normalized:
        if layer == "all":
            X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray()
        else:
            try:
                X = (
                    adata.layers[layer].toarray()
                    if issparse(adata.layers[layer])
                    else adata.layers[layer]
                )
            except:
                X = adata.X.toarray() if issparse(adata.X) else adata.X

        cells_selected = np.arange(X.shape[0])
        genes_selected = np.arange(X.shape[1])
        pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0)
        # cell x gene
        X = X[:, ~pgenes]
        X = (X.T / X.sum(axis=1)) * 1e6

        pqcells = (pd.isnull((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count)
        X = X[:, ~pqcells]

        genes_selected = genes_selected[~pgenes]
        cells_selected = cells_selected[~pqcells]

        X = np.log2(X + 1)
        counts = (X > 0).sum(axis=0)
        mat2 = census_normalize(X, counts)

        mvg = remove_zero_mvg(mat2)
        cells_selected = cells_selected[mvg.sum(axis=0) != 0]

        mat2 = mat2[:, mvg.sum(axis=0) != 0]
        counts = counts[mvg.sum(axis=0) != 0]

        # joint two conditions to determine gcs feature
        # using the same set of genes
        gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes)
        adata.uns["gcs"] = gcs
        adata.uns["cells_selected"] = cells_selected
        adata.uns["counts"] = counts
        adata.uns["mat2"] = mat2
        adata.uns["mvg"] = mvg
        adata.uns["genes_selected"] = genes_selected
    else:
        cells_selected = adata.uns["cells_selected"]
        counts = adata.uns["counts"]
        mat2 = adata.uns["mat2"]
        mvg = adata.uns["mvg"]
        genes_selected = adata.uns["genes_selected"]
        gcs = adata.uns["gcs"]

    # for indepent gcs feature
    # using different set of genes is very variable
    # gcs = np.zeros(len(cells_selected))

    # condition-specific steps
    for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]):
        selection = adata.obs.loc[:, condition].values[cells_selected] == cond

        gcs_cond = gcs[selection]
        counts_cond = counts[selection]
        mat2_cond = mat2[:, selection]

        # compute gcs feature independently for two conditions
        # this would select different gene set, and make the
        # cytotrace score incomparable
        # gcs_cond = compute_gcs(mat2_cond, counts_cond)

        mvg_cond = mvg[:, selection]

        trans_time = time()
        print("preprocessing: %s" % (trans_time - proc_time))
        # get transition matrix
        corr_cond = compute_similarity1(mvg_cond)
        markov_cond = convert_to_markov(corr_cond)

        gc_time = time()
        print("markov: %s" % (gc_time - trans_time))

        # get Gene count signature
        gcs_cond = regressed(markov_cond, gcs_cond, solver=solver)
        gcs_cond = diffused(markov_cond, gcs_cond)
        gcs[selection] = gcs_cond

        # condition specific cytotrace correlated gene sets
        scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0]
        score_time = time()
        print("score time: %s" % (score_time - gc_time))

        corrs_cond = compute_similarity2(mat2_cond.T, scores_cond.reshape(-1, 1))[0, :]
        final_time = time()
        print("final time: %s" % (final_time - score_time))
        adata.var["cytotrace_corrs_%s" % cond] = None
        adata.var.iloc[
            genes_selected, adata.var.columns.get_loc("cytotrace_corrs_%s" % cond)
        ] = corrs_cond

    adata.obs["gcs"] = None
    adata.obs["cytotrace"] = None
    adata.obs["counts"] = None
    adata.obs.iloc[
        cells_selected, adata.obs.columns.get_loc("gcs")
    ] = gcs  # used for re-rank when integrating multiple-samples
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = (
        rankdata(gcs) / gcs.shape[0]
    )  # cytotrace scores
    adata.obs.iloc[
        cells_selected, adata.obs.columns.get_loc("counts")
    ] = counts  # used for re-rank when integrating multiple-samples


def cytotrace_sparse(
    adata,
    layer="raw",
    cell_count=20,
    solver="nnls",
    top_n_features=200,
    skip_regress=False,
):
    "optimized version"
    proc_time = time()
    if not issparse(adata.layers[layer]):
        raise NotImplementedError
    else:
        X = adata.layers[layer]

    cells_selected = np.arange(X.shape[0])
    features_selected = np.arange(X.shape[1])

    n_cells = X.shape[0]
    feature_mean = (X.sum(0) / n_cells).A1
    feature_mean_sq = (X.multiply(X).sum(0) / n_cells).A1
    feature_var = (feature_mean_sq - feature_mean**2) * (n_cells / (n_cells - 1))
    non_zero_features = (X > 0).sum(axis=0).A1
    pfeatures = np.isnan(non_zero_features) | (feature_var == 0)

    # cell x feature, filter feature
    X = X[:, ~pfeatures]
    features_selected = features_selected[~pfeatures]

    # gene x cell
    X = X.T
    X = X.multiply(1.0 / csr_matrix.sum(X, axis=0)) * 1e6
    X = X.tocsr()

    # filter cell, feature x cell
    feature_count_per_cell = (X > 0).sum(axis=0).A1
    pcells = np.isnan(feature_count_per_cell) | (feature_count_per_cell < cell_count)

    # feature x cell
    X = X[:, ~pcells]
    cells_selected = cells_selected[~pcells]
    feature_count_per_cell = feature_count_per_cell[~pcells]

    # census normalize, feature x cell
    census_X = (
        X.multiply(feature_count_per_cell)
        .multiply(1.0 / csr_matrix.sum(X, axis=0))
        .log1p()
        .tocsr()
    )

    # top variable features, feature x cell
    cell_count_per_feature = (census_X > 0).sum(axis=1).A1
    census_X_topcell = census_X[
        cell_count_per_feature >= 0.05 * census_X.shape[1], :
    ].tocsr()
    n_cells = census_X_topcell.shape[1]
    feature_mean = (census_X_topcell.sum(1) / n_cells).A1
    feature_mean_sq = (census_X_topcell.multiply(census_X_topcell).sum(1) / n_cells).A1
    feature_var = (feature_mean_sq - feature_mean**2) * (n_cells / (n_cells - 1))
    disp = feature_var / feature_mean
    mvg = census_X_topcell[
        disp >= (np.sort(disp)[::-1])[:1000][-1], :
    ]  # in case less than 1000 features

    # top 1000 variable features for markov matrix
    selection = (mvg.sum(axis=0) != 0).A1
    mvg = mvg[:, selection]
    corr = compute_similarity1(mvg.A)
    markov = convert_to_markov(corr)

    # filter census output with nonzero cells in top variable features matrix
    cells_selected = cells_selected[selection]
    census_X = census_X[:, selection]
    feature_count_per_cell = feature_count_per_cell[selection]

    # calculate GC scores
    corrs = compute_similarity2(census_X.T.A, feature_count_per_cell.reshape(-1, 1))[
        0, :
    ]
    # avoid nan, put all nan to -1
    corrs[np.isnan(corrs)] = -1
    gcs = census_X[np.argsort(corrs)[::-1][:top_n_features], :].mean(axis=0).A1

    if not skip_regress:
        print("regress...")
        # from scipy.optimize import lsq_linear
        # # slow as well
        # sol = lsq_linear(csr_matrix(markov), gcs, bounds=(0, np.inf), lsmr_tol='auto', verbose=1)    # from 1997...
        # coef = sol.x

        from scipy.optimize import nnls

        coef, err = nnls(markov, gcs)  # from 1987...
        gcs = np.dot(markov, coef)

    print(markov.shape, gcs.shape)
    gcs = diffused(markov, gcs)
    rank = rankdata(gcs)
    scores = rank / gcs.shape[0]

    print(gcs)
    adata.obs["gcs"] = np.nan
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs

    cytoGenes = compute_similarity2(census_X.T.A, scores.reshape(-1, 1))[0, :]

    adata.obs["cytotrace"] = np.nan
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = scores

    adata.obs["counts"] = np.nan
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = gcs

    adata.var["cytotrace"] = False
    adata.var.iloc[features_selected, adata.var.columns.get_loc("cytotrace")] = True

    adata.var["cytotrace_corrs"] = np.nan
    adata.var.iloc[
        features_selected, adata.var.columns.get_loc("cytotrace_corrs")
    ] = np.array(corrs, dtype=np.float32)
    return {
        "CytoTRACE": scores,
        "CytoTRACErank": rank,
        "GCS": gcs,
        "Counts": feature_count_per_cell,
        "exprMatrix": census_X,
        "cytoGenes": cytoGenes,
        "gcsGenes": corrs,
        "filteredCells": np.setdiff1d(np.arange(adata.shape[0]), cells_selected),
    }


# Cell
[docs]def cytotrace(adata, layer="all", cell_count=10, solver="nnls", top_n_genes=200):
    "Main interface of cytotrace reimplementation used for single dataset with one condition"
    proc_time = time()
    if layer == "all":
        X = (adata.layers["spliced"] + adata.layers["unspliced"]).toarray()
    #        X = (adata.layers['spliced'] + adata.layers['unspliced'])
    else:
        try:
            X = (
                adata.layers[layer].toarray()
                if issparse(adata.layers[layer])
                else adata.layers[layer]
            )
            # X = adata.layers[layer] if issparse(adata.layers[layer]) else adata.layers[layer]
        except:
            X = adata.X.toarray() if issparse(adata.X) else adata.X
            # X = adata.X if issparse(adata.X) else adata.X

    cells_selected = np.arange(X.shape[0])
    genes_selected = np.arange(X.shape[1])

    pgenes = (pd.isnull((X > 0).sum(axis=0))) | (X.var(axis=0) == 0)
    # cell x gene
    X = X[:, ~pgenes]
    # gene x cell
    X = (X.T / X.sum(axis=1)) * 1e6

    pqcells = (pd.isnull((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count)
    X = X[:, ~pqcells]

    genes_selected = genes_selected[~pgenes]
    cells_selected = cells_selected[~pqcells]

    X = np.log2(X + 1)
    counts = (X > 0).sum(axis=0)
    mat2 = census_normalize(X, counts)

    mvg = remove_zero_mvg(mat2)
    selection = mvg.sum(axis=0) != 0
    mvg = mvg[:, selection]
    mat2 = mat2[:, selection]
    counts = counts[selection]

    cells_selected = cells_selected[selection]

    trans_time = time()
    print("processing: %s" % (trans_time - proc_time))

    # get transition matrix
    corr = compute_similarity1(mvg)
    markov = convert_to_markov(corr)

    gc_time = time()
    print("markov: %s" % (gc_time - trans_time))

    # get Gene count signature
    # gcs = compute_gcs(mat2, counts)
    # gcs = compute_gcs(mat2, counts, top_n_genes=top_n_genes)

    corrs = compute_similarity2(mat2.T, counts.reshape(-1, 1))[0, :]
    # avoid nan, put all nan to -1
    corrs[np.isnan(corrs)] = -1
    gcs = mat2[np.argsort(corrs)[::-1][:top_n_genes], :].mean(axis=0)

    gcs_time = time()
    print("gcs: %s" % (gcs_time - gc_time))

    gcs = regressed(markov, gcs, solver=solver)
    regress_time = time()
    print("regression: %s" % (regress_time - gcs_time))

    gcs = diffused(markov, gcs)
    rank = rankdata(gcs)
    scores = rankdata(gcs) / gcs.shape[0]
    score_time = time()
    print("diffusion:%s " % (score_time - regress_time))

    cyto_corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :]
    gene_time = time()
    print("genes:%s" % (gene_time - score_time))

    adata.obs["gcs"] = None
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs

    adata.obs["cytotrace"] = None
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = scores

    adata.obs["counts"] = None
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts

    adata.var["cytotrace"] = None
    adata.var.iloc[genes_selected, adata.var.columns.get_loc("cytotrace")] = True

    adata.var["cytotrace_corrs"] = None
    adata.var.iloc[
        genes_selected, adata.var.columns.get_loc("cytotrace_corrs")
    ] = cyto_corrs
    return {
        "CytoTRACE": scores,
        "CytoTRACErank": rank,
        "GCS": gcs,
        "Counts": counts,
        "exprMatrix": mat2,
        "cytoGenes": cyto_corrs,
        "gcsGenes": corrs,
        "filteredCells": np.setdiff1d(np.arange(adata.shape[0]), cells_selected),
    }


def visualize(adata, metrics, name="test"):
    import scanpy as sc
    import seaborn as sns

    fig, ax = plt.subplots(2)
    fig.set_size_inches(8, 6)
    sc.pl.umap(adata, color="cytotrace", show=False, ax=ax[0])
    order = adata.obs.groupby("clusters").median().sort_values(["cytotrace"]).index
    bplot = sns.boxplot(
        x="clusters", y="cytotrace", data=adata.obs, order=order, width=0.35, ax=ax[1]
    )
    ax[1].set_xticklabels(order, rotation=40, ha="right")
    fig.savefig(f"{name}_figure.pdf")


# Cell
[docs]def batch_cytotrace(mvg_batch, gcs_batch, solver="jfnnls"):
    corr_batch = compute_similarity1(mvg_batch)
    markov_batch = convert_to_markov(corr_batch)
    gcs_time = time()
    gcs_batch = regressed(markov_batch, gcs_batch, solver=solver)
    regress_time = time()
    print("regression: %s" % (regress_time - gcs_time))
    print(markov_batch.shape, gcs_batch.shape)
    gcs_batch = diffused(markov_batch, gcs_batch)
    return gcs_batch


# Cell
from scipy.sparse import issparse


[docs]def compare_cytotrace_ncores(
    adata,
    layer="all",
    cell_count=10,
    condition="age",
    solver="nnls",
    is_normalized=False,
    ncores=4,
    batch_cell=2000,
):
    "Main interface of cytotrace reimplementation used for single dataset with multiple conditions"
    # condition common steps

    proc_time = time()
    if not is_normalized:
        if layer == "all":
            X = adata.layers["spliced"] + adata.layers["unspliced"]  # .toarray()
        else:
            try:
                X = (
                    adata.layers[layer]
                    if issparse(adata.layers[layer])
                    else adata.layers[layer]
                )
            except:
                X = adata.X if issparse(adata.X) else adata.X

        cells_selected = np.arange(X.shape[0])
        genes_selected = np.arange(X.shape[1])

        from sklearn.utils import sparsefuncs_fast

        mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X)

        pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0)

        # cell x gene
        X = X[:, ~pgenes]
        X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr()
        pqcells = np.array(
            (np.isnan((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count)
        )[0]
        X = X[:, ~pqcells]

        genes_selected = genes_selected[~pgenes]
        cells_selected = cells_selected[~pqcells]

        counts = np.array((X > 0).sum(axis=0))[0]
        mat2 = census_normalize(X, counts).toarray()
        mvg = remove_zero_mvg(mat2)

        # sel = np.array(mvg.sum(axis=0))[0]!=0
        sel = mvg.sum(axis=0) != 0
        mat2 = mat2[:, sel]
        counts = counts[sel]
        cells_selected = cells_selected[sel]

        # joint two conditions to determine gcs feature
        # using the same set of genes
        gcs = compute_gcs(mat2, counts)
        adata.uns["gcs"] = gcs
        adata.uns["cells_selected"] = cells_selected
        adata.uns["counts"] = counts
        adata.uns["mat2"] = mat2
        adata.uns["mvg"] = mvg
        adata.uns["genes_selected"] = genes_selected
    else:
        cells_selected = adata.uns["cells_selected"]
        counts = adata.uns["counts"]
        mat2 = adata.uns["mat2"]
        mvg = adata.uns["mvg"]
        genes_selected = adata.uns["genes_selected"]
        gcs = adata.uns["gcs"]

    # for indepent gcs feature
    # using different set of genes is very variable
    # gcs = np.zeros(len(cells_selected))

    # parallel part
    import multiprocessing as mp

    # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
    # condition-specific steps
    pool = mp.Pool(ncores)

    for cond in np.unique(adata.obs.loc[:, condition].values[cells_selected]):
        print(cond)
        selection = adata.obs.loc[:, condition].values[cells_selected] == cond

        gcs_cond = gcs[selection]
        counts_cond = counts[selection]
        mat2_cond = mat2[:, selection]
        mvg_cond = mvg[:, selection]

        mvg_gcs_batches = []
        for i in range(0, gcs_cond.shape[0], batch_cell):
            mvg_batch = mvg_cond[:, i : (i + batch_cell)].copy()
            gcs_batch = gcs_cond[i : i + batch_cell].copy()
            print(mvg_batch.shape, gcs_batch.shape)
            mvg_gcs_batches.append([mvg_batch, gcs_batch, solver])

        # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
        # http://jennguyen1.github.io/nhuyhoa/software/Parallel-Processing.html
        result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches)
        result.wait()

        gcs_batches = result.get()
        # gcs_batches = result.get()
        gcs_cond = np.hstack(gcs_batches)

        trans_time = time()
        print("preprocessing: %s" % (trans_time - proc_time))

        gc_time = time()
        print("markov: %s" % (gc_time - trans_time))
        gcs[selection] = gcs_cond

        # condition specific cytotrace correlated gene sets
        scores_cond = rankdata(gcs_cond) / gcs_cond.shape[0]
        score_time = time()
        print("score time: %s" % (score_time - gc_time))

        corrs_cond = compute_similarity2(mat2_cond.T, scores_cond.reshape(-1, 1))[0, :]
        final_time = time()
        print("final time: %s" % (final_time - score_time))
        adata.var["cytotrace_corrs_%s" % cond] = None
        adata.var.iloc[
            genes_selected, adata.var.columns.get_loc("cytotrace_corrs_%s" % cond)
        ] = corrs_cond

    pool.close()
    pool.join()

    adata.obs["gcs"] = None
    adata.obs["cytotrace"] = None
    adata.obs["counts"] = None
    adata.obs.iloc[
        cells_selected, adata.obs.columns.get_loc("gcs")
    ] = gcs  # used for re-rank when integrating multiple-samples
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = (
        rankdata(gcs) / gcs.shape[0]
    )  # cytotrace scores
    adata.obs.iloc[
        cells_selected, adata.obs.columns.get_loc("counts")
    ] = counts  # used for re-rank when integrating multiple-samples


# Cell


[docs]def cytotrace_ncore(
    adata,
    layer="all",
    cell_count=10,
    solver="nnls",
    ncores=4,
    batch_cell=3000,
    shuffle=3,
):
    "optimized version"
    proc_time = time()
    if layer == "all":
        X = adata.layers["spliced"] + adata.layers["unspliced"]
    else:
        try:
            X = (
                adata.layers[layer]
                if issparse(adata.layers[layer])
                else adata.layers[layer]
            )
        except:
            X = adata.X if issparse(adata.X) else adata.X

    cells_selected = np.arange(X.shape[0])
    genes_selected = np.arange(X.shape[1])

    from sklearn.utils import sparsefuncs_fast

    mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X)

    pgenes = (np.isnan(np.array((X > 0).sum(axis=0))[0])) | (var == 0)
    # pgenes = np.isnan(mean * adata.shape[0]) | (var == 0)

    # cell x gene
    X = X[:, ~pgenes]
    # gene x cell
    # X = (X.T / np.array(X.sum(axis=1))[0]) * 1e6
    # sparse operation
    X = (X.T).multiply(1.0 / csr_matrix.sum(X.T, axis=0)).tocsr()

    pqcells = np.array(
        (np.isnan((X > 0).sum(axis=0))) | ((X > 0).sum(axis=0) <= cell_count)
    )[0]
    X = X[:, ~pqcells]

    genes_selected = genes_selected[~pgenes]
    cells_selected = cells_selected[~pqcells]

    counts = np.array((X > 0).sum(axis=0))[0]
    mat2 = census_normalize(X, counts).toarray()

    # cell_count = np.array((mat2 > 0).sum(axis=1))[:, 0]
    # X_topcell = mat2[cell_count >= 0.05 * mat2.shape[1], :]

    # mean, var = sparsefuncs_fast.csr_mean_variance_axis0(X_topcell)
    # disp = var / mean
    # mvg = X_topcell[disp >= (np.sort(disp)[::-1])[:1000][-1], :]
    mvg = remove_zero_mvg(mat2)

    # sel = np.array(mvg.sum(axis=0))[0]!=0
    sel = mvg.sum(axis=0) != 0
    mat2 = mat2[:, sel]
    counts = counts[sel]
    cells_selected = cells_selected[sel]

    # mat2 = mat2.toarray()
    # mvg = mvg.toarray()
    print(mvg.shape, mat2.shape)

    trans_time = time()
    print("processing: %s" % (trans_time - proc_time))

    gcs = compute_gcs(mat2, counts)

    # get transition matrix
    # parallel part
    import multiprocessing as mp

    pool = mp.Pool(ncores)

    # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
    mvg_gcs_batches = []
    for i in range(0, adata.shape[0], batch_cell):
        mvg_batch = mvg[:, i : (i + batch_cell)].copy()
        gcs_batch = gcs[i : i + batch_cell].copy()
        mvg_gcs_batches.append([mvg_batch, gcs_batch, solver])

    markov_diffusion_time = time()

    # https://stackoverflow.com/questions/8533318/multiprocessing-pool-when-to-use-apply-apply-async-or-map
    result = pool.starmap_async(batch_cytotrace, mvg_gcs_batches)
    pool.close()
    pool.join()
    score_time = time()
    print(score_time - markov_diffusion_time)
    gcs_batches = result.get()
    gcs_batches = np.hstack(gcs_batches)

    scores = rankdata(gcs_batches) / gcs_batches.shape[0]
    corrs = compute_similarity2(mat2.T, scores.reshape(-1, 1))[0, :]

    gene_time = time()
    print("genes:%s" % (gene_time - score_time))
    adata.obs["gcs"] = None
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("gcs")] = gcs

    adata.obs["cytotrace"] = None
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("cytotrace")] = scores

    adata.obs["counts"] = None
    adata.obs.iloc[cells_selected, adata.obs.columns.get_loc("counts")] = counts

    adata.var["cytotrace"] = None
    adata.var.iloc[genes_selected, adata.var.columns.get_loc("cytotrace")] = True

    adata.var["cytotrace_corrs"] = None
    adata.var.iloc[genes_selected, adata.var.columns.get_loc("cytotrace_corrs")] = corrs


# Cell
[docs]def align_diffrate(
    adatas, labels, field="condition", type="A", outfield="cytotrace", ax=None
):
    "this is used for differentiation rate comparison across samples"
    scores = [
        adatas[0].obs.loc[:, outfield][adatas[0].obs.loc[:, field] == type],
        adatas[1].obs.loc[:, outfield][adatas[1].obs.loc[:, field] == type],
    ]
    if outfield in ["latent_time", "velocity_pseudotime"]:
        pvalg = mannwhitneyu(scores[0], scores[1], alternative="greater")[1]
        pvall = mannwhitneyu(scores[0], scores[1], alternative="less")[1]
    else:
        pvalg = mannwhitneyu(1 - scores[0], 1 - scores[1], alternative="greater")[1]
        pvall = mannwhitneyu(1 - scores[0], 1 - scores[1], alternative="less")[1]

    for s, l in zip(scores, labels):
        if outfield in ["latent_time", "velocity_pseudotime"]:
            s = np.sort(s)
        else:
            if outfield != "cytotrace":
                s = np.sort(-s)
            else:
                s = np.sort(1 - s)
        ax.step(np.concatenate([s, s[[-1]]]), np.arange(s.size + 1) / s.size, label=l)
        ax.scatter(np.concatenate([s, s[[-1]]]), np.arange(s.size + 1) / s.size, s=5)
        if outfield == "cytotrace":
            ax.set_xlabel("Differentiation level")
        else:
            ax.set_xlabel(outfield)
        ax.set_ylabel("Cumulative proportion of cells")
        ax.set_title(type)
    ax.text(
        np.percentile(s, 0.6),
        0.4,
        f"{labels[0]} > {labels[1]}: {pvalg:.3E}",
    )
    ax.text(
        np.percentile(s, 0.6),
        0.48,
        f"{labels[0]} < {labels[1]}: {pvall:.3E}",
    )
    ax.legend()


# Cell
[docs]def plot_multiadata(adatas):
    for a in adatas:
        print(a.obs.age[0])
        fig, ax = plt.subplots(1, 5)
        fig.set_size_inches(32, 9)
        scv.pl.velocity_embedding_grid(
            a,
            scale=0.2,
            color="louvain",
            show=False,
            basis="pca",
            legend_loc="on data",
            ax=ax[0],
            title=a.obs.age[0],
        )
        scv.pl.scatter(
            a,
            color="velocity_pseudotime",
            basis="pca",
            size=80,
            color_map="gnuplot",
            ax=ax[1],
            show=False,
            title=a.obs.age[0],
        )
        scv.pl.scatter(
            a,
            color="latent_time",
            basis="pca",
            size=80,
            color_map="gnuplot",
            ax=ax[2],
            show=False,
            title=a.obs.age[0],
        )
        scv.pl.scatter(
            a,
            color="end_points",
            basis="pca",
            size=80,
            color_map="gnuplot",
            ax=ax[3],
            show=False,
            title=a.obs.age[0],
        )
        scv.pl.scatter(
            a,
            color="root_cells",
            basis="pca",
            size=80,
            color_map="gnuplot",
            ax=ax[4],
            show=False,
            title=a.obs.age[0],
        )


# Cell
# https://stackoverflow.com/questions/15408371/cumulative-distribution-plots-python
[docs]def cumulative_boxplot():
    data = adata_copy[adata_copy.obs.age == "A",].obs.n_counts
    # evaluate the histogram
    values, base = np.histogram(data, bins=10)
    # evaluate the cumulative
    cumulative = np.cumsum(values)
    # plot the cumulative function
    plt.scatter(base[:-1], cumulative / cumulative[-1], c="blue")
    # plot the survival function
    # plt.plot(base[:-1], len(data)-cumulative, c='green')