Hematopoiesis - Initial state identification#

Construct diffusion pseudotime on NeurIPS 2021 hematopoiesis data.

import sys

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import cellrank as cr
import scanpy as sc
import scvelo as scv

from cr2 import running_in_notebook

sys.path.extend(["../../../", "."])
from paths import DATA_DIR, FIG_DIR  # isort: skip  # noqa: E402
Global seed set to 0

General settings#

sc.settings.verbosity = 2
cr.settings.verbosity = 4
scv.settings.verbosity = 3
scv.settings.set_figure_params("scvelo", dpi_save=400, dpi=80, transparent=True, fontsize=20, color_map="viridis")
SAVE_FIGURES = False
if SAVE_FIGURES:
    (FIG_DIR / "pseudotime_kernel" / "hematopoiesis").mkdir(parents=True, exist_ok=True)

FIGURE_FORMAT = "pdf"
(DATA_DIR / "hematopoiesis" / "results").mkdir(parents=True, exist_ok=True)

Constants#

CELLTYPES_TO_KEEP = [
    "HSC",
    "MK/E prog",
    "Proerythroblast",
    "Erythroblast",
    "Normoblast",
    "cDC2",
    "pDC",
    "G/M prog",
    "CD14+ Mono",
]

Data loading#

adata = sc.read(DATA_DIR / "hematopoiesis" / "processed" / "gex_preprocessed.h5ad")
adata
AnnData object with n_obs × n_vars = 67568 × 25629
    obs: 'site', 'donor', 'batch', 'l1_cell_type', 'l2_cell_type'
    var: 'hvg_multiVI'
    uns: 'neighbors', 'umap'
    obsm: 'MultiVI_latent', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'
if running_in_notebook():
    scv.pl.scatter(adata, basis="X_umap", c="l2_cell_type", dpi=200, title="", legend_fontsize=5, legend_fontweight=1)
../../_images/b04c38a58d68bc58eb44260dfc20024a48022f8e0626fcb24281efd1a3d3388d.png

Data preprocessing#

adata = adata[adata.obs["l2_cell_type"].isin(CELLTYPES_TO_KEEP), :].copy()
adata
AnnData object with n_obs × n_vars = 24440 × 25629
    obs: 'site', 'donor', 'batch', 'l1_cell_type', 'l2_cell_type'
    var: 'hvg_multiVI'
    uns: 'neighbors', 'umap', 'l2_cell_type_colors'
    obsm: 'MultiVI_latent', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'
sc.pp.neighbors(adata, use_rep="MultiVI_latent")
sc.tl.umap(adata)
computing neighbors
    finished (0:02:34)
computing UMAP
    finished (0:00:12)
if running_in_notebook():
    scv.pl.scatter(
        adata,
        basis="X_umap",
        c="l2_cell_type",
        dpi=200,
        title="",
        legend_fontsize=10,
        legend_fontweight=5,
    )
../../_images/8a36ae0dc7a0ef5631a9e1f33b7ea45d72fa5575aaa776b6e39aa07da3305873.png

Diffusion pseudotime#

sc.tl.diffmap(adata, n_comps=15)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.99922997 0.9977195  0.9968419  0.9955766  0.9942717
     0.9900949  0.9884704  0.9867782  0.9852537  0.9849594  0.9830871
     0.98229724 0.9809607  0.97756666]
    finished (0:00:02)
if running_in_notebook():
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(
        adata, basis="diffmap", color=["l2_cell_type"], components=["4, 5"], size=25, dpi=100, title="", ax=ax
    )
../../_images/b6a18fd79ef402b090f15509fe174645e925bfd99b4380a852aa5c1286807d38.png
adata.obs["hsc_cluster"] = (
    adata.obs["l2_cell_type"]
    .astype("str")
    .replace(
        {
            "MK/E prog": "nan",
            "Proerythroblast": "nan",
            "Erythroblast": "nan",
            "Normoblast": "nan",
            "cDC2": "nan",
            "pDC": "nan",
            "G/M prog": "nan",
            "CD14+ Mono": "nan",
        }
    )
    .astype("category")
    .cat.reorder_categories(["nan", "HSC"])
    .copy()
)

celltype_colors = dict(zip(adata.obs["l2_cell_type"].cat.categories, adata.uns["l2_cell_type_colors"]))
adata.uns["hsc_cluster_colors"] = ["#dedede", celltype_colors["HSC"]]

if running_in_notebook():
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(
        adata,
        basis="diffmap",
        c=["hsc_cluster"],
        legend_loc="right",
        components=["4, 5"],
        add_outline="HSC",
        title="",
        ax=ax,
    )
../../_images/efe39890a7bb2afc3f93b4cb380e2b5812ae006215d5ce42ae0e50f984ef9e38.png
df = (
    pd.DataFrame(
        {
            "diff_comp": adata.obsm["X_diffmap"][:, 5],
            "cell_type": adata.obs["l2_cell_type"].values,
        }
    )
    .reset_index()
    .rename({"index": "obs_id"}, axis=1)
)
df = df.loc[df["cell_type"] == "HSC", "diff_comp"]
root_idx = df.index[df.argmax()]

set2_cmap = sns.color_palette("Set2").as_hex()
palette = [set2_cmap[-1], set2_cmap[1]]

if running_in_notebook():
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(adata, basis="diffmap", c=root_idx, legend_loc=False, palette=palette, components=["4, 5"], ax=ax)
../../_images/827627a1b9b5f688e7e5124e082734ddca24b42411e3ce8b2e58b5486b40a144.png
adata.uns["iroot"] = root_idx
sc.tl.dpt(adata, n_dcs=6)
computing Diffusion Pseudotime using n_dcs=6
    finished (0:00:00)
if running_in_notebook():
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(adata, basis="umap", color=root_idx, palette=palette, color_map="viridis", size=50, ax=ax)
../../_images/8d96f6a094ab3828a03c0065425d43b6cbd381d88abe34f4c4bf0c33125b8eac.png
if running_in_notebook():
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(adata, basis="umap", color="dpt_pseudotime", title="", color_map="viridis", ax=ax)
../../_images/5a564a2d686d7e0ab39d0b4fb696608d58164c06c47385e5beafe43795677d06.png
if running_in_notebook():
    sns.set_style(style="whitegrid")
    fig, ax = plt.subplots(figsize=(12, 4))
    sc.pl.violin(
        adata,
        keys=["dpt_pseudotime"],
        groupby="l2_cell_type",
        rotation=45,
        title="",
        legend_loc="none",
        order=[
            "HSC",
            "MK/E prog",
            "Proerythroblast",
            "Erythroblast",
            "Normoblast",
            "G/M prog",
            "CD14+ Mono",
            "cDC2",
            "pDC",
        ],
        ax=ax,
    )

    sns.reset_orig()
../../_images/7865cfc4cf87d3cff40f2c18a24619e3280d3e796bc34563f66af5b7c88c510d.png

CellRank#

Kernel#

ptk = cr.kernels.PseudotimeKernel(adata, time_key="dpt_pseudotime").compute_transition_matrix(threshold_scheme="soft")
ptk.transition_matrix = ptk.transition_matrix.T
Computing transition matrix based on pseudotime
    Finish (0:00:14)

Estimator#

estimator = cr.estimators.GPCCA(ptk)
estimator.compute_schur(n_components=20)
estimator.plot_spectrum(real_only=True)
plt.show()
Computing Schur decomposition
Adding `adata.uns['eigendecomposition_fwd']`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:00:03)
../../_images/ed9a5b5c6df7b27ef3ca39971c5311dd7cb31cf21604517dab83adca85f83b9a.png
estimator.compute_macrostates(1, cluster_key="l2_cell_type")
if running_in_notebook():
    estimator.plot_macrostates(which="all", basis="umap", title="", legend_loc="right", size=100)
    if SAVE_FIGURES:
        fpath = (
            FIG_DIR
            / "pseudotime_kernel"
            / "hematopoiesis"
            / f"umap_colored_by_cr_dpt_macrostates-initial_state.{FIGURE_FORMAT}"
        )
        estimator.plot_macrostates(which="all", basis="umap", title="", legend_loc=False, size=100, save=fpath)
For 1 macrostate, stationary distribution is computed
Computing eigendecomposition of the transition matrix
DEBUG: Computing top `20` eigenvalues of a sparse matrix
DEBUG: Sorting eigenvalues by their real part
Adding `adata.uns['eigendecomposition_fwd']`
       `.eigendecomposition`
    Finish (0:00:06)
DEBUG: Setting the macrostates using macrostates memberships
DEBUG: Raising an exception if there are less than `6` cells.
Adding `.macrostates`
       `.macrostates_memberships`
       `.coarse_T`
       `.coarse_initial_distribution
       `.coarse_stationary_distribution`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:00:06)
../../_images/f945705506780772183e669cd5d438c4c26adca3e6f29b1f930a5978a0258deb.png