Hematopoiesis - RNA velocity#

Infer RNA velocity on NeurIPS 2021 hematopoiesis data.

import sys

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplscience
import seaborn as sns

import cellrank as cr
import scanpy as sc
import scvelo as scv
from anndata import AnnData

from cr2 import get_state_purity, plot_state_purity, running_in_notebook

sys.path.extend(["../../../", "."])
from paths import DATA_DIR, FIG_DIR  # isort: skip  # noqa: E402
Global seed set to 0

General settings#

sc.settings.verbosity = 2
cr.settings.verbosity = 4
scv.settings.verbosity = 3
scv.settings.set_figure_params("scvelo", dpi_save=400, dpi=80, transparent=True, fontsize=20, color_map="viridis")
SAVE_FIGURES = False
if SAVE_FIGURES:
    (FIG_DIR / "pseudotime_kernel" / "hematopoiesis").mkdir(parents=True, exist_ok=True)

FIGURE_FORMAT = "pdf"
(DATA_DIR / "hematopoiesis" / "results").mkdir(parents=True, exist_ok=True)

Constants#

N_JOBS = 8
CELLTYPES_TO_KEEP = [
    "HSC",
    "MK/E prog",
    "Proerythroblast",
    "Erythroblast",
    "Normoblast",
    "cDC2",
    "pDC",
    "G/M prog",
    "CD14+ Mono",
]

Data loading#

adata = sc.read(DATA_DIR / "hematopoiesis" / "processed" / "gex_velocity.h5ad")
adata
AnnData object with n_obs × n_vars = 67568 × 25629
    obs: 'batch', 'site', 'donor', 'l1_cell_type', 'l2_cell_type'
    var: 'hvg_multiVI'
    uns: 'neighbors'
    obsm: 'MultiVI_latent', 'X_umap'
    layers: 'counts', 'spliced', 'unspliced'
    obsp: 'connectivities', 'distances'
if running_in_notebook():
    scv.pl.scatter(adata, basis="X_umap", c="l2_cell_type", dpi=200, title="", legend_fontsize=5, legend_fontweight=1)
../../_images/b04c38a58d68bc58eb44260dfc20024a48022f8e0626fcb24281efd1a3d3388d.png

Data preprocessing#

adata = adata[adata.obs["l2_cell_type"].isin(CELLTYPES_TO_KEEP), :].copy()
adata
AnnData object with n_obs × n_vars = 24440 × 25629
    obs: 'batch', 'site', 'donor', 'l1_cell_type', 'l2_cell_type'
    var: 'hvg_multiVI'
    uns: 'neighbors', 'l2_cell_type_colors'
    obsm: 'MultiVI_latent', 'X_umap'
    layers: 'counts', 'spliced', 'unspliced'
    obsp: 'connectivities', 'distances'
scv.pp.filter_genes(adata, min_shared_counts=20)
scv.pp.normalize_per_cell(adata)
Filtered out 16169 genes that are detected 20 counts (shared).
WARNING: Did not normalize X as it looks processed already. To enforce normalization, set `enforce=True`.
Normalized count data: spliced, unspliced.
sc.pp.neighbors(adata, use_rep="MultiVI_latent")
sc.tl.umap(adata)
computing neighbors
    finished (0:02:27)
computing UMAP
    finished (0:00:10)
if running_in_notebook():
    scv.pl.scatter(
        adata,
        basis="X_umap",
        c="l2_cell_type",
        dpi=200,
        title="",
        legend_fontsize=10,
        legend_fontweight=5,
    )
../../_images/8a36ae0dc7a0ef5631a9e1f33b7ea45d72fa5575aaa776b6e39aa07da3305873.png
scv.pp.moments(adata, n_pcs=None, n_neighbors=None)
computing moments based on connectivities
    finished (0:00:09) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
for gene in ["HBA2", "HBA1", "GYPC", "TFRC", "AKAP13", "ABCB10", "ANK1", "GATA1", "GATA2"]:
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(adata, basis=gene, color="l2_cell_type", frameon=False, ax=ax)

    if SAVE_FIGURES:
        fig.savefig(
            FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"phase_portrait_{gene}.{FIGURE_FORMAT}",
            format=FIGURE_FORMAT,
            transparent=True,
            bbox_inches="tight",
        )
../../_images/5c2455a83bc05f8245575e14a8d82698d52fd5c94ea19cb7cd573129b16ea97d.png ../../_images/cbe54ccdde4530bc589c36a5678caf6bfe1bc7826f26ce1acdb4d5258db2c7cd.png ../../_images/168353a980a161567912f5bed6b7ea1393c64c37f2c63d007240caa99ebed2b2.png ../../_images/9699fa38e7fbcd75a58353cf8049ef29ce60cd001bf0b16da70415fa457a1ead.png ../../_images/96912fc43f4c3ac6abda43a7816c5d320cc0b7d8204880429446050a1d71f125.png ../../_images/1b2a15212dbd5ca0f57ddbb116a290bd18477bf8aa2212e5bcaf2e3a707da8e9.png ../../_images/cea79fac1f732240722ffa8aa563462f2d458bd0c812478f45ac549363909eec.png ../../_images/5dcb711020b2843e48b446229841af8809a6a87c68c013ca0fbe172153cdff33.png ../../_images/85b7781d58fdb5312566b32e044fb1be5b2b1457b12a1dd360f6ee51d9395921.png

RNA velocity inference#

scv.tl.recover_dynamics(adata, n_jobs=N_JOBS)
recovering dynamics (using 8/14 cores)
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
Global seed set to 0
    finished (0:03:30) --> added 
    'fit_pars', fitted parameters for splicing dynamics (adata.var)
scv.tl.velocity(adata, mode="dynamical")
computing velocities
    finished (0:00:18) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)

CellRank analysis#

Kernel#

vk = cr.kernels.VelocityKernel(adata).compute_transition_matrix()
ck = cr.kernels.ConnectivityKernel(adata).compute_transition_matrix()

combined_kernel = 0.8 * vk + 0.2 * ck
Computing transition matrix using `'deterministic'` model
Using `softmax_scale=10.3350`
    Finish (0:00:20)
Computing transition matrix based on `adata.obsp['connectivities']`
DEBUG: Density normalizing the transition matrix
    Finish (0:00:00)
combined_kernel.plot_projection(color="l2_cell_type", recompute=True, basis="X_umap", dpi=200, legend_fontsize=5)

if SAVE_FIGURES:
    fig, ax = plt.subplots(figsize=(6, 4))
    combined_kernel.plot_projection(
        color="l2_cell_type",
        recompute=True,
        basis="X_umap",
        title="",
        legend_loc="none",
        alpha=0.25,
        linewidth=2,
        ax=ax,
    )

    fig.savefig(
        FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"rna_velocity_stream.{FIGURE_FORMAT}",
        format=FIGURE_FORMAT,
        transparent=True,
        bbox_inches="tight",
        dpi=400,
    )
Projecting transition matrix onto `umap`
Adding `adata.obsm['T_fwd_umap']`
    Finish (0:00:07)
../../_images/d8938b05b3e22df180d3a1aa381b8f544a17f9ca427144d72008477e308912e3.png

Estimator#

estimator = cr.estimators.GPCCA(combined_kernel)
estimator.compute_schur(n_components=20)
estimator.plot_spectrum(real_only=True)
Computing Schur decomposition
WARNING: Using `20` components would split a block of complex conjugate eigenvalues. Using `n_components=21`
When computing macrostates, choose a number of states NOT in `[10, 20]`
Adding `adata.uns['eigendecomposition_fwd']`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:00:02)
../../_images/c196cfed7b4c296a14b197484d400b3151ba2e2e5edba2941208748348445be1.png
estimator.compute_macrostates(n_states=3, cluster_key="l2_cell_type")
estimator.plot_macrostates(which="all", basis="umap", legend_loc="right", title="", size=100)
if SAVE_FIGURES:
    fpath = (
        FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"umap_colored_by_rna_velo_three_macrostates.{FIGURE_FORMAT}"
    )
    estimator.plot_macrostates(which="all", basis="umap", title="", legend_loc=False, size=100, save=fpath)
Computing `3` macrostates
DEBUG: Setting the macrostates using macrostates memberships
DEBUG: Raising an exception if there are less than `6` cells.
Adding `.macrostates`
       `.macrostates_memberships`
       `.coarse_T`
       `.coarse_initial_distribution
       `.coarse_stationary_distribution`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:00:00)
../../_images/24f7b41b1cfe78534d7c3b7872d4ed1595af3f87cf3d9d35d0e2c7c1204a5f52.png
macrostate_purity = get_state_purity(adata, estimator, states="macrostates", obs_col="l2_cell_type")
print(f"Mean purity: {np.mean(list(macrostate_purity.values()))}")

if running_in_notebook():
    if SAVE_FIGURES:
        fpath = FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"rna_velo_three_macrostate_purity.{FIGURE_FORMAT}"
    else:
        fpath = None

    palette = dict(zip(estimator.macrostates.cat.categories, estimator._macrostates.colors))

    plot_state_purity(macrostate_purity, palette=palette, fpath=fpath, format=FIGURE_FORMAT)
Mean purity: 1.0
../../_images/3308c41b4ebe9115fbf1d16c13213228f9daa7df0230411b15d8fa3522a5038a.png
terminal_states = ["CD14+ Mono", "Normoblast", "cDC2", "pDC"]
cluster_key = "l2_cell_type"

if (DATA_DIR / "hematopoiesis" / "results" / "tsi-vk.csv").is_file():
    tsi_df = pd.read_csv(DATA_DIR / "hematopoiesis" / "results" / "tsi-vk.csv")
    estimator._tsi = AnnData(tsi_df, uns={"terminal_states": terminal_states, "cluster_key": cluster_key})
    tsi_score = estimator.tsi(n_macrostates=17, terminal_states=terminal_states, cluster_key=cluster_key)
else:
    tsi_score = estimator.tsi(n_macrostates=17, terminal_states=terminal_states, cluster_key=cluster_key)
    estimator._tsi.to_df().to_csv(DATA_DIR / "hematopoiesis" / "results" / "tsi-vk.csv", index=False)

print(f"TSI score: {tsi_score:.2f}")
TSI score: 0.77
/vol/storage/miniconda3/envs/cr2-py38/lib/python3.8/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)
# For nice name in figure legend
estimator.kernel.__class__.__name__ = "VelocityKernel"
palette = {"VelocityKernel": "#0173b2", "Optimal identification": "#000000"}

if SAVE_FIGURES:
    fpath = FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"tsi-vk.{FIGURE_FORMAT}"
else:
    fpath = None

with mplscience.style_context():
    sns.set_style(style="whitegrid")
    estimator.plot_tsi(palette=palette, save=fpath)
    plt.show()
../../_images/f34df4898aa240b4cc2e310e2ea96a2bce7a62ab992b10412ae91a41b10c3b1c.png
estimator.set_terminal_states(["pDC", "CD14+ Mono", "Normoblast"])
estimator.plot_macrostates(which="terminal", basis="umap", title="", legend_loc="right", size=100)
if SAVE_FIGURES:
    fpath = (
        FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"umap_colored_by_cr_rna_velo_terminal_states.{FIGURE_FORMAT}"
    )
    estimator.plot_macrostates(which="terminal", basis="umap", title="", legend_loc=False, size=100, save=fpath)
DEBUG: Raising an exception if there are less than `6` cells.
Adding `adata.obs['term_states_fwd']`
       `adata.obs['term_states_fwd_probs']`
       `.terminal_states`
       `.terminal_states_probabilities`
       `.terminal_states_memberships
    Finish`
../../_images/196cb17a3ee56552db9544028860567d3538430cb8e081fbb4603707eb1de9a4.png
estimator.compute_fate_probabilities(tol=1e-7)
if running_in_notebook():
    estimator.plot_fate_probabilities(same_plot=False, basis="X_umap", ncols=2)

if SAVE_FIGURES:
    adata.obs["fate_prob_pDC"] = adata.obsm["lineages_fwd"][:, "pDC"].X.squeeze()
    adata.obs["fate_prob_CD14+Mono"] = adata.obsm["lineages_fwd"][:, "CD14+ Mono"].X.squeeze()
    adata.obs["fate_prob_Normoblast"] = adata.obsm["lineages_fwd"][:, "Normoblast"].X.squeeze()

    for terminal_state in ["pDC", "CD14+Mono", "Normoblast"]:
        fig, ax = plt.subplots(figsize=(6, 4))

        if running_in_notebook():
            scv.pl.scatter(
                adata,
                basis="umap",
                color=f"fate_prob_{terminal_state}",
                cmap="viridis",
                title="",
                colorbar=False,
                ax=ax,
            )

            fig.savefig(
                FIG_DIR
                / "pseudotime_kernel"
                / "hematopoiesis"
                / f"rna_velo_fate_prob_{terminal_state}.{FIGURE_FORMAT}",
                format=FIGURE_FORMAT,
                transparent=True,
                bbox_inches="tight",
            )
Computing fate probabilities
DEBUG: Solving the linear system using `PETSc` solver `'gmres'` on `1` core(s) with no preconditioner and `tol=1e-07`
Adding `adata.obsm['lineages_fwd']`
       `.fate_probabilities`
    Finish (0:00:00)
[0]PETSC ERROR: ------------------------------------------------------------------------
[0]PETSC ERROR: Caught signal number 13 Broken Pipe: Likely while reading or writing to a socket
[0]PETSC ERROR: Try option -start_in_debugger or -on_error_attach_debugger
[0]PETSC ERROR: or see https://petsc.org/release/faq/#valgrind and https://petsc.org/release/faq/
[0]PETSC ERROR: configure using --with-debugging=yes, recompile, link, and run 
[0]PETSC ERROR: to get more information on the crash.
Abort(59) on node 0 (rank 0 in comm 0): application called MPI_Abort(MPI_COMM_WORLD, 59) - process 0
../../_images/53890f35f3ae70aef314b9cd6ad010a2ffc26fd11ffc5a9f65963be101bac491.png
if running_in_notebook():
    if SAVE_FIGURES:
        fpath = f"{FIG_DIR}/pseudotime_kernel/hematopoiesis/umap_colored_by_rna_velo_fate.{FIGURE_FORMAT}"
    else:
        fpath = None
    fig, ax = plt.subplots(figsize=(6, 4))
    estimator.plot_fate_probabilities(
        same_plot=True,
        basis="umap",
        title="",
        legend_loc=False,
        save=fpath,
        ax=ax,
    )
../../_images/1bcb07caa6acbcfbce67583eab0bb46ddc95ebdca0dd4d0825cd5e05abf51b62.png
estimator.compute_macrostates(n_states=20, cluster_key="l2_cell_type")
estimator.plot_macrostates(which="all", basis="umap", title="", legend_loc="right", size=100)
if SAVE_FIGURES:
    fpath = FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"umap_colored_by_rna_velo_20_macrostates.{FIGURE_FORMAT}"
    estimator.plot_macrostates(which="all", basis="umap", title="", legend_loc=False, size=100, save=fpath)
WARNING: Unable to compute macrostates with `n_states=20` because it will split complex conjugate eigenvalues. Using `n_states=21`
Computing `21` macrostates
DEBUG: Setting the macrostates using macrostates memberships
DEBUG: Raising an exception if there are less than `6` cells.
Adding `.macrostates`
       `.macrostates_memberships`
       `.coarse_T`
       `.coarse_initial_distribution
       `.coarse_stationary_distribution`
       `.schur_vectors`
       `.schur_matrix`
       `.eigendecomposition`
    Finish (0:07:44)
../../_images/5aed60f57f0f9c170d701c9571c88356933ae84067fa71d8b22874a72d0849a0.png
macrostate_purity = get_state_purity(adata, estimator, states="macrostates", obs_col="l2_cell_type")
print(f"Mean purity: {np.mean(list(macrostate_purity.values()))}")

if running_in_notebook():
    if SAVE_FIGURES:
        fpath = FIG_DIR / "pseudotime_kernel" / "hematopoiesis" / f"rna_velo_20_macrostate_purity.{FIGURE_FORMAT}"
    else:
        fpath = None

    macrostates_ordered = estimator.macrostates.cat.categories.sort_values()
    palette = dict(zip(estimator.macrostates.cat.categories, estimator._macrostates.colors))

    plot_state_purity(macrostate_purity, order=macrostates_ordered, palette=palette, fpath=fpath, format=FIGURE_FORMAT)
    plt.show()
Mean purity: 0.8984126984126986
../../_images/9e891e1dcf7caf9715c78477b51e6f90ebf84a67305aa2d173dc257851246d66.png