Human hematopoiesis - data preprocessing

Human hematopoiesis - data preprocessing#

Here, we process a human hematopoiesis dataset to infer the underlying dynamics with CellRank in further analyses. This workflow includes the standard preprocessing steps of scRNA-seq data, and the inference of a pseudotime with Palantir.

The corresponding data can be downloaded from here and should be saved in data/bone_marrow/raw/adata.h5ad, i.e.,

mkdir -p ../../data/bone_marrow/raw/
wget https://figshare.com/ndownloader/files/53393684 -O ../../data/bone_marrow/raw/adata.h5ad

Library imports#

import warnings

import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import mplscience

import anndata as ad
import palantir
import scanpy as sc
import scvelo as scv

from crp import DATA_DIR, FIG_DIR

General settings#

warnings.filterwarnings(action="ignore", category=FutureWarning)
sc.settings.verbosity = 2
scv.settings.verbosity = 3
mpl.use("module://matplotlib_inline.backend_inline")
mpl.rcParams["backend"]

plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Arial"]
scv.settings.set_figure_params("scvelo", dpi_save=400, dpi=80, transparent=True, fontsize=20, color_map="viridis")

Constants#

SAVE_FIGURES = False
if SAVE_FIGURES:
    (FIG_DIR / "pseudotimekernel").mkdir(parents=True, exist_ok=True)

FIGURE_FORMAT = "svg"
SAVE_RESULTS = False
if SAVE_RESULTS:
    (DATA_DIR / "bone_marrow" / "processed").mkdir(parents=True, exist_ok=True)

Data loading#

adata = ad.io.read_h5ad(DATA_DIR / "bone_marrow" / "raw" / "adata.h5ad")
adata
AnnData object with n_obs × n_vars = 6881 × 12464
    obs: 'leiden', 'celltype'
    var: 'highly_variable'
    uns: 'celltype_colors'
    obsm: 'X_umap'

Data preprocessing#

sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=1500, subset=True)
normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added
    'n_counts', counts per cell before normalization (adata.obs)
extracting highly variable genes
    finished (0:00:00)
sc.tl.pca(adata, n_comps=50)
sc.pp.neighbors(adata, n_pcs=50, n_neighbors=30)
computing PCA
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished (0:00:02)
sc.tl.umap(adata)
computing UMAP
    finished (0:00:05)
with mplscience.style_context():
    fig, ax = plt.subplots(figsize=(6, 6))
    scv.pl.scatter(adata, basis="umap", color=["celltype"], size=25, dpi=100, title="", ax=ax)
    plt.show()

if SAVE_FIGURES:
    fig, ax = plt.subplots(figsize=(6, 6))
    scv.pl.scatter(adata, basis="umap", color=["celltype"], size=25, dpi=100, title="", legend_loc=False, ax=ax)
    fig.savefig(
        FIG_DIR / "pseudotimekernel" / "umap_colored_by_cell_type.png",
        dpi=400,
        transparent=True,
        bbox_inches="tight",
    )
pc_projection = pd.DataFrame(adata.obsm["X_pca"].copy(), index=adata.obs_names)
# diffusion maps
diff_maps = palantir.utils.run_diffusion_maps(pc_projection, n_components=15)
computing neighbors
    finished (0:00:00)
adata.obsm["X_diffmap"] = diff_maps["EigenVectors"].values.copy()
with mplscience.style_context():
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(adata, basis="diffmap", color=["celltype"], components=["1, 2"], size=25, dpi=100, title="", ax=ax)
    plt.show()
# multiscale space
multiscale_space = palantir.utils.determine_multiscale_space(diff_maps)
root_idx = 3290  # adata.obsm["X_diffmap"][:, 2].argmin()
with mplscience.style_context():
    scv.pl.scatter(adata, basis="diffmap", color=["celltype", root_idx], legend_loc="right", components="1, 2", size=25)
    plt.show()
palantir_res = palantir.core.run_palantir(
    multiscale_space, adata.obs_names[root_idx], use_early_cell_as_start=True, num_waypoints=500
)
Sampling and flocking waypoints...
Time for determining waypoints: 0.000851603349049886 minutes
Determining pseudotime...
Shortest path distances using 30-nearest neighbor graph...
Time for shortest paths: 0.06112736860911051 minutes
Iteratively refining the pseudotime...
Correlation at iteration 1: 0.9996
Correlation at iteration 2: 1.0000
Entropy and branch probabilities...
Markov chain construction...
Identification of terminal states...
Computing fundamental matrix and absorption probabilities...
Project results to all cells...
adata.obs["palantir_pseudotime"] = palantir_res.pseudotime

with mplscience.style_context():
    scv.pl.scatter(
        adata, basis="umap", color=["celltype", "palantir_pseudotime"], legend_loc="right", color_map="viridis", size=25
    )
    plt.show()
if SAVE_RESULTS:
    adata.write_h5ad(DATA_DIR / "bone_marrow" / "processed" / "adata.h5ad")