Intestinal organoid differentiation - Data preprocessing

Intestinal organoid differentiation - Data preprocessing#

Preprocessing of scEU-seq organoid data by

  • Excluding control cells identified via labeling time "dmso"

  • Excluding tuft cells

  • Removing genes with less than 50 counts

  • Normalizing counts

  • Extracting 2000 highly variable features

  • Log1p transforming total counts stored in adata.X

  • Computing PCA

  • Constructing a nearest neighbor graph with 30 neighbors and 30 principal components

  • Computing UMAP embedding

Library imports#

import os
import sys

import pandas as pd

import matplotlib.pyplot as plt

import scanpy as sc
import scvelo as scv

from cr2 import running_in_notebook

sys.path.extend(["../../", "."])
from paths import DATA_DIR, FIG_DIR  # isort: skip  # noqa: E402
Global seed set to 0

General settings#

sc.settings.verbosity = 3
scv.settings.verbosity = 3
scv.settings.set_figure_params("scvelo")
SAVE_FIGURES = True

if SAVE_FIGURES:
    os.makedirs(FIG_DIR / "labeling_kernel", exist_ok=True)

Data loading#

adata = sc.read(DATA_DIR / "sceu_organoid" / "processed" / "raw.h5ad")

adata = adata[adata.obs["labeling_time"] != "dmso", :].copy()
adata = adata[~adata.obs["cell_type"].isin(["Tuft cells"]), :]
adata.obs["labeling_time"] = adata.obs["labeling_time"].astype(float) / 60

adata.obs.drop(["well_id", "batch_id", "log10_gfp", "monocle_branch_id", "monocle_pseudotime"], axis=1, inplace=True)

del (
    adata.layers["labeled_unspliced"],
    adata.layers["labeled_spliced"],
    adata.layers["unlabeled_unspliced"],
    adata.layers["unlabeled_spliced"],
)

adata
AnnData object with n_obs × n_vars = 3452 × 9157
    obs: 'experiment', 'labeling_time', 'cell_type', 'som_cluster_id'
    var: 'ensum_id'
    uns: 'cell_type_colors'
    obsm: 'X_umap_paper'
    layers: 'labeled', 'total', 'unlabeled'
pd.DataFrame(adata.obs[["labeling_time", "experiment"]].groupby("experiment").apply(lambda x: x.value_counts())).rename(
    {0: "value_counts"}, axis=1
).droplevel(level=2).sort_index()
value_counts
experiment labeling_time
Chase 0.00 647
0.75 805
6.00 632
Pulse 2.00 1368
adata.obs["cell_type_merged"] = adata.obs["cell_type"].copy()
adata.obs["cell_type_merged"].replace({"Enteroendocrine cells": "Enteroendocrine progenitors"}, inplace=True)

Data preprocessing#

scv.pp.filter_and_normalize(
    adata, min_counts=50, layers_normalize=["X", "labeled", "unlabeled", "total"], n_top_genes=2000
)
adata
WARNING: Could not find spliced / unspliced counts.
Normalized count data: X, labeled, unlabeled, total.
Extracted 2000 highly variable genes.
Logarithmized X.
AnnData object with n_obs × n_vars = 3452 × 2000
    obs: 'experiment', 'labeling_time', 'cell_type', 'som_cluster_id', 'cell_type_merged', 'initial_size', 'n_counts'
    var: 'ensum_id', 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable'
    uns: 'cell_type_colors'
    obsm: 'X_umap_paper'
    layers: 'labeled', 'total', 'unlabeled'
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 30
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
sc.tl.umap(adata)
if running_in_notebook():
    scv.pl.scatter(adata, basis="umap", color="cell_type", legend_loc="right")

if SAVE_FIGURES:
    fig, ax = plt.subplots(figsize=(6, 4))
    scv.pl.scatter(adata, basis="umap", color="cell_type", title="", legend_loc=False, show=False, ax=ax)

    fig.savefig(
        FIG_DIR / "labeling_kernel" / "umap_colored_by_cell_type.eps",
        format="eps",
        transparent=True,
        bbox_inches="tight",
    )
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)
../_images/b1b59d9ba788925b340a5d594767dfcbf7c35c2091ed561aad9fdf51d1406a6f.png ../_images/a69efe14dbda6007b9acbd234dddaa3ff3d14f77c650699f77502cb92da69fbf.png
filepath = DATA_DIR / "sceu_organoid" / "processed" / "umap_coords.csv"
pd.DataFrame(adata.obsm["X_umap"], index=adata.obs_names, columns=["umap_1", "umap_2"]).to_csv(filepath)

Data saving#

adata.write(DATA_DIR / "sceu_organoid" / "processed" / "preprocessed.h5ad")