Basic preprocessing and analysis of the zebrafish data#

Notebook preprocesses the zebrafish Smart-seq3 dataset.

Library imports#

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplscience

import scanpy as sc
import scvelo as scv
from velovi import preprocess_data

from rgv_tools import DATA_DIR, FIG_DIR
from rgv_tools.preprocessing import filter_genes, set_prior_grn
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead.
  warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead.
  warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead.
  warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead.
  warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead.
  warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead.
  warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/dynamo/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_umi_tools from `anndata` is deprecated. Import anndata.io.read_umi_tools instead.
  warnings.warn(msg, FutureWarning)

General settings#

sc.settings.verbosity = 2
scv.settings.verbosity = 3
plt.rcParams["svg.fonttype"] = "none"

Constants#

DATASET = "zebrafish"
SAVE_DATA = True
if SAVE_DATA:
    (DATA_DIR / DATASET / "processed").mkdir(parents=True, exist_ok=True)
SAVE_FIGURES = True
if SAVE_FIGURES:
    (FIG_DIR / DATASET).mkdir(parents=True, exist_ok=True)

Data loading#

adata = sc.read_h5ad(DATA_DIR / DATASET / "raw" / "adata_zebrafish_preprocessed.h5ad")
tfs = pd.read_csv(DATA_DIR / DATASET / "raw" / "zebrafish_tfs.csv", index_col=0).iloc[:, 0].tolist()
prior_net = pd.read_csv(DATA_DIR / DATASET / "raw" / "prior_GRN.csv", index_col=0)
## Only keep necessary TF list
keep_list = pd.read_csv(DATA_DIR / DATASET / "raw" / "keep_tf.csv", sep=";").iloc[:, 0].tolist()
sc.pp.neighbors(adata, n_neighbors=30)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished (0:00:06)
scv.pp.moments(adata, n_pcs=None, n_neighbors=None)
adata
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
AnnData object with n_obs × n_vars = 697 × 8012
    obs: 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts', 'cell_type', 'stage'
    var: 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'gene_count_corr', 'is_tf'
    uns: 'cell_type_colors', 'neighbors'
    obsm: 'X_pca', 'X_umap'
    layers: 'ambiguous', 'matrix', 'spliced', 'unspliced', 'Ms', 'Mu'
    obsp: 'distances', 'connectivities'
scv.pl.umap(adata, color="cell_type", palette=sc.pl.palettes.vega_20, legend_loc="right")
../_images/0306155cae9973a6159e9efb8150465034067fe5e7109311e8c02b5d626826a3.png

Visualize results#

with mplscience.style_context():
    fig, ax = plt.subplots(figsize=(7, 5))
    scv.pl.umap(adata, color="cell_type", ax=ax, palette=sc.pl.palettes.vega_20)

    if SAVE_FIGURES:
        fig.savefig(FIG_DIR / DATASET / "INTRO_figure_all_ct.svg", format="svg", transparent=True, bbox_inches="tight")
    plt.show
../_images/ae87dd4551372694239f8e1381f08d2aee5dd8f0b142c6972408d8ddbd488755.png

Visualize known terminal states#

adata.obs["cell_type2"] = adata.obs["cell_type"].copy()
adata.obs["cell_type2"][
    ~adata.obs["cell_type2"].isin(["NPB_nohox", "mNC_hox34", "Pigment", "mNC_head_mesenchymal", "mNC_arch2"])
] = np.nan
adata.obs["cell_type2"] = adata.obs["cell_type2"].cat.remove_unused_categories()
palette = dict(zip(adata.obs["cell_type"].cat.categories, adata.uns["cell_type_colors"]))
subset_palette = {
    name: color for name, color in palette.items() if name in adata.obs["cell_type2"].cat.categories.tolist()
}
which = "macrostates"
adata.obs[which] = adata.obs["cell_type2"].copy()

state_names = adata.obs[which].cat.categories.tolist()
adata.obs[which] = adata.obs[which].astype(str).astype("category").cat.reorder_categories(["nan"] + state_names)

if which == "macrostates":
    adata.uns[f"{which}_colors"] = ["#dedede"] + list(subset_palette.values())
else:
    adata.uns[f"{which}_colors"] = ["#dedede"] + list(subset_palette.values())
state_names = adata.obs[which].cat.categories.tolist()[1:]


with mplscience.style_context():
    fig, ax = plt.subplots(figsize=(4, 3))
    scv.pl.scatter(
        adata,
        basis="umap",
        c=which,
        add_outline=state_names,
        ax=ax,
        size=60,
    )

    if SAVE_FIGURES:
        fig.savefig(FIG_DIR / DATASET / "INTRO_figure.svg", format="svg", transparent=True, bbox_inches="tight")
    plt.show()
../_images/e665b844451f3f62d2c02d7316810675e95847ffc52ffc386b4f106c8bcd8bd9.png

Preprocessing#

adata = set_prior_grn(adata, prior_net)
velocity_genes = preprocess_data(adata.copy()).var_names.tolist()
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
adata.var["TF"] = np.isin(adata.var_names, tfs)

Select genes that are either part of the transcription factor (TF) list or velocity_genes

## velocity_r2 positive genes
var_mask = np.union1d(list(set(keep_list).intersection(adata.var_names)), velocity_genes)

## Filtering genes, only keep velocity_r2 positive genes and TFs
adata = adata[:, var_mask].copy()
adata = filter_genes(adata)
adata = preprocess_data(adata, filter_on_r2=False)
Number of genes: 1013
Number of genes: 989
Number of genes: 988
# focus on velocity genes to ensure calculation stability
adata.var["velocity_genes"] = adata.var_names.isin(velocity_genes)

Save dataset#

if SAVE_DATA:
    adata.write_h5ad(DATA_DIR / DATASET / "processed" / "adata_preprocessed.h5ad")