Cell cycle data preparation#

Notebook prepares data for inference tasks.

Library imports#

from tqdm import tqdm

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

import anndata as ad
import scanpy as sc
import scvelo as scv
from anndata import AnnData
from velovi import preprocess_data

from rgv_tools import DATA_DIR

Constants#

DATASET = "cell_cycle"
SAVE_DATA = True
if SAVE_DATA:
    (DATA_DIR / DATASET / "processed").mkdir(parents=True, exist_ok=True)

Function definitions#

def prepare_data(adata: AnnData) -> None:
    """Update cell cycle data to include only relevant information in the standard format."""
    adata.X = adata.layers["spliced"].copy()

    for layer in ["ambiguous", "matrix", "spanning"]:
        del adata.layers[layer]
    adata.layers["total"] = adata.layers["unspliced"] + adata.layers["spliced"]

    columns_to_drop = [
        "Well_Plate",
        "plate",
        "MeanGreen530",
        "MeanRed585",
        "initial_size_unspliced",
        "initial_size_spliced",
        "initial_size",
    ]
    adata.obs["phase"] = adata.obs["phase"].astype(str).replace({"N/A": np.nan, "S-ph": "S"}).astype("category")
    adata.obs.drop(columns_to_drop, axis=1, inplace=True)

    adata.var["ensum_id"] = adata.var_names
    adata.var_names = adata.var["name"].values.astype(str)
    adata.var_names_make_unique()
    columns_to_drop = [
        "name",
        "biotype",
        "description",
        "Accession",
        "Chromosome",
        "End",
        "Start",
        "Strand",
        "GeneName",
    ]
    adata.var.drop(columns_to_drop, axis=1, inplace=True)

Data loading#

adata = ad.io.read_h5ad(DATA_DIR / DATASET / "raw" / "adata.h5ad")
adata
AnnData object with n_obs × n_vars = 1146 × 19997
    obs: 'Well_Plate', 'plate', 'phase', 'MeanGreen530', 'MeanRed585', 'fucci_time', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size'
    var: 'name', 'biotype', 'description', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'GeneName'
    layers: 'ambiguous', 'matrix', 'spanning', 'spliced', 'unspliced'

Data processing#

prepare_data(adata=adata)

if SAVE_DATA:
    adata.write(DATA_DIR / DATASET / "processed" / "adata.h5ad")

adata
AnnData object with n_obs × n_vars = 1146 × 19997
    obs: 'phase', 'fucci_time'
    var: 'ensum_id'
    layers: 'spliced', 'unspliced', 'total'
scv.pp.filter_and_normalize(adata, min_counts=10, n_top_genes=2000, log=False)
sc.pp.log1p(adata)
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
scv.pp.moments(adata)
sc.tl.umap(adata)

adata = preprocess_data(adata)
adata
Filtered out 4748 genes that are detected 10 counts (spliced).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
computing moments based on connectivities
    finished (0:00:00) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
AnnData object with n_obs × n_vars = 1146 × 395
    obs: 'phase', 'fucci_time', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts'
    var: 'ensum_id', 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
    uns: 'log1p', 'pca', 'neighbors', 'umap', 'velocity_params'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'spliced', 'unspliced', 'total', 'Ms', 'Mu', 'velocity'
    obsp: 'distances', 'connectivities'
true_skeleton = pd.DataFrame(np.zeros((adata.n_vars, adata.n_vars)), index=adata.var_names, columns=adata.var_names)
for fname in tqdm((DATA_DIR / DATASET / "raw" / "tf_list_5k").iterdir()):
    regulator = fname.stem
    targets = pd.read_csv(fname, delimiter="\t")["Target_genes"].tolist()
    targets = list(adata.var_names.intersection(targets))

    if len(targets) > 50:
        true_skeleton.loc[regulator, targets] = 1

adata.varm["true_skeleton"] = csr_matrix(true_skeleton.values)
26it [00:02, 11.83it/s]
if SAVE_DATA:
    adata.write(DATA_DIR / DATASET / "processed" / "adata_processed.h5ad")