Cell cycle data preparation#
Notebook prepares data for inference tasks.
Library imports#
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import anndata as ad
import scanpy as sc
import scvelo as scv
from anndata import AnnData
from velovi import preprocess_data
from rgv_tools import DATA_DIR
Constants#
DATASET = "cell_cycle"
SAVE_DATA = True
if SAVE_DATA:
(DATA_DIR / DATASET / "processed").mkdir(parents=True, exist_ok=True)
Function definitions#
def prepare_data(adata: AnnData) -> None:
"""Update cell cycle data to include only relevant information in the standard format."""
adata.X = adata.layers["spliced"].copy()
for layer in ["ambiguous", "matrix", "spanning"]:
del adata.layers[layer]
adata.layers["total"] = adata.layers["unspliced"] + adata.layers["spliced"]
columns_to_drop = [
"Well_Plate",
"plate",
"MeanGreen530",
"MeanRed585",
"initial_size_unspliced",
"initial_size_spliced",
"initial_size",
]
adata.obs["phase"] = adata.obs["phase"].astype(str).replace({"N/A": np.nan, "S-ph": "S"}).astype("category")
adata.obs.drop(columns_to_drop, axis=1, inplace=True)
adata.var["ensum_id"] = adata.var_names
adata.var_names = adata.var["name"].values.astype(str)
adata.var_names_make_unique()
columns_to_drop = [
"name",
"biotype",
"description",
"Accession",
"Chromosome",
"End",
"Start",
"Strand",
"GeneName",
]
adata.var.drop(columns_to_drop, axis=1, inplace=True)
Data loading#
adata = ad.io.read_h5ad(DATA_DIR / DATASET / "raw" / "adata.h5ad")
adata
AnnData object with n_obs × n_vars = 1146 × 19997
obs: 'Well_Plate', 'plate', 'phase', 'MeanGreen530', 'MeanRed585', 'fucci_time', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size'
var: 'name', 'biotype', 'description', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'GeneName'
layers: 'ambiguous', 'matrix', 'spanning', 'spliced', 'unspliced'
Data processing#
prepare_data(adata=adata)
if SAVE_DATA:
adata.write(DATA_DIR / DATASET / "processed" / "adata.h5ad")
adata
AnnData object with n_obs × n_vars = 1146 × 19997
obs: 'phase', 'fucci_time'
var: 'ensum_id'
layers: 'spliced', 'unspliced', 'total'
scv.pp.filter_and_normalize(adata, min_counts=10, n_top_genes=2000, log=False)
sc.pp.log1p(adata)
sc.tl.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=30)
scv.pp.moments(adata)
sc.tl.umap(adata)
adata = preprocess_data(adata)
adata
Filtered out 4748 genes that are detected 10 counts (spliced).
Normalized count data: X, spliced, unspliced.
Extracted 2000 highly variable genes.
computing moments based on connectivities
finished (0:00:00) --> added
'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
computing velocities
finished (0:00:00) --> added
'velocity', velocity vectors for each individual cell (adata.layers)
AnnData object with n_obs × n_vars = 1146 × 395
obs: 'phase', 'fucci_time', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts'
var: 'ensum_id', 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
uns: 'log1p', 'pca', 'neighbors', 'umap', 'velocity_params'
obsm: 'X_pca', 'X_umap'
varm: 'PCs'
layers: 'spliced', 'unspliced', 'total', 'Ms', 'Mu', 'velocity'
obsp: 'distances', 'connectivities'
true_skeleton = pd.DataFrame(np.zeros((adata.n_vars, adata.n_vars)), index=adata.var_names, columns=adata.var_names)
for fname in tqdm((DATA_DIR / DATASET / "raw" / "tf_list_5k").iterdir()):
regulator = fname.stem
targets = pd.read_csv(fname, delimiter="\t")["Target_genes"].tolist()
targets = list(adata.var_names.intersection(targets))
if len(targets) > 50:
true_skeleton.loc[regulator, targets] = 1
adata.varm["true_skeleton"] = csr_matrix(true_skeleton.values)
26it [00:02, 11.83it/s]
if SAVE_DATA:
adata.write(DATA_DIR / DATASET / "processed" / "adata_processed.h5ad")