scVelo benchmark on cell cycle data#
Notebook benchmarks velocity, latent time inference, and cross boundary correctness using scVelo on cell cycle data.
Library imports#
import numpy as np
import pandas as pd
import anndata as ad
import scvelo as scv
from cellrank.kernels import VelocityKernel
from rgv_tools import DATA_DIR
from rgv_tools.benchmarking import get_time_correlation
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_umi_tools from `anndata` is deprecated. Import anndata.io.read_umi_tools instead.
warnings.warn(msg, FutureWarning)
General settings#
scv.settings.verbosity = 3
Constants#
DATASET = "cell_cycle"
STATE_TRANSITIONS = [("G1", "S"), ("S", "G2M")]
SAVE_DATA = True
if SAVE_DATA:
(DATA_DIR / DATASET / "results").mkdir(parents=True, exist_ok=True)
Data loading#
adata = ad.io.read_h5ad(DATA_DIR / DATASET / "processed" / "adata_processed.h5ad")
adata
AnnData object with n_obs × n_vars = 1146 × 395
obs: 'phase', 'fucci_time', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'n_counts'
var: 'ensum_id', 'gene_count_corr', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
uns: 'log1p', 'neighbors', 'pca', 'umap', 'velocity_params'
obsm: 'X_pca', 'X_umap'
varm: 'PCs', 'true_skeleton'
layers: 'Ms', 'Mu', 'spliced', 'total', 'unspliced', 'velocity'
obsp: 'connectivities', 'distances'
Velocity pipeline#
# Parameter inference
scv.tl.recover_dynamics(adata, fit_scaling=False, var_names=adata.var_names, n_jobs=1)
recovering dynamics (using 1/112 cores)
finished (0:00:47) --> added
'fit_pars', fitted parameters for splicing dynamics (adata.var)
# Velocity inferene
adata.var["fit_scaling"] = 1.0
scv.tl.velocity(adata, mode="dynamical", min_likelihood=-np.inf, min_r2=None)
computing velocities
finished (0:00:00) --> added
'velocity', velocity vectors for each individual cell (adata.layers)
scv.tl.velocity_graph(adata, vkey="velocity", n_jobs=1)
scv.tl.velocity_confidence(adata, vkey="velocity")
computing velocity graph (using 1/112 cores)
finished (0:00:00) --> added
'velocity_graph', sparse matrix with cosine correlations (adata.uns)
--> added 'velocity_length' (adata.obs)
--> added 'velocity_confidence' (adata.obs)
--> added 'velocity_confidence_transition' (adata.obs)
time_correlation = [
get_time_correlation(ground_truth=adata.obs["fucci_time"], estimated=adata.layers["fit_t"].mean(axis=1))
]
time_correlation
[0.2557183701088282]
Cross-boundary correctness#
vk = VelocityKernel(adata).compute_transition_matrix()
cluster_key = "phase"
rep = "X_pca"
score_df = []
for source, target in STATE_TRANSITIONS:
cbc = vk.cbc(source=source, target=target, cluster_key=cluster_key, rep=rep)
score_df.append(
pd.DataFrame(
{
"State transition": [f"{source} - {target}"] * len(cbc),
"CBC": cbc,
}
)
)
score_df = pd.concat(score_df)
np.mean(score_df["CBC"])
0.815740452774211
Data saving#
if SAVE_DATA:
pd.DataFrame({"time": time_correlation}, index=adata.obs_names).to_parquet(
path=DATA_DIR / DATASET / "results" / "scvelo_correlation.parquet"
)
adata.obs[["velocity_confidence"]].to_parquet(path=DATA_DIR / DATASET / "results" / "scvelo_confidence.parquet")
score_df.to_parquet(path=DATA_DIR / DATASET / "results" / "scvelo_cbc.parquet")