Run other GRN inference methods on mHSPC dataset#
Library imports#
from itertools import permutations, product
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import celloracle as co
import scanpy as sc
from arboreto.algo import grnboost2
from rgv_tools import DATA_DIR
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_csv from `anndata` is deprecated. Import anndata.io.read_csv instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_excel from `anndata` is deprecated. Import anndata.io.read_excel instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_hdf from `anndata` is deprecated. Import anndata.io.read_hdf instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_loom from `anndata` is deprecated. Import anndata.io.read_loom instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_mtx from `anndata` is deprecated. Import anndata.io.read_mtx instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_text from `anndata` is deprecated. Import anndata.io.read_text instead.
warnings.warn(msg, FutureWarning)
/home/icb/weixu.wang/miniconda3/envs/regvelo_test/lib/python3.10/site-packages/anndata/utils.py:429: FutureWarning: Importing read_umi_tools from `anndata` is deprecated. Import anndata.io.read_umi_tools instead.
warnings.warn(msg, FutureWarning)
General setting#
%matplotlib inline
Constants#
DATASET = "mHSPC"
SAVE_DATA = True
if SAVE_DATA:
(DATA_DIR / DATASET / "results").mkdir(parents=True, exist_ok=True)
Functions definations#
We followed the GRN benchmark workflow provided by BEELINE, please check Murali-group/Beeline
def unsigned(true_edges: pd.DataFrame, pred_edges: pd.DataFrame, type: str = "alledges") -> tuple[float, float, float]:
"""Compare true vs predicted edges (unsigned) and compute precision/recall metrics.
Returns
-------
tuple: (eprec, erec, eprec_ratio)
"""
true_edges_copy = true_edges.copy()
pred_edges_copy = pred_edges.copy()
# Drop self-edges and duplicates
true_edges_copy = true_edges_copy.loc[(true_edges_copy["Gene1"] != true_edges_copy["Gene2"])]
true_edges_copy.drop_duplicates(keep="first", inplace=True)
true_edges_copy.reset_index(drop=True, inplace=True)
pred_edges_copy = pred_edges_copy.loc[(pred_edges_copy["Gene1"] != pred_edges_copy["Gene2"])]
pred_edges_copy.drop_duplicates(keep="first", inplace=True)
pred_edges_copy.reset_index(drop=True, inplace=True)
# Get a list of all possible TF to gene interactions
unique_nodes = np.unique(true_edges_copy.loc[:, ["Gene1", "Gene2"]])
possible_edges_all = set(product(set(true_edges_copy.Gene1), set(unique_nodes)))
# Get a list of all possible interactions
possible_edges_no_self = set(permutations(unique_nodes, r=2))
# Find intersection of above lists to ignore self edges
possible_edges = possible_edges_all.intersection(possible_edges_no_self)
true_edges_dict = {"|".join(p): 0 for p in possible_edges}
true_edges_str = true_edges_copy["Gene1"] + "|" + true_edges_copy["Gene2"]
true_edges_str = true_edges_str[true_edges_str.isin(true_edges_dict)]
n_edges = len(true_edges_str)
pred_edges_copy["Edges"] = pred_edges_copy["Gene1"] + "|" + pred_edges_copy["Gene2"]
pred_edges_copy = pred_edges_copy[pred_edges_copy["Edges"].isin(true_edges_dict)]
pred_edges_copy.copy()
if not pred_edges_copy.shape[0] == 0:
pred_edges_copy.loc[:, "EdgeWeight"] = pred_edges_copy.EdgeWeight.round(6).abs()
pred_edges_copy.sort_values(by="EdgeWeight", ascending=False, inplace=True)
maxk = min(pred_edges_copy.shape[0], n_edges)
edge_weight_topk = pred_edges_copy.iloc[maxk - 1].EdgeWeight
nnz_min = np.nanmin(pred_edges_copy.EdgeWeight.replace(0, np.nan).values)
best_val = max(nnz_min, edge_weight_topk)
newDF = pred_edges_copy.loc[(pred_edges_copy["EdgeWeight"] >= best_val)]
rank = set(newDF["Gene1"] + "|" + newDF["Gene2"])
intersectionSet = rank.intersection(true_edges_str)
eprec = len(intersectionSet) / len(rank)
erec = len(intersectionSet) / len(true_edges_str)
random_eprec = n_edges / len(true_edges_dict)
eprec_ratio = eprec / random_eprec
else:
eprec = 1.0
erec = 1.0
eprec_ratio = 1.0
print("EPR: " + str(eprec_ratio))
return eprec, erec, eprec_ratio
def calculate_auroc(inferred_scores_df: pd.DataFrame, ground_truth_df: pd.DataFrame) -> float:
"""Calculate AUROC comparing inferred edge scores against ground truth.
Returns
-------
float: AUROC score.
"""
ground_truth_set = set(zip(ground_truth_df["Gene1"], ground_truth_df["Gene2"]))
inferred_scores_df["label"] = inferred_scores_df.apply(
lambda row: (row["Gene1"], row["Gene2"]) in ground_truth_set, axis=1
).astype(int)
y_true = inferred_scores_df["label"]
y_scores = inferred_scores_df["EdgeWeight"]
auroc = roc_auc_score(y_true, y_scores)
return auroc
Data loading#
adata = sc.read_h5ad(DATA_DIR / DATASET / "processed" / "mHSC_ExpressionData.h5ad")
gt = pd.read_csv(DATA_DIR / DATASET / "raw" / "mHSC-ChIP-seq-network.csv")
TF = pd.read_csv(DATA_DIR / DATASET / "raw" / "mouse-tfs.csv")
TF = [i[0].upper() + i[1:].lower() for i in TF["TF"].tolist()]
TF = np.array(TF)[[i in adata.var_names for i in TF]]
gt["Gene1"] = [i[0].upper() + i[1:].lower() for i in gt["Gene1"].tolist()]
gt["Gene2"] = [i[0].upper() + i[1:].lower() for i in gt["Gene2"].tolist()]
gt = gt.loc[[i in TF for i in gt["Gene1"]], :]
gt = gt.loc[[i in adata.var_names for i in gt["Gene2"]], :]
gt
| Gene1 | Gene2 | Score | |
|---|---|---|---|
| 24829 | Bcl11b | Ccna2 | 2546.0 |
| 24913 | Bcl11b | Edem1 | 2532.0 |
| 24928 | Bcl11b | Tmem229b | 2023.0 |
| 25015 | Bcl11b | Clptm1l | 1598.0 |
| 25037 | Bcl11b | Cyld | 2156.0 |
| ... | ... | ... | ... |
| 942678 | Stat3 | Abcg3 | 101.0 |
| 942681 | Stat3 | Il10 | 607.0 |
| 942686 | Stat3 | Arg1 | 499.0 |
| 942696 | Stat3 | Mmp9 | 445.0 |
| 942698 | Stat3 | Cd209a | 437.0 |
8940 rows × 3 columns
Pearson correlation#
grn_estimate = adata.to_df(layer="Ms").corr().values
np.fill_diagonal(grn_estimate, 0)
grn_estimate = np.abs(grn_estimate)
grn_estimate = pd.DataFrame(grn_estimate, index=adata.var_names.tolist(), columns=adata.var_names.tolist())
grn_estimate = grn_estimate.loc[:, TF].copy()
grn = pd.DataFrame(grn_estimate.stack()).reset_index()
grn.columns = ["Gene2", "Gene1", "EdgeWeight"]
result = grn[["Gene1", "Gene2", "EdgeWeight"]].sort_values(by="EdgeWeight", ascending=False).reset_index(drop=True)
result.shape
(94170, 3)
_, _, epr_cor = unsigned(gt, result)
EPR: 1.1085224655249315
auc_cor = calculate_auroc(result, gt)
auc_cor
0.5660399003249793
GRNBoost2#
network = grnboost2(expression_data=adata.to_df(layer="Ms"), tf_names=TF.tolist())
grn_estimate = pd.pivot(network, index="target", columns="TF").fillna(0).values.T
network.columns = ["Gene1", "Gene2", "EdgeWeight"]
_, _, epr_gbt2 = unsigned(gt, network)
EPR: 1.0147529772351342
auc_gbt2 = calculate_auroc(network, gt)
auc_gbt2
0.5339281564361597
Using CellOracle#
base_grn = np.ones((len(TF), adata.n_vars))
base_grn = pd.DataFrame(base_grn, index=TF, columns=adata.var_names)
base_grn["peak_id"] = ["peak_" + i for i in TF]
base_grn["gene_short_name"] = TF
base_grn = base_grn[["peak_id", "gene_short_name"] + adata.var_names.to_list()]
net = co.Net(gene_expression_matrix=adata.to_df(layer="Ms"), TFinfo_matrix=base_grn, verbose=False)
net.fit_All_genes(bagging_number=100, alpha=1, verbose=False)
net.updateLinkList(verbose=False)
network = net.linkList[["source", "target", "coef_mean"]]
network.columns = ["Gene1", "Gene2", "EdgeWeight"]
network["EdgeWeight"] = np.abs(network["EdgeWeight"])
_, _, epr_co = unsigned(gt, network)
EPR: 1.1963372909401349
auc_co = calculate_auroc(network, gt)
auc_co
0.55718676439274
Results#
result_df = pd.DataFrame(
{
"EPR": [epr_cor, epr_gbt2, epr_co],
"AUC": [auc_cor, auc_gbt2, auc_co],
"Method": ["Corr", "GRNBoost2", "CellOracle"],
}
)
if SAVE_DATA:
result_df.to_csv(DATA_DIR / DATASET / "results" / "GRN_benchmark.csv")