import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
'AMI_Louvain','AMI_kmeans','AMI_HC',
'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])
workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)
0
metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))
print(num_clusters)
10
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)
17
files
['FM_ChromVAR_buenrostro2018_motifs.rds', 'FM_ChromVAR_buenrostro2018_kmers.rds', 'FM_cisTopic_buenrostro2018.rds', 'FM_SnapATAC_buenrostro2018.rds', 'FM_SCRAT_buenrostro2018_motifs.rds', 'FM_BROCKMAN_buenrostro2018.rds', 'FM_Cusanovich2018_buenrostro2018.rds', 'FM_Control_buenrostro2018.rds', 'FM_GeneScoring_buenrostro2018.rds', 'FM_Scasat_buenrostro2018.rds', 'FM_scABC_buenrostro2018.rds', 'FM_Cicero_buenrostro2018.rds', 'FM_ChromVAR_buenrostro2018_kmers_pca.rds', 'FM_ChromVAR_buenrostro2018_motifs_pca.rds', 'FM_GeneScoring_buenrostro2018_pca.rds', 'FM_Cicero_buenrostro2018_pca.rds', 'FM_SCRAT_buenrostro2018_pca.rds']
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.louvain(adata,resolution=this_resolution)
this_clusters = adata.obs['louvain'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Cannot find the number of clusters')
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))
for file in files:
file_split = file.split('_')
method = file_split[1]
dataset = file_split[2].split('.')[0]
if(len(file_split)>3):
method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
print(method)
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS(os.path.join(path_fm,file))
fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
fm_mat.fillna(0,inplace=True)
fm_mat.columns = metadata.index
adata = sc.AnnData(fm_mat.T)
adata.var_names_make_unique()
adata.obs = metadata.loc[adata.obs.index,]
df_metrics.loc[method,] = ""
#Louvain
sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
# sc.tl.louvain(adata)
getNClusters(adata,n_cluster=num_clusters)
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
#hierachical clustering
hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
#clustering metrics
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')
ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc]
adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')
ChromVAR_motifs
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 9 at resolution 1.5 step 1 got 12 at resolution 2.25 step 2 got 10 at resolution 1.875 ChromVAR_kmers
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 9 at resolution 1.5 step 1 got 14 at resolution 2.25 step 2 got 13 at resolution 1.875 step 3 got 10 at resolution 1.6875 cisTopic
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 18 at resolution 1.5 step 1 got 13 at resolution 0.75 step 2 got 10 at resolution 0.375 SnapATAC
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 22 at resolution 1.5 step 1 got 16 at resolution 0.75 step 2 got 15 at resolution 0.375 step 3 got 10 at resolution 0.1875 SCRAT_motifs
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 17 at resolution 1.5 step 1 got 9 at resolution 0.75 step 2 got 14 at resolution 1.125 step 3 got 11 at resolution 0.9375 step 4 got 10 at resolution 0.84375 BROCKMAN
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 18 at resolution 1.5 step 1 got 12 at resolution 0.75 step 2 got 7 at resolution 0.375 step 3 got 10 at resolution 0.5625 Cusanovich2018
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 15 at resolution 1.5 step 1 got 11 at resolution 0.75 step 2 got 8 at resolution 0.375 step 3 got 9 at resolution 0.5625 step 4 got 10 at resolution 0.65625 Control
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 19 at resolution 1.5 step 1 got 12 at resolution 0.75 step 2 got 5 at resolution 0.375 step 3 got 10 at resolution 0.5625 GeneScoring
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 Scasat
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 20 at resolution 1.5 step 1 got 13 at resolution 0.75 step 2 got 8 at resolution 0.375 step 3 got 12 at resolution 0.5625 step 4 got 12 at resolution 0.46875 step 5 got 12 at resolution 0.421875 step 6 got 8 at resolution 0.3984375 step 7 got 10 at resolution 0.41015625 scABC
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 19 at resolution 1.5 step 1 got 4 at resolution 0.75 step 2 got 7 at resolution 1.125 step 3 got 14 at resolution 1.3125 step 4 got 11 at resolution 1.21875 step 5 got 9 at resolution 1.171875 step 6 got 10 at resolution 1.1953125 Cicero
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 76 at resolution 1.5 step 1 got 1 at resolution 0.75 step 2 got 11 at resolution 1.125 step 3 got 5 at resolution 0.9375 step 4 got 9 at resolution 1.03125 step 5 got 13 at resolution 1.078125 step 6 got 8 at resolution 1.0546875 step 7 got 9 at resolution 1.06640625 step 8 got 9 at resolution 1.072265625 step 9 got 10 at resolution 1.0751953125 ChromVAR_kmers_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 11 at resolution 1.5 step 1 got 8 at resolution 0.75 step 2 got 9 at resolution 1.125 step 3 got 11 at resolution 1.3125 step 4 got 10 at resolution 1.21875 ChromVAR_motifs_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 9 at resolution 1.5 step 1 got 13 at resolution 2.25 step 2 got 11 at resolution 1.875 step 3 got 11 at resolution 1.6875 step 4 got 11 at resolution 1.59375 step 5 got 11 at resolution 1.546875 step 6 got 11 at resolution 1.5234375 step 7 got 10 at resolution 1.51171875 GeneScoring_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 16 at resolution 1.5 step 1 got 7 at resolution 0.75 step 2 got 12 at resolution 1.125 step 3 got 10 at resolution 0.9375 Cicero_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 SCRAT_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 18 at resolution 1.5 step 1 got 9 at resolution 0.75 step 2 got 15 at resolution 1.125 step 3 got 13 at resolution 0.9375 step 4 got 13 at resolution 0.84375 step 5 got 9 at resolution 0.796875 step 6 got 12 at resolution 0.8203125 step 7 got 10 at resolution 0.80859375
df_metrics.to_csv(path_metrics+'clustering_scores.csv')
df_metrics
ARI_Louvain | ARI_kmeans | ARI_HC | AMI_Louvain | AMI_kmeans | AMI_HC | Homogeneity_Louvain | Homogeneity_kmeans | Homogeneity_HC | |
---|---|---|---|---|---|---|---|---|---|
ChromVAR_motifs | 0.344884 | 0.243458 | 0.307777 | 0.516044 | 0.415984 | 0.432023 | 0.53329 | 0.414872 | 0.407445 |
ChromVAR_kmers | 0.389598 | 0.279133 | 0.221587 | 0.544125 | 0.428086 | 0.362537 | 0.555826 | 0.400741 | 0.307758 |
cisTopic | 0.641239 | 0.498477 | 0.51701 | 0.719416 | 0.657523 | 0.661236 | 0.7252 | 0.681288 | 0.682697 |
SnapATAC | 0.420341 | 0.353299 | 0.443056 | 0.684499 | 0.605017 | 0.689081 | 0.652815 | 0.589645 | 0.645974 |
SCRAT_motifs | 0.0515874 | 0.040599 | 0.0380492 | 0.140008 | 0.101852 | 0.104561 | 0.154075 | 0.115026 | 0.116999 |
BROCKMAN | 0.115213 | 0.0329884 | 0.0353507 | 0.261593 | 0.103987 | 0.119018 | 0.274468 | 0.115736 | 0.131517 |
Cusanovich2018 | 0.533643 | -0.00194164 | -0.00194164 | 0.663841 | -0.00278841 | -0.00278841 | 0.680403 | 0.00302188 | 0.00302188 |
Control | 0.182422 | 0.0137586 | 0.0344167 | 0.372463 | 0.024063 | 0.0449997 | 0.384145 | 0.0242291 | 0.044414 |
GeneScoring | 0.0417257 | 0.0208223 | 0.0244348 | 0.112021 | 0.0315973 | 0.0401868 | 0.121847 | 0.0349462 | 0.0413666 |
Scasat | 0.320218 | 0.1597 | 0.18861 | 0.519518 | 0.300951 | 0.383886 | 0.538066 | 0.32015 | 0.397076 |
scABC | 0.0450337 | 0.0127084 | 0.0248051 | 0.12754 | 0.0239257 | 0.0433246 | 0.132366 | 0.024874 | 0.043172 |
Cicero | 0.09669 | -0.00325062 | -0.00215327 | 0.172726 | 0.00262284 | -0.00253554 | 0.180195 | 0.00641933 | 0.00335946 |
ChromVAR_kmers_pca | 0.443234 | 0.262985 | 0.270338 | 0.56954 | 0.420358 | 0.434644 | 0.590846 | 0.377882 | 0.386469 |
ChromVAR_motifs_pca | 0.308443 | 0.256185 | 0.294866 | 0.516459 | 0.410229 | 0.43911 | 0.534353 | 0.407546 | 0.412825 |
GeneScoring_pca | 0.0306601 | 0.024873 | 0.0135162 | 0.0951604 | 0.0362062 | 0.0298817 | 0.107238 | 0.0395731 | 0.0322586 |
Cicero_pca | 0.211067 | -0.00194164 | -0.00215327 | 0.37366 | -0.00278841 | -0.00253554 | 0.397431 | 0.00302188 | 0.00335946 |
SCRAT_pca | 0.0441418 | 0.0392368 | 0.0433118 | 0.114929 | 0.0989591 | 0.115728 | 0.126979 | 0.112535 | 0.129509 |