import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
'AMI_Louvain','AMI_kmeans','AMI_HC',
'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])
workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)
0
metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)
17
files
['FM_Control_Erynoisyp2.rds', 'FM_BROCKMAN_Erynoisyp2.rds', 'FM_Cusanovich2018_Erynoisyp2.rds', 'FM_cisTopic_Erynoisyp2.rds', 'FM_chromVAR_Erynoisyp2_kmers.rds', 'FM_chromVAR_Erynoisyp2_motifs.rds', 'FM_chromVAR_Erynoisyp2_kmers_pca.rds', 'FM_chromVAR_Erynoisyp2_motifs_pca.rds', 'FM_GeneScoring_Erynoisyp2.rds', 'FM_GeneScoring_Erynoisyp2_pca.rds', 'FM_Cicero_Erynoisyp2.rds', 'FM_Cicero_Erynoisyp2_pca.rds', 'FM_SnapATAC_Erynoisyp2.rds', 'FM_Scasat_Erynoisyp2.rds', 'FM_scABC_Erynoisyp2.rds', 'FM_SCRAT_Erynoisyp2.rds', 'FM_SCRAT_Erynoisyp2_pca.rds']
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.louvain(adata,resolution=this_resolution)
this_clusters = adata.obs['louvain'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Cannot find the number of clusters')
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))
for file in files:
file_split = file.split('_')
method = file_split[1]
dataset = file_split[2].split('.')[0]
if(len(file_split)>3):
method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
print(method)
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS(os.path.join(path_fm,file))
fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
fm_mat.columns = metadata.index
adata = sc.AnnData(fm_mat.T)
adata.var_names_make_unique()
adata.obs = metadata.loc[adata.obs.index,]
df_metrics.loc[method,] = ""
#Louvain
sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
# sc.tl.louvain(adata)
getNClusters(adata,n_cluster=num_clusters)
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
#hierachical clustering
hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
#clustering metrics
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')
ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc]
adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')
Control
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 9 at resolution 1.5 step 1 got 14 at resolution 2.25 step 2 got 10 at resolution 1.875 step 3 got 13 at resolution 2.0625 step 4 got 13 at resolution 1.96875 step 5 got 11 at resolution 1.921875 step 6 got 12 at resolution 1.9453125 BROCKMAN
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 11 at resolution 1.5 step 1 got 13 at resolution 2.25 step 2 got 11 at resolution 1.875 step 3 got 12 at resolution 2.0625 Cusanovich2018
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 step 1 got 13 at resolution 2.25 step 2 got 11 at resolution 1.875 step 3 got 11 at resolution 2.0625 step 4 got 12 at resolution 2.15625 cisTopic
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 step 1 got 10 at resolution 2.25 step 2 got 10 at resolution 2.625 step 3 got 12 at resolution 2.8125 chromVAR_kmers
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 6 at resolution 1.5 step 1 got 10 at resolution 2.25 step 2 got 15 at resolution 2.625 step 3 got 12 at resolution 2.4375 chromVAR_motifs
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 8 at resolution 1.5 step 1 got 14 at resolution 2.25 step 2 got 10 at resolution 1.875 step 3 got 11 at resolution 2.0625 step 4 got 12 at resolution 2.15625 chromVAR_kmers_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 8 at resolution 1.5 step 1 got 10 at resolution 2.25 step 2 got 13 at resolution 2.625 step 3 got 11 at resolution 2.4375 step 4 got 11 at resolution 2.53125 step 5 got 11 at resolution 2.578125 step 6 got 12 at resolution 2.6015625 chromVAR_motifs_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 7 at resolution 1.5 step 1 got 14 at resolution 2.25 step 2 got 9 at resolution 1.875 step 3 got 13 at resolution 2.0625 step 4 got 11 at resolution 1.96875 step 5 got 11 at resolution 2.015625 step 6 got 12 at resolution 2.0390625 GeneScoring
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 32 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 12 at resolution 1.125 GeneScoring_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 13 at resolution 1.5 step 1 got 6 at resolution 0.75 step 2 got 8 at resolution 1.125 step 3 got 10 at resolution 1.3125 step 4 got 10 at resolution 1.40625 step 5 got 13 at resolution 1.453125 step 6 got 12 at resolution 1.4296875 Cicero
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 36 at resolution 1.5 step 1 got 1 at resolution 0.75 step 2 got 17 at resolution 1.125 step 3 got 7 at resolution 0.9375 step 4 got 12 at resolution 1.03125 Cicero_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 step 1 got 17 at resolution 2.25 step 2 got 16 at resolution 1.875 step 3 got 12 at resolution 1.6875 SnapATAC
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 11 at resolution 1.5 step 1 got 12 at resolution 2.25 Scasat
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 8 at resolution 1.5 step 1 got 13 at resolution 2.25 step 2 got 9 at resolution 1.875 step 3 got 12 at resolution 2.0625 scABC
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 5 at resolution 1.5 step 1 got 14 at resolution 2.25 step 2 got 8 at resolution 1.875 step 3 got 9 at resolution 2.0625 step 4 got 13 at resolution 2.15625 step 5 got 10 at resolution 2.109375 step 6 got 11 at resolution 2.1328125 step 7 got 13 at resolution 2.14453125 step 8 got 13 at resolution 2.138671875 step 9 got 13 at resolution 2.1357421875 step 10 got 11 at resolution 2.13427734375 step 11 got 11 at resolution 2.135009765625 step 12 got 13 at resolution 2.1353759765625 step 13 got 13 at resolution 2.13519287109375 step 14 got 11 at resolution 2.135101318359375 step 15 got 11 at resolution 2.1351470947265625 step 16 got 13 at resolution 2.1351699829101562 step 17 got 11 at resolution 2.1351585388183594 step 18 got 13 at resolution 2.135164260864258 step 19 got 13 at resolution 2.1351613998413086 Cannot find the number of clusters Clustering solution from last iteration is used:13 at resolution 2.1351613998413086 SCRAT
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 9 at resolution 1.5 step 1 got 11 at resolution 2.25 step 2 got 11 at resolution 2.625 step 3 got 12 at resolution 2.8125 SCRAT_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 step 1 got 12 at resolution 2.25
df_metrics.to_csv(path_metrics+'clustering_scores.csv')
df_metrics
ARI_Louvain | ARI_kmeans | ARI_HC | AMI_Louvain | AMI_kmeans | AMI_HC | Homogeneity_Louvain | Homogeneity_kmeans | Homogeneity_HC | |
---|---|---|---|---|---|---|---|---|---|
Control | 0.69301 | 0.642899 | 0.674561 | 0.827431 | 0.796641 | 0.814805 | 0.823204 | 0.80017 | 0.807588 |
BROCKMAN | 0.751053 | 0.666687 | 0.730374 | 0.852433 | 0.818222 | 0.841523 | 0.854386 | 0.811593 | 0.838416 |
Cusanovich2018 | 0.79698 | 0.658592 | 0.631806 | 0.883279 | 0.808403 | 0.835781 | 0.8761 | 0.795551 | 0.791579 |
cisTopic | 0.82162 | 0.836987 | 0.816515 | 0.897695 | 0.901182 | 0.892743 | 0.897675 | 0.903026 | 0.894268 |
chromVAR_kmers | 0.570839 | 0.542063 | 0.522348 | 0.718845 | 0.711779 | 0.682572 | 0.707286 | 0.712658 | 0.677166 |
chromVAR_motifs | 0.301088 | 0.291413 | 0.267198 | 0.493316 | 0.514334 | 0.465885 | 0.494241 | 0.521394 | 0.470202 |
chromVAR_kmers_pca | 0.590316 | 0.589465 | 0.542553 | 0.730808 | 0.7627 | 0.699934 | 0.725754 | 0.760691 | 0.696847 |
chromVAR_motifs_pca | 0.298686 | 0.313789 | 0.254616 | 0.489115 | 0.509886 | 0.467367 | 0.492538 | 0.518993 | 0.468481 |
GeneScoring | 0.00324063 | 0.268966 | 0.242634 | 0.00840918 | 0.468074 | 0.403775 | 0.0290629 | 0.389774 | 0.386797 |
GeneScoring_pca | 0.208526 | 0.229001 | 0.220807 | 0.41626 | 0.421416 | 0.414228 | 0.422811 | 0.41651 | 0.409035 |
Cicero | 0.00225731 | 0.272 | 0.36459 | 0.00591386 | 0.508134 | 0.573279 | 0.0266793 | 0.412684 | 0.527064 |
Cicero_pca | 0.398698 | 0.389485 | 0.380573 | 0.574955 | 0.567591 | 0.564743 | 0.565314 | 0.561142 | 0.537896 |
SnapATAC | 0.889732 | 0.89642 | 0.871528 | 0.940682 | 0.93841 | 0.92393 | 0.937749 | 0.939642 | 0.924514 |
Scasat | 0.672968 | 0.678968 | 0.651872 | 0.812718 | 0.81316 | 0.800134 | 0.806857 | 0.809412 | 0.793816 |
scABC | 0.467422 | 0.31324 | 0.528099 | 0.649262 | 0.547904 | 0.701177 | 0.603884 | 0.480919 | 0.687006 |
SCRAT | 0.652963 | 0.594275 | 0.612153 | 0.79089 | 0.771565 | 0.781908 | 0.79099 | 0.76992 | 0.775793 |
SCRAT_pca | 0.662029 | 0.592204 | 0.602312 | 0.795031 | 0.769902 | 0.779481 | 0.789606 | 0.768044 | 0.776074 |