import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
'AMI_Louvain','AMI_kmeans','AMI_HC',
'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])
workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)
0
metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)
17
files
['FM_Control_BMcov1000.rds', 'FM_BROCKMAN_BMcov1000.rds', 'FM_Cusanovich2018_BMcov1000.rds', 'FM_cisTopic_BMcov1000.rds', 'FM_chromVAR_BMcov1000_kmers.rds', 'FM_chromVAR_BMcov1000_motifs.rds', 'FM_chromVAR_BMcov1000_kmers_pca.rds', 'FM_chromVAR_BMcov1000_motifs_pca.rds', 'FM_GeneScoring_BMcov1000.rds', 'FM_GeneScoring_BMcov1000_pca.rds', 'FM_Cicero_BMcov1000.rds', 'FM_Cicero_BMcov1000_pca.rds', 'FM_SnapATAC_BMcov1000.rds', 'FM_Scasat_BMcov1000.rds', 'FM_scABC_BMcov1000.rds', 'FM_SCRAT_BMcov1000.rds', 'FM_SCRAT_BMcov1000_pca.rds']
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
this_step = 0
this_min = float(range_min)
this_max = float(range_max)
while this_step < max_steps:
print('step ' + str(this_step))
this_resolution = this_min + ((this_max-this_min)/2)
sc.tl.louvain(adata,resolution=this_resolution)
this_clusters = adata.obs['louvain'].nunique()
print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
if this_clusters > n_cluster:
this_max = this_resolution
elif this_clusters < n_cluster:
this_min = this_resolution
else:
return(this_resolution, adata)
this_step += 1
print('Cannot find the number of clusters')
print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))
for file in files:
file_split = file.split('_')
method = file_split[1]
dataset = file_split[2].split('.')[0]
if(len(file_split)>3):
method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
print(method)
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS(os.path.join(path_fm,file))
fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
fm_mat.columns = metadata.index
adata = sc.AnnData(fm_mat.T)
adata.var_names_make_unique()
adata.obs = metadata.loc[adata.obs.index,]
df_metrics.loc[method,] = ""
#Louvain
sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
# sc.tl.louvain(adata)
getNClusters(adata,n_cluster=num_clusters)
#kmeans
kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
#hierachical clustering
hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
#clustering metrics
#adjusted rank index
ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#adjusted mutual information
ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')
ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#homogeneity
homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])
df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc]
adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')
Control
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 8 at resolution 1.5 step 1 got 4 at resolution 0.75 step 2 got 4 at resolution 1.125 step 3 got 6 at resolution 1.3125 BROCKMAN
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 step 1 got 6 at resolution 0.75 Cusanovich2018
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 6 at resolution 1.5 cisTopic
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 8 at resolution 1.5 step 1 got 4 at resolution 0.75 step 2 got 6 at resolution 1.125 chromVAR_kmers
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 7 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 4 at resolution 1.125 step 3 got 4 at resolution 1.3125 step 4 got 4 at resolution 1.40625 step 5 got 5 at resolution 1.453125 step 6 got 4 at resolution 1.4765625 step 7 got 5 at resolution 1.48828125 step 8 got 5 at resolution 1.494140625 step 9 got 5 at resolution 1.4970703125 step 10 got 5 at resolution 1.49853515625 step 11 got 7 at resolution 1.499267578125 step 12 got 5 at resolution 1.4989013671875 step 13 got 7 at resolution 1.49908447265625 step 14 got 5 at resolution 1.498992919921875 step 15 got 7 at resolution 1.4990386962890625 step 16 got 7 at resolution 1.4990158081054688 step 17 got 7 at resolution 1.4990043640136719 step 18 got 7 at resolution 1.4989986419677734 step 19 got 5 at resolution 1.4989957809448242 Cannot find the number of clusters Clustering solution from last iteration is used:5 at resolution 1.4989957809448242 chromVAR_motifs
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 7 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 4 at resolution 1.125 step 3 got 7 at resolution 1.3125 step 4 got 5 at resolution 1.21875 step 5 got 6 at resolution 1.265625 chromVAR_kmers_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 7 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 4 at resolution 1.125 step 3 got 4 at resolution 1.3125 step 4 got 6 at resolution 1.40625 chromVAR_motifs_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 8 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 5 at resolution 1.125 step 3 got 7 at resolution 1.3125 step 4 got 6 at resolution 1.21875 GeneScoring
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 30 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 14 at resolution 1.125 step 3 got 5 at resolution 0.9375 step 4 got 9 at resolution 1.03125 step 5 got 7 at resolution 0.984375 step 6 got 7 at resolution 0.9609375 step 7 got 7 at resolution 0.94921875 step 8 got 6 at resolution 0.943359375 GeneScoring_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 13 at resolution 1.5 step 1 got 11 at resolution 0.75 step 2 got 7 at resolution 0.375 step 3 got 7 at resolution 0.1875 step 4 got 4 at resolution 0.09375 step 5 got 5 at resolution 0.140625 step 6 got 6 at resolution 0.1640625 Cicero
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 37 at resolution 1.5 step 1 got 1 at resolution 0.75 step 2 got 19 at resolution 1.125 step 3 got 8 at resolution 0.9375 step 4 got 2 at resolution 0.84375 step 5 got 4 at resolution 0.890625 step 6 got 4 at resolution 0.9140625 step 7 got 6 at resolution 0.92578125 Cicero_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 12 at resolution 1.5 step 1 got 3 at resolution 0.75 step 2 got 8 at resolution 1.125 step 3 got 5 at resolution 0.9375 step 4 got 7 at resolution 1.03125 step 5 got 6 at resolution 0.984375 SnapATAC
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 6 at resolution 1.5 Scasat
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 7 at resolution 1.5 step 1 got 4 at resolution 0.75 step 2 got 4 at resolution 1.125 step 3 got 6 at resolution 1.3125 scABC
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 29 at resolution 1.5 step 1 got 2 at resolution 0.75 step 2 got 10 at resolution 1.125 step 3 got 3 at resolution 0.9375 step 4 got 5 at resolution 1.03125 step 5 got 8 at resolution 1.078125 step 6 got 8 at resolution 1.0546875 step 7 got 6 at resolution 1.04296875 SCRAT
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 7 at resolution 1.5 step 1 got 4 at resolution 0.75 step 2 got 5 at resolution 1.125 step 3 got 5 at resolution 1.3125 step 4 got 7 at resolution 1.40625 step 5 got 6 at resolution 1.359375 SCRAT_pca
/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order. res = PandasDataFrame.from_items(items)
step 0 got 10 at resolution 1.5 step 1 got 6 at resolution 0.75
df_metrics.to_csv(path_metrics+'clustering_scores.csv')
df_metrics
ARI_Louvain | ARI_kmeans | ARI_HC | AMI_Louvain | AMI_kmeans | AMI_HC | Homogeneity_Louvain | Homogeneity_kmeans | Homogeneity_HC | |
---|---|---|---|---|---|---|---|---|---|
Control | 0.5928 | 0.569088 | 0.588754 | 0.714715 | 0.718895 | 0.723575 | 0.695935 | 0.701095 | 0.699836 |
BROCKMAN | 0.54574 | 0.500141 | 0.499691 | 0.682654 | 0.642841 | 0.65918 | 0.67829 | 0.641288 | 0.645486 |
Cusanovich2018 | 0.943846 | 0.745106 | 0.758834 | 0.941653 | 0.851342 | 0.854749 | 0.941908 | 0.825158 | 0.813651 |
cisTopic | 0.530953 | 0.529333 | 0.49712 | 0.645415 | 0.643513 | 0.629724 | 0.639389 | 0.644356 | 0.619491 |
chromVAR_kmers | 0.467463 | 0.430748 | 0.417099 | 0.610438 | 0.577779 | 0.540338 | 0.555603 | 0.577083 | 0.537387 |
chromVAR_motifs | 0.3906 | 0.369871 | 0.290375 | 0.520555 | 0.510661 | 0.43916 | 0.519439 | 0.512255 | 0.432029 |
chromVAR_kmers_pca | 0.458582 | 0.492114 | 0.436538 | 0.612727 | 0.622779 | 0.568588 | 0.602843 | 0.624776 | 0.563519 |
chromVAR_motifs_pca | 0.390955 | 0.367727 | 0.298085 | 0.524297 | 0.48803 | 0.427891 | 0.522027 | 0.489588 | 0.417594 |
GeneScoring | 0.00267149 | 0.111998 | 0.103837 | 0.00570611 | 0.20693 | 0.179649 | 0.011055 | 0.1803 | 0.149194 |
GeneScoring_pca | 0.09707 | 0.135782 | 0.106418 | 0.117303 | 0.222123 | 0.200712 | 0.113637 | 0.205834 | 0.179901 |
Cicero | 0.00687868 | 0.142307 | 0.397534 | 0.0111747 | 0.389042 | 0.589552 | 0.0168549 | 0.249484 | 0.48291 |
Cicero_pca | 0.448862 | 0.443354 | 0.423914 | 0.534053 | 0.566373 | 0.562634 | 0.522034 | 0.547409 | 0.50774 |
SnapATAC | 0.804994 | 0.860076 | 0.770361 | 0.84069 | 0.874407 | 0.816767 | 0.841453 | 0.875022 | 0.816121 |
Scasat | 0.613989 | 0.634201 | 0.590497 | 0.724743 | 0.74057 | 0.708525 | 0.715335 | 0.741861 | 0.705696 |
scABC | 0.20426 | 0.448982 | 0.465818 | 0.247493 | 0.583171 | 0.568088 | 0.229539 | 0.529003 | 0.531906 |
SCRAT | 0.444511 | 0.448028 | 0.457745 | 0.610931 | 0.581328 | 0.59356 | 0.596725 | 0.582187 | 0.592494 |
SCRAT_pca | 0.469748 | 0.405095 | 0.461812 | 0.606473 | 0.585137 | 0.597519 | 0.601129 | 0.571914 | 0.591323 |