Notebook

In [1]:

import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:

df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
                                   'AMI_Louvain','AMI_kmeans','AMI_HC',
                                   'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])

In [3]:

workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

Out[3]:

In [4]:

metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))

In [5]:

files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

Out[5]:

In [6]:

files

Out[6]:

['FM_Control_BMcov1000.rds',
 'FM_BROCKMAN_BMcov1000.rds',
 'FM_Cusanovich2018_BMcov1000.rds',
 'FM_cisTopic_BMcov1000.rds',
 'FM_chromVAR_BMcov1000_kmers.rds',
 'FM_chromVAR_BMcov1000_motifs.rds',
 'FM_chromVAR_BMcov1000_kmers_pca.rds',
 'FM_chromVAR_BMcov1000_motifs_pca.rds',
 'FM_GeneScoring_BMcov1000.rds',
 'FM_GeneScoring_BMcov1000_pca.rds',
 'FM_Cicero_BMcov1000.rds',
 'FM_Cicero_BMcov1000_pca.rds',
 'FM_SnapATAC_BMcov1000.rds',
 'FM_Scasat_BMcov1000.rds',
 'FM_scABC_BMcov1000.rds',
 'FM_SCRAT_BMcov1000.rds',
 'FM_SCRAT_BMcov1000_pca.rds']

In [7]:

def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))

In [8]:

for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)

    pandas2ri.activate()
    readRDS = robjects.r['readRDS']
    df_rds = readRDS(os.path.join(path_fm,file))
    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
    fm_mat.columns = metadata.index
    
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
#     sc.tl.louvain(adata)
    getNClusters(adata,n_cluster=num_clusters)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    
    #adjusted rank index
    ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
    #adjusted mutual information
    ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

    df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 8 at resolution 1.5
step 1
got 4 at resolution 0.75
step 2
got 4 at resolution 1.125
step 3
got 6 at resolution 1.3125
BROCKMAN

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 10 at resolution 1.5
step 1
got 6 at resolution 0.75
Cusanovich2018

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 6 at resolution 1.5
cisTopic

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 8 at resolution 1.5
step 1
got 4 at resolution 0.75
step 2
got 6 at resolution 1.125
chromVAR_kmers

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 7 at resolution 1.5
step 1
got 3 at resolution 0.75
step 2
got 4 at resolution 1.125
step 3
got 4 at resolution 1.3125
step 4
got 4 at resolution 1.40625
step 5
got 5 at resolution 1.453125
step 6
got 4 at resolution 1.4765625
step 7
got 5 at resolution 1.48828125
step 8
got 5 at resolution 1.494140625
step 9
got 5 at resolution 1.4970703125
step 10
got 5 at resolution 1.49853515625
step 11
got 7 at resolution 1.499267578125
step 12
got 5 at resolution 1.4989013671875
step 13
got 7 at resolution 1.49908447265625
step 14
got 5 at resolution 1.498992919921875
step 15
got 7 at resolution 1.4990386962890625
step 16
got 7 at resolution 1.4990158081054688
step 17
got 7 at resolution 1.4990043640136719
step 18
got 7 at resolution 1.4989986419677734
step 19
got 5 at resolution 1.4989957809448242
Cannot find the number of clusters
Clustering solution from last iteration is used:5 at resolution 1.4989957809448242
chromVAR_motifs

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 7 at resolution 1.5
step 1
got 3 at resolution 0.75
step 2
got 4 at resolution 1.125
step 3
got 7 at resolution 1.3125
step 4
got 5 at resolution 1.21875
step 5
got 6 at resolution 1.265625
chromVAR_kmers_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 7 at resolution 1.5
step 1
got 3 at resolution 0.75
step 2
got 4 at resolution 1.125
step 3
got 4 at resolution 1.3125
step 4
got 6 at resolution 1.40625
chromVAR_motifs_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 8 at resolution 1.5
step 1
got 3 at resolution 0.75
step 2
got 5 at resolution 1.125
step 3
got 7 at resolution 1.3125
step 4
got 6 at resolution 1.21875
GeneScoring

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 30 at resolution 1.5
step 1
got 3 at resolution 0.75
step 2
got 14 at resolution 1.125
step 3
got 5 at resolution 0.9375
step 4
got 9 at resolution 1.03125
step 5
got 7 at resolution 0.984375
step 6
got 7 at resolution 0.9609375
step 7
got 7 at resolution 0.94921875
step 8
got 6 at resolution 0.943359375
GeneScoring_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 13 at resolution 1.5
step 1
got 11 at resolution 0.75
step 2
got 7 at resolution 0.375
step 3
got 7 at resolution 0.1875
step 4
got 4 at resolution 0.09375
step 5
got 5 at resolution 0.140625
step 6
got 6 at resolution 0.1640625
Cicero

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 37 at resolution 1.5
step 1
got 1 at resolution 0.75
step 2
got 19 at resolution 1.125
step 3
got 8 at resolution 0.9375
step 4
got 2 at resolution 0.84375
step 5
got 4 at resolution 0.890625
step 6
got 4 at resolution 0.9140625
step 7
got 6 at resolution 0.92578125
Cicero_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 12 at resolution 1.5
step 1
got 3 at resolution 0.75
step 2
got 8 at resolution 1.125
step 3
got 5 at resolution 0.9375
step 4
got 7 at resolution 1.03125
step 5
got 6 at resolution 0.984375
SnapATAC

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 6 at resolution 1.5
Scasat

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 7 at resolution 1.5
step 1
got 4 at resolution 0.75
step 2
got 4 at resolution 1.125
step 3
got 6 at resolution 1.3125
scABC

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 29 at resolution 1.5
step 1
got 2 at resolution 0.75
step 2
got 10 at resolution 1.125
step 3
got 3 at resolution 0.9375
step 4
got 5 at resolution 1.03125
step 5
got 8 at resolution 1.078125
step 6
got 8 at resolution 1.0546875
step 7
got 6 at resolution 1.04296875
SCRAT

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 7 at resolution 1.5
step 1
got 4 at resolution 0.75
step 2
got 5 at resolution 1.125
step 3
got 5 at resolution 1.3125
step 4
got 7 at resolution 1.40625
step 5
got 6 at resolution 1.359375
SCRAT_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 10 at resolution 1.5
step 1
got 6 at resolution 0.75

In [9]:

df_metrics.to_csv(path_metrics+'clustering_scores.csv')

In [10]:

df_metrics

Out[10]:

	ARI_Louvain	ARI_kmeans	ARI_HC	AMI_Louvain	AMI_kmeans	AMI_HC	Homogeneity_Louvain	Homogeneity_kmeans	Homogeneity_HC
Control	0.5928	0.569088	0.588754	0.714715	0.718895	0.723575	0.695935	0.701095	0.699836
BROCKMAN	0.54574	0.500141	0.499691	0.682654	0.642841	0.65918	0.67829	0.641288	0.645486
Cusanovich2018	0.943846	0.745106	0.758834	0.941653	0.851342	0.854749	0.941908	0.825158	0.813651
cisTopic	0.530953	0.529333	0.49712	0.645415	0.643513	0.629724	0.639389	0.644356	0.619491
chromVAR_kmers	0.467463	0.430748	0.417099	0.610438	0.577779	0.540338	0.555603	0.577083	0.537387
chromVAR_motifs	0.3906	0.369871	0.290375	0.520555	0.510661	0.43916	0.519439	0.512255	0.432029
chromVAR_kmers_pca	0.458582	0.492114	0.436538	0.612727	0.622779	0.568588	0.602843	0.624776	0.563519
chromVAR_motifs_pca	0.390955	0.367727	0.298085	0.524297	0.48803	0.427891	0.522027	0.489588	0.417594
GeneScoring	0.00267149	0.111998	0.103837	0.00570611	0.20693	0.179649	0.011055	0.1803	0.149194
GeneScoring_pca	0.09707	0.135782	0.106418	0.117303	0.222123	0.200712	0.113637	0.205834	0.179901
Cicero	0.00687868	0.142307	0.397534	0.0111747	0.389042	0.589552	0.0168549	0.249484	0.48291
Cicero_pca	0.448862	0.443354	0.423914	0.534053	0.566373	0.562634	0.522034	0.547409	0.50774
SnapATAC	0.804994	0.860076	0.770361	0.84069	0.874407	0.816767	0.841453	0.875022	0.816121
Scasat	0.613989	0.634201	0.590497	0.724743	0.74057	0.708525	0.715335	0.741861	0.705696
scABC	0.20426	0.448982	0.465818	0.247493	0.583171	0.568088	0.229539	0.529003	0.531906
SCRAT	0.444511	0.448028	0.457745	0.610931	0.581328	0.59356	0.596725	0.582187	0.592494
SCRAT_pca	0.469748	0.405095	0.461812	0.606473	0.585137	0.597519	0.601129	0.571914	0.591323

In [ ]: