In [1]:

import pandas as pd
import numpy as np
import scanpy as sc
from scipy import stats
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:

# df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
#                                    'AMI_Louvain','AMI_kmeans','AMI_HC',
#                                    'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])

In [3]:

workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

Out[3]:

In [4]:

metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))
print(num_clusters)

In [5]:

files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

Out[5]:

In [6]:

files

Out[6]:

['FM_ChromVAR_10xpbmc5k_motifs.rds',
 'FM_ChromVAR_10xpbmc5k_kmers.rds',
 'FM_cisTopic_10xpbmc5k.rds',
 'FM_SnapATAC_10xpbmc5k.rds',
 'FM_BROCKMAN_10xpbmc5k.rds',
 'FM_SCRAT_10xpbmc5k_motifs.rds',
 'FM_Cusanovich2018_10xpbmc5k.rds',
 'FM_Control_10xpbmc5k.rds',
 'FM_GeneScoring_10xpbmc5k.rds',
 'FM_Scasat_10xpbmc5k.rds',
 'FM_scABC_10xpbmc5k.rds',
 'FM_Cicero_10xpbmc5k.rds',
 'FM_ChromVAR_10xpbmc_kmers_pca.rds',
 'FM_ChromVAR_10xpbmc_motifs_pca.rds',
 'FM_GeneScoring_10xpbmc_pca.rds',
 'FM_Cicero_10xpbmc_pca.rds',
 'FM_SCRAT_10xpbmc_pca.rds']

In [7]:

def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))

In [9]:

for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)

    pandas2ri.activate()
    readRDS = robjects.r['readRDS']
    df_rds = readRDS(os.path.join(path_fm,file))
    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
    fm_mat.fillna(0,inplace=True)
    fm_mat.columns = metadata.index
    
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = metadata.loc[adata.obs.index,]
#     df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
#     sc.tl.louvain(adata)
    getNClusters(adata,n_cluster=num_clusters)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
#     #clustering metrics
    
#     #adjusted rank index
#     ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
#     ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
#     ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
#     #adjusted mutual information
#     ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
#     ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
#     ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
#     #homogeneity
#     homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
#     homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
#     homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

#     df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
#     df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
#     df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

ChromVAR_motifs
step 0
got 8 at resolution 1.5
ChromVAR_kmers

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 6 at resolution 1.5
step 1
got 17 at resolution 2.25
step 2
got 9 at resolution 1.875
step 3
got 6 at resolution 1.6875
step 4
got 7 at resolution 1.78125
step 5
got 8 at resolution 1.828125
cisTopic

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 17 at resolution 1.5
step 1
got 14 at resolution 0.75
step 2
got 9 at resolution 0.375
step 3
got 6 at resolution 0.1875
step 4
got 8 at resolution 0.28125
SnapATAC

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 19 at resolution 1.5
step 1
got 14 at resolution 0.75
step 2
got 11 at resolution 0.375
step 3
got 6 at resolution 0.1875
step 4
got 8 at resolution 0.28125
BROCKMAN

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 20 at resolution 1.5
step 1
got 13 at resolution 0.75
step 2
got 8 at resolution 0.375
SCRAT_motifs

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 19 at resolution 1.5
step 1
got 14 at resolution 0.75
step 2
got 8 at resolution 0.375
Cusanovich2018

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 17 at resolution 1.5
step 1
got 11 at resolution 0.75
step 2
got 8 at resolution 0.375
Control

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 22 at resolution 1.5
step 1
got 13 at resolution 0.75
step 2
got 7 at resolution 0.375
step 3
got 10 at resolution 0.5625
step 4
got 9 at resolution 0.46875
step 5
got 8 at resolution 0.421875
GeneScoring

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 8 at resolution 1.5
Scasat

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 25 at resolution 1.5
step 1
got 17 at resolution 0.75
step 2
got 11 at resolution 0.375
step 3
got 6 at resolution 0.1875
step 4
got 9 at resolution 0.28125
step 5
got 6 at resolution 0.234375
step 6
got 6 at resolution 0.2578125
step 7
got 8 at resolution 0.26953125
scABC

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 10 at resolution 1.5
step 1
got 5 at resolution 0.75
step 2
got 7 at resolution 1.125
step 3
got 10 at resolution 1.3125
step 4
got 8 at resolution 1.21875
Cicero

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 8 at resolution 1.5
ChromVAR_kmers_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 10 at resolution 1.5
step 1
got 6 at resolution 0.75
step 2
got 8 at resolution 1.125
ChromVAR_motifs_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 9 at resolution 1.5
step 1
got 6 at resolution 0.75
step 2
got 7 at resolution 1.125
step 3
got 7 at resolution 1.3125
step 4
got 8 at resolution 1.40625
GeneScoring_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 15 at resolution 1.5
step 1
got 9 at resolution 0.75
step 2
got 7 at resolution 0.375
step 3
got 7 at resolution 0.5625
step 4
got 7 at resolution 0.65625
step 5
got 7 at resolution 0.703125
step 6
got 9 at resolution 0.7265625
step 7
got 7 at resolution 0.71484375
step 8
got 8 at resolution 0.720703125
Cicero_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 12 at resolution 1.5
step 1
got 5 at resolution 0.75
step 2
got 11 at resolution 1.125
step 3
got 7 at resolution 0.9375
step 4
got 10 at resolution 1.03125
step 5
got 7 at resolution 0.984375
step 6
got 9 at resolution 1.0078125
step 7
got 8 at resolution 0.99609375
SCRAT_pca

/data/pinello/SHARED_SOFTWARE/anaconda3/envs/ATACseq_clustering/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:191: FutureWarning: from_items is deprecated. Please use DataFrame.from_dict(dict(items), ...) instead. DataFrame.from_dict(OrderedDict(items)) may be used to preserve the key order.
  res = PandasDataFrame.from_items(items)

step 0
got 23 at resolution 1.5
step 1
got 16 at resolution 0.75
step 2
got 11 at resolution 0.375
step 3
got 6 at resolution 0.1875
step 4
got 9 at resolution 0.28125
step 5
got 7 at resolution 0.234375
step 6
got 8 at resolution 0.2578125

In [ ]:

In [14]:

def residual_average_gini_index(gene_scores,folder_clusters,
                                housekeeping_genes,marker_genes,
                                min_cells_per_cluster=10):

    #Subset from the main matrix the housekeeping genes and marker genes
    df_matrix_housekeeping=gene_scores.loc[gene_scores.index.intersection(housekeeping_genes),]
    df_matrix_marker=gene_scores.loc[gene_scores.index.intersection(marker_genes),]
    
    #Define a function to compute the Gini score
    def gini(list_of_values):
        sorted_list = sorted(list_of_values)
        height, area = 0, 0
        for value in sorted_list:
            height += value
            area += height - value / 2.
            fair_area = height * len(list_of_values) / 2.
        return (fair_area - area) / fair_area
    
    #Function to calculate Gini value for all the genes
    def calculate_gini(df_matrix, gene_name,clustering_info):
        return gini(get_avg_per_cluster(df_matrix,gene_name,clustering_info,use_log2=False))

    #Function to calculate Gini value for all the genes
    def calculate_gini_values(df_matrix,clustering_info):
        gini_values=[]
        for gene_name in df_matrix.index:
            gini_values.append(calculate_gini(df_matrix, gene_name,clustering_info))
        return gini_values
    
    #Write a function to compute delta difference of the average accessibility in Marker vs Housekeeping and Kolmogorov Smirnov test
    def score_clustering_solution(df_matrix_marker,df_matrix_housekeeping,clustering_info):
        gini_values_housekeeping=calculate_gini_values(df_matrix_housekeeping,clustering_info)
        gini_values_marker=calculate_gini_values(df_matrix_marker,clustering_info)
        statistic,p_value=stats.ks_2samp(gini_values_marker,gini_values_housekeeping)
        
        return  np.mean(gini_values_marker), np.mean(gini_values_housekeeping),np.mean(gini_values_marker)-np.mean(gini_values_housekeeping), statistic,p_value

    #Function to compute the average accessibility value per cluster
    def get_avg_per_cluster(df_matrix, gene_name, clustering_info,use_log2=False):
        N_clusters=len(clustering_info.index.unique())
        avg_per_cluster=np.zeros(N_clusters)
        for idx,idx_cluster in enumerate(sorted(np.unique(clustering_info.index.unique()))):
            if use_log2:
                values_cluster=df_matrix.loc[gene_name,clustering_info.loc[idx_cluster,:].values.flatten()].apply(lambda x:np.log2(x+1))
            else:
                values_cluster=df_matrix.loc[gene_name,clustering_info.loc[idx_cluster,:].values.flatten()]
           
            avg_per_cluster[idx]=values_cluster.mean()
            if avg_per_cluster[idx]>0:
                  avg_per_cluster[idx]=avg_per_cluster[idx]#/values_cluster.std()

        return avg_per_cluster
    

    #Run the method for all the clustering solutions
    
    df_metrics = pd.DataFrame(columns=['Method','Clustering','Gini_Marker_Genes','Gini_Housekeeping_Genes','Difference','KS_statistics','p-value'])    
    
    for clusters_filename in os.listdir(folder_clusters):     
        method = '_'.join(clusters_filename.split('_')[:-1])
        print(method)
        df_clusters = pd.read_csv(os.path.join(folder_clusters,clusters_filename),sep='\t',index_col=0)
        for clustering_method in df_clusters.columns:
            clustering_info = pd.DataFrame(df_clusters[clustering_method])
            clustering_info['Barcode'] = clustering_info.index
            clustering_info=clustering_info.set_index(clustering_method)
            
            #REMOVE CLUSTERS WITH FEW CELLS
            cluster_sizes=pd.value_counts(clustering_info.index)
            clustering_info=clustering_info.loc[cluster_sizes[cluster_sizes>min_cells_per_cluster].index.values,:]


            mean_gini_marker,mean_gini_housekeeping,mean_gini_difference,statistics,p_value=score_clustering_solution(df_matrix_marker,df_matrix_housekeeping,clustering_info)

            df_metrics = df_metrics.append({'Method': method,'Clustering':clustering_method,
                               'Gini_Marker_Genes':mean_gini_marker,'Gini_Housekeeping_Genes':mean_gini_housekeeping,
                               'Difference':mean_gini_difference,'KS_statistics':statistics,'p-value':p_value},
                              ignore_index=True)  
        
    return df_metrics

In [15]:

gene_scores = pd.read_csv('./run_methods/GeneScoring/FM_GeneScoring_10xpbmc5k.tsv',
                         sep = '\t',index_col=0)

In [16]:

gene_scores.head()

Out[16]:

	AAACGAAAGCGCAATG-1	AAACGAAAGGGTATCG-1	AAACGAAAGTAACATG-1	AAACGAAAGTTACACC-1	AAACGAACAGAGATGC-1	AAACGAACATGCTATG-1	AAACGAAGTGCATCAT-1	AAACGAAGTGGACGAT-1	AAACGAAGTGGCCTCA-1	AAACGAATCAGTGTAC-1	...	TTTGGTTGTCAGAAAT-1	TTTGGTTGTTGTATCG-1	TTTGGTTTCAGTGGTT-1	TTTGTGTAGGAAACTT-1	TTTGTGTCAAGCCTTA-1	TTTGTGTCACTCAAGT-1	TTTGTGTCACTGGGCT-1	TTTGTGTGTACGCAAG-1	TTTGTGTGTCTGCGCA-1	TTTGTGTTCAACTTGG-1
A1BG	0.086201	1.775694	3.591624	0.212587	0.040847	0.004313	0.002157	0.176512	0.000518	0.007131	...	0.041391	3.588490	1.822400	0.045678	5.371366	0.002675	2.454377	1.984945	1.732547	1.730391
A1BG-AS1	0.458201	0.478351	1.365454	0.930858	0.018763	0.023040	0.011713	0.943315	0.000097	0.038674	...	0.011158	1.345947	0.922146	0.042287	2.255818	0.012003	3.702715	1.393866	0.444438	0.432918
A2LD1	0.000363	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
A2M	0.000000	0.054960	0.000665	0.000000	2.205809	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.007216	2.258926	0.000000	0.000819	0.000000	0.001228	0.000665	0.000000	0.000000
A2M-AS1	0.000000	0.051265	0.000621	0.000000	2.057506	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.006731	2.107052	0.000000	0.000764	0.000000	0.001146	0.000621	0.000000	0.000000

5 rows × 5335 columns

In [17]:

#https://www.tau.ac.il/~elieis/Housekeeping_genes.html List of Housekeeping genes
housekeeping_genes=['ACTB','ALDOA','GAPDH','PGK1','LDHA','RPS27A','RPL19','RPL11','NONO','ARHGDIA','RPL32','RPS18','HSPCB',
    'C1orf43','CHMP2A','EMC7','GPI','PSMB2,''PSMB4','RAB7A','REEP5','SNRPD3','VCP','VPS29']

#List of Marker Genes
marker_genes=['CD209', 'ENG', 'FOXP3', 'CD34', 'BATF3', 'S100A12', 'THBD','CD3D', 'THY1', 'CD8A', 'CD8B', 'CD14', 'PROM1', 'IL2RA', 'FCGR3A',
'IL3RA', 'FCGR1A', 'CD19', 'IL7R', 'CD79A', 'MS4A1', 'NCAM1','CD3E', 'CD3G', 'KIT', 'CD1C', 'CD68', 'CD4']

In [18]:

folder_clusters = './output/clusters/'

In [19]:

df_metrics = residual_average_gini_index(gene_scores,folder_clusters,
                                housekeeping_genes,marker_genes,
                                min_cells_per_cluster=10)

ChromVAR_motifs
ChromVAR_kmers
cisTopic
SnapATAC
BROCKMAN
SCRAT_motifs
Cusanovich2018
Control
GeneScoring
Scasat
scABC
Cicero
ChromVAR_kmers_pca
ChromVAR_motifs_pca
GeneScoring_pca
Cicero_pca
SCRAT_pca

In [ ]:

In [20]:

df_metrics.to_csv(path_metrics+'clustering_RAGI_scores.csv')

In [21]:

df_metrics

Out[21]:

	Method	Clustering	Gini_Marker_Genes	Gini_Housekeeping_Genes	Difference	KS_statistics	p-value
0	ChromVAR_motifs	louvain	0.329794	0.202917	0.126878	0.535714	0.001101
1	ChromVAR_motifs	kmeans	0.343745	0.237399	0.106345	0.464286	0.007129
2	ChromVAR_motifs	hc	0.381168	0.290041	0.091127	0.500000	0.002897
3	ChromVAR_kmers	louvain	0.309164	0.178850	0.130315	0.535714	0.001101
4	ChromVAR_kmers	kmeans	0.340088	0.263170	0.076918	0.488095	0.003940
5	ChromVAR_kmers	hc	0.319202	0.241089	0.078113	0.523810	0.001532
6	cisTopic	louvain	0.369119	0.254688	0.114431	0.571429	0.000392
7	cisTopic	kmeans	0.417319	0.316060	0.101259	0.500000	0.002897
8	cisTopic	hc	0.468534	0.362493	0.106041	0.500000	0.002897
9	SnapATAC	louvain	0.332478	0.195798	0.136680	0.452381	0.009484
10	SnapATAC	kmeans	0.361506	0.217868	0.143637	0.571429	0.000392
11	SnapATAC	hc	0.371438	0.231429	0.140009	0.523810	0.001532
12	BROCKMAN	louvain	0.346999	0.234125	0.112874	0.476190	0.005320
13	BROCKMAN	kmeans	0.327789	0.262820	0.064969	0.392857	0.035346
14	BROCKMAN	hc	0.324061	0.255878	0.068183	0.464286	0.007129
15	SCRAT_motifs	louvain	0.347878	0.293645	0.054233	0.476190	0.005320
16	SCRAT_motifs	kmeans	0.287355	0.234507	0.052848	0.392857	0.035346
17	SCRAT_motifs	hc	0.315229	0.263967	0.051262	0.416667	0.021353
18	Cusanovich2018	louvain	0.388897	0.280957	0.107940	0.464286	0.007129
19	Cusanovich2018	kmeans	0.254027	0.177305	0.076722	0.464286	0.007129
20	Cusanovich2018	hc	0.225468	0.141125	0.084343	0.464286	0.007129
21	Control	louvain	0.443971	0.358864	0.085107	0.464286	0.007129
22	Control	kmeans	0.456480	0.453863	0.002617	0.273810	0.280812
23	Control	hc	0.471765	0.467423	0.004342	0.238095	0.448886
24	GeneScoring	louvain	0.460154	0.389402	0.070752	0.428571	0.016413
25	GeneScoring	kmeans	0.450160	0.471894	-0.021734	0.297619	0.197109
26	GeneScoring	hc	0.503348	0.501473	0.001875	0.285714	0.236193
27	Scasat	louvain	0.472445	0.374599	0.097846	0.500000	0.002897
28	Scasat	kmeans	0.497129	0.421512	0.075617	0.428571	0.016413
29	Scasat	hc	0.477655	0.399098	0.078557	0.464286	0.007129
30	scABC	louvain	0.461698	0.441462	0.020237	0.297619	0.197109
31	scABC	kmeans	0.458673	0.454737	0.003937	0.238095	0.448886
32	scABC	hc	0.425721	0.396330	0.029391	0.321429	0.134155
33	Cicero	louvain	0.354993	0.249562	0.105431	0.547619	0.000786
34	Cicero	kmeans	0.000000	0.000000	0.000000	0.000000	1.000000
35	Cicero	hc	0.264151	0.203767	0.060383	0.357143	0.071204
36	ChromVAR_kmers_pca	louvain	0.352224	0.232139	0.120086	0.511905	0.002114
37	ChromVAR_kmers_pca	kmeans	0.287473	0.227681	0.059792	0.488095	0.003940
38	ChromVAR_kmers_pca	hc	0.322012	0.246821	0.075191	0.464286	0.007129
39	ChromVAR_motifs_pca	louvain	0.333697	0.205199	0.128498	0.535714	0.001101
40	ChromVAR_motifs_pca	kmeans	0.396433	0.301102	0.095331	0.428571	0.016413
41	ChromVAR_motifs_pca	hc	0.397484	0.301730	0.095755	0.440476	0.012522
42	GeneScoring_pca	louvain	0.517697	0.503202	0.014495	0.261905	0.331171
43	GeneScoring_pca	kmeans	0.488294	0.499929	-0.011635	0.333333	0.109433
44	GeneScoring_pca	hc	0.469333	0.473566	-0.004233	0.285714	0.236193
45	Cicero_pca	louvain	0.470227	0.356027	0.114200	0.488095	0.003940
46	Cicero_pca	kmeans	0.262221	0.197641	0.064580	0.357143	0.071204
47	Cicero_pca	hc	0.250496	0.172051	0.078446	0.464286	0.007129
48	SCRAT_pca	louvain	0.392071	0.320236	0.071835	0.464286	0.007129
49	SCRAT_pca	kmeans	0.287531	0.234164	0.053367	0.392857	0.035346
50	SCRAT_pca	hc	0.278123	0.232857	0.045266	0.404762	0.027574

In [ ]: