#!/usr/bin/env python # coding: utf-8 # # Performance timings data generation # # We need to generate data comparing performance of the reference implementation of HDBSCAN and various historical versions of the hdbscan library. We need to do this varying over dataset size so we can get an idea of scaling, and we also need to consider various dimension sizes. To get all this done we'll need some handy modules: sklearn.datasets to generate fake data for clustering; numpy and pandas for easy manipulation of vectors and dataframes of results; and subprocess and time so we can actually fork off and time the actual Java refeence implementation. # In[1]: import sklearn.datasets import numpy as np import pandas as pd import subprocess import time # Now we need a function to actually time the reference implementation. We can do external timing use the time module, and the Java program also returns internal timings, which we can parse out and save to a dataframe. In practice this is just a matter of using subprocess an the appropriate commandline parameters for the reference code. # In[2]: def get_reference_timings(data, filename='tmp_data.csv', jarfile='/Users/leland/Source/HDBSCAN_Star/HDBSCAN_Star.jar', min_points=5, min_cluster_size=5): # Create the required csv file pd.DataFrame(data).to_csv('tmp_data.csv', header=False, index=False) # Run the clustering via a subprocess call and grab the output as it # has timing information to be parsed start_time = time.time() internal_timing = subprocess.check_output(['java', '-jar', jarfile, 'file={}'.format(filename), 'minPts={}'.format(min_points), 'minClSize={}'.format(min_cluster_size), 'compact=true']) time_taken = time.time() - start_time # Parse internal timing info into a pandas series for later use result_dict = {} for line in internal_timing.split('\n'): if ':' in line: key, value = line.split(':') key = key.replace(' (ms)', '') key = key.replace('Time to ', '') key = key.replace('Overall ', '') value = int(value) result_dict[key] = value internal_timing = pd.Series(result_dict) return time_taken, internal_timing # With that in hand we can run the code over a range of dimensions and dataset sizes and aggregate the results together in indexed pandas series or dataframes. # In[3]: internal_timing = {} external_timing = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 8000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) (external_timing[(dataset_dimension, dataset_size)], internal_timing[(dataset_dimension, dataset_size)]) = get_reference_timings(data) internal_timing_df = pd.DataFrame(internal_timing).T external_timing_series = pd.Series(external_timing) # Now it is just a matter of saving these off to disk for later use. # In[4]: internal_timing_df.to_csv('reference_impl_internal_timings.csv') # In[5]: external_timing_series.to_csv('reference_impl_external_timings.csv') # Now we need to build up hdbscan timings, preferably over a range of hdbscan versions to show how the performance of the code has evolved (and improved!). To do this I pulled down historical versions and fudged them so that they exist in different namespaces and can live side by side. We can import them all like so ... # In[6]: import hdbscan01 import hdbscan02 import hdbscan03 import hdbscan04 import hdbscan05 import hdbscan # Now we simply go through each version and run over a range of dimensions and dataset sizes (ranging up to smaller sizes in the case of early versions which were memory constrained). # In[7]: hdbscan01_timings = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 2000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) start_time = time.time() hdbscan01.HDBSCAN().fit(data) time_taken = time.time() - start_time hdbscan01_timings[(dataset_dimension, dataset_size)] = time_taken hdbscan01_timings_series = pd.Series(hdbscan01_timings).T hdbscan01_timings_series.to_csv('hdbscan01_timings.csv') # In[8]: hdbscan02_timings = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 2000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) start_time = time.time() hdbscan02.HDBSCAN().fit(data) time_taken = time.time() - start_time hdbscan02_timings[(dataset_dimension, dataset_size)] = time_taken hdbscan02_timings_series = pd.Series(hdbscan02_timings).T hdbscan02_timings_series.to_csv('hdbscan02_timings.csv') # In[9]: hdbscan03_timings = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 4000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) start_time = time.time() hdbscan03.HDBSCAN().fit(data) time_taken = time.time() - start_time hdbscan03_timings[(dataset_dimension, dataset_size)] = time_taken hdbscan03_timings_series = pd.Series(hdbscan03_timings).T hdbscan03_timings_series.to_csv('hdbscan03_timings.csv') # In[10]: hdbscan04_timings = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 8000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) start_time = time.time() hdbscan04.HDBSCAN().fit(data) time_taken = time.time() - start_time hdbscan04_timings[(dataset_dimension, dataset_size)] = time_taken hdbscan04_timings_series = pd.Series(hdbscan04_timings).T hdbscan04_timings_series.to_csv('hdbscan04_timings.csv') # In[11]: hdbscan05_timings = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 8000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) start_time = time.time() hdbscan05.HDBSCAN().fit(data) time_taken = time.time() - start_time hdbscan05_timings[(dataset_dimension, dataset_size)] = time_taken hdbscan05_timings_series = pd.Series(hdbscan05_timings).T hdbscan05_timings_series.to_csv('hdbscan05_timings.csv') # Finally we can run the current code (soon to be version 0.6). # In[12]: hdbscan06_timings = {} for dataset_dimension in (2,5,10,25,50): for dataset_size in np.arange(1,17) * 8000: data, _ = sklearn.datasets.make_blobs(dataset_size, n_features=dataset_dimension, centers=dataset_dimension) start_time = time.time() hdbscan.HDBSCAN().fit(data) time_taken = time.time() - start_time hdbscan06_timings[(dataset_dimension, dataset_size)] = time_taken hdbscan06_timings_series = pd.Series(hdbscan06_timings).T hdbscan06_timings_series.to_csv('hdbscan06_timings.csv') # And we will save the analysis of all of this for another (rather more text heavy) notebook.