# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
from datascienceutils import clusteringModels as cm
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)
import os
files = os.listdir('../data')
files.remove('merged_all_subjects.txt')
files.remove('data.tar.gz')
dataframes = dict()
for i, file in enumerate(files):
print(file)
dataframes[i] = pd.read_csv(os.path.join('../data', file), delimiter='\t', names=["nSeq", "DI", "CH1", "CH2"])
dataframes[i].to_csv('../data/subject_%s.csv'%str(i+1), index=False)
sub3.txt sub7.txt sub6.txt sub10.txt sub9.txt sub1.txt sub4.txt sub5.txt sub8.txt sub2.txt
irisDf= dataframes[0]
dataframes[0].describe()
nSeq | DI | CH1 | CH2 | |
---|---|---|---|---|
count | 309405.000000 | 309405.0 | 309405.000000 | 309405.000000 |
mean | 154702.000000 | 0.0 | 5489.008041 | 27332.859569 |
std | 89317.674357 | 0.0 | 1542.025794 | 11106.054750 |
min | 0.000000 | 0.0 | 2042.000000 | 197.000000 |
25% | 77351.000000 | 0.0 | 4520.000000 | 20161.000000 |
50% | 154702.000000 | 0.0 | 5294.000000 | 25109.000000 |
75% | 232053.000000 | 0.0 | 6380.000000 | 34112.000000 |
max | 309404.000000 | 0.0 | 10020.000000 | 65373.000000 |
irisDf.head()
nSeq | DI | CH1 | CH2 | |
---|---|---|---|---|
0 | 0 | 0 | 3641 | 29204 |
1 | 1 | 0 | 3490 | 28052 |
2 | 2 | 0 | 3599 | 27581 |
3 | 3 | 0 | 3506 | 27141 |
4 | 4 | 0 | 3708 | 26668 |
irisDf.var()
nSeq 7.977647e+09 DI 0.000000e+00 CH1 2.377844e+06 CH2 1.233445e+08 dtype: float64
irisDf.skew()
nSeq -4.004077e-16 DI 0.000000e+00 CH1 4.731694e-01 CH2 5.802260e-01 dtype: float64
irisDf.corr()
nSeq | DI | CH1 | CH2 | |
---|---|---|---|---|
nSeq | 1.000000 | NaN | 0.775749 | -0.229334 |
DI | NaN | NaN | NaN | NaN |
CH1 | 0.775749 | NaN | 1.000000 | -0.189159 |
CH2 | -0.229334 | NaN | -0.189159 | 1.000000 |
## Hmm... Looks like there's no correlation between CH1 and CH2.. surprising or may be not.. may be there's a ground - line relationship btw them
## also interestingly fairly strong correlation between nSeq and CH1... since nSeq is time, perhaps there's a continuous increase in CH1 signal
## throughout the time the subject wore it??
irisDf.select_dtypes(include=[np.number]).columns
Index(['nSeq', 'DI', 'CH1', 'CH2'], dtype='object')
#analyze.correlation_analyze(irisDf, 'nSeq', 'CH1')
for i, df in dataframes.items():
print("Subject no: %s"%str(i+1))
plotter.show(analyze.dist_analyze(df, 'CH1'))
plotter.show(analyze.dist_analyze(irisDf, 'CH1', is_normal=False,))
Variance of CH1 2377843.54997 Skewness of CH1 0.473169421278 Kolmogrov - Smirnov test with distribution norm KstestResult(statistic=1.0, pvalue=0.0)
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
## Interesting.. the channel measurements are almost normal.. a bit skewed... and there's a close second mode
## Wonder if the 2nd mode is likely to be an indicator of epileptic propensity.. (Should compare against normal subjects)
## The test statistic says it is resoundingly from a normal distribution (from [here](https://en.wikipedia.org/wiki/Anderson%E2%80%93Darling_test#Test_for_normality))
plotter.show(analyze.dist_analyze(irisDf, 'nSeq', 'CH1'))
Variance of nSeq 7977646952.5 Skewness of nSeq 1.11474584517e-15 Kolmogrov - Smirnov test with distribution norm KstestResult(statistic=0.99998355329930155, pvalue=0.0) Anderson-Darling normality test on nSeq Statistic: 3440 p-value: 0
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
kde = KernelDensity(kernel='tophat').fit(dataframes[0]['CH1'])
X = dataframes[0]['CH1']
log_dens = kde.score_samples(X)
fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
ax[1, 0].fill(X[:, 0], np.exp(log_dens), fc='#AAAAFF')
ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density")
# Gaussian KDE
kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
log_dens = kde.score_samples(X_plot)
ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density")
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample. DeprecationWarning) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample. DeprecationWarning)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-16-6616fd9fb3dc> in <module>() 6 7 fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) ----> 8 ax[1, 0].fill(X[:, 0], np.exp(log_dens), fc='#AAAAFF') 9 ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density") 10 /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/core/series.py in __getitem__(self, key) 622 key = check_bool_indexer(self.index, key) 623 --> 624 return self._get_with(key) 625 626 def _get_with(self, key): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/core/series.py in _get_with(self, key) 635 if isinstance(key, tuple): 636 try: --> 637 return self._get_values_tuple(key) 638 except: 639 if len(key) == 1: /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/core/series.py in _get_values_tuple(self, key) 680 681 if not isinstance(self.index, MultiIndex): --> 682 raise ValueError('Can only tuple-index with a MultiIndex') 683 684 # If key is contained, would have returned by now ValueError: Can only tuple-index with a MultiIndex