# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook()
irisDf = pd.read_csv('/home/anand/DataScientist/data/metadata.csv')
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) <ipython-input-4-53544694da68> in <module>() ----> 1 irisDf = pd.read_csv('/home/anand/DataScientist/data/metadata.csv') /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision) 653 skip_blank_lines=skip_blank_lines) 654 --> 655 return _read(filepath_or_buffer, kwds) 656 657 parser_f.__name__ = name /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds) 403 404 # Create the parser. --> 405 parser = TextFileReader(filepath_or_buffer, **kwds) 406 407 if chunksize or iterator: /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds) 762 self.options['has_index_names'] = kwds['has_index_names'] 763 --> 764 self._make_engine(self.engine) 765 766 def close(self): /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py in _make_engine(self, engine) 983 def _make_engine(self, engine='c'): 984 if engine == 'c': --> 985 self._engine = CParserWrapper(self.f, **self.options) 986 else: 987 if engine == 'python': /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds) 1603 kwds['allow_leading_cols'] = self.index_col is not False 1604 -> 1605 self._reader = parsers.TextReader(src, **kwds) 1606 1607 # XXX pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas/_libs/parsers.c:4209)() pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas/_libs/parsers.c:8873)() FileNotFoundError: File b'/home/anand/DataScientist/data/metadata.csv' does not exist
irisDf.describe()
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/numpy/lib/function_base.py:4116: RuntimeWarning: Invalid value encountered in percentile interpolation=interpolation)
call_duration | antenna_id | |
---|---|---|
count | 342.000000 | 0.0 |
mean | 228.997076 | NaN |
std | 441.944354 | NaN |
min | 1.000000 | NaN |
25% | NaN | NaN |
50% | NaN | NaN |
75% | NaN | NaN |
max | 3072.000000 | NaN |
irisDf.head()
interaction | direction | correspondent_id | datetime | call_duration | antenna_id | |
---|---|---|---|---|---|---|
0 | call | out | +109535800433 | 2016-10-16 07:34:52 | NaN | NaN |
1 | call | in | +919486957832 | 2016-10-16 12:49:40 | 15.0 | NaN |
2 | call | in | +914430811100 | 2016-10-16 14:35:26 | 101.0 | NaN |
3 | call | out | +918903421807 | 2016-10-16 14:37:47 | NaN | NaN |
4 | call | out | +914634221807 | 2016-10-16 14:38:30 | 77.0 | NaN |
irisDf.var()
call_duration 195314.812308 antenna_id NaN dtype: float64
irisDf.skew()
call_duration 3.96145 antenna_id NaN dtype: float64
irisDf.corr()
call_duration | antenna_id | |
---|---|---|
call_duration | 1.0 | NaN |
antenna_id | NaN | NaN |
irisDf.select_dtypes(include=[np.number]).columns
Index(['call_duration', 'antenna_id'], dtype='object')
analyze.correlation_analyze(irisDf)
# Correlation btw Numerical Columns
# Pandas correlation coefficients matrix call_duration antenna_id call_duration 1.0 NaN antenna_id NaN NaN # Pandas co-variance coefficients matrix call_duration antenna_id call_duration 195314.812308 NaN antenna_id NaN NaN
analyze.dist_analyze(irisDf)
Variance of call_duration 195314.812308 Skewness of call_duration 3.96144995214 Variance of antenna_id nan Skewness of antenna_id nan Too many categorise for col: correspondent_id can't plot pie-chart Too many categorise for col: datetime can't plot pie-chart
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
analyze.dist_analyze(irisDf, 'call_duration')
Variance of call_duration 195314.812308 Skewness of call_duration 3.96144995214
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle warnings.warn("Path marker shapes currently not handled, defaulting to Circle") /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the axes property. A removal date has not been set. stacklevel=1) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected warnings.warn("Path marker sizes support is limited and may not display as expected")
analyze.regression_analyze(irisDf, 'call_duration', 'datetime')
1752 1752
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-14-d72a4ec37f79> in <module>() ----> 1 analyze.regression_analyze(irisDf, 'call_duration', 'datetime') /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.1.10-py3.5.egg/datascienceutils/analyze.py in regression_analyze(df, col1, col2, trainsize, non_linear) 159 target = new_df[col2] 160 models = [ --> 161 pm.train(new_df, target, column=col1, modelType='LinearRegression'), 162 pm.train(new_df, target, column=col1, modelType='RidgeRegression'), 163 pm.train(new_df, target, column=col1, modelType='RidgeRegressionCV'), /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/datascienceutils-1.1.10-py3.5.egg/datascienceutils/predictiveModels.py in train(dataframe, target, modelType, column, **kwargs) 30 source = dataframe[column].reshape((len(target), 1)) 31 print(len(source), len(target)) ---> 32 model.fit(source, target) 33 else: 34 model.fit(dataframe, target) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/linear_model/base.py in fit(self, X, y, sample_weight) 510 n_jobs_ = self.n_jobs 511 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], --> 512 y_numeric=True, multi_output=True) 513 514 if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1: /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator) 519 X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite, 520 ensure_2d, allow_nd, ensure_min_samples, --> 521 ensure_min_features, warn_on_dtype, estimator) 522 if multi_output: 523 y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False, /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 405 % (array.ndim, estimator_name)) 406 if force_all_finite: --> 407 _assert_all_finite(array) 408 409 shape_repr = _shape_repr(array.shape) /home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X) 56 and not np.isfinite(X).all()): 57 raise ValueError("Input contains NaN, infinity" ---> 58 " or a value too large for %r." % X.dtype) 59 60 ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
irisDf.head()