In [1]:

%load_ext autoreload
%autoreload 2

In [2]:

import sys
sys.path.append("..")

In [3]:

from optimus import Optimus

C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  from collections import Callable
C:\Users\argenisleon\Anaconda3\lib\site-packages\statsmodels\base\wrapper.py:100: DeprecationWarning: `formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly
  defaults=argspec[3])

    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible.

In [4]:

# op = Optimus("spark")
# df = op.load.csv("data/crime.csv")

In [5]:

op = Optimus("dask")

In [6]:

df = op.load.csv("data/crime.csv", charset="ISO-8859–1")

In [10]:

df = op.load.json("data/foo.json", multiline=True)

In [20]:

df = op.load.parquet("data/foo.parquet")

In [7]:

df = op.load.excel("data/titanic3.xls")

Ds data/titanic3.xls
data/titanic3.xls
data/titanic3.xls

In [8]:

df.ext.display()

Viewing 10 of 1309 rows / 1309 columns

3 partition(s)

pclass 1 (int64) not nullable	survived 2 (int64) not nullable	name 3 (object) not nullable	sex 4 (object) not nullable	age 5 (float64) not nullable	sibsp 6 (int64) not nullable	parch 7 (int64) not nullable	ticket 8 (object) not nullable	fare 9 (float64) not nullable	cabin 10 (object) not nullable	embarked 11 (object) not nullable	boat 12 (object) not nullable	body 13 (float64) not nullable	home.dest 14 (object) not nullable
1	1	Allen,⋅Miss.⋅Elisabeth⋅Walton	female	29.0	0	0	24160	211.3375	B5	S	2	nan	St⋅Louis,⋅MO
1	1	Allison,⋅Master.⋅Hudson⋅Trevor	male	0.9167	1	2	113781	151.55	C22⋅C26	S	11	nan	Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
1	0	Allison,⋅Miss.⋅Helen⋅Loraine	female	2.0	1	2	113781	151.55	C22⋅C26	S	nan	nan	Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
1	0	Allison,⋅Mr.⋅Hudson⋅Joshua⋅Creighton	male	30.0	1	2	113781	151.55	C22⋅C26	S	nan	135.0	Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
1	0	Allison,⋅Mrs.⋅Hudson⋅J⋅C⋅(Bessie⋅Waldo⋅Daniels)	female	25.0	1	2	113781	151.55	C22⋅C26	S	nan	nan	Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
1	1	Anderson,⋅Mr.⋅Harry	male	48.0	0	0	19952	26.55	E12	S	3	nan	New⋅York,⋅NY
1	1	Andrews,⋅Miss.⋅Kornelia⋅Theodosia	female	63.0	1	0	13502	77.9583	D7	S	10	nan	Hudson,⋅NY
1	0	Andrews,⋅Mr.⋅Thomas⋅Jr	male	39.0	0	0	112050	0.0	A36	S	nan	nan	Belfast,⋅NI
1	1	Appleton,⋅Mrs.⋅Edward⋅Dale⋅(Charlotte⋅Lamson)	female	53.0	2	0	11769	51.4792	C101	S	D	nan	Bayside,⋅Queens,⋅NY
1	0	Artagaveytia,⋅Mr.⋅Ramon	male	71.0	0	0	PC⋅17609	49.5042	nan	C	nan	22.0	Montevideo,⋅Uruguay
1	0	Astor,⋅Col.⋅John⋅Jacob	male	47.0	1	0	PC⋅17757	227.525	C62⋅C64	C	nan	124.0	New⋅York,⋅NY

Viewing 10 of 1309 rows / 1309 columns

3 partition(s)

In [ ]:

df = op.load.avro("data/foo.avro")

In [18]:

df.ext.display()

Viewing 10 of 19 rows / 19 columns

1 partition(s)

id 1 (int32) not nullable	firstName 2 (object) not nullable	lastName 3 (object) not nullable	billingId 4 (int32) not nullable	product 5 (object) not nullable	price 6 (int32) not nullable	birth 7 (object) not nullable	dummyCol 8 (object) not nullable
1	Luis	Alvarez$$%!	123	Cake	10	1980/07/07	never
2	André	Ampère	423	piza	8	1950/07/08	gonna
3	NiELS	Böhr//((%%	551	pizza	8	1990/07/09	give
4	PAUL	dirac$	521	pizza	8	1954/07/10	you
5	Albert	Einstein	634	pizza	8	1990/07/11	up
6	Galileo	⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI	672	arepa	5	1930/08/12	never
7	CaRL	Ga%%%uss	323	taco	3	1970/07/13	gonna
8	David	H$$$ilbert	624	taaaccoo	3	1950/07/14	let
9	Johannes	KEPLER	735	taco	3	1920/04/22	you
10	JaMES	M$$ax%%well	875	taco	3	1923/03/12	down
11	Isaac	Newton	992	pasta	9	1999/02/15	never⋅

Viewing 10 of 19 rows / 19 columns

1 partition(s)

In [13]:

%%time
df.min().compute()

Wall time: 5.85 s

Out[13]:

INCIDENT_NUMBER                                           142052550
OFFENSE_CODE                                                    111
OFFENSE_CODE_GROUP                               Aggravated Assault
OFFENSE_DESCRIPTION    A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.
REPORTING_AREA                                                     
OCCURRED_ON_DATE                                2015-06-15 00:00:00
YEAR                                                           2015
MONTH                                                             1
DAY_OF_WEEK                                                  Friday
HOUR                                                              0
Lat                                                              -1
Long                                                       -71.1787
Location                                 (-1.00000000, -1.00000000)
dtype: object

In [11]:

%%time
df.cols.min("*")

[('min', Dask Series Structure:
npartitions=1
DAY_OF_WEEK    object
YEAR              ...
dtype: object
Dask Name: dataframe-min-agg, 6 tasks)]
Wall time: 5.9 s

Out[11]:

{'min': {'INCIDENT_NUMBER': '142052550',
  'OFFENSE_CODE': 111,
  'OFFENSE_CODE_GROUP': 'Aggravated Assault',
  'OFFENSE_DESCRIPTION': 'A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.',
  'REPORTING_AREA': ' ',
  'OCCURRED_ON_DATE': '2015-06-15 00:00:00',
  'YEAR': 2015,
  'MONTH': 1,
  'DAY_OF_WEEK': 'Friday',
  'HOUR': 0,
  'Lat': -1.0,
  'Long': -71.17867378,
  'Location': '(-1.00000000, -1.00000000)'}}

In [35]:

df.cols.schema_dtype()

Out[35]:

[numpy.object_,
 numpy.int64,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.int64,
 numpy.int64,
 numpy.object_,
 numpy.int64,
 numpy.object_,
 numpy.object_,
 numpy.float64,
 numpy.float64,
 numpy.object_]

In [50]:

%%time
df.min().compute()

Wall time: 5.49 s

Out[50]:

INCIDENT_NUMBER                                           142052550
OFFENSE_CODE                                                    111
OFFENSE_CODE_GROUP                               Aggravated Assault
OFFENSE_DESCRIPTION    A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.
REPORTING_AREA                                                     
OCCURRED_ON_DATE                                2015-06-15 00:00:00
YEAR                                                           2015
MONTH                                                             1
DAY_OF_WEEK                                                  Friday
HOUR                                                              0
Lat                                                              -1
Long                                                       -71.1787
Location                                 (-1.00000000, -1.00000000)
dtype: object

In [14]:

df.cols.min("*")

Out[14]:

{'min': {'INCIDENT_NUMBER': '142052550',
  'OFFENSE_CODE': 111,
  'OFFENSE_CODE_GROUP': 'Aggravated Assault',
  'OFFENSE_DESCRIPTION': 'A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.',
  'REPORTING_AREA': ' ',
  'OCCURRED_ON_DATE': '2015-06-15 00:00:00',
  'YEAR': 2015,
  'MONTH': 1,
  'DAY_OF_WEEK': 'Friday',
  'HOUR': 0,
  'Lat': -1.0,
  'Long': -71.17867378,
  'Location': '(-1.00000000, -1.00000000)'}}

In [9]:

%%time
df.cols.kurt("*")

Wall time: 16.5 s

Out[9]:

{'Long': nan,
 'MONTH': 1.9528853000207569,
 'HOUR': 2.3994270670757927,
 'Lat': nan,
 'YEAR': 1.9469652465192953,
 'OFFENSE_CODE': 1.5838041686767816}

In [ ]:

In [14]:

%%time
df.cols.min("*")

Wall time: 5.91 s

Out[14]:

{'min': {'INCIDENT_NUMBER': '142052550',
  'OFFENSE_CODE': 111,
  'OFFENSE_CODE_GROUP': 'Aggravated Assault',
  'OFFENSE_DESCRIPTION': 'A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.',
  'REPORTING_AREA': ' ',
  'OCCURRED_ON_DATE': '2015-06-15 00:00:00',
  'YEAR': 2015,
  'MONTH': 1,
  'DAY_OF_WEEK': 'Friday',
  'HOUR': 0,
  'Lat': -1.0,
  'Long': -71.17867378,
  'Location': '(-1.00000000, -1.00000000)'}}

In [ ]:

In [17]:

%%time
from optimus.profiler.profiler import Profiler
p= Profiler()
p.run(df)

['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<timed exec> in <module>

~\Documents\Optimus\optimus\helpers\decorators.py in timed(*args, **kw)
      8     def timed(*args, **kw):
      9         start_time = timeit.default_timer()
---> 10         f = method(*args, **kw)
     11         _time = round(timeit.default_timer() - start_time, 2)
     12         logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time))

~\Documents\Optimus\optimus\profiler\profiler.py in run(self, df, columns, buckets, infer, relative_error, approx_count, mismatch, advanced_stats)
     72         columns = parse_columns(df, columns)
     73         output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict",
---> 74                               mismatch=mismatch, advanced_stats=advanced_stats)
     75 
     76         # Load jinja

~\Documents\Optimus\optimus\profiler\profiler.py in dataset(self, df, columns, buckets, infer, relative_error, approx_count, sample, stats, format, mismatch, advanced_stats)
    325                 self.cols_count = cols_count = len(df.columns)
    326                 updated_columns = self.columns_stats(df, cols_to_profile, buckets, infer, relative_error, approx_count,
--> 327                                                      mismatch, advanced_stats)
    328 
    329                 output_columns = update_dict(output_columns, updated_columns)

~\Documents\Optimus\optimus\profiler\profiler.py in columns_stats(self, df, columns, buckets, infer, relative_error, approx_count, mismatch, advanced_stats)
    434 
    435         # Aggregation
--> 436         stats = self.columns_agg(df, columns, buckets, relative_error, approx_count, advanced_stats)
    437 
    438         # Calculate Frequency

~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(self, df, columns, buckets, relative_error, approx_count, advanced_stats)
    490                          df.functions.sum, df.functions.variance, df.functions.zeros_agg]
    491                 print(cols)
--> 492                 exprs.extend(df.cols.create_exprs(cols, funcs))
    493 
    494                 # TODO: None in basic calculation

~\Documents\Optimus\optimus\dask\columns.py in create_exprs(columns, funcs, *args)
    119                                 exprs[col_name].update(func(col_name, args)(df))
    120                             else:
--> 121                                 exprs[col_name] = func(col_name, args)(df)
    122 
    123             result = {}

~\Documents\Optimus\optimus\dask\functions.py in _kurtoris(serie)
    134         def kurtosis(col_name, args):
    135             def _kurtoris(serie):
--> 136                 result = {"kurtosis": float(stats.kurtosis(serie[col_name]))}
    137                 return result
    138 

~\Anaconda3\lib\site-packages\dask\array\stats.py in kurtosis(a, axis, fisher, bias, nan_policy)
    227     olderr = np.seterr(all='ignore')
    228     try:
--> 229         vals = da.where(zero, 0, m4 / m2**2.0)
    230     finally:
    231         np.seterr(**olderr)

TypeError: unsupported operand type(s) for ** or pow(): 'Array' and 'float'

In [6]:

df.rows.count()

Out[6]:

In [7]:

df.cols.min(["OFFENSE_CODE","YEAR"])

Out[7]:

{'YEAR': {'min': 2015}, 'OFFENSE_CODE': {'min': 111}}

In [8]:

%%time
df.cols.range(["OFFENSE_CODE","YEAR"])

Wall time: 7.52 s

Out[8]:

{'YEAR': {'min': 2015, 'max': 2018}, 'OFFENSE_CODE': {'min': 111, 'max': 3831}}

In [9]:

%%time
df.cols.hist(["OFFENSE_CODE","YEAR"])

Wall time: 12 s

Out[9]:

{'YEAR': {'hist': [{'count': 53388, 'lower': 2015.0, 'upper': 2015.15},
   {'count': 0, 'lower': 2015.15, 'upper': 2015.3},
   {'count': 0, 'lower': 2015.3, 'upper': 2015.45},
   {'count': 0, 'lower': 2015.45, 'upper': 2015.6},
   {'count': 0, 'lower': 2015.6, 'upper': 2015.75},
   {'count': 0, 'lower': 2015.75, 'upper': 2015.9},
   {'count': 99114, 'lower': 2015.9, 'upper': 2016.05},
   {'count': 0, 'lower': 2016.05, 'upper': 2016.2},
   {'count': 0, 'lower': 2016.2, 'upper': 2016.35},
   {'count': 0, 'lower': 2016.35, 'upper': 2016.5},
   {'count': 0, 'lower': 2016.5, 'upper': 2016.65},
   {'count': 0, 'lower': 2016.65, 'upper': 2016.8},
   {'count': 0, 'lower': 2016.8, 'upper': 2016.95},
   {'count': 100886, 'lower': 2016.95, 'upper': 2017.1},
   {'count': 0, 'lower': 2017.1, 'upper': 2017.25},
   {'count': 0, 'lower': 2017.25, 'upper': 2017.4},
   {'count': 0, 'lower': 2017.4, 'upper': 2017.55},
   {'count': 0, 'lower': 2017.55, 'upper': 2017.7},
   {'count': 0, 'lower': 2017.7, 'upper': 2017.85},
   {'count': 65685, 'lower': 2017.85, 'upper': 2018.0}]},
 'OFFENSE_CODE': {'hist': [{'count': 169, 'lower': 111.0, 'upper': 297.0},
   {'count': 12431, 'lower': 297.0, 'upper': 483.0},
   {'count': 44188, 'lower': 483.0, 'upper': 669.0},
   {'count': 21795, 'lower': 669.0, 'upper': 855.0},
   {'count': 1548, 'lower': 855.0, 'upper': 1041.0},
   {'count': 9272, 'lower': 1041.0, 'upper': 1227.0},
   {'count': 16609, 'lower': 1227.0, 'upper': 1413.0},
   {'count': 2185, 'lower': 1413.0, 'upper': 1599.0},
   {'count': 216, 'lower': 1599.0, 'upper': 1785.0},
   {'count': 16536, 'lower': 1785.0, 'upper': 1971.0},
   {'count': 2759, 'lower': 1971.0, 'upper': 2157.0},
   {'count': 256, 'lower': 2157.0, 'upper': 2343.0},
   {'count': 2655, 'lower': 2343.0, 'upper': 2529.0},
   {'count': 20908, 'lower': 2529.0, 'upper': 2715.0},
   {'count': 2894, 'lower': 2715.0, 'upper': 2901.0},
   {'count': 29658, 'lower': 2901.0, 'upper': 3087.0},
   {'count': 63012, 'lower': 3087.0, 'upper': 3273.0},
   {'count': 25653, 'lower': 3273.0, 'upper': 3459.0},
   {'count': 9197, 'lower': 3459.0, 'upper': 3645.0},
   {'count': 37132, 'lower': 3645.0, 'upper': 3831.0}]}}

In [10]:

df.cols.percentile(["OFFENSE_CODE","YEAR"], values =[0.5,0.9])

Out[10]:

{'YEAR': {'percentile': {0.5: 2017.0, 0.9: 2018.0}},
 'OFFENSE_CODE': {'percentile': {0.5: 2907.0, 0.9: 3802.0}}}

In [19]:

import numpy as np
a ={'hist_agg': {'hist': np.array([  169, 12431, 44188, 21795,  1548,  9272, 16609,  2185,   216,
          16536,  2759,   256,  2655, 20908,  2894, 29658, 63012, 25653,
           9197, 37132]),
   'bins': np.array([ 111.,  297.,  483.,  669.,  855., 1041., 1227., 1413., 1599.,
          1785., 1971., 2157., 2343., 2529., 2715., 2901., 3087., 3273.,
          3459., 3645., 3831.])}}

In [44]:

x = a["hist_agg"]["hist"]
y = a["hist_agg"]["bins"]

for idx, v in enumerate(y):
    if idx <len(y)-1:
        print({"count":x[idx],"lower":y[idx], "upper":y[idx+1]})

{'count': 169, 'lower': 111.0, 'upper': 297.0}
{'count': 12431, 'lower': 297.0, 'upper': 483.0}
{'count': 44188, 'lower': 483.0, 'upper': 669.0}
{'count': 21795, 'lower': 669.0, 'upper': 855.0}
{'count': 1548, 'lower': 855.0, 'upper': 1041.0}
{'count': 9272, 'lower': 1041.0, 'upper': 1227.0}
{'count': 16609, 'lower': 1227.0, 'upper': 1413.0}
{'count': 2185, 'lower': 1413.0, 'upper': 1599.0}
{'count': 216, 'lower': 1599.0, 'upper': 1785.0}
{'count': 16536, 'lower': 1785.0, 'upper': 1971.0}
{'count': 2759, 'lower': 1971.0, 'upper': 2157.0}
{'count': 256, 'lower': 2157.0, 'upper': 2343.0}
{'count': 2655, 'lower': 2343.0, 'upper': 2529.0}
{'count': 20908, 'lower': 2529.0, 'upper': 2715.0}
{'count': 2894, 'lower': 2715.0, 'upper': 2901.0}
{'count': 29658, 'lower': 2901.0, 'upper': 3087.0}
{'count': 63012, 'lower': 3087.0, 'upper': 3273.0}
{'count': 25653, 'lower': 3273.0, 'upper': 3459.0}
{'count': 9197, 'lower': 3459.0, 'upper': 3645.0}
{'count': 37132, 'lower': 3645.0, 'upper': 3831.0}

In [62]:

%%time
import dask.array as da
# x = da.random.normal(10, 0.1, size=(100000,), chunks=(1000,))
h, bins = da.histogram(df["OFFENSE_CODE"], bins=30, range=[111, 3831])
print(h.compute(), bins)

[  169  3799  8632  7406 41698 16879    94  5868  4858  1455 15415  1924
   216 11327  5209  2185   830     0  2611    44 20908     0  8989 23563
 63012 14054 11599  8724   473 37132] [ 111.  235.  359.  483.  607.  731.  855.  979. 1103. 1227. 1351. 1475.
 1599. 1723. 1847. 1971. 2095. 2219. 2343. 2467. 2591. 2715. 2839. 2963.
 3087. 3211. 3335. 3459. 3583. 3707. 3831.]
Wall time: 1.26 s

In [ ]:

In [66]:

%%time
df.cols.hist(["OFFENSE_CODE"])

Wall time: 3.11 s

Out[66]:

{'OFFENSE_CODE': {'hist_agg': array([  169, 12431, 44188, 21795,  1548,  9272, 16609,  2185,   216,
         16536,  2759,   256,  2655, 20908,  2894, 29658, 63012, 25653,
          9197, 37132], dtype=int64)}}

In [15]:

da.histogram(serie[col_name], bins=bins, range=[min_max["min"], min_max["max"]])

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-f7c9ec5b4f51> in <module>
----> 1 da.histogram(serie[col_name], bins=bins, range=[min_max["min"], min_max["max"]])

NameError: name 'da' is not defined

In [17]:

print(df.cols.test_agg("OFFENSE_CODE"))

(<dask.dataframe.groupby.Aggregation object at 0x000001FEF04D0550>,)

In [38]:

print(res)

(<dask.dataframe.groupby.Aggregation object at 0x000001FEEA135FD0>,)

In [9]:

%%time
df.cols.percentile(["YEAR","OFFENSE_CODE"], values=[0.5,0.95])

Wall time: 1.2 s

Out[9]:

{'YEAR': {'percentile': {'percentile': {0.5: 2017.0, 0.95: 2018.0}}},
 'OFFENSE_CODE': {'percentile': {'percentile': {0.5: 2907.0, 0.95: 3831.0}}}}

In [161]:

custom_mean = dd.Aggregation(
    name='custom_mean',
    chunk=lambda s: (s.count(), s.sum()),
    agg=lambda count, sum: (count.sum(), sum.sum()),
    finalize=lambda count, sum: sum / count,
)  # doctest: +SKIP

df.groupby('g').agg(custom_mean)  # doctest: +SKIP

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-161-72873dc9eb19> in <module>
----> 1 custom_mean = dd.Aggregation(
      2     name='custom_mean',
      3     chunk=lambda s: (s.count(), s.sum()),
      4     agg=lambda count, sum: (count.sum(), sum.sum()),
      5     finalize=lambda count, sum: sum / count,

NameError: name 'dd' is not defined

In [118]:

a[0][1]["percentile"].names

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-118-df5250a9f6f7> in <module>
----> 1 a[0][1]["percentile"].names

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5065             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5066                 return self[name]
-> 5067             return object.__getattribute__(self, name)
   5068 
   5069     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'names'

In [28]:

df.cols.names()

Out[28]:

['INCIDENT_NUMBER',
 'OFFENSE_CODE',
 'OFFENSE_CODE_GROUP',
 'OFFENSE_DESCRIPTION',
 'DISTRICT',
 'REPORTING_AREA',
 'SHOOTING',
 'OCCURRED_ON_DATE',
 'YEAR',
 'MONTH',
 'DAY_OF_WEEK',
 'HOUR',
 'UCR_PART',
 'STREET',
 'Lat',
 'Long',
 'Location']

In [64]:

# df.cols.percentile("*")
{"percentile": {str(i): j for i, j in df[["INCIDENT_NUMBER","OFFENSE_CODE"]].quantile(0.5).iteritems()}}

Out[64]:

{'percentile': {'OFFENSE_CODE': 2907.0}}

In [63]:

columns = ["YEAR","OFFENSE_CODE"]
values= [0.5,0.9]
result ={}
for index, row in df[columns].quantile(values).iterrows():
    for i, c in enumerate(columns):
        result.setdefault(c,{})
        result[c].setdefault("percentile",{})
        result[c]["percentile"][index] = row[i]

print(r)

{'YEAR': {'percentile': {0.5: 2017.0, 0.9: 2018.0}}, 'OFFENSE_CODE': {'percentile': {0.5: 2907.0, 0.9: 3802.0}}}

In [7]:

# %reset_selective?

In [23]:

for i in df.cols.names():
    df.cols.kurtosis(i)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-23-8251d58bc6f0> in <module>
      1 for i in df.cols.names():
----> 2     df.cols.kurtosis(i)

AttributeError: 'Cols' object has no attribute 'kurtosis'

In [105]:

df.cols.min("*")

Out[105]:

{'INCIDENT_NUMBER': {'min': '142052550'},
 'OFFENSE_CODE': {'min': 111},
 'OFFENSE_CODE_GROUP': {'min': 'Aggravated Assault'},
 'OFFENSE_DESCRIPTION': {'min': 'A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.'},
 'REPORTING_AREA': {'min': ' '},
 'OCCURRED_ON_DATE': {'min': '2015-06-15 00:00:00'},
 'YEAR': {'min': 2015},
 'MONTH': {'min': 1},
 'DAY_OF_WEEK': {'min': 'Friday'},
 'HOUR': {'min': 0},
 'Lat': {'min': -1.0},
 'Long': {'min': -71.17867378},
 'Location': {'min': '(-1.00000000, -1.00000000)'}}

In [41]:

%%time
a = df.min().compute()
print(type(a))
# print({k:{"min":v} for k,v in a.items()}    )

<class 'pandas.core.series.Series'>
Wall time: 2.53 s

In [23]:

df.cols.lower("INCIDENT_NUMBER").ext.display()

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (int64) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (int64) not nullable	MONTH 10 (int64) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (int64) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (float64) not nullable	Long 16 (float64) not nullable	Location 17 (object) not nullable
i182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
i182070943	1402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
i182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
i182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
i182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
i182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
i182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
i182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
i182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
i182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)
i182070928	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C6	913	nan	2018-09-03⋅19:55:00	2018	9	Monday	19	Part⋅Three	OCEAN⋅VIEW⋅DR	42.31957856	-71.04032766	(42.31957856,⋅-71.04032766)

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

In [24]:

df.cols.upper("OFFENSE_CODE_GROUP").ext.display()

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (int64) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (int64) not nullable	MONTH 10 (int64) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (int64) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (float64) not nullable	Long 16 (float64) not nullable	Location 17 (object) not nullable
I182070945	619	LARCENY	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	VANDALISM	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	TOWED	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	INVESTIGATE⋅PROPERTY	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	INVESTIGATE⋅PROPERTY	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	MOTOR⋅VEHICLE⋅ACCIDENT⋅RESPONSE	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	AUTO⋅THEFT	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	VERBAL⋅DISPUTES	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	ROBBERY	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	VERBAL⋅DISPUTES	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)
I182070928	3301	VERBAL⋅DISPUTES	VERBAL⋅DISPUTE	C6	913	nan	2018-09-03⋅19:55:00	2018	9	Monday	19	Part⋅Three	OCEAN⋅VIEW⋅DR	42.31957856	-71.04032766	(42.31957856,⋅-71.04032766)

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

In [24]:

df.head()

Out[24]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	LAT	LONG	LOCATION
0	I182070945	619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.357791	-71.139371	(42.35779134, -71.13937053)
1	I182070943	1402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.306821	-71.060300	(42.30682138, -71.06030035)
2	I182070941	3410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.346589	-71.072429	(42.34658879, -71.07242943)
3	I182070940	3114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.334182	-71.078664	(42.33418175, -71.07866441)
4	I182070938	3114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.275365	-71.090361	(42.27536542, -71.09036101)

In [16]:

df.cols.trim("OFFENSE_CODE_GROUP").ext.display()

OFFENSE_CODE_GROUP OFFENSE_CODE_GROUP None <class 'str'>

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (int64) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (int64) not nullable	MONTH 10 (int64) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (int64) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (float64) not nullable	Long 16 (float64) not nullable	Location 17 (object) not nullable
I182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)
I182070928	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C6	913	nan	2018-09-03⋅19:55:00	2018	9	Monday	19	Part⋅Three	OCEAN⋅VIEW⋅DR	42.31957856	-71.04032766	(42.31957856,⋅-71.04032766)

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

In [29]:

# def polar(data):
#     data=scale(sid.polarity_scores(data.tweet)['compound'])
#     return data

# hola_df.map_partitions(clean_text, meta=df)
def _lower(text, args):
    a = args[0]
    return text[a].str.lower()

def apply(df,cols, func):
    kwargs = {cols:df[[cols]].map_partitions(_lower, args=(cols,1), meta=(cols,str))}
    return df.assign(**kwargs)
    

apply(df, "INCIDENT_NUMBER", _lower).head()
# df["INCIDENT_NUMBER"].map_partitions(_lower, meta=("INCIDENT_NUMBER",str)).head()

Out[29]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	LAT	LONG	LOCATION
0	i182070945	619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.357791	-71.139371	(42.35779134, -71.13937053)
1	i182070943	1402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.306821	-71.060300	(42.30682138, -71.06030035)
2	i182070941	3410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.346589	-71.072429	(42.34658879, -71.07242943)
3	i182070940	3114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.334182	-71.078664	(42.33418175, -71.07866441)
4	i182070938	3114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.275365	-71.090361	(42.27536542, -71.09036101)

In [26]:

df1["hola"] = hola_df["INCIDENT_NUMBER"].replace(20,0)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-07556533af62> in <module>
----> 1 df1["hola"] = hola_df["INCIDENT_NUMBER"].replace(20,0)

NameError: name 'hola_df' is not defined

In [9]:

df1 =  df.cols.replace("*",["a","b","c"],"")

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-8f2979ed9daa> in <module>
----> 1 df1 =  df.cols.replace("*",["a","b","c"],"")

AttributeError: 'Cols' object has no attribute 'replace'

In [48]:

df1.ext.display()

Viewing 10 of 319073 rows / 17 columns

8 partition(s)

INCIDENT_NUMBER 1 (string) nullable	OFFENSE_CODE 2 (string) nullable	OFFENSE_CODE_GROUP 3 (string) nullable	OFFENSE_DESCRIPTION 4 (string) nullable	DISTRICT 5 (string) nullable	REPORTING_AREA 6 (string) nullable	SHOOTING 7 (string) nullable	OCCURRED_ON_DATE 8 (timestamp) nullable	YEAR 9 (string) nullable	MONTH 10 (string) nullable	DAY_OF_WEEK 11 (string) nullable	HOUR 12 (string) nullable	UCR_PART 13 (string) nullable	STREET 14 (string) nullable	Lat 15 (string) nullable	Long 16 (string) nullable	Location 17 (string) nullable
I182070945	619	Lreny	LARCENY⋅ALL⋅OTHERS	D14	808	None	2018-09-02⋅13:00:00	2018	9	Sundy	13	Prt⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vndlism	VANDALISM	C11	347	None	2018-08-21⋅00:00:00	2018	8	Tuesdy	0	Prt⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	None	2018-09-03⋅19:27:00	2018	9	Mondy	19	Prt⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigte⋅Property	INVESTIGATE⋅PROPERTY	D4	272	None	2018-09-03⋅21:16:00	2018	9	Mondy	21	Prt⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigte⋅Property	INVESTIGATE⋅PROPERTY	B3	421	None	2018-09-03⋅21:05:00	2018	9	Mondy	21	Prt⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehile⋅Aident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	None	2018-09-03⋅21:09:00	2018	9	Mondy	21	Prt⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	None	2018-09-03⋅21:25:00	2018	9	Mondy	21	Prt⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	Verl⋅Disputes	VERBAL⋅DISPUTE	B2	584	None	2018-09-03⋅20:39:37	2018	9	Mondy	20	Prt⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Roery	ROBBERY⋅-⋅STREET	C6	177	None	2018-09-03⋅20:48:00	2018	9	Mondy	20	Prt⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verl⋅Disputes	VERBAL⋅DISPUTE	C11	364	None	2018-09-03⋅20:38:00	2018	9	Mondy	20	Prt⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)

Viewing 10 of 319073 rows / 17 columns

8 partition(s)

In [7]:

df.cols.count_by_dtypes("OFFENSE_CODE")

Out[7]:

{'OFFENSE_CODE': {'int': 319073, 'float': 0}}

In [7]:

df.cols.percentile("OFFENSE_CODE",[0.5])

invalid escape sequence \d

[('OFFENSE_CODE', {'percentile': {'0.5': 2907.0}})]

Out[7]:

{'OFFENSE_CODE': {'percentile': {'0.5': 2907.0}}}

In [27]:

df.cols.median("*")

Out[27]:

{'OFFENSE_CODE': 0.5    2907.0
 Name: OFFENSE_CODE, dtype: float64, 'YEAR': 0.5    2017.0
 Name: YEAR, dtype: float64, 'MONTH': 0.5    7.0
 Name: MONTH, dtype: float64, 'HOUR': 0.5    14.0
 Name: HOUR, dtype: float64, 'Lat': 0.5    42.325538
 Name: Lat, dtype: float64, 'Long': 0.5   -71.077524
 Name: Long, dtype: float64}

In [28]:

df.cols.hist("OFFENSE_CODE")

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-28-bf27b0e7abaa> in <module>
----> 1 df.cols.hist("OFFENSE_CODE")

AttributeError: 'Cols' object has no attribute 'hist'

In [10]:

df.ext.display()

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (int64) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (int64) not nullable	MONTH 10 (int64) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (int64) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (float64) not nullable	Long 16 (float64) not nullable	Location 17 (object) not nullable
I182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)
I182070928	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C6	913	nan	2018-09-03⋅19:55:00	2018	9	Monday	19	Part⋅Three	OCEAN⋅VIEW⋅DR	42.31957856	-71.04032766	(42.31957856,⋅-71.04032766)

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

In [11]:

df.cols.dtypes("INCIDENT_NUMBER")

Out[11]:

{'INCIDENT_NUMBER': 'object'}

In [8]:

df.head()

invalid escape sequence \d

Out[8]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	I182070945	619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.357791	-71.139371	(42.35779134, -71.13937053)
1	I182070943	1402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.306821	-71.060300	(42.30682138, -71.06030035)
2	I182070941	3410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.346589	-71.072429	(42.34658879, -71.07242943)
3	I182070940	3114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.334182	-71.078664	(42.33418175, -71.07866441)
4	I182070938	3114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.275365	-71.090361	(42.27536542, -71.09036101)

In [9]:

%%time
df["INCIDENT_NUMBER"].min().compute()

Wall time: 1.19 s

Out[9]:

'142052550'

In [30]:

%%time
df.cols.min("*")

[('OFFENSE_CODE', {'min': dd.Scalar<series-..., dtype=int64>}), ('YEAR', {'min': dd.Scalar<series-..., dtype=int64>}), ('MONTH', {'min': dd.Scalar<series-..., dtype=int64>}), ('HOUR', {'min': dd.Scalar<series-..., dtype=int64>}), ('LAT', {'min': dd.Scalar<series-..., dtype=float64>}), ('LONG', {'min': dd.Scalar<series-..., dtype=float64>})]

distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.44 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.46 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.49 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.52 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.54 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.57 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Worker is at 80% memory usage. Pausing worker.  Process memory: 1.52 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.52 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Worker is at 76% memory usage. Resuming worker. Process memory: 1.53 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.53 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.56 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.59 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Worker is at 80% memory usage. Pausing worker.  Process memory: 1.61 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.61 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.64 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.67 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.63 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk.  Perhaps some other process is leaking memory?  Process memory: 1.65 GB -- Worker memory limit: 2.00 GB
distributed.worker - WARNING - Worker is at 61% memory usage. Resuming worker. Process memory: 1.22 GB -- Worker memory limit: 2.00 GB

Wall time: 5.78 s

Out[30]:

{'OFFENSE_CODE': {'min': 111},
 'YEAR': {'min': 2015},
 'MONTH': {'min': 1},
 'HOUR': {'min': 0},
 'LAT': {'min': -1.0},
 'LONG': {'min': -71.17867378}}

In [99]:

!pip install graphviz

Collecting graphviz
  Downloading https://files.pythonhosted.org/packages/94/cd/7b37f2b658995033879719e1ea4c9f171bf7a14c16b79220bd19f9eda3fe/graphviz-0.13-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.13

In [15]:

df.min().visualize()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
~\Anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, quiet, **kwargs)
    157     try:
--> 158         proc = subprocess.Popen(cmd, startupinfo=get_startupinfo(), **kwargs)
    159     except OSError as e:

~\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
    768                                 errread, errwrite,
--> 769                                 restore_signals, start_new_session)
    770         except:

~\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
   1171                                          os.fspath(cwd) if cwd is not None else None,
-> 1172                                          startupinfo)
   1173             finally:

FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

ExecutableNotFound                        Traceback (most recent call last)
<ipython-input-15-e7fadf3d4654> in <module>
----> 1 df.min().visualize()

~\Anaconda3\lib\site-packages\dask\base.py in visualize(self, filename, format, optimize_graph, **kwargs)
     86         """
     87         return visualize(self, filename=filename, format=format,
---> 88                          optimize_graph=optimize_graph, **kwargs)
     89 
     90     def persist(self, **kwargs):

~\Anaconda3\lib\site-packages\dask\base.py in visualize(*args, **kwargs)
    479         raise NotImplementedError("Unknown value color=%s" % color)
    480 
--> 481     return dot_graph(dsk, filename=filename, **kwargs)
    482 
    483 

~\Anaconda3\lib\site-packages\dask\dot.py in dot_graph(dsk, filename, format, **kwargs)
    254         format = 'png'
    255 
--> 256     data = g.pipe(format=format)
    257     if not data:
    258         raise RuntimeError("Graphviz failed to properly produce an image. "

~\Anaconda3\lib\site-packages\graphviz\files.py in pipe(self, format, renderer, formatter, quiet)
    136         out = backend.pipe(self._engine, format, data,
    137                            renderer=renderer, formatter=formatter,
--> 138                            quiet=quiet)
    139 
    140         return out

~\Anaconda3\lib\site-packages\graphviz\backend.py in pipe(engine, format, data, renderer, formatter, quiet)
    226     """
    227     cmd, _ = command(engine, format, None, renderer, formatter)
--> 228     out, _ = run(cmd, input=data, capture_output=True, check=True, quiet=quiet)
    229     return out
    230 

~\Anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, quiet, **kwargs)
    159     except OSError as e:
    160         if e.errno == errno.ENOENT:
--> 161             raise ExecutableNotFound(cmd)
    162         else:
    163             raise

ExecutableNotFound: failed to execute ['dot', '-Tpng'], make sure the Graphviz executables are on your systems' PATH

In [95]:

df.cols.create_exprs("*",df.functions.min)

[('OFFENSE_CODE', {'min': dd.Scalar<series-..., dtype=int64>}), ('YEAR', {'min': dd.Scalar<series-..., dtype=int64>}), ('MONTH', {'min': dd.Scalar<series-..., dtype=int64>}), ('HOUR', {'min': dd.Scalar<series-..., dtype=int64>}), ('Lat', {'min': dd.Scalar<series-..., dtype=float64>}), ('Long', {'min': dd.Scalar<series-..., dtype=float64>})]

Out[95]:

[('OFFENSE_CODE', {'min': dd.Scalar<series-..., dtype=int64>}),
 ('YEAR', {'min': dd.Scalar<series-..., dtype=int64>}),
 ('MONTH', {'min': dd.Scalar<series-..., dtype=int64>}),
 ('HOUR', {'min': dd.Scalar<series-..., dtype=int64>}),
 ('Lat', {'min': dd.Scalar<series-..., dtype=float64>}),
 ('Long', {'min': dd.Scalar<series-..., dtype=float64>})]

In [82]:

df.cols.dtypes()

Out[82]:

{'INCIDENT_NUMBER': 'object',
 'OFFENSE_CODE': 'int64',
 'OFFENSE_CODE_GROUP': 'object',
 'OFFENSE_DESCRIPTION': 'object',
 'DISTRICT': 'object',
 'REPORTING_AREA': 'object',
 'SHOOTING': 'object',
 'OCCURRED_ON_DATE': 'object',
 'YEAR': 'int64',
 'MONTH': 'int64',
 'DAY_OF_WEEK': 'object',
 'HOUR': 'int64',
 'UCR_PART': 'object',
 'STREET': 'object',
 'Lat': 'float64',
 'Long': 'float64',
 'Location': 'object'}

In [24]:

for c in df.cols.names():
    print(c)
    df.cols.min(c)

INCIDENT_NUMBER
OFFENSE_CODE
OFFENSE_CODE_GROUP
OFFENSE_DESCRIPTION
DISTRICT

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in f(values, axis, skipna, **kwds)
    126                 else:
--> 127                     result = alt(values, axis=axis, skipna=skipna, **kwds)
    128             except Exception:

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in reduction(values, axis, skipna, mask)
    741         else:
--> 742             result = getattr(values, meth)(axis)
    743 

~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _amin(a, axis, out, keepdims, initial)
     31           initial=_NoValue):
---> 32     return umr_minimum(a, axis, None, out, keepdims, initial)
     33 

TypeError: '<=' not supported between instances of 'str' and 'float'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-24-8c2153824c92> in <module>
      1 for c in df.cols.names():
      2     print(c)
----> 3     df.cols.min(c)

~\Documents\Optimus\optimus\dask\columns.py in min(columns)
    463             :return:
    464             """
--> 465             return Cols.agg_exprs(columns, self.functions.min)
    466 
    467     return Cols()

~\Documents\Optimus\optimus\dask\columns.py in agg_exprs(columns, funcs, *args)
    326             """
    327             # print(args)
--> 328             return Cols.exec_agg(Cols.create_exprs(columns, funcs, *args))
    329 
    330         @staticmethod

~\Documents\Optimus\optimus\dask\columns.py in exec_agg(exprs)
    364             if is_list_of_futures(agg_list):
    365                 for agg_element in agg_list:
--> 366                     agg_result.append(agg_element.result())
    367             else:
    368                 agg_result = agg_list[0]

~\Anaconda3\lib\site-packages\distributed\client.py in result(self, timeout)
    193                                   raiseit=False)
    194         if self.status == 'error':
--> 195             six.reraise(*result)
    196         elif self.status == 'cancelled':
    197             raise result

~\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
    690                 value = tp()
    691             if value.__traceback__ is not tb:
--> 692                 raise value.with_traceback(tb)
    693             raise value
    694         finally:

~\Anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
     91     def apply(func, args, kwargs=None):
     92         if kwargs:
---> 93             return func(*args, **kwargs)
     94         else:
     95             return func(*args)

~\Anaconda3\lib\site-packages\dask\dataframe\core.py in _reduction_chunk(x, aca_chunk, **kwargs)
   4314 
   4315 def _reduction_chunk(x, aca_chunk=None, **kwargs):
-> 4316     o = aca_chunk(x, **kwargs)
   4317     # Return a dataframe so that the concatenated version is also a dataframe
   4318     return o.to_frame().T if isinstance(o, pd.Series) else o

~\Anaconda3\lib\site-packages\dask\utils.py in __call__(self, obj, *args, **kwargs)
    692 
    693     def __call__(self, obj, *args, **kwargs):
--> 694         return getattr(obj, self.method)(*args, **kwargs)
    695 
    696     def __reduce__(self):

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in stat_func(self, axis, skipna, level, numeric_only, **kwargs)
  10954                                       skipna=skipna)
  10955         return self._reduce(f, name, axis=axis, skipna=skipna,
> 10956                             numeric_only=numeric_only)
  10957 
  10958     return set_function_name(stat_func, name, cls)

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   3628                                           'numeric_only.'.format(name))
   3629             with np.errstate(all='ignore'):
-> 3630                 return op(delegate, skipna=skipna, **kwds)
   3631 
   3632         # TODO(EA) dispatch to Index

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in f(values, axis, skipna, **kwds)
    128             except Exception:
    129                 try:
--> 130                     result = alt(values, axis=axis, skipna=skipna, **kwds)
    131                 except ValueError as e:
    132                     # we want to transform an object array

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in reduction(values, axis, skipna, mask)
    740                 result = np.nan
    741         else:
--> 742             result = getattr(values, meth)(axis)
    743 
    744         result = _wrap_results(result, dtype, fill_value)

~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _amin(a, axis, out, keepdims, initial)
     30 def _amin(a, axis=None, out=None, keepdims=False,
     31           initial=_NoValue):
---> 32     return umr_minimum(a, axis, None, out, keepdims, initial)
     33 
     34 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,

TypeError: '<=' not supported between instances of 'str' and 'float'

In [90]:

import dask.array as da

In [96]:

df

In [100]:

da.histogram(df["INCIDENT_NUMBER"],bins=10, range=[9, 11])

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-100-0b0a8087ee90> in <module>
----> 1 da.histogram(df["INCIDENT_NUMBER"],bins=10, range=[9, 11]).compute()

AttributeError: 'tuple' object has no attribute 'compute'

In [ ]:

df.sum().compute()

In [14]:

%%time
from optimus.profiler.profiler import Profiler
p = Profiler()
p.run(df, "*")

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<timed exec> in <module>

~\Documents\Optimus\optimus\helpers\decorators.py in timed(*args, **kw)
      8     def timed(*args, **kw):
      9         start_time = timeit.default_timer()
---> 10         f = method(*args, **kw)
     11         _time = round(timeit.default_timer() - start_time, 2)
     12         logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time))

~\Documents\Optimus\optimus\profiler\profiler.py in run(self, df, columns, buckets, infer, relative_error, approx_count)
    118         # df.ext.set_meta({"initialized": True})
    119 
--> 120         output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict")
    121 
    122         # Load jinja

~\Documents\Optimus\optimus\profiler\profiler.py in dataset(self, df, columns, buckets, infer, relative_error, approx_count, sample, stats, format)
    245         # if ["drop", "rename"] not in trans and self.already_run is False:
    246         if stats is True:
--> 247             output_columns = self.columns_stats(df, columns, buckets, infer, relative_error, approx_count)
    248 
    249         assign(output_columns, "name", df.ext.get_name(), dict)

~\Documents\Optimus\optimus\profiler\profiler.py in columns_stats(self, df, columns, buckets, infer, relative_error, approx_count)
    330 
    331         # Aggregation
--> 332         stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count)
    333 
    334         # Calculate Frequency

~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(df, columns, buckets, relative_error, approx_count)
    375                      df.functions.kurtosis, df.functions.mean, df.functions.skewness, df.functions.sum,
    376                      df.functions.variance, df.functions.zeros_agg]
--> 377             exprs.extend(df.cols.create_exprs(cols, funcs))
    378 
    379             # TODO: None in basic calculation

~\Documents\Optimus\optimus\dask\columns.py in create_exprs(columns, funcs, *args)
    620                                 exprs[col_name].update(func(col_name, args)(self))
    621                             else:
--> 622                                 exprs[col_name] = func(col_name, args)(self)
    623 
    624             result = {}

~\Documents\Optimus\optimus\dask\functions.py in _kurtoris(serie)
    127         def kurtosis(col_name, args):
    128             def _kurtoris(serie):
--> 129                 result = {"kurtosis": float(stats.kurtosis(serie[col_name]))}
    130                 return result
    131 

~\Anaconda3\lib\site-packages\dask\array\stats.py in kurtosis(a, axis, fisher, bias, nan_policy)
    227     olderr = np.seterr(all='ignore')
    228     try:
--> 229         vals = da.where(zero, 0, m4 / m2**2.0)
    230     finally:
    231         np.seterr(**olderr)

TypeError: unsupported operand type(s) for ** or pow(): 'Array' and 'float'

In [16]:

df.ext.display(10)

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (int64) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (int64) not nullable	MONTH 10 (int64) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (int64) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (float64) not nullable	Long 16 (float64) not nullable	Location 17 (object) not nullable
I182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)
I182070928	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C6	913	nan	2018-09-03⋅19:55:00	2018	9	Monday	19	Part⋅Three	OCEAN⋅VIEW⋅DR	42.31957856	-71.04032766	(42.31957856,⋅-71.04032766)

Viewing 10 of 319073 rows / 319073 columns

1 partition(s)

In [ ]:

df.rows.to_list("OFFENSE_CODE")
# type(df["OFFENSE_CODE"])
# type(df.cols.select("OFFENSE_CODE"))

# a = [v for v in df["OFFENSE_CODE"].iteritems()]

# df[["OFFENSE_CODE"]].iteritems()

In [74]:

df.cols.select("OFFENSE_CODE")

Out[74]:

Dask DataFrame Structure:

	OFFENSE_CODE
npartitions=1
	int64
	...

Dask Name: getitem, 4 tasks

In [56]:

df.sample(1).head()

sample does not support the number of sampled items parameter, 'n'. Please use the 'frac' parameter instead.

Out[56]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
186823	I162085248	3006	Medical Assistance	SICK/INJURED/MEDICAL - PERSON	B2	314	NaN	2016-10-18 07:24:00	2016	10	Tuesday	7	Part Three	HOWLAND ST	42.314448	-71.089934	(42.31444840, -71.08993418)
65160	I182001199	1102	Fraud	FRAUD - FALSE PRETENSE / SCHEME	E18	526	NaN	2017-11-01 10:00:00	2017	11	Wednesday	10	Part Two	CLIFFORD ST	42.234543	-71.130915	(42.23454320, -71.13091490)
277221	I152095488	361	Robbery	ROBBERY - OTHER	D4	272	NaN	2015-11-17 14:52:00	2015	11	Tuesday	14	Part One	WASHINGTON ST	42.334831	-71.079041	(42.33483110, -71.07904134)
44193	I182023593	3805	Motor Vehicle Accident Response	M/V ACCIDENT - POLICE VEHICLE	D4	171	NaN	2018-03-30 09:49:00	2018	3	Friday	9	Part Three	ALBANY ST	42.334288	-71.072395	(42.33428841, -71.07239518)
13308	I182056371	3115	Investigate Person	INVESTIGATE PERSON	C11	250	NaN	2018-07-17 20:18:00	2018	7	Tuesday	20	Part Three	SAVIN HILL AVE	42.312482	-71.048290	(42.31248165, -71.04829028)

In [29]:

df.cols.schema_dtype()

Out[29]:

[numpy.object_,
 numpy.int64,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.object_,
 numpy.int64,
 numpy.int64,
 numpy.object_,
 numpy.int64,
 numpy.object_,
 numpy.object_,
 numpy.float64,
 numpy.float64,
 numpy.object_]

In [28]:

import numpy as np
np.dtype(df["OFFENSE_CODE"]).type

Out[28]:

numpy.int64

In [77]:

df.schema

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-77-2830e85e5da4> in <module>
----> 1 df.schema

~\Anaconda3\lib\site-packages\dask\dataframe\core.py in __getattr__(self, key)
   2531             return new_dd_object(merge(self.dask, dsk), name,
   2532                                  meta, self.divisions)
-> 2533         raise AttributeError("'DataFrame' object has no attribute %r" % key)
   2534 
   2535     def __dir__(self):

AttributeError: 'DataFrame' object has no attribute 'schema'

In [7]:

%%time
df["INCIDENT_NUMBER"].value_counts().nlargest(5).compute()

Wall time: 2.51 s

Out[7]:

I162030584    13
I152080623    11
I172013170    10
I182065208    10
I172096394    10
Name: INCIDENT_NUMBER, dtype: int64

In [14]:

df["INCIDENT_NUMBER"].nunique().compute()

Out[14]:

In [6]:

df.npartitions

Out[6]:

In [7]:

df = df.repartition(npartitions=20)

In [8]:

df.npartitions

Out[8]:

In [73]:

%%time
df["INCIDENT_NUMBER"].value_counts().nlargest(5).compute()

Wall time: 8.07 s

Out[73]:

I162030584    13
I152080623    11
I172013170    10
I182065208    10
I172096394    10
Name: INCIDENT_NUMBER, dtype: int64

In [16]:

df["INCIDENT_NUMBER"].unique().count().compute()

Out[16]:

In [6]:

df.head()

Out[6]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	I182070945	619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.357791	-71.139371	(42.35779134, -71.13937053)
1	I182070943	1402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.306821	-71.060300	(42.30682138, -71.06030035)
2	I182070941	3410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.346589	-71.072429	(42.34658879, -71.07242943)
3	I182070940	3114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.334182	-71.078664	(42.33418175, -71.07866441)
4	I182070938	3114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.275365	-71.090361	(42.27536542, -71.09036101)

In [87]:

import dask.dataframe as dd

In [7]:

def count_uniques_agg(col_name, args=True):
    estimate = args[0]
    def count_uniques_agg_(x):
        print(estimate)
        if estimate is True:
            result = {"count_uniques_agg": x[col_name].nunique_approx()}
        else:
            result = {"count_uniques_agg": x[col_name].nunique()}
        return result
    return count_uniques_agg_

In [98]:

df.rows.count()

Out[98]:

In [39]:

import dask.array as da

In [224]:

def hist_agg(col_name, args):
    df = args[0]
    bins = args[1]
    range = args[2]
    
    def hist_agg_(x):
        return {"hist_agg":list(da.histogram(x[col_name], bins=bins, range=range)[1]) }
    return hist_agg_

In [254]:

def percentile_agg(col_name, args):
    values = args[0]

    def _percentile(x):

        return {"percentile_agg": {i:j for i,j in x[col_name].quantile(values).iteritems()}}

    return _percentile

In [259]:

%%time
import dask.dataframe as dd
# k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)]
# agg_func=["min","max"]
# agg_func=[percentile_agg(0.1)(df),"min","max", count_uniques_agg("INCIDENT_NUMBER",False)(df)]
# agg_func=[percentile_agg(0.1)(df),"min","max"]
agg_func=[hist_agg]

# agg_func=[df.functions.count_na_agg]

exprs = df.cols.create_exprs(["OFFENSE_CODE","YEAR"], agg_func, df, 10,[1,5])

# print(exprs)
print(dd.compute(exprs))

([('OFFENSE_CODE', {'hist_agg': [1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 3.4000000000000004, 3.8000000000000003, 4.2, 4.6, 5.0]}), ('YEAR', {'hist_agg': [1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 3.4000000000000004, 3.8000000000000003, 4.2, 4.6, 5.0]})],)
Wall time: 4 ms

In [233]:

Out[233]:

dask.dataframe.core.DataFrame

In [221]:

list(r["OFFENSE_CODE"]["hist_agg"][1])

Out[221]:

[4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5]

In [33]:

for i in x:
    print(i[0],i[1])

a {'min': 111}

In [124]:

print(df.cols.exec_agg(exprs))

{'OFFENSE_CODE': {'kurtosis': 1.5838041686767816, 'skew': array(-0.40994654), 'zeros': 15106, 'variance': 1404901.8183066112}, 'HOUR': {'kurtosis': 2.3994270670757927, 'skew': array(-0.4834518), 'zeros': 15106, 'variance': 39.61701992983649}}

In [161]:

import dask.dataframe as dd

In [159]:

df.min()

Out[159]:

Dask Series Structure:
npartitions=1
DAY_OF_WEEK    object
YEAR              ...
dtype: object
Dask Name: dataframe-min-agg, 5 tasks

In [158]:

df.functions.percentile_agg(0.1)(df)

Out[158]:

Dask Series Structure:
npartitions=1
HOUR    float64
YEAR        ...
Name: 0.1, dtype: float64
Dask Name: quantiles-concat, 28 tasks

In [150]:

dd.compute(df.min())

Out[150]:

(INCIDENT_NUMBER                                           142052550
 OFFENSE_CODE                                                    111
 OFFENSE_CODE_GROUP                               Aggravated Assault
 OFFENSE_DESCRIPTION    A&B HANDS, FEET, ETC.  - MED. ATTENTION REQ.
 REPORTING_AREA                                                     
 OCCURRED_ON_DATE                                2015-06-15 00:00:00
 YEAR                                                           2015
 MONTH                                                             1
 DAY_OF_WEEK                                                  Friday
 HOUR                                                              0
 Lat                                                              -1
 Long                                                       -71.1787
 Location                                 (-1.00000000, -1.00000000)
 dtype: object,)

In [13]:

import dask.dataframe as dd

In [74]:

def frequency(col_name):
    def frequency_(x):
        print(x)
        return 
    return frequency_

In [79]:

df.cols.frequency("INCIDENT_NUMBER")

Out[79]:

({'I162030584': 13,
  'I152080623': 11,
  'I172013170': 10,
  'I182065208': 10,
  'I172096394': 10},)

In [76]:

print(dd.compute(df[col_name].value_counts().nlargest(5)))

(<function frequency.<locals>.frequency_ at 0x00000202489E19D8>,)

In [17]:

%%time
dd.compute(df["OFFENSE_CODE"].min(), df["INCIDENT_NUMBER"].value_counts().nlargest(5), df["OFFENSE_CODE"].value_counts().nlargest(5), df["OFFENSE_CODE"].nunique())

Wall time: 2.01 s

Out[17]:

(111, I162030584    13
 I152080623    11
 I172013170    10
 I182065208    10
 I172096394    10
 Name: INCIDENT_NUMBER, dtype: int64, 3006    18783
 3115    18754
 3831    16323
 1402    15154
 802     14799
 Name: OFFENSE_CODE, dtype: int64, 222)

In [118]:

%%time
dd.compute(df["OFFENSE_CODE"].nunique())

Wall time: 1.26 s

Out[118]:

(222,)

In [27]:

## https://distributed.dask.org/en/latest/web.html

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-bc0913579ac8> in <module>
----> 1 cluster.scheduler.processing

NameError: name 'cluster' is not defined

In [56]:

# Another approach
# from dask.array import stats
# x = da.random.beta(1, 1, size=(1000,), chunks=10)
# k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)]
# dask.compute(k, s, m)

(1.7612340817172787, -0.064073498030693302, -0.00054523780628304799)

Out[56]:

(1.7612340817172787, -0.0640734980306933, -0.000545237806283048)

In [288]:

import pandas as pd
data = [['tom', 10], ['nick', 15], ['juli', 0],['argenis', 10]]

# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Name', 'Age']) 
df.cols.count_by_dtypes("*")

ddf = dd.from_pandas(df, npartitions=2)

In [9]:

df.reset_index(drop=True)

Out[9]:

	Name	Age
0	tom	10
1	nick	15
2	juli	0
3	argenis	10

In [20]:

# df[df == 0].count(axis=0)
# Speedy https://stackoverflow.com/questions/35277075/python-pandas-counting-the-occurrences-of-a-specific-value
def zeros(series):
    return (series.values==0).sum()

zeros(df["Age"])

Out[20]:

In [21]:

def percentile_agg(n):
    def percentile_(x):
        return x.quantile(n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

# percentile_agg(values=[0.1, 0.5])

In [28]:

print(ddf)

Dask DataFrame Structure:
                 Name    Age
npartitions=2               
0              object  int64
2                 ...    ...
3                 ...    ...
Dask Name: from_pandas, 2 tasks

In [56]:

dd.compute(ddf.min(), ddf.max())

Out[56]:

(Name    argenis
 Age           0
 dtype: object, Name    tom
 Age      15
 dtype: object)

In [103]:

ddf.agg(
    {
#         # find the min, max, and sum of the duration column
#         'Name': [min, max, sum],
#          # find the number of network type entries
#         'Age': "count",
        # min, first, and number of unique dates per group
        'Age': [min, 'max', 'nunique', zeros , percentile_agg(1)]
    }
).to_dict()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-103-9d1e9f74206e> in <module>
----> 1 df.agg(
      2     {
      3 #         # find the min, max, and sum of the duration column
      4 #         'Name': [min, max, sum],
      5 #          # find the number of network type entries

~\Anaconda3\lib\site-packages\dask\dataframe\core.py in __getattr__(self, key)
   2531             return new_dd_object(merge(self.dask, dsk), name,
   2532                                  meta, self.divisions)
-> 2533         raise AttributeError("'DataFrame' object has no attribute %r" % key)
   2534 
   2535     def __dir__(self):

AttributeError: 'DataFrame' object has no attribute 'agg'

In [351]:

df

Out[351]:

	Name	Age
0	tom	10
1	nick	15
2	juli	0
3	argenis	10

In [266]:

def agg_fn(x):
    result = pd.Series(
        dict(
            C = x.quantile(1.0), # int           
        )
    )
    print(result)
    return result

In [267]:

a=df[['Age']]

In [268]:

a.apply(agg_fn)

C    15.0
dtype: float64
C    15.0
dtype: float64

Out[268]:

	Age
C	15.0

In [284]:

a.apply(agg_fn)

C    15.0
dtype: float64
C    15.0
dtype: float64

Out[284]:

	Age
C	15.0

In [269]:

print(df)

      Name  Age
0      tom   10
1     nick   15
2     juli    0
3  argenis   10

In [ ]:

#https://stackoverflow.com/questions/46080171/constructing-mode-and-corresponding-count-functions-using-custom-aggregation-fun

In [31]:

def chunk(s):
    """
    The function applied to the
    individual partition (map)
    """
    return s.apply(lambda x: list(set(x)))

def agg(s):
    """
    The function whic will aggrgate
    the result from all the partitions(reduce)
    """
    s = s._selected_obj
    return s.groupby(level=list(range(s.index.nlevels))).sum()

def finalize(s):
    """
    The optional functional that will be
    applied to the result of the agg_tu functions
    """
    return s.apply(lambda x: len(set(x)))

tunique = dd.Aggregation('tunique', chunk, agg, finalize)

In [42]:

df.groupby(['INCIDENT_NUMBER']).agg({'INCIDENT_NUMBER': tunique}).compute()

Out[42]:

	INCIDENT_NUMBER
INCIDENT_NUMBER
142052550	1
I010370257-00	1
I030217815-08	1
I050310906-00	1
I060168073-00	1
I080542626-00	1
I090317057-00	1
I090321958-00	1
I100033064-00	1
I100222105-02	1
I100340225-00	1
I100636670-00	1
I110177502-00	1
I110261417-00	1
I110372326-00	1
I110551302-00	1
I110611058-00	1
I110694557-00	1
I120069826-00	1
I120189428-00	1
I120201612-00	1
I120260724-01	1
I120283195-00	1
I120470733-00	1
I120595668-00	1
I120719309-00	1
I120720047-00	1
I130007264-01	1
I130031413-00	1
I130041200-00	1
...	...
I182070901	1
I182070903	1
I182070904	1
I182070905	1
I182070906	1
I182070908	1
I182070909	1
I182070910	1
I182070911	1
I182070913	1
I182070915	1
I182070917	1
I182070918	1
I182070919	1
I182070920	1
I182070921	1
I182070922	1
I182070923	1
I182070927	1
I182070928	1
I182070929	1
I182070931	1
I182070932	1
I182070933	1
I182070936	1
I182070938	1
I182070940	1
I182070941	1
I182070943	1
I182070945	1

282517 rows × 1 columns

distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/10 remote=inproc://192.168.0.9/34804/1>
distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/11 remote=inproc://192.168.0.9/34804/1>
distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/12 remote=inproc://192.168.0.9/34804/1>
distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/13 remote=inproc://192.168.0.9/34804/1>
distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/14 remote=inproc://192.168.0.9/34804/1>

In [109]:

df.apply({'g0': unique}, axis=1).compute()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-109-0d3e0320a332> in <module>
----> 1 df.apply({'g0': unique}, axis=1).compute()

NameError: name 'unique' is not defined

In [15]:

from abc import abstractmethod, ABC


class AbstractCols(ABC):    
    @abstractmethod
    def min(self, columns):
        pass

In [16]:

class BaseCols(AbstractCols):
    def __init__(self, functions):
        self.functions = functions
                
    @staticmethod
    # @abstractmethod
    def get_agg_function():
        return 3
    
    def min(self, columns):
        return BaseCols.agg_exprs(columns, self.functions)
    
    @staticmethod
    def agg_exprs(columns, agg):
        return agg

In [21]:

functions={"hola1"}

class DaskCols(BaseCols):
    def __init__(self):
        super().__init__(functions)
    
c = DaskCols()

In [22]:

print(c.min("cols"))

{'hola1'}

In [ ]: