In [1]:

%load_ext autoreload
%autoreload 2

In [2]:

import sys
sys.path.append("..")

In [3]:

from optimus import Optimus

In [4]:

# import sys,os,os.path
# os.environ
def xray(df, col_name):
    for i in df[col_name]:
        print(i, type(i))
        

In [25]:

op = Optimus("dask", threads_per_worker=8, n_workers=1, comm=True)

preview_df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv", n_rows=35).ext.cache() 

Open Bumblebee: https://app.hi-bumblebee.com

If you really care about privacy get your keys in bumblebee.ini and put them here

C:\Users\argenisleon\Anaconda3\lib\site-packages\distributed\dashboard\core.py:79: UserWarning: 
Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.
  warnings.warn("\n" + msg)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 35 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [29]:

import fastnumbers
fastnumbers.fast_float(None)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-29-8e1775e35bdf> in <module>
      1 import fastnumbers
----> 2 fastnumbers.fast_float(None)

TypeError: float() argument must be a string or a number, not 'NoneType'

In [27]:

f = df.cols.set(value='df["ticket_price"]*df["discount"]', where='df["ticket_price"]!=None', output_cols="ticket_price").ext.cache()
_output = df.ext.profile(columns="*", output="json")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [17]:

_output = {**preview_df.ext.to_json("*"), "meta": preview_df.meta.get() if (preview_df.meta and preview_df.meta.get) else {} } 

_output = preview_df.ext.profile(columns="*", output="json")
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv").ext.cache()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.profile(columns="*", output="json")
_output = df.ext.set_buffer("*")
_output = df.ext.buffer_window("*", 0, 17).ext.to_json("*")

print(_output)

_output = df.ext.set_buffer("*")
_output = df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]', where='df["ticket_price"]!=None', output_cols="new ticket_price").ext.to_json("*")
_df_profile = df.ext.buffer_window("*").cols.set(value='df["ticket_price"]', where='df["ticket_price"]!=None', output_cols="new ticket_price")
_output = { "profile": _df_profile.ext.profile(["new ticket_price"], output="json")}

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

{'sample': {'columns': [{'title': 'customer_id'}, {'title': 'transactoin_date'}, {'title': 'ticket_price'}, {'title': 'discount'}, {'title': 'product'}, {'title': 'info'}], 'value': [['0f345kjh345oiuy345', '2010/08/19', '29.99', nan, 'platinum', '1'], ['0fju234978rfjkhsdf', '2012/01/05', '29.99', nan, 'platinum', '2'], ['0f34ruiy23e78y2r', '2009/08/11', '29.99', nan, 'platinum', '3'], ['0fue298y2r23r23r2', '2010/08/19', '29.99', nan, 'platinum', '4'], ['0f345kjh345oiuy346', '2010/08/19', '29.99', nan, 'platinum', '5'], ['0fju234978rfjkhsdf', '2010/08/19', '9.99', '5%', 'basic', '6'], ['0f34ruiy23e78y2r', '2010/08/19', '9.99', nan, 'basic', '7'], ['0fue298y2r23r23r3', '2010/08/19', '9.99', nan, 'basic', '8'], ['0f345kjh345oiuy347', '2010/08/19', '9.99', nan, 'basic', '9'], ['0fju234978rfjkhsdf', '2011/08/11', '9.99', '5%', 'basic', '10'], ['0f34ruiy23e78y2r', '2015/08/09', '9.99', '5%', 'basic', '10'], ['0fue298y2r23r23r4', nan, '9.91', nan, 'basic', '10'], ['0f345kjh345oiuy348', nan, '9.99', nan, 'basic', '10'], ['0fju234978rfjkhsdf', nan, '14.99', '5%', 'deluxe', '10'], ['0f34ruiy23e78y2r', nan, '14.99', nan, 'deluxe', '10'], ['0fue298y2r23r23r5', nan, '14.99', '5%', 'deluxe', '10'], ['0f345kjh345oiuy349', nan, '14.99', nan, 'deluxe', 'a']]}}
pdf     ticket_price
0          29.99
1          29.99
2          29.99
3          29.99
4          29.99
5           9.99
6           9.99
7           9.99
8           9.99
9           9.99
10          9.99
11          9.91
12          9.99
13         14.99
14         14.99
15         14.99
16         14.99
a 0     29.99
1     29.99
2     29.99
3     29.99
4     29.99
5      9.99
6      9.99
7      9.99
8      9.99
9      9.99
10     9.99
11     9.91
12     9.99
13    14.99
14    14.99
15    14.99
16    14.99
Name: new ticket_price, dtype: float64
pdf     ticket_price
0          29.99
1          29.99
2          29.99
3          29.99
4          29.99
5           9.99
6           9.99
7           9.99
8           9.99
9           9.99
10          9.99
11          9.91
12          9.99
13         14.99
14         14.99
15         14.99
16         14.99
a 0     29.99
1     29.99
2     29.99
3     29.99
4     29.99
5      9.99
6      9.99
7      9.99
8      9.99
9      9.99
10     9.99
11     9.91
12     9.99
13    14.99
14    14.99
15    14.99
16    14.99
Name: new ticket_price, dtype: float64
dtype int
{'new ticket_price': {'match': 17, 'missing': 0, 'mismatch': 17}}

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [13]:

df.compute().ext.head()

Out[13]:

	customer_id	transactoin_date	ticket_price	discount	product	info
0	0f345kjh345oiuy345	2010/08/19	29.99	NaN	platinum	1
1	0fju234978rfjkhsdf	2012/01/05	29.99	NaN	platinum	2
2	0f34ruiy23e78y2r	2009/08/11	29.99	NaN	platinum	3
3	0fue298y2r23r23r2	2010/08/19	29.99	NaN	platinum	4
4	0f345kjh345oiuy346	2010/08/19	29.99	NaN	platinum	5
5	0fju234978rfjkhsdf	2010/08/19	9.99	5%	basic	6
6	0f34ruiy23e78y2r	2010/08/19	9.99	NaN	basic	7
7	0fue298y2r23r23r3	2010/08/19	9.99	NaN	basic	8
8	0f345kjh345oiuy347	2010/08/19	9.99	NaN	basic	9
9	0fju234978rfjkhsdf	2011/08/11	9.99	5%	basic	10

In [24]:

_output = df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]*df["discount"]', where='df["ticket_price"]!=None', output_cols="new ticket_price").ext.to_json("*")

pdf     ticket_price
0          29.99
1          29.99
2          29.99
3          29.99
4          29.99
5           9.99
6           9.99
7           9.99
8           9.99
9           9.99
10          9.99
11          9.91
12          9.99
13         14.99
14         14.99
15         14.99
16         14.99
a 0     899.4001
1     899.4001
2     899.4001
3     899.4001
4     899.4001
5      99.8001
6      99.8001
7      99.8001
8      99.8001
9      99.8001
10     99.8001
11     98.2081
12     99.8001
13    224.7001
14    224.7001
15    224.7001
16    224.7001
Name: new ticket_price, dtype: float64

In [ ]:

df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]*df["ticket_price"]',output_cols="new 1")

In [ ]:

In [5]:

from optimus import Optimus
op = Optimus("dask", n_workers=1, threads_per_worker=8, processes=False, memory_limit="3G", comm=True)

Open Bumblebee: https://app.hi-bumblebee.com

If you really care about privacy get your keys in bumblebee.ini and put them here

In [6]:

%%time
df = op.load.file("data/dataset-transactions.csv").ext.cache()
# df = df.ext.optimize()
df = df.ext.repartition(1).ext.cache()
df.ext.display(20)

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Viewing 17 of 17 rows / 6 columns

1 partition(s)

customer_id 1 (object) not nullable	transactoin_date 2 (object) not nullable	ticket_price 3 (object) not nullable	discount 4 (object) not nullable	product 5 (object) not nullable	info 6 (object) not nullable
0f345kjh345oiuy345	2010/08/19	29.99	nan	platinum	1
0fju234978rfjkhsdf	2012/01/05	29.99	nan	platinum	2
0f34ruiy23e78y2r	2009/08/11	29.99	nan	platinum	3
0fue298y2r23r23r2	2010/08/19	29.99	nan	platinum	4
0f345kjh345oiuy346	2010/08/19	29.99	nan	platinum	5
0fju234978rfjkhsdf	2010/08/19	9.99	5%	basic	6
0f34ruiy23e78y2r	2010/08/19	9.99	nan	basic	7
0fue298y2r23r23r3	2010/08/19	9.99	nan	basic	8
0f345kjh345oiuy347	2010/08/19	9.99	nan	basic	9
0fju234978rfjkhsdf	2011/08/11	9.99	5%	basic	10
0f34ruiy23e78y2r	2015/08/09	9.99	5%	basic	10
0fue298y2r23r23r4	nan	9.91	nan	basic	10
0f345kjh345oiuy348	nan	9.99	nan	basic	10
0fju234978rfjkhsdf	nan	14.99	5%	deluxe	10
0f34ruiy23e78y2r	nan	14.99	nan	deluxe	10
0fue298y2r23r23r5	nan	14.99	5%	deluxe	10
0f345kjh345oiuy349	nan	14.99	nan	deluxe	a

Viewing 17 of 17 rows / 6 columns

1 partition(s) <class 'dask.dataframe.core.DataFrame'>

Wall time: 163 ms

In [7]:

# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
# df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
# # result = df.ext.profile(columns=columns, bins=bins, output=output, flush=flush, size=size)
    
# print(cols_and_inferred_dtype)

In [8]:

# df.cols.infer_profiler_dtypes("*")
p = df.ext.profile("*", flush=True)

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

AAAA {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\r\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}}
BBBB {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\r\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {'actions': [{'profiler_dtype': 'customer_id'}, {'profiler_dtype': 'transactoin_date'}, {'profiler_dtype': 'ticket_price'}, {'profiler_dtype': 'discount'}, {'profiler_dtype': 'product'}, {'profiler_dtype': 'info'}]}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object', 'profiler_dtype': 'date'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object', 'profiler_dtype': 'decimal'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object', 'profiler_dtype': 'int'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}}

In [9]:

df.meta.get()["profile"]["columns"]["customer_id"]

Out[9]:

{'stats': {'mismatch': 0,
  'missing': 0,
  'match': 17,
  'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4},
   {'value': '0f34ruiy23e78y2r', 'count': 4},
   {'value': '0fue298y2r23r23r5', 'count': 1},
   {'value': '0fue298y2r23r23r4', 'count': 1},
   {'value': '0fue298y2r23r23r3', 'count': 1},
   {'value': '0fue298y2r23r23r2', 'count': 1},
   {'value': '0f345kjh345oiuy349', 'count': 1},
   {'value': '0f345kjh345oiuy348', 'count': 1},
   {'value': '0f345kjh345oiuy347', 'count': 1},
   {'value': '0f345kjh345oiuy346', 'count': 1},
   {'value': '0f345kjh345oiuy345', 'count': 1}],
  'count_uniques': 11},
 'dtype': 'object',
 'profiler_dtype': 'string'}

In [9]:

# df, p = df.ext.cast_and_profile("*")

In [10]:

assert(p["columns"]["ticket_price"]["stats"]["match"]==17)
assert(p["columns"]["ticket_price"]["stats"]["mismatch"]==0)
assert(p["columns"]["ticket_price"]["stats"]["missing"]==0)

In [11]:

df.cols.infer_profiler_dtypes("*")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Out[11]:

{'customer_id': 'string',
 'transactoin_date': 'date',
 'ticket_price': 'decimal',
 'discount': 'string',
 'product': 'string',
 'info': 'int'}

In [12]:

df = df.cols.replace("discount", search=["%"], replace_by="", search_by="chars", ignore_case=True)

In [13]:

df.compute()

Out[13]:

	customer_id	transactoin_date	ticket_price	discount	product	info
0	0f345kjh345oiuy345	2010/08/19	29.99	NaN	platinum	1
1	0fju234978rfjkhsdf	2012/01/05	29.99	NaN	platinum	2
2	0f34ruiy23e78y2r	2009/08/11	29.99	NaN	platinum	3
3	0fue298y2r23r23r2	2010/08/19	29.99	NaN	platinum	4
4	0f345kjh345oiuy346	2010/08/19	29.99	NaN	platinum	5
5	0fju234978rfjkhsdf	2010/08/19	9.99	5	basic	6
6	0f34ruiy23e78y2r	2010/08/19	9.99	NaN	basic	7
7	0fue298y2r23r23r3	2010/08/19	9.99	NaN	basic	8
8	0f345kjh345oiuy347	2010/08/19	9.99	NaN	basic	9
9	0fju234978rfjkhsdf	2011/08/11	9.99	5	basic	10
10	0f34ruiy23e78y2r	2015/08/09	9.99	5	basic	10
11	0fue298y2r23r23r4	NaN	9.91	NaN	basic	10
12	0f345kjh345oiuy348	NaN	9.99	NaN	basic	10
13	0fju234978rfjkhsdf	NaN	14.99	5	deluxe	10
14	0f34ruiy23e78y2r	NaN	14.99	NaN	deluxe	10
15	0fue298y2r23r23r5	NaN	14.99	5	deluxe	10
16	0f345kjh345oiuy349	NaN	14.99	NaN	deluxe	a

In [14]:

# df.cols.infer_profiler_dtypes("*")
from optimus.infer import Infer
df.ext.head("*",50).applymap(Infer.parse_pandas)

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 50 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Out[14]:

	customer_id	transactoin_date	ticket_price	discount	product	info
0	string	date	decimal	null	string	int
1	string	date	decimal	null	string	int
2	string	date	decimal	null	string	int
3	string	date	decimal	null	string	int
4	string	date	decimal	null	string	int
5	string	date	decimal	int	string	int
6	string	date	decimal	null	string	int
7	string	date	decimal	null	string	int
8	string	date	decimal	null	string	int
9	string	date	decimal	int	string	int
10	string	date	decimal	int	string	int
11	string	null	decimal	null	string	int
12	string	null	decimal	null	string	int
13	string	null	decimal	int	string	int
14	string	null	decimal	null	string	int
15	string	null	decimal	int	string	int
16	string	null	decimal	null	string	string

In [15]:

xray(df, "discount")

nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
5 <class 'str'>
nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
5 <class 'str'>
5 <class 'str'>
nan <class 'float'>
nan <class 'float'>
5 <class 'str'>
nan <class 'float'>
5 <class 'str'>
nan <class 'float'>

In [16]:

p = df.ext.profile("discount")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [17]:

df.cols.infer_profiler_dtypes("*")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Out[17]:

{'customer_id': 'string',
 'transactoin_date': 'date',
 'ticket_price': 'decimal',
 'discount': 'int',
 'product': 'string',
 'info': 'int'}

In [18]:

# df["transactoin_date"].compute()

In [19]:

df.compute()

Out[19]:

	customer_id	transactoin_date	ticket_price	discount	product	info
0	0f345kjh345oiuy345	2010/08/19	29.99	NaN	platinum	1
1	0fju234978rfjkhsdf	2012/01/05	29.99	NaN	platinum	2
2	0f34ruiy23e78y2r	2009/08/11	29.99	NaN	platinum	3
3	0fue298y2r23r23r2	2010/08/19	29.99	NaN	platinum	4
4	0f345kjh345oiuy346	2010/08/19	29.99	NaN	platinum	5
5	0fju234978rfjkhsdf	2010/08/19	9.99	5	basic	6
6	0f34ruiy23e78y2r	2010/08/19	9.99	NaN	basic	7
7	0fue298y2r23r23r3	2010/08/19	9.99	NaN	basic	8
8	0f345kjh345oiuy347	2010/08/19	9.99	NaN	basic	9
9	0fju234978rfjkhsdf	2011/08/11	9.99	5	basic	10
10	0f34ruiy23e78y2r	2015/08/09	9.99	5	basic	10
11	0fue298y2r23r23r4	NaN	9.91	NaN	basic	10
12	0f345kjh345oiuy348	NaN	9.99	NaN	basic	10
13	0fju234978rfjkhsdf	NaN	14.99	5	deluxe	10
14	0f34ruiy23e78y2r	NaN	14.99	NaN	deluxe	10
15	0fue298y2r23r23r5	NaN	14.99	5	deluxe	10
16	0f345kjh345oiuy349	NaN	14.99	NaN	deluxe	a

In [20]:

xray(df, "discount")

nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
5 <class 'str'>
nan <class 'float'>
nan <class 'float'>
nan <class 'float'>
5 <class 'str'>
5 <class 'str'>
nan <class 'float'>
nan <class 'float'>
5 <class 'str'>
nan <class 'float'>
5 <class 'str'>
nan <class 'float'>

In [21]:

df = df.cols.unnest("transactoin_date", separator="/", splits=2, output_cols="transactoin_date").ext.cache()

In [26]:

_output = df.ext.profile(columns="*", output="json", flush=True)

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [27]:

df.meta.get()["profile"]["columns"]["customer_id"]

Out[27]:

{'stats': {'mismatch': 0,
  'missing': 0,
  'match': 17,
  'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4},
   {'value': '0f34ruiy23e78y2r', 'count': 4},
   {'value': '0fue298y2r23r23r5', 'count': 1},
   {'value': '0fue298y2r23r23r4', 'count': 1},
   {'value': '0fue298y2r23r23r3', 'count': 1},
   {'value': '0fue298y2r23r23r2', 'count': 1},
   {'value': '0f345kjh345oiuy349', 'count': 1},
   {'value': '0f345kjh345oiuy348', 'count': 1},
   {'value': '0f345kjh345oiuy347', 'count': 1},
   {'value': '0f345kjh345oiuy346', 'count': 1},
   {'value': '0f345kjh345oiuy345', 'count': 1}],
  'count_uniques': 11},
 'dtype': 'object',
 'profiler_dtype': 'string'}

In [28]:

df.cols.infer_profiler_dtypes("*")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Out[28]:

{'customer_id': 'string',
 'transactoin_date': 'date',
 'transactoin_date_0': 'int',
 'transactoin_date_1': 'int',
 'ticket_price': 'decimal',
 'discount': 'int',
 'product': 'string',
 'info': 'int'}

In [29]:

df.compute()

Out[29]:

	customer_id	transactoin_date	transactoin_date_0	transactoin_date_1	ticket_price	discount	product	info
0	0f345kjh345oiuy345	2010/08/19	2010	08	29.99	NaN	platinum	1
1	0fju234978rfjkhsdf	2012/01/05	2012	01	29.99	NaN	platinum	2
2	0f34ruiy23e78y2r	2009/08/11	2009	08	29.99	NaN	platinum	3
3	0fue298y2r23r23r2	2010/08/19	2010	08	29.99	NaN	platinum	4
4	0f345kjh345oiuy346	2010/08/19	2010	08	29.99	NaN	platinum	5
5	0fju234978rfjkhsdf	2010/08/19	2010	08	9.99	5	basic	6
6	0f34ruiy23e78y2r	2010/08/19	2010	08	9.99	NaN	basic	7
7	0fue298y2r23r23r3	2010/08/19	2010	08	9.99	NaN	basic	8
8	0f345kjh345oiuy347	2010/08/19	2010	08	9.99	NaN	basic	9
9	0fju234978rfjkhsdf	2011/08/11	2011	08	9.99	5	basic	10
10	0f34ruiy23e78y2r	2015/08/09	2015	08	9.99	5	basic	10
11	0fue298y2r23r23r4	NaN	nan	None	9.91	NaN	basic	10
12	0f345kjh345oiuy348	NaN	nan	None	9.99	NaN	basic	10
13	0fju234978rfjkhsdf	NaN	nan	None	14.99	5	deluxe	10
14	0f34ruiy23e78y2r	NaN	nan	None	14.99	NaN	deluxe	10
15	0fue298y2r23r23r5	NaN	nan	None	14.99	5	deluxe	10
16	0f345kjh345oiuy349	NaN	nan	None	14.99	NaN	deluxe	a

In [30]:

df.meta.get()["profile"]["columns"]["customer_id"]

Out[30]:

{'stats': {'mismatch': 0,
  'missing': 0,
  'match': 17,
  'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4},
   {'value': '0f34ruiy23e78y2r', 'count': 4},
   {'value': '0fue298y2r23r23r5', 'count': 1},
   {'value': '0fue298y2r23r23r4', 'count': 1},
   {'value': '0fue298y2r23r23r3', 'count': 1},
   {'value': '0fue298y2r23r23r2', 'count': 1},
   {'value': '0f345kjh345oiuy349', 'count': 1},
   {'value': '0f345kjh345oiuy348', 'count': 1},
   {'value': '0f345kjh345oiuy347', 'count': 1},
   {'value': '0f345kjh345oiuy346', 'count': 1},
   {'value': '0f345kjh345oiuy345', 'count': 1}],
  'count_uniques': 11},
 'dtype': 'object',
 'profiler_dtype': 'string'}

In [10]:

df1 = df.cols.set(value='df["ticket_price"]*df["discount"]*10', output_cols="new ticket_price")

cols df["ticket_price"]*df["discount"]*10
columns ['ticket_price', 'discount']
cols None

In [11]:

df.cols.names()

Out[11]:

['customer_id',
 'transactoin_date',
 'ticket_price',
 'discount',
 'product',
 'info']

In [12]:

df1.compute()

Out[12]:

	customer_id	transactoin_date	ticket_price	discount	product	info	new ticket_price
0	0f345kjh345oiuy345	2010/08/19	29.99	NaN	platinum	1	NaN
1	0fju234978rfjkhsdf	2012/01/05	29.99	NaN	platinum	2	NaN
2	0f34ruiy23e78y2r	2009/08/11	29.99	NaN	platinum	3	NaN
3	0fue298y2r23r23r2	2010/08/19	29.99	NaN	platinum	4	NaN
4	0f345kjh345oiuy346	2010/08/19	29.99	NaN	platinum	5	NaN
5	0fju234978rfjkhsdf	2010/08/19	9.99	5%	basic	6	NaN
6	0f34ruiy23e78y2r	2010/08/19	9.99	NaN	basic	7	NaN
7	0fue298y2r23r23r3	2010/08/19	9.99	NaN	basic	8	NaN
8	0f345kjh345oiuy347	2010/08/19	9.99	NaN	basic	9	NaN
9	0fju234978rfjkhsdf	2011/08/11	9.99	5%	basic	10	NaN
10	0f34ruiy23e78y2r	2015/08/09	9.99	5%	basic	10	NaN
11	0fue298y2r23r23r4	NaN	9.91	NaN	basic	10	NaN
12	0f345kjh345oiuy348	NaN	9.99	NaN	basic	10	NaN
13	0fju234978rfjkhsdf	NaN	14.99	5%	deluxe	10	NaN
14	0f34ruiy23e78y2r	NaN	14.99	NaN	deluxe	10	NaN
15	0fue298y2r23r23r5	NaN	14.99	5%	deluxe	10	NaN
16	0f345kjh345oiuy349	NaN	14.99	NaN	deluxe	a	NaN

In [ ]:

df.cols.set(value='mask["ticket_price"]+mask["transactoin_date"]', where='df["ticket_price"]!=None', output_cols="new ticket_price_2").compute()

In [26]:

df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590599684769.csv").ext.cache()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.cast_and_profile(columns="*", output="json")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

dtype string
dtype date
dtype decimal
dtype string
dtype string
dtype int
dtypedtype string
dtype date
dtype decimal
dtype string
dtype dtype string
dtype string
string
dtype int
dtype string
dtype datedtype string

dtype decimal
dtype string
dtype string date
dtype decimal
dtype string
dtype date
dtype decimaldtype string
dtype datedtype
dtypedtypedtype date
dtype decimal
dtype string
dtypedtype date
dtype decimal
  
dtype string

dtype decimal
dtype string
 int
 stringdtypestring
string
dtype string
dtype date

dtype int
 string
dtype int
dtypedtypedtype string
dtype string
dtype int
dtype decimal
dtype string
dtype string
dtype int
  int
string
dtype int

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [20]:

unmasked = "DD/MM/YY"
masked = unmasked[1:4]

In [19]:

masked

Out[19]:

'ABB'

In [12]:

print(_output)

{"columns": {"customer_id": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object"}, "transactoin_date": {"profiler_dtype": "date", "stats": {"mismatch": 11, "missing": 6, "match": 0, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}, "dtype": "object"}, "ticket_price": {"profiler_dtype": "decimal", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 9.91, "upper": 10.5375, "count": 0}, {"lower": 10.5375, "upper": 11.165, "count": 0}, {"lower": 11.165, "upper": 11.7925, "count": 0}, {"lower": 11.7925, "upper": 12.42, "count": 0}, {"lower": 12.42, "upper": 13.0475, "count": 0}, {"lower": 13.0475, "upper": 13.675, "count": 0}, {"lower": 13.675, "upper": 14.3025, "count": 0}, {"lower": 14.3025, "upper": 14.93, "count": 0}, {"lower": 14.93, "upper": 15.5575, "count": 3}, {"lower": 15.5575, "upper": 16.185, "count": 0}, {"lower": 16.185, "upper": 16.8125, "count": 0}, {"lower": 16.8125, "upper": 17.439999999999998, "count": 0}, {"lower": 17.439999999999998, "upper": 18.0675, "count": 0}, {"lower": 18.0675, "upper": 18.695, "count": 0}, {"lower": 18.695, "upper": 19.322499999999998, "count": 0}, {"lower": 19.322499999999998, "upper": 19.95, "count": 0}, {"lower": 19.95, "upper": 20.5775, "count": 0}, {"lower": 20.5775, "upper": 21.205, "count": 0}, {"lower": 21.205, "upper": 21.8325, "count": 0}, {"lower": 21.8325, "upper": 22.46, "count": 0}, {"lower": 22.46, "upper": 23.0875, "count": 0}, {"lower": 23.0875, "upper": 23.715, "count": 0}, {"lower": 23.715, "upper": 24.3425, "count": 0}, {"lower": 24.3425, "upper": 24.97, "count": 0}, {"lower": 24.97, "upper": 25.597499999999997, "count": 0}, {"lower": 25.597499999999997, "upper": 26.224999999999998, "count": 0}, {"lower": 26.224999999999998, "upper": 26.8525, "count": 0}, {"lower": 26.8525, "upper": 27.48, "count": 0}, {"lower": 27.48, "upper": 28.107499999999998, "count": 0}, {"lower": 28.107499999999998, "upper": 28.735, "count": 0}, {"lower": 28.735, "upper": 29.362499999999997, "count": 0}, {"lower": 29.362499999999997, "upper": 29.99, "count": 0}], "count_uniques": 4}, "dtype": "object"}, "discount": {"profiler_dtype": "object", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "nan", "count": 12}, {"value": "5%", "count": 5}], "count_uniques": 2}, "dtype": "object"}, "product": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object"}, "info": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "object"}}, "name": null, "file_name": "tmpk28b8e9o.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object"], "total_count_dtypes": 1, "missing_count": 0, "p_missing": 0.0}}

In [7]:

# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
# print(cols_and_inferred_dtype)
# df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()

In [8]:

df.compute()

Out[8]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	I182070945	00619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.35779134	-71.13937053	(42.35779134, -71.13937053)
1	I182070943	01402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.30682138	-71.06030035	(42.30682138, -71.06030035)
2	I182070941	03410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.34658879	-71.07242943	(42.34658879, -71.07242943)
3	I182070940	03114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.33418175	-71.07866441	(42.33418175, -71.07866441)
4	I182070938	03114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.27536542	-71.09036101	(42.27536542, -71.09036101)
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
319068	I050310906-00	03125	Warrant Arrests	WARRANT ARREST	D4	285	NaN	2016-06-05 17:25:00	2016	6	Sunday	17	Part Three	COVENTRY ST	42.33695098	-71.08574813	(42.33695098, -71.08574813)
319069	I030217815-08	00111	Homicide	MURDER, NON-NEGLIGIENT MANSLAUGHTER	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part One	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319070	I030217815-08	03125	Warrant Arrests	WARRANT ARREST	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part Three	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319071	I010370257-00	03125	Warrant Arrests	WARRANT ARREST	E13	569	NaN	2016-05-31 19:35:00	2016	5	Tuesday	19	Part Three	NEW WASHINGTON ST	42.30233307	-71.11156487	(42.30233307, -71.11156487)
319072	142052550	03125	Warrant Arrests	WARRANT ARREST	D4	903	NaN	2015-06-22 00:12:00	2015	6	Monday	0	Part Three	WASHINGTON ST	42.33383935	-71.08029038	(42.33383935, -71.08029038)

319073 rows × 17 columns

In [9]:

%%time
df.cols.cast(columns = {"INCIDENT_NUMBER":"int"}).compute()

Wall time: 589 ms

Out[9]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	NaN	00619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.35779134	-71.13937053	(42.35779134, -71.13937053)
1	NaN	01402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.30682138	-71.06030035	(42.30682138, -71.06030035)
2	NaN	03410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.34658879	-71.07242943	(42.34658879, -71.07242943)
3	NaN	03114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.33418175	-71.07866441	(42.33418175, -71.07866441)
4	NaN	03114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.27536542	-71.09036101	(42.27536542, -71.09036101)
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
319068	NaN	03125	Warrant Arrests	WARRANT ARREST	D4	285	NaN	2016-06-05 17:25:00	2016	6	Sunday	17	Part Three	COVENTRY ST	42.33695098	-71.08574813	(42.33695098, -71.08574813)
319069	NaN	00111	Homicide	MURDER, NON-NEGLIGIENT MANSLAUGHTER	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part One	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319070	NaN	03125	Warrant Arrests	WARRANT ARREST	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part Three	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319071	NaN	03125	Warrant Arrests	WARRANT ARREST	E13	569	NaN	2016-05-31 19:35:00	2016	5	Tuesday	19	Part Three	NEW WASHINGTON ST	42.30233307	-71.11156487	(42.30233307, -71.11156487)
319072	142052550	03125	Warrant Arrests	WARRANT ARREST	D4	903	NaN	2015-06-22 00:12:00	2015	6	Monday	0	Part Three	WASHINGTON ST	42.33383935	-71.08029038	(42.33383935, -71.08029038)

319073 rows × 17 columns

In [18]:

%%time
df.ext.cast_and_profile(columns="*", flush=True, output="json")

Wall time: 9.43 s

Out[18]:

'{"columns": {"INCIDENT_NUMBER": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "I162030584", "count": 13}, {"value": "I152080623", "count": 11}, {"value": "I172013170", "count": 10}, {"value": "I182065208", "count": 10}, {"value": "I172096394", "count": 10}, {"value": "I162071327", "count": 9}, {"value": "I162001871", "count": 9}, {"value": "I172056883", "count": 9}, {"value": "I172022524", "count": 9}, {"value": "I172054429", "count": 9}, {"value": "I162098170", "count": 9}, {"value": "I162078338", "count": 8}, {"value": "I162074826", "count": 8}, {"value": "I162090278", "count": 8}, {"value": "I172069723", "count": 8}, {"value": "I162022140", "count": 8}, {"value": "I130041200-00", "count": 8}, {"value": "I162087224", "count": 8}, {"value": "I152076465", "count": 8}, {"value": "I162056703", "count": 8}, {"value": "I162064331", "count": 8}, {"value": "I162082917", "count": 8}, {"value": "I152105431", "count": 8}, {"value": "I172053616", "count": 8}, {"value": "I152101399", "count": 7}, {"value": "I152095733", "count": 7}, {"value": "I162054378", "count": 7}, {"value": "I152067057", "count": 7}, {"value": "I152096998", "count": 7}, {"value": "I172018004", "count": 7}, {"value": "I162083089", "count": 7}, {"value": "I162045680", "count": 7}, {"value": "I152071480", "count": 7}], "count_uniques": 282517}, "dtype": "object"}, "OFFENSE_CODE": {"profiler_dtype": "zip_code", "stats": {"mismatch": 251, "missing": 0, "match": 318822, "frequency": [{"value": "03006", "count": 18783}, {"value": "03115", "count": 18746}, {"value": "03831", "count": 16323}, {"value": "01402", "count": 15152}, {"value": "00802", "count": 14791}, {"value": "03301", "count": 13099}, {"value": "03410", "count": 11287}, {"value": "03114", "count": 11123}, {"value": "00617", "count": 9069}, {"value": "02647", "count": 9039}, {"value": "00614", "count": 8893}, {"value": "03201", "count": 8892}, {"value": "03125", "count": 8389}, {"value": "00613", "count": 7949}, {"value": "03802", "count": 6557}, {"value": "00619", "count": 5963}, {"value": "03803", "count": 5131}, {"value": "00413", "count": 4886}, {"value": "01102", "count": 4413}, {"value": "03502", "count": 4365}, {"value": "02629", "count": 4007}, {"value": "03501", "count": 3766}, {"value": "03207", "count": 3698}, {"value": "00724", "count": 3629}, {"value": "02610", "count": 3245}, {"value": "01106", "count": 3147}, {"value": "00301", "count": 3056}, {"value": "03801", "count": 2925}, {"value": "00423", "count": 2910}, {"value": "02900", "count": 2894}, {"value": "02907", "count": 2616}, {"value": "00520", "count": 2585}, {"value": "01849", "count": 2584}], "count_uniques": 263}, "dtype": "object"}, "OFFENSE_CODE_GROUP": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "Motor Vehicle Accident Response", "count": 37132}, {"value": "Larceny", "count": 25935}, {"value": "Medical Assistance", "count": 23540}, {"value": "Investigate Person", "count": 18750}, {"value": "Other", "count": 18075}, {"value": "Drug Violation", "count": 16548}, {"value": "Simple Assault", "count": 15826}, {"value": "Vandalism", "count": 15415}, {"value": "Verbal Disputes", "count": 13099}, {"value": "Towed", "count": 11287}, {"value": "Investigate Property", "count": 11124}, {"value": "Larceny From Motor Vehicle", "count": 10847}, {"value": "Property Lost", "count": 9751}, {"value": "Warrant Arrests", "count": 8407}, {"value": "Aggravated Assault", "count": 7807}, {"value": "Violations", "count": 6095}, {"value": "Fraud", "count": 5829}, {"value": "Residential Burglary", "count": 5606}, {"value": "Missing Person Located", "count": 4958}, {"value": "Auto Theft", "count": 4851}, {"value": "Robbery", "count": 4624}, {"value": "Harassment", "count": 4007}, {"value": "Property Found", "count": 3925}, {"value": "Missing Person Reported", "count": 3797}, {"value": "Confidence Games", "count": 3147}, {"value": "Police Service Incidents", "count": 2781}, {"value": "Disorderly Conduct", "count": 2611}, {"value": "Fire Related Reports", "count": 1920}, {"value": "Firearm Violations", "count": 1777}, {"value": "License Violation", "count": 1701}, {"value": "Restraining Order Violations", "count": 1607}, {"value": "Recovered Stolen Property", "count": 1455}, {"value": "Counterfeiting", "count": 1454}], "count_uniques": 67}, "dtype": "object"}, "OFFENSE_DESCRIPTION": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "SICK/INJURED/MEDICAL - PERSON", "count": 18783}, {"value": "INVESTIGATE PERSON", "count": 18754}, {"value": "M/V - LEAVING SCENE - PROPERTY DAMAGE", "count": 16323}, {"value": "VANDALISM", "count": 15154}, {"value": "ASSAULT SIMPLE - BATTERY", "count": 14791}, {"value": "VERBAL DISPUTE", "count": 13099}, {"value": "TOWED MOTOR VEHICLE", "count": 11287}, {"value": "INVESTIGATE PROPERTY", "count": 11124}, {"value": "LARCENY THEFT FROM BUILDING", "count": 9069}, {"value": "THREATS TO DO BODILY HARM", "count": 9042}, {"value": "LARCENY THEFT FROM MV - NON-ACCESSORY", "count": 8893}, {"value": "PROPERTY - LOST", "count": 8893}, {"value": "WARRANT ARREST", "count": 8407}, {"value": "LARCENY SHOPLIFTING", "count": 7949}, {"value": "M/V ACCIDENT - PROPERTY \\u00a0DAMAGE", "count": 6557}, {"value": "LARCENY ALL OTHERS", "count": 5963}, {"value": "M/V ACCIDENT - PERSONAL INJURY", "count": 5131}, {"value": "ASSAULT - AGGRAVATED - BATTERY", "count": 4886}, {"value": "FRAUD - FALSE PRETENSE / SCHEME", "count": 4413}, {"value": "MISSING PERSON - LOCATED", "count": 4365}, {"value": "HARASSMENT", "count": 4007}, {"value": "MISSING PERSON", "count": 3766}, {"value": "PROPERTY - FOUND", "count": 3698}, {"value": "AUTO THEFT", "count": 3630}, {"value": "TRESPASSING", "count": 3254}, {"value": "FRAUD - CREDIT CARD / ATM FRAUD", "count": 3147}, {"value": "ROBBERY - STREET", "count": 3056}, {"value": "M/V ACCIDENT - OTHER", "count": 2925}, {"value": "ASSAULT - AGGRAVATED", "count": 2910}, {"value": "VAL - VIOLATION OF AUTO LAW - OTHER", "count": 2894}, {"value": "VAL - OPERATING AFTER REV/SUSP.", "count": 2618}, {"value": "DRUGS - POSS CLASS B - COCAINE, ETC.", "count": 2591}, {"value": "BURGLARY - RESIDENTIAL - FORCE", "count": 2585}], "count_uniques": 244}, "dtype": "object"}, "DISTRICT": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 1765, "match": 317308, "frequency": [{"value": "B2", "count": 49945}, {"value": "C11", "count": 42530}, {"value": "D4", "count": 41915}, {"value": "A1", "count": 35717}, {"value": "B3", "count": 35442}, {"value": "C6", "count": 23460}, {"value": "D14", "count": 20127}, {"value": "E13", "count": 17536}, {"value": "E18", "count": 17348}, {"value": "A7", "count": 13544}, {"value": "E5", "count": 13239}, {"value": "A15", "count": 6505}], "count_uniques": 12}, "dtype": "object"}, "REPORTING_AREA": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 20250, "match": 298823, "hist": [{"lower": 0.0, "upper": 30.0625, "count": 1502}, {"lower": 30.0625, "upper": 60.125, "count": 740}, {"lower": 60.125, "upper": 90.1875, "count": 1376}, {"lower": 90.1875, "upper": 120.25, "count": 2618}, {"lower": 120.25, "upper": 150.3125, "count": 2175}, {"lower": 150.3125, "upper": 180.375, "count": 2034}, {"lower": 180.375, "upper": 210.4375, "count": 1349}, {"lower": 210.4375, "upper": 240.5, "count": 1116}, {"lower": 240.5, "upper": 270.5625, "count": 1501}, {"lower": 270.5625, "upper": 300.625, "count": 1968}, {"lower": 300.625, "upper": 330.6875, "count": 2684}, {"lower": 330.6875, "upper": 360.75, "count": 2336}, {"lower": 360.75, "upper": 390.8125, "count": 939}, {"lower": 390.8125, "upper": 420.875, "count": 1051}, {"lower": 420.875, "upper": 450.9375, "count": 2021}, {"lower": 450.9375, "upper": 481.0, "count": 1848}, {"lower": 481.0, "upper": 511.0625, "count": 996}, {"lower": 511.0625, "upper": 541.125, "count": 804}, {"lower": 541.125, "upper": 571.1875, "count": 897}, {"lower": 571.1875, "upper": 601.25, "count": 1146}, {"lower": 601.25, "upper": 631.3125, "count": 1610}, {"lower": 631.3125, "upper": 661.375, "count": 537}, {"lower": 661.375, "upper": 691.4375, "count": 423}, {"lower": 691.4375, "upper": 721.5, "count": 277}, {"lower": 721.5, "upper": 751.5625, "count": 337}, {"lower": 751.5625, "upper": 781.625, "count": 678}, {"lower": 781.625, "upper": 811.6875, "count": 1466}, {"lower": 811.6875, "upper": 841.75, "count": 455}, {"lower": 841.75, "upper": 871.8125, "count": 0}, {"lower": 871.8125, "upper": 901.875, "count": 245}, {"lower": 901.875, "upper": 931.9375, "count": 1546}, {"lower": 931.9375, "upper": 962.0, "count": 1210}], "count_uniques": 878}, "dtype": "object"}, "SHOOTING": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "nan", "count": 318054}, {"value": "Y", "count": 1019}], "count_uniques": 2}, "dtype": "object"}, "OCCURRED_ON_DATE": {"profiler_dtype": "date", "stats": {"mismatch": 319073, "missing": 0, "match": 0, "frequency": [{"value": "2017-06-01 00:00:00+00:00", "count": 29}, {"value": "2015-07-01 00:00:00+00:00", "count": 27}, {"value": "2016-08-01 00:00:00+00:00", "count": 27}, {"value": "2015-06-18 05:00:00+00:00", "count": 22}, {"value": "2017-08-01 00:00:00+00:00", "count": 22}, {"value": "2017-01-01 00:00:00+00:00", "count": 21}, {"value": "2016-04-01 00:00:00+00:00", "count": 20}, {"value": "2017-05-01 00:00:00+00:00", "count": 20}, {"value": "2015-12-07 11:38:00+00:00", "count": 20}, {"value": "2017-04-01 00:00:00+00:00", "count": 19}, {"value": "2016-09-01 00:00:00+00:00", "count": 19}, {"value": "2018-01-01 00:00:00+00:00", "count": 18}, {"value": "2016-11-01 00:00:00+00:00", "count": 18}, {"value": "2018-06-04 12:40:00+00:00", "count": 18}, {"value": "2015-12-01 00:00:00+00:00", "count": 18}, {"value": "2017-07-05 00:00:00+00:00", "count": 18}, {"value": "2017-11-01 00:00:00+00:00", "count": 17}, {"value": "2018-02-01 00:00:00+00:00", "count": 16}, {"value": "2016-02-01 00:00:00+00:00", "count": 16}, {"value": "2018-07-19 00:00:00+00:00", "count": 16}, {"value": "2017-03-01 00:00:00+00:00", "count": 15}, {"value": "2018-03-07 06:00:00+00:00", "count": 15}, {"value": "2018-08-18 08:45:00+00:00", "count": 15}, {"value": "2017-10-01 00:00:00+00:00", "count": 15}, {"value": "2016-12-15 06:00:00+00:00", "count": 14}, {"value": "2015-06-20 00:00:00+00:00", "count": 14}, {"value": "2016-06-24 17:30:00+00:00", "count": 14}, {"value": "2016-11-10 16:00:00+00:00", "count": 13}, {"value": "2018-04-02 00:00:00+00:00", "count": 13}, {"value": "2015-12-22 20:25:00+00:00", "count": 13}, {"value": "2018-01-03 18:00:00+00:00", "count": 13}, {"value": "2015-07-17 00:00:00+00:00", "count": 13}, {"value": "2016-04-20 11:07:00+00:00", "count": 13}], "count_uniques": 233229}, "dtype": "object"}, "YEAR": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 2015.0, "upper": 2015.09375, "count": 39649}, {"lower": 2015.09375, "upper": 2015.1875, "count": 0}, {"lower": 2015.1875, "upper": 2015.28125, "count": 0}, {"lower": 2015.28125, "upper": 2015.375, "count": 0}, {"lower": 2015.375, "upper": 2015.46875, "count": 0}, {"lower": 2015.46875, "upper": 2015.5625, "count": 0}, {"lower": 2015.5625, "upper": 2015.65625, "count": 0}, {"lower": 2015.65625, "upper": 2015.75, "count": 0}, {"lower": 2015.75, "upper": 2015.84375, "count": 0}, {"lower": 2015.84375, "upper": 2015.9375, "count": 0}, {"lower": 2015.9375, "upper": 2016.03125, "count": 184}, {"lower": 2016.03125, "upper": 2016.125, "count": 0}, {"lower": 2016.125, "upper": 2016.21875, "count": 0}, {"lower": 2016.21875, "upper": 2016.3125, "count": 0}, {"lower": 2016.3125, "upper": 2016.40625, "count": 0}, {"lower": 2016.40625, "upper": 2016.5, "count": 0}, {"lower": 2016.5, "upper": 2016.59375, "count": 0}, {"lower": 2016.59375, "upper": 2016.6875, "count": 0}, {"lower": 2016.6875, "upper": 2016.78125, "count": 0}, {"lower": 2016.78125, "upper": 2016.875, "count": 0}, {"lower": 2016.875, "upper": 2016.96875, "count": 0}, {"lower": 2016.96875, "upper": 2017.0625, "count": 39}, {"lower": 2017.0625, "upper": 2017.15625, "count": 0}, {"lower": 2017.15625, "upper": 2017.25, "count": 0}, {"lower": 2017.25, "upper": 2017.34375, "count": 0}, {"lower": 2017.34375, "upper": 2017.4375, "count": 0}, {"lower": 2017.4375, "upper": 2017.53125, "count": 0}, {"lower": 2017.53125, "upper": 2017.625, "count": 0}, {"lower": 2017.625, "upper": 2017.71875, "count": 0}, {"lower": 2017.71875, "upper": 2017.8125, "count": 0}, {"lower": 2017.8125, "upper": 2017.90625, "count": 0}, {"lower": 2017.90625, "upper": 2018.0, "count": 13}], "count_uniques": 4}, "dtype": "object"}, "MONTH": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 1.0, "upper": 1.34375, "count": 55}, {"lower": 1.34375, "upper": 1.6875, "count": 0}, {"lower": 1.6875, "upper": 2.03125, "count": 28}, {"lower": 2.03125, "upper": 2.375, "count": 0}, {"lower": 2.375, "upper": 2.71875, "count": 0}, {"lower": 2.71875, "upper": 3.0625, "count": 31}, {"lower": 3.0625, "upper": 3.40625, "count": 0}, {"lower": 3.40625, "upper": 3.75, "count": 0}, {"lower": 3.75, "upper": 4.09375, "count": 30}, {"lower": 4.09375, "upper": 4.4375, "count": 0}, {"lower": 4.4375, "upper": 4.78125, "count": 0}, {"lower": 4.78125, "upper": 5.125, "count": 16}, {"lower": 5.125, "upper": 5.46875, "count": 0}, {"lower": 5.46875, "upper": 5.8125, "count": 0}, {"lower": 5.8125, "upper": 6.15625, "count": 4184}, {"lower": 6.15625, "upper": 6.5, "count": 0}, {"lower": 6.5, "upper": 6.84375, "count": 0}, {"lower": 6.84375, "upper": 7.1875, "count": 8293}, {"lower": 7.1875, "upper": 7.53125, "count": 0}, {"lower": 7.53125, "upper": 7.875, "count": 0}, {"lower": 7.875, "upper": 8.21875, "count": 8298}, {"lower": 8.21875, "upper": 8.5625, "count": 0}, {"lower": 8.5625, "upper": 8.90625, "count": 0}, {"lower": 8.90625, "upper": 9.25, "count": 8334}, {"lower": 9.25, "upper": 9.59375, "count": 0}, {"lower": 9.59375, "upper": 9.9375, "count": 0}, {"lower": 9.9375, "upper": 10.28125, "count": 8151}, {"lower": 10.28125, "upper": 10.625, "count": 0}, {"lower": 10.625, "upper": 10.96875, "count": 0}, {"lower": 10.96875, "upper": 11.3125, "count": 2392}, {"lower": 11.3125, "upper": 11.65625, "count": 0}, {"lower": 11.65625, "upper": 12.0, "count": 73}], "count_uniques": 12}, "dtype": "object"}, "DAY_OF_WEEK": {"profiler_dtype": "date", "stats": {"mismatch": 319073, "missing": 0, "match": 0, "frequency": [{"value": "Friday", "count": 48495}, {"value": "Wednesday", "count": 46729}, {"value": "Thursday", "count": 46656}, {"value": "Tuesday", "count": 46383}, {"value": "Monday", "count": 45679}, {"value": "Saturday", "count": 44818}, {"value": "Sunday", "count": 40313}], "count_uniques": 7}, "dtype": "object"}, "HOUR": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 0.0, "upper": 0.71875, "count": 2069}, {"lower": 0.71875, "upper": 1.4375, "count": 1262}, {"lower": 1.4375, "upper": 2.15625, "count": 1040}, {"lower": 2.15625, "upper": 2.875, "count": 0}, {"lower": 2.875, "upper": 3.59375, "count": 611}, {"lower": 3.59375, "upper": 4.3125, "count": 402}, {"lower": 4.3125, "upper": 5.03125, "count": 461}, {"lower": 5.03125, "upper": 5.75, "count": 0}, {"lower": 5.75, "upper": 6.46875, "count": 606}, {"lower": 6.46875, "upper": 7.1875, "count": 1114}, {"lower": 7.1875, "upper": 7.90625, "count": 0}, {"lower": 7.90625, "upper": 8.625, "count": 1562}, {"lower": 8.625, "upper": 9.34375, "count": 1833}, {"lower": 9.34375, "upper": 10.0625, "count": 1951}, {"lower": 10.0625, "upper": 10.78125, "count": 0}, {"lower": 10.78125, "upper": 11.5, "count": 2028}, {"lower": 11.5, "upper": 12.21875, "count": 2330}, {"lower": 12.21875, "upper": 12.9375, "count": 0}, {"lower": 12.9375, "upper": 13.65625, "count": 2020}, {"lower": 13.65625, "upper": 14.375, "count": 2116}, {"lower": 14.375, "upper": 15.09375, "count": 1988}, {"lower": 15.09375, "upper": 15.8125, "count": 0}, {"lower": 15.8125, "upper": 16.53125, "count": 2415}, {"lower": 16.53125, "upper": 17.25, "count": 2625}, {"lower": 17.25, "upper": 17.96875, "count": 0}, {"lower": 17.96875, "upper": 18.6875, "count": 2584}, {"lower": 18.6875, "upper": 19.40625, "count": 2159}, {"lower": 19.40625, "upper": 20.125, "count": 2041}, {"lower": 20.125, "upper": 20.84375, "count": 0}, {"lower": 20.84375, "upper": 21.5625, "count": 1698}, {"lower": 21.5625, "upper": 22.28125, "count": 1604}, {"lower": 22.28125, "upper": 23.0, "count": 1366}], "count_uniques": 24}, "dtype": "object"}, "UCR_PART": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 90, "match": 318983, "frequency": [{"value": "Part Three", "count": 158553}, {"value": "Part Two", "count": 97569}, {"value": "Part One", "count": 61629}, {"value": "Other", "count": 1232}], "count_uniques": 4}, "dtype": "object"}, "STREET": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 10871, "match": 308202, "frequency": [{"value": "WASHINGTON ST", "count": 14194}, {"value": "BLUE HILL AVE", "count": 7794}, {"value": "BOYLSTON ST", "count": 7221}, {"value": "DORCHESTER AVE", "count": 5149}, {"value": "TREMONT ST", "count": 4796}, {"value": "MASSACHUSETTS AVE", "count": 4708}, {"value": "HARRISON AVE", "count": 4609}, {"value": "CENTRE ST", "count": 4383}, {"value": "COMMONWEALTH AVE", "count": 4134}, {"value": "HYDE PARK AVE", "count": 3470}, {"value": "COLUMBIA RD", "count": 3195}, {"value": "HUNTINGTON AVE", "count": 2989}, {"value": "RIVER ST", "count": 2831}, {"value": "DUDLEY ST", "count": 2352}, {"value": "WARREN ST", "count": 2336}, {"value": "COLUMBUS AVE", "count": 2304}, {"value": "ADAMS ST", "count": 2042}, {"value": "BEACON ST", "count": 1966}, {"value": "NEWBURY ST", "count": 1889}, {"value": "CAMBRIDGE ST", "count": 1670}, {"value": "BOWDOIN ST", "count": 1656}, {"value": "AMERICAN LEGION HWY", "count": 1631}, {"value": "W BROADWAY", "count": 1630}, {"value": "GENEVA AVE", "count": 1628}, {"value": "ALBANY ST", "count": 1429}, {"value": "ALLSTATE RD", "count": 1296}, {"value": "SUMMER ST", "count": 1250}, {"value": "HANCOCK ST", "count": 1246}, {"value": "BORDER ST", "count": 1213}, {"value": "NEW SUDBURY ST", "count": 1213}, {"value": "CUMMINS HWY", "count": 1151}, {"value": "NORFOLK ST", "count": 1115}, {"value": "BENNINGTON ST", "count": 1084}], "count_uniques": 4657}, "dtype": "object"}, "Lat": {"profiler_dtype": "decimal", "stats": {"mismatch": 299074, "missing": 19999, "match": 0, "hist": [{"lower": -1.0, "upper": 0.3560950493749999, "count": 33}, {"lower": 0.3560950493749999, "upper": 1.7121900987499998, "count": 0}, {"lower": 1.7121900987499998, "upper": 3.0682851481249998, "count": 0}, {"lower": 3.0682851481249998, "upper": 4.4243801975, "count": 0}, {"lower": 4.4243801975, "upper": 5.780475246875, "count": 0}, {"lower": 5.780475246875, "upper": 7.1365702962499995, "count": 0}, {"lower": 7.1365702962499995, "upper": 8.492665345625, "count": 0}, {"lower": 8.492665345625, "upper": 9.848760395, "count": 0}, {"lower": 9.848760395, "upper": 11.204855444375, "count": 0}, {"lower": 11.204855444375, "upper": 12.56095049375, "count": 0}, {"lower": 12.56095049375, "upper": 13.917045543124999, "count": 0}, {"lower": 13.917045543124999, "upper": 15.273140592499999, "count": 0}, {"lower": 15.273140592499999, "upper": 16.629235641875, "count": 0}, {"lower": 16.629235641875, "upper": 17.98533069125, "count": 0}, {"lower": 17.98533069125, "upper": 19.341425740625, "count": 0}, {"lower": 19.341425740625, "upper": 20.69752079, "count": 0}, {"lower": 20.69752079, "upper": 22.053615839375, "count": 0}, {"lower": 22.053615839375, "upper": 23.40971088875, "count": 0}, {"lower": 23.40971088875, "upper": 24.765805938125, "count": 0}, {"lower": 24.765805938125, "upper": 26.1219009875, "count": 0}, {"lower": 26.1219009875, "upper": 27.477996036875, "count": 0}, {"lower": 27.477996036875, "upper": 28.834091086249998, "count": 0}, {"lower": 28.834091086249998, "upper": 30.190186135624998, "count": 0}, {"lower": 30.190186135624998, "upper": 31.546281184999998, "count": 0}, {"lower": 31.546281184999998, "upper": 32.902376234375, "count": 0}, {"lower": 32.902376234375, "upper": 34.25847128375, "count": 0}, {"lower": 34.25847128375, "upper": 35.614566333125, "count": 0}, {"lower": 35.614566333125, "upper": 36.9706613825, "count": 0}, {"lower": 36.9706613825, "upper": 38.326756431875, "count": 0}, {"lower": 38.326756431875, "upper": 39.68285148125, "count": 0}, {"lower": 39.68285148125, "upper": 41.038946530625, "count": 0}, {"lower": 41.038946530625, "upper": 42.39504158, "count": 39852}], "count_uniques": 18178}, "dtype": "object"}, "Long": {"profiler_dtype": "decimal", "stats": {"mismatch": 299074, "missing": 19999, "match": 0, "hist": [{"lower": -71.17867378, "upper": -68.985590224375, "count": 39852}, {"lower": -68.985590224375, "upper": -66.79250666875, "count": 0}, {"lower": -66.79250666875, "upper": -64.599423113125, "count": 0}, {"lower": -64.599423113125, "upper": -62.4063395575, "count": 0}, {"lower": -62.4063395575, "upper": -60.213256001874996, "count": 0}, {"lower": -60.213256001874996, "upper": -58.02017244625, "count": 0}, {"lower": -58.02017244625, "upper": -55.827088890625, "count": 0}, {"lower": -55.827088890625, "upper": -53.634005335, "count": 0}, {"lower": -53.634005335, "upper": -51.440921779374996, "count": 0}, {"lower": -51.440921779374996, "upper": -49.247838223749994, "count": 0}, {"lower": -49.247838223749994, "upper": -47.054754668125, "count": 0}, {"lower": -47.054754668125, "upper": -44.8616711125, "count": 0}, {"lower": -44.8616711125, "upper": -42.668587556874996, "count": 0}, {"lower": -42.668587556874996, "upper": -40.47550400125, "count": 0}, {"lower": -40.47550400125, "upper": -38.282420445625, "count": 0}, {"lower": -38.282420445625, "upper": -36.08933689, "count": 0}, {"lower": -36.08933689, "upper": -33.896253334375, "count": 0}, {"lower": -33.896253334375, "upper": -31.703169778749995, "count": 0}, {"lower": -31.703169778749995, "upper": -29.510086223125, "count": 0}, {"lower": -29.510086223125, "upper": -27.3170026675, "count": 0}, {"lower": -27.3170026675, "upper": -25.123919111874997, "count": 0}, {"lower": -25.123919111874997, "upper": -22.930835556250003, "count": 0}, {"lower": -22.930835556250003, "upper": -20.737752000625, "count": 0}, {"lower": -20.737752000625, "upper": -18.544668445, "count": 0}, {"lower": -18.544668445, "upper": -16.351584889374998, "count": 0}, {"lower": -16.351584889374998, "upper": -14.158501333749996, "count": 0}, {"lower": -14.158501333749996, "upper": -11.965417778125001, "count": 0}, {"lower": -11.965417778125001, "upper": -9.7723342225, "count": 0}, {"lower": -9.7723342225, "upper": -7.579250666874998, "count": 0}, {"lower": -7.579250666874998, "upper": -5.386167111250003, "count": 0}, {"lower": -5.386167111250003, "upper": -3.1930835556249946, "count": 0}, {"lower": -3.1930835556249946, "upper": -1.0, "count": 33}], "count_uniques": 18178}, "dtype": "object"}, "Location": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "(0.00000000, 0.00000000)", "count": 19999}, {"value": "(42.34862382, -71.08277637)", "count": 1243}, {"value": "(42.36183857, -71.05976489)", "count": 1208}, {"value": "(42.28482577, -71.09137369)", "count": 1121}, {"value": "(42.32866284, -71.08563401)", "count": 1042}, {"value": "(42.25621592, -71.12401947)", "count": 898}, {"value": "(42.29755533, -71.05970910)", "count": 783}, {"value": "(42.34128751, -71.05467933)", "count": 773}, {"value": "(-1.00000000, -1.00000000)", "count": 745}, {"value": "(42.33152148, -71.07085307)", "count": 735}, {"value": "(42.35231190, -71.06370510)", "count": 688}, {"value": "(42.33954199, -71.06940877)", "count": 655}, {"value": "(42.32696647, -71.06198607)", "count": 652}, {"value": "(42.35512339, -71.06087980)", "count": 584}, {"value": "(42.30971857, -71.10429432)", "count": 573}, {"value": "(42.29848866, -71.06313294)", "count": 562}, {"value": "(42.33401829, -71.07638124)", "count": 561}, {"value": "(42.33367922, -71.09187755)", "count": 550}, {"value": "(42.31043400, -71.06134010)", "count": 523}, {"value": "(42.35095909, -71.07412780)", "count": 523}, {"value": "(42.35241815, -71.06525499)", "count": 515}, {"value": "(42.37081805, -71.03929078)", "count": 507}, {"value": "(42.33428841, -71.07239518)", "count": 504}, {"value": "(42.34980175, -71.07840978)", "count": 472}, {"value": "(42.32696802, -71.08051941)", "count": 472}, {"value": "(42.33511904, -71.07491710)", "count": 455}, {"value": "(42.35037870, -71.07626098)", "count": 445}, {"value": "(42.34653820, -71.09880598)", "count": 444}, {"value": "(42.36643546, -71.06135413)", "count": 440}, {"value": "(42.28709355, -71.14822128)", "count": 438}, {"value": "(42.35602373, -71.06177615)", "count": 436}, {"value": "(42.34840576, -71.08688339)", "count": 432}, {"value": "(42.34905600, -71.15049850)", "count": 427}], "count_uniques": 18194}, "dtype": "object"}}, "name": null, "file_name": "crime.csv", "summary": {"cols_count": 17, "rows_count": 319073, "dtypes_list": ["object"], "total_count_dtypes": 1, "missing_count": 0, "p_missing": 0.0}}'

In [ ]:

In [14]:

df.cols.names()

Out[14]:

['INCIDENT_NUMBER',
 'OFFENSE_CODE',
 'OFFENSE_CODE_GROUP',
 'OFFENSE_DESCRIPTION',
 'DISTRICT',
 'REPORTING_AREA',
 'SHOOTING',
 'OCCURRED_ON_DATE',
 'YEAR',
 'MONTH',
 'DAY_OF_WEEK',
 'HOUR',
 'UCR_PART',
 'STREET',
 'Lat',
 'Long',
 'Location']

In [ ]:

df.OCCURRED_ON_DATE.astype()

In [12]:

%%time
import pandas as pd
from fastnumbers import fast_int
import pendulum
def func(value):
    return pendulum.parse(value)
df.OCCURRED_ON_DATE.apply(func, meta="object").compute()

Wall time: 3.49 s

In [50]:

df.compute()

Out[50]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	nan	00619	Larceny	LARCENY ALL OTHERS	D14	808	nan	2018-09-02 13:00:00+00:00	2018	9	2020-05-31	13	Part One	LINCOLN ST	42.3578	-71.1394	(42.35779134, -71.13937053)
1	nan	01402	Vandalism	VANDALISM	C11	347	nan	2018-08-21 00:00:00+00:00	2018	8	2020-05-26	0	Part Two	HECLA ST	42.3068	-71.0603	(42.30682138, -71.06030035)
2	nan	03410	Towed	TOWED MOTOR VEHICLE	D4	151	nan	2018-09-03 19:27:00+00:00	2018	9	2020-06-01	19	Part Three	CAZENOVE ST	42.3466	-71.0724	(42.34658879, -71.07242943)
3	nan	03114	Investigate Property	INVESTIGATE PROPERTY	D4	272	nan	2018-09-03 21:16:00+00:00	2018	9	2020-06-01	21	Part Three	NEWCOMB ST	42.3342	-71.0787	(42.33418175, -71.07866441)
4	nan	03114	Investigate Property	INVESTIGATE PROPERTY	B3	421	nan	2018-09-03 21:05:00+00:00	2018	9	2020-06-01	21	Part Three	DELHI ST	42.2754	-71.0904	(42.27536542, -71.09036101)
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
319068	nan	03125	Warrant Arrests	WARRANT ARREST	D4	285	nan	2016-06-05 17:25:00+00:00	2016	6	2020-05-31	17	Part Three	COVENTRY ST	42.337	-71.0857	(42.33695098, -71.08574813)
319069	nan	00111	Homicide	MURDER, NON-NEGLIGIENT MANSLAUGHTER	E18	520	nan	2015-07-09 13:38:00+00:00	2015	7	2020-05-28	13	Part One	RIVER ST	42.2559	-71.1232	(42.25592648, -71.12317207)
319070	nan	03125	Warrant Arrests	WARRANT ARREST	E18	520	nan	2015-07-09 13:38:00+00:00	2015	7	2020-05-28	13	Part Three	RIVER ST	42.2559	-71.1232	(42.25592648, -71.12317207)
319071	nan	03125	Warrant Arrests	WARRANT ARREST	E13	569	nan	2016-05-31 19:35:00+00:00	2016	5	2020-05-26	19	Part Three	NEW WASHINGTON ST	42.3023	-71.1116	(42.30233307, -71.11156487)
319072	142052550	03125	Warrant Arrests	WARRANT ARREST	D4	903	nan	2015-06-22 00:12:00+00:00	2015	6	2020-06-01	0	Part Three	WASHINGTON ST	42.3338	-71.0803	(42.33383935, -71.08029038)

319073 rows × 17 columns

In [17]:

a = df["INCIDENT_NUMBER"].compute() 
b =  df["OFFENSE_CODE"].compute()
from dask import dataframe as dd
dd.compute(a+b)

Out[17]:

(0            I18207094500619
 1            I18207094301402
 2            I18207094103410
 3            I18207094003114
 4            I18207093803114
                  ...        
 319068    I050310906-0003125
 319069    I030217815-0800111
 319070    I030217815-0803125
 319071    I010370257-0003125
 319072        14205255003125
 Length: 319073, dtype: object,)

In [10]:

def func(value):
    return value
df.apply(func)

---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-10-46167e4a0a69> in <module>
      1 def func(value):
      2     return value
----> 3 df.apply(func)

~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py in apply(self, func, axis, broadcast, raw, reduce, args, meta, **kwds)
   4145                 "  Try: df.apply(func, axis=1)"
   4146             )
-> 4147             raise NotImplementedError(msg)
   4148 
   4149         if meta is no_default:

NotImplementedError: dd.DataFrame.apply only supports axis=1
  Try: df.apply(func, axis=1)

In [8]:

%%time
# _output = df.ext.cast_and_profile(columns="*", flush=True, output="json")
            # df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()

Wall time: 0 ns

In [9]:

df.ext.display()

Viewing 10 of 319073 rows / 17 columns

8 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (object) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (object) not nullable	MONTH 10 (object) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (object) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (object) not nullable	Long 16 (object) not nullable	Location 17 (object) not nullable
I182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.0827326	(42.30607218,⋅-71.08273260)
I182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)

Viewing 10 of 319073 rows / 17 columns

8 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [26]:

df.ext.partitions()

Out[26]:

In [7]:

df.compute()

Out[7]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	I182070945	00619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.35779134	-71.13937053	(42.35779134, -71.13937053)
1	I182070943	01402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.30682138	-71.06030035	(42.30682138, -71.06030035)
2	I182070941	03410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.34658879	-71.07242943	(42.34658879, -71.07242943)
3	I182070940	03114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.33418175	-71.07866441	(42.33418175, -71.07866441)
4	I182070938	03114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.27536542	-71.09036101	(42.27536542, -71.09036101)
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
319068	I050310906-00	03125	Warrant Arrests	WARRANT ARREST	D4	285	NaN	2016-06-05 17:25:00	2016	6	Sunday	17	Part Three	COVENTRY ST	42.33695098	-71.08574813	(42.33695098, -71.08574813)
319069	I030217815-08	00111	Homicide	MURDER, NON-NEGLIGIENT MANSLAUGHTER	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part One	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319070	I030217815-08	03125	Warrant Arrests	WARRANT ARREST	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part Three	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319071	I010370257-00	03125	Warrant Arrests	WARRANT ARREST	E13	569	NaN	2016-05-31 19:35:00	2016	5	Tuesday	19	Part Three	NEW WASHINGTON ST	42.30233307	-71.11156487	(42.30233307, -71.11156487)
319072	142052550	03125	Warrant Arrests	WARRANT ARREST	D4	903	NaN	2015-06-22 00:12:00	2015	6	Monday	0	Part Three	WASHINGTON ST	42.33383935	-71.08029038	(42.33383935, -71.08029038)

319073 rows × 17 columns

In [19]:

df.cols.year("")

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-19-f77bfb6ed87a> in <module>
----> 1 df.cols.year("birth")

~\Documents\Optimus\optimus\engines\base\dask\columns.py in year(self, input_cols, output_cols)
    573         output_cols = get_output_cols(input_cols, output_cols)
    574         for input_col, output_col in zip(input_cols, output_cols):
--> 575             df = df.assign(**{output_col: df[input_col].dt.year})
    576         return df
    577 

pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__()

~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py in dt(self)
   2616     def dt(self):
   2617         """ Namespace of datetime methods """
-> 2618         return DatetimeAccessor(self)
   2619 
   2620     @cache_readonly

~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\accessor.py in __init__(self, series)
     34         if hasattr(series_meta, "to_series"):  # is index-like
     35             series_meta = series_meta.to_series()
---> 36         meta = getattr(series_meta, self._accessor_name)
     37 
     38         self._meta = meta

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5268             or name in self._accessors
   5269         ):
-> 5270             return object.__getattribute__(self, name)
   5271         else:
   5272             if self._info_axis._can_hold_identifiers_and_holds_name(name):

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\accessor.py in __get__(self, obj, cls)
    185             # we're accessing the attribute of the class, i.e., Dataset.geo
    186             return self._accessor
--> 187         accessor_obj = self._accessor(obj)
    188         # Replace the property with the accessor object. Inspired by:
    189         # http://www.pydanny.com/cached-property.html

~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\accessors.py in __new__(cls, data)
    336             return DatetimeProperties(data, orig)
    337 
--> 338         raise AttributeError("Can only use .dt accessor with datetimelike values")

AttributeError: Can only use .dt accessor with datetimelike values

In [10]:

print(_output)

{"columns": {"customer_id": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object"}, "transactoin_date": {"profiler_dtype": "date", "profiler_dtype_fotmat": "date", "stats": {"mismatch": 11, "missing": 6, "match": 0}, "dtype": "datetime64[ns]"}, "ticket_price": {"profiler_dtype": "decimal", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": 9.99, "count": 7}, {"value": 29.99, "count": 5}, {"value": 14.99, "count": 4}, {"value": 9.91, "count": 1}], "count_uniques": 4}, "dtype": "object"}, "discount": {"profiler_dtype": "object", "stats": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}, "dtype": "object"}, "product": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object"}, "info": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "int64"}}, "name": null, "file_name": "tmp71_odjvg.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "datetime64[ns]", "int64"], "total_count_dtypes": 3, "missing_count": 0, "p_missing": 0.0}}

In [6]:

import pendulum
pendulum.from_format("11 11 1980", "DD MM YYYY")

Out[6]:

DateTime(1980, 11, 11, 0, 0, 0, tzinfo=Timezone('UTC'))

In [20]:

# df = op.load.file("data/test-types.csv").ext.cache()
df = op.load.file("data/dataset-transactions.csv").ext.cache()
a  = df.cols.infer_profiler_dtypes("*")
print(a)

{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'string', 'product': 'string', 'info': 'int'}

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [21]:

df = df.cols.cast_to_profiler_dtypes(columns={'transactoin_date': 'date'})

In [22]:

# df["a"] = df.transactoin_date.astype('M8[us]')
# b["transactoin_date"].dt.year

In [23]:

df.dtypes

Out[23]:

customer_id                 object
transactoin_date    datetime64[ns]
ticket_price                object
discount                    object
product                     object
info                        object
dtype: object

In [24]:

df.cols.year("transactoin_date", "new1").compute()
df.cols.month("transactoin_date", "new1").compute()
df.cols.day("transactoin_date", "new1").compute()

Out[24]:

	customer_id	transactoin_date	ticket_price	discount	product	info	new1
0	0f345kjh345oiuy345	2010-08-19	29.99	NaN	platinum	1	19.0
1	0fju234978rfjkhsdf	2012-01-05	29.99	NaN	platinum	2	5.0
2	0f34ruiy23e78y2r	2009-08-11	29.99	NaN	platinum	3	11.0
3	0fue298y2r23r23r2	2010-08-19	29.99	NaN	platinum	4	19.0
4	0f345kjh345oiuy346	2010-08-19	29.99	NaN	platinum	5	19.0
5	0fju234978rfjkhsdf	2010-08-19	9.99	5%	basic	6	19.0
6	0f34ruiy23e78y2r	2010-08-19	9.99	NaN	basic	7	19.0
7	0fue298y2r23r23r3	2010-08-19	9.99	NaN	basic	8	19.0
8	0f345kjh345oiuy347	2010-08-19	9.99	NaN	basic	9	19.0
9	0fju234978rfjkhsdf	2011-08-11	9.99	5%	basic	10	11.0
10	0f34ruiy23e78y2r	2015-08-09	9.99	5%	basic	10	9.0
11	0fue298y2r23r23r4	NaT	9.91	NaN	basic	10	NaN
12	0f345kjh345oiuy348	NaT	9.99	NaN	basic	10	NaN
13	0fju234978rfjkhsdf	NaT	14.99	5%	deluxe	10	NaN
14	0f34ruiy23e78y2r	NaT	14.99	NaN	deluxe	10	NaN
15	0fue298y2r23r23r5	NaT	14.99	5%	deluxe	10	NaN
16	0f345kjh345oiuy349	NaT	14.99	NaN	deluxe	10	NaN

In [382]:

df.ext.display(20)

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Viewing 17 of 17 rows / 6 columns

1 partition(s)

customer_id 1 (object) not nullable	transactoin_date 2 (object) not nullable	ticket_price 3 (object) not nullable	discount 4 (object) not nullable	product 5 (object) not nullable	info 6 (object) not nullable
0f345kjh345oiuy345	2010/08/19	29.99	nan	platinum	1
0fju234978rfjkhsdf	2012/01/05	29.99	nan	platinum	2
0f34ruiy23e78y2r	2009/08/11	29.99	nan	platinum	3
0fue298y2r23r23r2	2010/08/19	29.99	nan	platinum	4
0f345kjh345oiuy346	2010/08/19	29.99	nan	platinum	5
0fju234978rfjkhsdf	2010/08/19	9.99	5%	basic	6
0f34ruiy23e78y2r	2010/08/19	9.99	nan	basic	7
0fue298y2r23r23r3	2010/08/19	9.99	nan	basic	8
0f345kjh345oiuy347	2010/08/19	9.99	nan	basic	9
0fju234978rfjkhsdf	2011/08/11	9.99	5%	basic	10
0f34ruiy23e78y2r	2015/08/09	9.99	5%	basic	10
0fue298y2r23r23r4	nan	9.91	nan	basic	10
0f345kjh345oiuy348	nan	9.99	nan	basic	10
0fju234978rfjkhsdf	nan	14.99	5%	deluxe	10
0f34ruiy23e78y2r	nan	14.99	nan	deluxe	10
0fue298y2r23r23r5	nan	14.99	5%	deluxe	10
0f345kjh345oiuy349	nan	14.99	nan	deluxe	10

Viewing 17 of 17 rows / 6 columns

1 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [ ]:

df= df.cols.unnest("transactoin_date", separator="/", splits=3, output_cols="transactoin_date").ext.cache()

In [ ]:

df.ext.profile(columns="*", infer=True, output="json", flush=True, bins=10)

In [293]:

df.ext.display()

Viewing 10 of 17 rows / 6 columns

1 partition(s)

customer_id 1 (object) not nullable	transactoin_date 2 (object) not nullable	ticket_price 3 (object) not nullable	discount 4 (object) not nullable	product 5 (object) not nullable	info 6 (object) not nullable
0f345kjh345oiuy345	2010/08/19	29.99	nan	platinum	1
0fju234978rfjkhsdf	2012/01/05	29.99	nan	platinum	2
0f34ruiy23e78y2r	2009/08/11	29.99	nan	platinum	3
0fue298y2r23r23r2	2010/08/19	29.99	nan	platinum	4
0f345kjh345oiuy346	2010/08/19	29.99	nan	platinum	5
0fju234978rfjkhsdf	2010/08/19	9.99	5%	basic	6
0f34ruiy23e78y2r	2010/08/19	9.99	nan	basic	7
0fue298y2r23r23r3	2010/08/19	9.99	nan	basic	8
0f345kjh345oiuy347	2010/08/19	9.99	nan	basic	9
0fju234978rfjkhsdf	2011/08/11	9.99	5%	basic	10

Viewing 10 of 17 rows / 6 columns

1 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [8]:

df.cols.infer_profiler_dtypes("*")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 13 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Out[8]:

{'A': 'string', 'B': 'string'}

In [363]:

df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590426328940.csv").ext.cache()
df = df.ext.optimize()
df = df.ext.repartition(1).ext.cache()

In [364]:

col_name = "customer_id"
df.cols.count_mismatch({col_name:"int","transactoin_date":"date"})

Out[364]:

{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0},
 'transactoin_date': {'mismatch': 11, 'missing': 6, 'match': 0}}

In [79]:

df = op.load.file("data/crime.csv").ext.cache()

In [113]:

%%time
df.cols.cast("INCIDENT_NUMBER","int").compute()

Wall time: 447 ms

Out[113]:

	INCIDENT_NUMBER	OFFENSE_CODE	OFFENSE_CODE_GROUP	OFFENSE_DESCRIPTION	DISTRICT	REPORTING_AREA	SHOOTING	OCCURRED_ON_DATE	YEAR	MONTH	DAY_OF_WEEK	HOUR	UCR_PART	STREET	Lat	Long	Location
0	I182070945	00619	Larceny	LARCENY ALL OTHERS	D14	808	NaN	2018-09-02 13:00:00	2018	9	Sunday	13	Part One	LINCOLN ST	42.35779134	-71.13937053	(42.35779134, -71.13937053)
1	I182070943	01402	Vandalism	VANDALISM	C11	347	NaN	2018-08-21 00:00:00	2018	8	Tuesday	0	Part Two	HECLA ST	42.30682138	-71.06030035	(42.30682138, -71.06030035)
2	I182070941	03410	Towed	TOWED MOTOR VEHICLE	D4	151	NaN	2018-09-03 19:27:00	2018	9	Monday	19	Part Three	CAZENOVE ST	42.34658879	-71.07242943	(42.34658879, -71.07242943)
3	I182070940	03114	Investigate Property	INVESTIGATE PROPERTY	D4	272	NaN	2018-09-03 21:16:00	2018	9	Monday	21	Part Three	NEWCOMB ST	42.33418175	-71.07866441	(42.33418175, -71.07866441)
4	I182070938	03114	Investigate Property	INVESTIGATE PROPERTY	B3	421	NaN	2018-09-03 21:05:00	2018	9	Monday	21	Part Three	DELHI ST	42.27536542	-71.09036101	(42.27536542, -71.09036101)
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
319068	I050310906-00	03125	Warrant Arrests	WARRANT ARREST	D4	285	NaN	2016-06-05 17:25:00	2016	6	Sunday	17	Part Three	COVENTRY ST	42.33695098	-71.08574813	(42.33695098, -71.08574813)
319069	I030217815-08	00111	Homicide	MURDER, NON-NEGLIGIENT MANSLAUGHTER	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part One	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319070	I030217815-08	03125	Warrant Arrests	WARRANT ARREST	E18	520	NaN	2015-07-09 13:38:00	2015	7	Thursday	13	Part Three	RIVER ST	42.25592648	-71.12317207	(42.25592648, -71.12317207)
319071	I010370257-00	03125	Warrant Arrests	WARRANT ARREST	E13	569	NaN	2016-05-31 19:35:00	2016	5	Tuesday	19	Part Three	NEW WASHINGTON ST	42.30233307	-71.11156487	(42.30233307, -71.11156487)
319072	142052550	03125	Warrant Arrests	WARRANT ARREST	D4	903	NaN	2015-06-22 00:12:00	2015	6	Monday	0	Part Three	WASHINGTON ST	42.33383935	-71.08029038	(42.33383935, -71.08029038)

319073 rows × 17 columns

In [120]:

df.ext.display()

Viewing 10 of 319073 rows / 17 columns

8 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (object) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (object) not nullable	MONTH 10 (object) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (object) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (object) not nullable	Long 16 (object) not nullable	Location 17 (object) not nullable
I182070945	00619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	01402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	03410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	03114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	03114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	03820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	00724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.08273260	(42.30607218,⋅-71.08273260)
I182070932	03301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	00301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	03301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)

Viewing 10 of 319073 rows / 17 columns

8 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [140]:

import numpy as np
a = np.array([1,2,3])
b = np.array([1,2,"a"])

def func(a, b):
    try:
        r =a+b
    except:
        r = np.nan
    return r

func(a,b)

Out[140]:

nan

In [224]:

df[["INCIDENT_NUMBER","REPORTING_AREA"]].compute().to_numpy()[0]

Out[224]:

array(['I182070945', '808'], dtype=object)

In [ ]:

def op(value):
    return value
#     return value["INCIDENT_NUMBER"]
#     print(value)
#     try:
#         return value["INCIDENT_NUMBER"] + value["REPORTING_AREA"]
#     except():
#         return np.nan

def func(pdf):    
#     print(type(pdf))
    return pdf.apply(op, axis=1)
    

df[["INCIDENT_NUMBER","REPORTING_AREA"]].map_partitions(func, meta=object).compute()

In [198]:

%%time
from numpy.core._exceptions import UFuncTypeError

def func(a,b):
    return a+b

def myfunc(a, b, expr):
    "Return a-b if a>b, otherwise return a+b"
    
    try:
        return func(a,b)
    except (UFuncTypeError, TypeError):
        return np.nan
    
vfunc = np.vectorize(myfunc)
print(func)
arr= [1, 2, 3,4,5,6,7,8,9,"A"]*1000000

b = vfunc(arr, 2, func)
# vfunc([1, 2, 3, "A"], 2)

<function func at 0x00000259BE4C1AF8>
Wall time: 5.37 s

In [189]:

len(arr)

Out[189]:

10000000

In [104]:

df = df.ext.repartition(8)

In [128]:

import re
comp = re.compile("^(?=.)([+-]?([0-9]*)(\.([0-9]+))?)$")    
%time df["INCIDENT_NUMBER"].str.match(comp).compute()

Wall time: 326 ms

Out[128]:

0         False
1         False
2         False
3         False
4         False
          ...  
319068    False
319069    False
319070    False
319071    False
319072     True
Name: INCIDENT_NUMBER, Length: 319073, dtype: bool

In [80]:

df.cols.names()

Out[80]:

['INCIDENT_NUMBER',
 'OFFENSE_CODE',
 'OFFENSE_CODE_GROUP',
 'OFFENSE_DESCRIPTION',
 'DISTRICT',
 'REPORTING_AREA',
 'SHOOTING',
 'OCCURRED_ON_DATE',
 'YEAR',
 'MONTH',
 'DAY_OF_WEEK',
 'HOUR',
 'UCR_PART',
 'STREET',
 'Lat',
 'Long',
 'Location']

In [366]:

from dateutil.parser import parse as dparse
dparse("2018-09-02 13:00:00")

Out[366]:

datetime.datetime(2018, 9, 2, 13, 0)

In [367]:

df.ext.display()

Viewing 10 of 319073 rows / 17 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (object) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (object) not nullable	MONTH 10 (object) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (object) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (object) not nullable	Long 16 (object) not nullable	Location 17 (object) not nullable
I182070945	00619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	01402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	03410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	03114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	03114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	03820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	00724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.08273260	(42.30607218,⋅-71.08273260)
I182070932	03301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	00301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	03301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)

Viewing 10 of 319073 rows / 17 columns

1 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [368]:

df.cols.infer_profiler_dtypes("*")

Out[368]:

{'INCIDENT_NUMBER': 'string',
 'OFFENSE_CODE': 'zip_code',
 'OFFENSE_CODE_GROUP': 'string',
 'OFFENSE_DESCRIPTION': 'string',
 'DISTRICT': 'string',
 'REPORTING_AREA': 'int',
 'SHOOTING': 'object',
 'OCCURRED_ON_DATE': 'string',
 'YEAR': 'int',
 'MONTH': 'int',
 'DAY_OF_WEEK': 'string',
 'HOUR': 'int',
 'UCR_PART': 'string',
 'STREET': 'string',
 'Lat': 'decimal',
 'Long': 'decimal',
 'Location': 'string'}

In [ ]:

In [34]:

df.ext.display(20)

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

Viewing 17 of 17 rows / 6 columns

1 partition(s)

customer_id 1 (object) not nullable	transactoin_date 2 (object) not nullable	ticket_price 3 (object) not nullable	discount 4 (object) not nullable	product 5 (object) not nullable	info 6 (uint8) not nullable
0f345kjh345oiuy345	2010/08/19	29.99	nan	platinum	1
0fju234978rfjkhsdf	2012/01/05	29.99	nan	platinum	2
0f34ruiy23e78y2r	2009/08/11	29.99	nan	platinum	3
0fue298y2r23r23r2	2010/08/19	29.99	nan	platinum	4
0f345kjh345oiuy346	2010/08/19	29.99	nan	platinum	5
0fju234978rfjkhsdf	2010/08/19	9.99	5%	basic	6
0f34ruiy23e78y2r	2010/08/19	9.99	nan	basic	7
0fue298y2r23r23r3	2010/08/19	9.99	nan	basic	8
0f345kjh345oiuy347	2010/08/19	9.99	nan	basic	9
0fju234978rfjkhsdf	2011/08/11	9.99	5%	basic	10
0f34ruiy23e78y2r	2015/08/09	9.99	5%	basic	10
0fue298y2r23r23r4	nan	9.91	nan	basic	10
0f345kjh345oiuy348	nan	9.99	nan	basic	10
0fju234978rfjkhsdf	nan	14.99	5%	deluxe	10
0f34ruiy23e78y2r	nan	14.99	nan	deluxe	10
0fue298y2r23r23r5	nan	14.99	5%	deluxe	10
0f345kjh345oiuy349	nan	14.99	nan	deluxe	10

Viewing 17 of 17 rows / 6 columns

1 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [69]:

col_name ="customer_id"

print(df.cols.count_mismatch({col_name:"int"}))
print(df.cols.count_mismatch({col_name:"decimal"}))
print(df.cols.count_mismatch({col_name:"boolean"}))
print(df.cols.count_mismatch({col_name:"date"}))
print(df.cols.count_mismatch({col_name:"array"}))
print(df.cols.count_mismatch({col_name:"object"}))
print(df.cols.count_mismatch({col_name:"gender"}))
print(df.cols.count_mismatch({col_name:"ip"}))
print(df.cols.count_mismatch({col_name:"url"}))
print(df.cols.count_mismatch({col_name:"email"}))
print(df.cols.count_mismatch({col_name:"credit_card_number"}))
print(df.cols.count_mismatch({col_name:"zip_code"}))

{'customer_id': {'mismatch': 0, 'missing': 0, 'match': 17, 'profiler_dtype': 'string'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'int'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'decimal'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'boolean'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'date'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'array'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'object'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'gender'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'ip'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'url'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'email'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'credit_card_number'}}
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'zip_code'}}

In [65]:

_output = df.ext.profile(columns=col_name, infer=True, output="json", flush=True)
# _output = df.ext.set_buffer("*")
# _output = df.ext.set_buffer("*")
# _output = df.ext.buffer_window("*", 0, 17).ext.to_json("*")
print(_output)

cols_and_inferred_dtype {'customer_id': 'string'}
{"columns": {"customer_id": {"stats": {"mismatch": 0, "missing": 0, "match": 3, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object", "profiler_dtype": "string"}}, "name": null, "file_name": "tmpthwlzht1.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "uint8"], "total_count_dtypes": 2, "missing_count": 0, "p_missing": 0.0}}

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [56]:

cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
print(cols_and_inferred_dtype)
mismatch = df.cols.count_mismatch(cols_and_inferred_dtype, infer=True, compute=True)
print(mismatch)

{'customer_id': 'string', 'transactoin_date': 'string', 'ticket_price': 'decimal', 'discount': 'object', 'product': 'string', 'info': 'int'}
{'customer_id': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'transactoin_date': {'mismatch': 0, 'missing': 3, 'match': 0, 'profiler_dtype': 'string'}, 'ticket_price': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'decimal'}, 'discount': {'mismatch': 1, 'missing': 2, 'match': 0, 'profiler_dtype': 'object'}, 'product': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'info': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'int'}}

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [33]:

df = op.load.file("data/crime.csv").ext.cache()
df = df.ext.optimize().ext.cache()

In [36]:

df.cols.count_mismatch({"OFFENSE_DESCRIPTION":"string"})

Out[36]:

{'OFFENSE_DESCRIPTION': {'mismatch': 0,
  'missing': 0,
  'match': 319073,
  'profiler_dtype': 'string'}}

In [35]:

df.ext.display()

Viewing 10 of 319073 rows / 17 columns

1 partition(s)

INCIDENT_NUMBER 1 (object) not nullable	OFFENSE_CODE 2 (uint16) not nullable	OFFENSE_CODE_GROUP 3 (object) not nullable	OFFENSE_DESCRIPTION 4 (object) not nullable	DISTRICT 5 (object) not nullable	REPORTING_AREA 6 (object) not nullable	SHOOTING 7 (object) not nullable	OCCURRED_ON_DATE 8 (object) not nullable	YEAR 9 (uint16) not nullable	MONTH 10 (uint8) not nullable	DAY_OF_WEEK 11 (object) not nullable	HOUR 12 (uint8) not nullable	UCR_PART 13 (object) not nullable	STREET 14 (object) not nullable	Lat 15 (object) not nullable	Long 16 (object) not nullable	Location 17 (object) not nullable
I182070945	619	Larceny	LARCENY⋅ALL⋅OTHERS	D14	808	nan	2018-09-02⋅13:00:00	2018	9	Sunday	13	Part⋅One	LINCOLN⋅ST	42.35779134	-71.13937053	(42.35779134,⋅-71.13937053)
I182070943	1402	Vandalism	VANDALISM	C11	347	nan	2018-08-21⋅00:00:00	2018	8	Tuesday	0	Part⋅Two	HECLA⋅ST	42.30682138	-71.06030035	(42.30682138,⋅-71.06030035)
I182070941	3410	Towed	TOWED⋅MOTOR⋅VEHICLE	D4	151	nan	2018-09-03⋅19:27:00	2018	9	Monday	19	Part⋅Three	CAZENOVE⋅ST	42.34658879	-71.07242943	(42.34658879,⋅-71.07242943)
I182070940	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	D4	272	nan	2018-09-03⋅21:16:00	2018	9	Monday	21	Part⋅Three	NEWCOMB⋅ST	42.33418175	-71.07866441	(42.33418175,⋅-71.07866441)
I182070938	3114	Investigate⋅Property	INVESTIGATE⋅PROPERTY	B3	421	nan	2018-09-03⋅21:05:00	2018	9	Monday	21	Part⋅Three	DELHI⋅ST	42.27536542	-71.09036101	(42.27536542,⋅-71.09036101)
I182070936	3820	Motor⋅Vehicle⋅Accident⋅Response	M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY	C11	398	nan	2018-09-03⋅21:09:00	2018	9	Monday	21	Part⋅Three	TALBOT⋅AVE	42.29019621	-71.07159012	(42.29019621,⋅-71.07159012)
I182070933	724	Auto⋅Theft	AUTO⋅THEFT	B2	330	nan	2018-09-03⋅21:25:00	2018	9	Monday	21	Part⋅One	NORMANDY⋅ST	42.30607218	-71.08273260	(42.30607218,⋅-71.08273260)
I182070932	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	B2	584	nan	2018-09-03⋅20:39:37	2018	9	Monday	20	Part⋅Three	LAWN⋅ST	42.32701648	-71.10555088	(42.32701648,⋅-71.10555088)
I182070931	301	Robbery	ROBBERY⋅-⋅STREET	C6	177	nan	2018-09-03⋅20:48:00	2018	9	Monday	20	Part⋅One	MASSACHUSETTS⋅AVE	42.33152148	-71.07085307	(42.33152148,⋅-71.07085307)
I182070929	3301	Verbal⋅Disputes	VERBAL⋅DISPUTE	C11	364	nan	2018-09-03⋅20:38:00	2018	9	Monday	20	Part⋅Three	LESLIE⋅ST	42.29514664	-71.05860832	(42.29514664,⋅-71.05860832)

Viewing 10 of 319073 rows / 17 columns

1 partition(s) <class 'dask.dataframe.core.DataFrame'>

In [99]:

import functools
import dask
import dask.dataframe as dd
import pandas as pd

pdf = pd.DataFrame({
    'x': range(0, 100),
    'y': range(0, 100),
    'z': range(0, 100)
})

ddf = dd.from_pandas(pdf, npartitions=8)

print('Number of partitions', ddf.npartitions)


def compute_stats(row):
    return {
        'sum': row['x'] + row['y'] + row['z'],
        'min': min(row),
        'max': max(row)
    }


def accum_stats(stats_accum, stats):
    return {
        'sum': stats_accum['sum'] + stats['sum'],
        'min': min(stats_accum['min'], stats['min']),
        'max': max(stats_accum['max'], stats['max'])
    }


def compute_stats_partition(pdf):
    pds = pdf.apply(compute_stats, axis=1)
    print(pdf)
    print("99999999999999999999999999999999999999999999999")
    return pds
    return functools.reduce(accum_stats, pds)


def merge_stats_series(pds):
    print(pds)
    print("--")
    return pds
    return functools.reduce(accum_stats, pds)


res = ddf.reduction(
    compute_stats_partition,
    merge_stats_series,
    meta={
        'sum': 'int64',
        'min': 'int64',
        'max': 'int64'
    })

# singleton dataframe to list of delayed objects
# where each row is a delayed object
# and in this case we just want the first one
delayed_dict = res.to_delayed()[0]
a = dd.compute(delayed_dict)

Number of partitions 8
     x   y   z
13  13  13  13
14  14  14  14
15  15  15  15
16  16  16  16
17  17  17  17
18  18  18  18
19  19  19  19
20  20  20  20
21  21  21  21
22  22  22  22
23  23  23  23
24  24  24  24
25  25  25  25     x   y   z
26  26  26  26
27  27  27  27
28  28  28  28
29  29  29  29
30  30  30  30
31  31  31  31
32  32  32  32
33  33  33  33
34  34  34  34
35  35  35  35
36  36  36  36
37  37  37  37
38  38  38  38
99999999999999999999999999999999999999999999999
     x   y   z
39  39  39  39
40  40  40  40
41  41  41  41
42  42  42  42
43  43  43  43
44  44  44  44
45  45  45  45
46  46  46  46
47  47  47  47
48  48  48  48
49  49  49  49
50  50  50  50
51  51  51  51
99999999999999999999999999999999999999999999999
     x   y   z
0    0   0   0
1    1   1   1
2    2   2   2
3    3   3   3
4    4   4   4
5    5   5   5
6    6   6   6
7    7   7   7
8    8   8   8
9    9   9   9
10  10  10  10
11  11  11  11
12  12  12  12
99999999999999999999999999999999999999999999999

99999999999999999999999999999999999999999999999     x   y   z
52  52  52  52
53  53  53  53
54  54  54  54
55  55  55  55
56  56  56  56
57  57  57  57
58  58  58  58
59  59  59  59
60  60  60  60
61  61  61  61
62  62  62  62
63  63  63  63
64  64  64  64
99999999999999999999999999999999999999999999999
     x   y   z
91  91  91  91
92  92  92  92
93  93  93  93
94  94  94  94
95  95  95  95
96  96  96  96
97  97  97  97
98  98  98  98
99  99  99  99
99999999999999999999999999999999999999999999999

     x   y   z
65  65  65  65
66  66  66  66
67  67  67  67
68  68  68  68
69  69  69  69
70  70  70  70
71  71  71  71
72  72  72  72
73  73  73  73
74  74  74  74
75  75  75  75
76  76  76  76
77  77  77  77
99999999999999999999999999999999999999999999999
     x   y   z
78  78  78  78
79  79  79  79
80  80  80  80
81  81  81  81
82  82  82  82
83  83  83  83
84  84  84  84
85  85  85  85
86  86  86  86
87  87  87  87
88  88  88  88
89  89  89  89
90  90  90  90
99999999999999999999999999999999999999999999999
                               0                               1   \
0  {'sum': 0, 'min': 0, 'max': 0}  {'sum': 3, 'min': 1, 'max': 1}   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   

                               2                               3   \
0  {'sum': 6, 'min': 2, 'max': 2}  {'sum': 9, 'min': 3, 'max': 3}   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   
0                             NaN                             NaN   

                                4                                5   \
0  {'sum': 12, 'min': 4, 'max': 4}  {'sum': 15, 'min': 5, 'max': 5}   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   

                                6                                7   \
0  {'sum': 18, 'min': 6, 'max': 6}  {'sum': 21, 'min': 7, 'max': 7}   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   
0                              NaN                              NaN   

                                8                                9   ...  \
0  {'sum': 24, 'min': 8, 'max': 8}  {'sum': 27, 'min': 9, 'max': 9}  ...   
0                              NaN                              NaN  ...   
0                              NaN                              NaN  ...   
0                              NaN                              NaN  ...   
0                              NaN                              NaN  ...   
0                              NaN                              NaN  ...   
0                              NaN                              NaN  ...   
0                              NaN                              NaN  ...   

                                   90                                  91  \
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0  {'sum': 270, 'min': 90, 'max': 90}                                 NaN   
0                                 NaN  {'sum': 273, 'min': 91, 'max': 91}   

                                   92                                  93  \
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0  {'sum': 276, 'min': 92, 'max': 92}  {'sum': 279, 'min': 93, 'max': 93}   

                                   94                                  95  \
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0  {'sum': 282, 'min': 94, 'max': 94}  {'sum': 285, 'min': 95, 'max': 95}   

                                   96                                  97  \
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0                                 NaN                                 NaN   
0  {'sum': 288, 'min': 96, 'max': 96}  {'sum': 291, 'min': 97, 'max': 97}   

                                   98                                  99  
0                                 NaN                                 NaN  
0                                 NaN                                 NaN  
0                                 NaN                                 NaN  
0                                 NaN                                 NaN  
0                                 NaN                                 NaN  
0                                 NaN                                 NaN  
0                                 NaN                                 NaN  
0  {'sum': 294, 'min': 98, 'max': 98}  {'sum': 297, 'min': 99, 'max': 99}  

[8 rows x 100 columns]
--

In [361]:

df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590447280612.csv").ext.cache()
df = df.ext.optimize()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.profile(columns="*", infer=True, output="json")

C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`.
  warnings.warn(msg.format(n, len(r)))

In [362]:

print(_output)

{"columns": {"customer_id": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}}, "transactoin_date": {"stats": {"mismatch": 0, "missing": 6, "match": 11, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 6, "match": 11, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}}, "ticket_price": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "9.99", "count": 7}, {"value": "29.99", "count": 5}, {"value": "14.99", "count": 4}, {"value": "9.91", "count": 1}], "count_uniques": 4}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "9.99", "count": 7}, {"value": "29.99", "count": 5}, {"value": "14.99", "count": 4}, {"value": "9.91", "count": 1}], "count_uniques": 4}}, "discount": {"stats": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}, "dtype": "object", "profiler_dtype": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}}, "product": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}}, "info": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "uint8", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}}}, "name": null, "file_name": "tmp_vk79fcm.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "uint8"], "total_count_dtypes": 2, "missing_count": 0, "p_missing": 0.0}}

In [22]:

##Bag
# each element is an integer
import dask.bag as db
b = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], npartitions=1)
b.take(3)

Out[22]:

(1, 2, 3)

In [23]:

%%time
b.to_dataframe().compute()

Wall time: 850 ms

Out[23]:

	0
0	1
1	2
2	3
3	4
4	5
5	6
6	7
7	8
8	9
9	10

In [50]:

from optimus.helpers.functions import match_date

In [78]:

import re
pattern = match_date("dd MM yy h")
string = "11 05 1980 2"
prog = re.compile(pattern)
result = prog.match(string)
print(bool(result))

['dd', ' ', 'MM', ' ', 'yy', ' ', 'h']
False