%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
# import sys,os,os.path
# os.environ
def xray(df, col_name):
for i in df[col_name]:
print(i, type(i))
op = Optimus("dask", threads_per_worker=8, n_workers=1, comm=True)
preview_df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv", n_rows=35).ext.cache()
C:\Users\argenisleon\Anaconda3\lib\site-packages\distributed\dashboard\core.py:79: UserWarning: Port 8787 is already in use. Perhaps you already have a cluster running? Hosting the diagnostics dashboard on a random port instead. warnings.warn("\n" + msg) C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 35 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
import fastnumbers
fastnumbers.fast_float(None)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-29-8e1775e35bdf> in <module> 1 import fastnumbers ----> 2 fastnumbers.fast_float(None) TypeError: float() argument must be a string or a number, not 'NoneType'
f = df.cols.set(value='df["ticket_price"]*df["discount"]', where='df["ticket_price"]!=None', output_cols="ticket_price").ext.cache()
_output = df.ext.profile(columns="*", output="json")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
_output = {**preview_df.ext.to_json("*"), "meta": preview_df.meta.get() if (preview_df.meta and preview_df.meta.get) else {} }
_output = preview_df.ext.profile(columns="*", output="json")
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv").ext.cache()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.profile(columns="*", output="json")
_output = df.ext.set_buffer("*")
_output = df.ext.buffer_window("*", 0, 17).ext.to_json("*")
print(_output)
_output = df.ext.set_buffer("*")
_output = df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]', where='df["ticket_price"]!=None', output_cols="new ticket_price").ext.to_json("*")
_df_profile = df.ext.buffer_window("*").cols.set(value='df["ticket_price"]', where='df["ticket_price"]!=None', output_cols="new ticket_price")
_output = { "profile": _df_profile.ext.profile(["new ticket_price"], output="json")}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'sample': {'columns': [{'title': 'customer_id'}, {'title': 'transactoin_date'}, {'title': 'ticket_price'}, {'title': 'discount'}, {'title': 'product'}, {'title': 'info'}], 'value': [['0f345kjh345oiuy345', '2010/08/19', '29.99', nan, 'platinum', '1'], ['0fju234978rfjkhsdf', '2012/01/05', '29.99', nan, 'platinum', '2'], ['0f34ruiy23e78y2r', '2009/08/11', '29.99', nan, 'platinum', '3'], ['0fue298y2r23r23r2', '2010/08/19', '29.99', nan, 'platinum', '4'], ['0f345kjh345oiuy346', '2010/08/19', '29.99', nan, 'platinum', '5'], ['0fju234978rfjkhsdf', '2010/08/19', '9.99', '5%', 'basic', '6'], ['0f34ruiy23e78y2r', '2010/08/19', '9.99', nan, 'basic', '7'], ['0fue298y2r23r23r3', '2010/08/19', '9.99', nan, 'basic', '8'], ['0f345kjh345oiuy347', '2010/08/19', '9.99', nan, 'basic', '9'], ['0fju234978rfjkhsdf', '2011/08/11', '9.99', '5%', 'basic', '10'], ['0f34ruiy23e78y2r', '2015/08/09', '9.99', '5%', 'basic', '10'], ['0fue298y2r23r23r4', nan, '9.91', nan, 'basic', '10'], ['0f345kjh345oiuy348', nan, '9.99', nan, 'basic', '10'], ['0fju234978rfjkhsdf', nan, '14.99', '5%', 'deluxe', '10'], ['0f34ruiy23e78y2r', nan, '14.99', nan, 'deluxe', '10'], ['0fue298y2r23r23r5', nan, '14.99', '5%', 'deluxe', '10'], ['0f345kjh345oiuy349', nan, '14.99', nan, 'deluxe', 'a']]}} pdf ticket_price 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 a 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 Name: new ticket_price, dtype: float64 pdf ticket_price 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 a 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 Name: new ticket_price, dtype: float64 dtype int {'new ticket_price': {'match': 17, 'missing': 0, 'mismatch': 17}}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r))) C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df.compute().ext.head()
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5% | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5% | basic | 10 |
_output = df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]*df["discount"]', where='df["ticket_price"]!=None', output_cols="new ticket_price").ext.to_json("*")
pdf ticket_price 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 a 0 899.4001 1 899.4001 2 899.4001 3 899.4001 4 899.4001 5 99.8001 6 99.8001 7 99.8001 8 99.8001 9 99.8001 10 99.8001 11 98.2081 12 99.8001 13 224.7001 14 224.7001 15 224.7001 16 224.7001 Name: new ticket_price, dtype: float64
df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]*df["ticket_price"]',output_cols="new 1")
from optimus import Optimus
op = Optimus("dask", n_workers=1, threads_per_worker=8, processes=False, memory_limit="3G", comm=True)
%%time
df = op.load.file("data/dataset-transactions.csv").ext.cache()
# df = df.ext.optimize()
df = df.ext.repartition(1).ext.cache()
df.ext.display(20)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (object)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
0f34ruiy23e78y2r
|
2015/08/09
|
9.99
|
5%
|
basic
|
10
|
0fue298y2r23r23r4
|
nan
|
9.91
|
nan
|
basic
|
10
|
0f345kjh345oiuy348
|
nan
|
9.99
|
nan
|
basic
|
10
|
0fju234978rfjkhsdf
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f34ruiy23e78y2r
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
0fue298y2r23r23r5
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f345kjh345oiuy349
|
nan
|
14.99
|
nan
|
deluxe
|
a
|
Wall time: 163 ms
# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
# df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
# # result = df.ext.profile(columns=columns, bins=bins, output=output, flush=flush, size=size)
# print(cols_and_inferred_dtype)
# df.cols.infer_profiler_dtypes("*")
p = df.ext.profile("*", flush=True)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
AAAA {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\r\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}} BBBB {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\r\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {'actions': [{'profiler_dtype': 'customer_id'}, {'profiler_dtype': 'transactoin_date'}, {'profiler_dtype': 'ticket_price'}, {'profiler_dtype': 'discount'}, {'profiler_dtype': 'product'}, {'profiler_dtype': 'info'}]}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object', 'profiler_dtype': 'date'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object', 'profiler_dtype': 'decimal'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object', 'profiler_dtype': 'int'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}}
df.meta.get()["profile"]["columns"]["customer_id"]
{'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}
# df, p = df.ext.cast_and_profile("*")
assert(p["columns"]["ticket_price"]["stats"]["match"]==17)
assert(p["columns"]["ticket_price"]["stats"]["mismatch"]==0)
assert(p["columns"]["ticket_price"]["stats"]["missing"]==0)
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'string', 'product': 'string', 'info': 'int'}
df = df.cols.replace("discount", search=["%"], replace_by="", search_by="chars", ignore_case=True)
df.compute()
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5 | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5 | basic | 10 |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 9.99 | 5 | basic | 10 |
11 | 0fue298y2r23r23r4 | NaN | 9.91 | NaN | basic | 10 |
12 | 0f345kjh345oiuy348 | NaN | 9.99 | NaN | basic | 10 |
13 | 0fju234978rfjkhsdf | NaN | 14.99 | 5 | deluxe | 10 |
14 | 0f34ruiy23e78y2r | NaN | 14.99 | NaN | deluxe | 10 |
15 | 0fue298y2r23r23r5 | NaN | 14.99 | 5 | deluxe | 10 |
16 | 0f345kjh345oiuy349 | NaN | 14.99 | NaN | deluxe | a |
# df.cols.infer_profiler_dtypes("*")
from optimus.infer import Infer
df.ext.head("*",50).applymap(Infer.parse_pandas)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 50 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | string | date | decimal | null | string | int |
1 | string | date | decimal | null | string | int |
2 | string | date | decimal | null | string | int |
3 | string | date | decimal | null | string | int |
4 | string | date | decimal | null | string | int |
5 | string | date | decimal | int | string | int |
6 | string | date | decimal | null | string | int |
7 | string | date | decimal | null | string | int |
8 | string | date | decimal | null | string | int |
9 | string | date | decimal | int | string | int |
10 | string | date | decimal | int | string | int |
11 | string | null | decimal | null | string | int |
12 | string | null | decimal | null | string | int |
13 | string | null | decimal | int | string | int |
14 | string | null | decimal | null | string | int |
15 | string | null | decimal | int | string | int |
16 | string | null | decimal | null | string | string |
xray(df, "discount")
nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> 5 <class 'str'> nan <class 'float'>
p = df.ext.profile("discount")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'int', 'product': 'string', 'info': 'int'}
# df["transactoin_date"].compute()
df.compute()
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5 | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5 | basic | 10 |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 9.99 | 5 | basic | 10 |
11 | 0fue298y2r23r23r4 | NaN | 9.91 | NaN | basic | 10 |
12 | 0f345kjh345oiuy348 | NaN | 9.99 | NaN | basic | 10 |
13 | 0fju234978rfjkhsdf | NaN | 14.99 | 5 | deluxe | 10 |
14 | 0f34ruiy23e78y2r | NaN | 14.99 | NaN | deluxe | 10 |
15 | 0fue298y2r23r23r5 | NaN | 14.99 | 5 | deluxe | 10 |
16 | 0f345kjh345oiuy349 | NaN | 14.99 | NaN | deluxe | a |
xray(df, "discount")
nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> 5 <class 'str'> nan <class 'float'>
df = df.cols.unnest("transactoin_date", separator="/", splits=2, output_cols="transactoin_date").ext.cache()
_output = df.ext.profile(columns="*", output="json", flush=True)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df.meta.get()["profile"]["columns"]["customer_id"]
{'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'customer_id': 'string', 'transactoin_date': 'date', 'transactoin_date_0': 'int', 'transactoin_date_1': 'int', 'ticket_price': 'decimal', 'discount': 'int', 'product': 'string', 'info': 'int'}
df.compute()
customer_id | transactoin_date | transactoin_date_0 | transactoin_date_1 | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 2010 | 08 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 2012 | 01 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 2009 | 08 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 2010 | 08 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 2010 | 08 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 2010 | 08 | 9.99 | 5 | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 2010 | 08 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 2010 | 08 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 2010 | 08 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 2011 | 08 | 9.99 | 5 | basic | 10 |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 2015 | 08 | 9.99 | 5 | basic | 10 |
11 | 0fue298y2r23r23r4 | NaN | nan | None | 9.91 | NaN | basic | 10 |
12 | 0f345kjh345oiuy348 | NaN | nan | None | 9.99 | NaN | basic | 10 |
13 | 0fju234978rfjkhsdf | NaN | nan | None | 14.99 | 5 | deluxe | 10 |
14 | 0f34ruiy23e78y2r | NaN | nan | None | 14.99 | NaN | deluxe | 10 |
15 | 0fue298y2r23r23r5 | NaN | nan | None | 14.99 | 5 | deluxe | 10 |
16 | 0f345kjh345oiuy349 | NaN | nan | None | 14.99 | NaN | deluxe | a |
df.meta.get()["profile"]["columns"]["customer_id"]
{'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}
df1 = df.cols.set(value='df["ticket_price"]*df["discount"]*10', output_cols="new ticket_price")
cols df["ticket_price"]*df["discount"]*10 columns ['ticket_price', 'discount'] cols None
df.cols.names()
['customer_id', 'transactoin_date', 'ticket_price', 'discount', 'product', 'info']
df1.compute()
customer_id | transactoin_date | ticket_price | discount | product | info | new ticket_price | |
---|---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 | NaN |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 | NaN |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 | NaN |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 | NaN |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 | NaN |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5% | basic | 6 | NaN |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 | NaN |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 | NaN |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 | NaN |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5% | basic | 10 | NaN |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 9.99 | 5% | basic | 10 | NaN |
11 | 0fue298y2r23r23r4 | NaN | 9.91 | NaN | basic | 10 | NaN |
12 | 0f345kjh345oiuy348 | NaN | 9.99 | NaN | basic | 10 | NaN |
13 | 0fju234978rfjkhsdf | NaN | 14.99 | 5% | deluxe | 10 | NaN |
14 | 0f34ruiy23e78y2r | NaN | 14.99 | NaN | deluxe | 10 | NaN |
15 | 0fue298y2r23r23r5 | NaN | 14.99 | 5% | deluxe | 10 | NaN |
16 | 0f345kjh345oiuy349 | NaN | 14.99 | NaN | deluxe | a | NaN |
df.cols.set(value='mask["ticket_price"]+mask["transactoin_date"]', where='df["ticket_price"]!=None', output_cols="new ticket_price_2").compute()
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590599684769.csv").ext.cache()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.cast_and_profile(columns="*", output="json")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
dtype string dtype date dtype decimal dtype string dtype string dtype int dtypedtype string dtype date dtype decimal dtype string dtype dtype string dtype string string dtype int dtype string dtype datedtype string dtype decimal dtype string dtype string date dtype decimal dtype string dtype date dtype decimaldtype string dtype datedtype dtypedtypedtype date dtype decimal dtype string dtypedtype date dtype decimal dtype string dtype decimal dtype string int stringdtypestring string dtype string dtype date dtype int string dtype int dtypedtypedtype string dtype string dtype int dtype decimal dtype string dtype string dtype int int string dtype int
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
unmasked = "DD/MM/YY"
masked = unmasked[1:4]
masked
'ABB'
print(_output)
{"columns": {"customer_id": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object"}, "transactoin_date": {"profiler_dtype": "date", "stats": {"mismatch": 11, "missing": 6, "match": 0, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}, "dtype": "object"}, "ticket_price": {"profiler_dtype": "decimal", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 9.91, "upper": 10.5375, "count": 0}, {"lower": 10.5375, "upper": 11.165, "count": 0}, {"lower": 11.165, "upper": 11.7925, "count": 0}, {"lower": 11.7925, "upper": 12.42, "count": 0}, {"lower": 12.42, "upper": 13.0475, "count": 0}, {"lower": 13.0475, "upper": 13.675, "count": 0}, {"lower": 13.675, "upper": 14.3025, "count": 0}, {"lower": 14.3025, "upper": 14.93, "count": 0}, {"lower": 14.93, "upper": 15.5575, "count": 3}, {"lower": 15.5575, "upper": 16.185, "count": 0}, {"lower": 16.185, "upper": 16.8125, "count": 0}, {"lower": 16.8125, "upper": 17.439999999999998, "count": 0}, {"lower": 17.439999999999998, "upper": 18.0675, "count": 0}, {"lower": 18.0675, "upper": 18.695, "count": 0}, {"lower": 18.695, "upper": 19.322499999999998, "count": 0}, {"lower": 19.322499999999998, "upper": 19.95, "count": 0}, {"lower": 19.95, "upper": 20.5775, "count": 0}, {"lower": 20.5775, "upper": 21.205, "count": 0}, {"lower": 21.205, "upper": 21.8325, "count": 0}, {"lower": 21.8325, "upper": 22.46, "count": 0}, {"lower": 22.46, "upper": 23.0875, "count": 0}, {"lower": 23.0875, "upper": 23.715, "count": 0}, {"lower": 23.715, "upper": 24.3425, "count": 0}, {"lower": 24.3425, "upper": 24.97, "count": 0}, {"lower": 24.97, "upper": 25.597499999999997, "count": 0}, {"lower": 25.597499999999997, "upper": 26.224999999999998, "count": 0}, {"lower": 26.224999999999998, "upper": 26.8525, "count": 0}, {"lower": 26.8525, "upper": 27.48, "count": 0}, {"lower": 27.48, "upper": 28.107499999999998, "count": 0}, {"lower": 28.107499999999998, "upper": 28.735, "count": 0}, {"lower": 28.735, "upper": 29.362499999999997, "count": 0}, {"lower": 29.362499999999997, "upper": 29.99, "count": 0}], "count_uniques": 4}, "dtype": "object"}, "discount": {"profiler_dtype": "object", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "nan", "count": 12}, {"value": "5%", "count": 5}], "count_uniques": 2}, "dtype": "object"}, "product": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object"}, "info": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "object"}}, "name": null, "file_name": "tmpk28b8e9o.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object"], "total_count_dtypes": 1, "missing_count": 0, "p_missing": 0.0}}
# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
# print(cols_and_inferred_dtype)
# df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
df.compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
%%time
df.cols.cast(columns = {"INCIDENT_NUMBER":"int"}).compute()
Wall time: 589 ms
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | NaN | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | NaN | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | NaN | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | NaN | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | NaN | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | NaN | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | NaN | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | NaN | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
%%time
df.ext.cast_and_profile(columns="*", flush=True, output="json")
Wall time: 9.43 s
'{"columns": {"INCIDENT_NUMBER": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "I162030584", "count": 13}, {"value": "I152080623", "count": 11}, {"value": "I172013170", "count": 10}, {"value": "I182065208", "count": 10}, {"value": "I172096394", "count": 10}, {"value": "I162071327", "count": 9}, {"value": "I162001871", "count": 9}, {"value": "I172056883", "count": 9}, {"value": "I172022524", "count": 9}, {"value": "I172054429", "count": 9}, {"value": "I162098170", "count": 9}, {"value": "I162078338", "count": 8}, {"value": "I162074826", "count": 8}, {"value": "I162090278", "count": 8}, {"value": "I172069723", "count": 8}, {"value": "I162022140", "count": 8}, {"value": "I130041200-00", "count": 8}, {"value": "I162087224", "count": 8}, {"value": "I152076465", "count": 8}, {"value": "I162056703", "count": 8}, {"value": "I162064331", "count": 8}, {"value": "I162082917", "count": 8}, {"value": "I152105431", "count": 8}, {"value": "I172053616", "count": 8}, {"value": "I152101399", "count": 7}, {"value": "I152095733", "count": 7}, {"value": "I162054378", "count": 7}, {"value": "I152067057", "count": 7}, {"value": "I152096998", "count": 7}, {"value": "I172018004", "count": 7}, {"value": "I162083089", "count": 7}, {"value": "I162045680", "count": 7}, {"value": "I152071480", "count": 7}], "count_uniques": 282517}, "dtype": "object"}, "OFFENSE_CODE": {"profiler_dtype": "zip_code", "stats": {"mismatch": 251, "missing": 0, "match": 318822, "frequency": [{"value": "03006", "count": 18783}, {"value": "03115", "count": 18746}, {"value": "03831", "count": 16323}, {"value": "01402", "count": 15152}, {"value": "00802", "count": 14791}, {"value": "03301", "count": 13099}, {"value": "03410", "count": 11287}, {"value": "03114", "count": 11123}, {"value": "00617", "count": 9069}, {"value": "02647", "count": 9039}, {"value": "00614", "count": 8893}, {"value": "03201", "count": 8892}, {"value": "03125", "count": 8389}, {"value": "00613", "count": 7949}, {"value": "03802", "count": 6557}, {"value": "00619", "count": 5963}, {"value": "03803", "count": 5131}, {"value": "00413", "count": 4886}, {"value": "01102", "count": 4413}, {"value": "03502", "count": 4365}, {"value": "02629", "count": 4007}, {"value": "03501", "count": 3766}, {"value": "03207", "count": 3698}, {"value": "00724", "count": 3629}, {"value": "02610", "count": 3245}, {"value": "01106", "count": 3147}, {"value": "00301", "count": 3056}, {"value": "03801", "count": 2925}, {"value": "00423", "count": 2910}, {"value": "02900", "count": 2894}, {"value": "02907", "count": 2616}, {"value": "00520", "count": 2585}, {"value": "01849", "count": 2584}], "count_uniques": 263}, "dtype": "object"}, "OFFENSE_CODE_GROUP": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "Motor Vehicle Accident Response", "count": 37132}, {"value": "Larceny", "count": 25935}, {"value": "Medical Assistance", "count": 23540}, {"value": "Investigate Person", "count": 18750}, {"value": "Other", "count": 18075}, {"value": "Drug Violation", "count": 16548}, {"value": "Simple Assault", "count": 15826}, {"value": "Vandalism", "count": 15415}, {"value": "Verbal Disputes", "count": 13099}, {"value": "Towed", "count": 11287}, {"value": "Investigate Property", "count": 11124}, {"value": "Larceny From Motor Vehicle", "count": 10847}, {"value": "Property Lost", "count": 9751}, {"value": "Warrant Arrests", "count": 8407}, {"value": "Aggravated Assault", "count": 7807}, {"value": "Violations", "count": 6095}, {"value": "Fraud", "count": 5829}, {"value": "Residential Burglary", "count": 5606}, {"value": "Missing Person Located", "count": 4958}, {"value": "Auto Theft", "count": 4851}, {"value": "Robbery", "count": 4624}, {"value": "Harassment", "count": 4007}, {"value": "Property Found", "count": 3925}, {"value": "Missing Person Reported", "count": 3797}, {"value": "Confidence Games", "count": 3147}, {"value": "Police Service Incidents", "count": 2781}, {"value": "Disorderly Conduct", "count": 2611}, {"value": "Fire Related Reports", "count": 1920}, {"value": "Firearm Violations", "count": 1777}, {"value": "License Violation", "count": 1701}, {"value": "Restraining Order Violations", "count": 1607}, {"value": "Recovered Stolen Property", "count": 1455}, {"value": "Counterfeiting", "count": 1454}], "count_uniques": 67}, "dtype": "object"}, "OFFENSE_DESCRIPTION": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "SICK/INJURED/MEDICAL - PERSON", "count": 18783}, {"value": "INVESTIGATE PERSON", "count": 18754}, {"value": "M/V - LEAVING SCENE - PROPERTY DAMAGE", "count": 16323}, {"value": "VANDALISM", "count": 15154}, {"value": "ASSAULT SIMPLE - BATTERY", "count": 14791}, {"value": "VERBAL DISPUTE", "count": 13099}, {"value": "TOWED MOTOR VEHICLE", "count": 11287}, {"value": "INVESTIGATE PROPERTY", "count": 11124}, {"value": "LARCENY THEFT FROM BUILDING", "count": 9069}, {"value": "THREATS TO DO BODILY HARM", "count": 9042}, {"value": "LARCENY THEFT FROM MV - NON-ACCESSORY", "count": 8893}, {"value": "PROPERTY - LOST", "count": 8893}, {"value": "WARRANT ARREST", "count": 8407}, {"value": "LARCENY SHOPLIFTING", "count": 7949}, {"value": "M/V ACCIDENT - PROPERTY \\u00a0DAMAGE", "count": 6557}, {"value": "LARCENY ALL OTHERS", "count": 5963}, {"value": "M/V ACCIDENT - PERSONAL INJURY", "count": 5131}, {"value": "ASSAULT - AGGRAVATED - BATTERY", "count": 4886}, {"value": "FRAUD - FALSE PRETENSE / SCHEME", "count": 4413}, {"value": "MISSING PERSON - LOCATED", "count": 4365}, {"value": "HARASSMENT", "count": 4007}, {"value": "MISSING PERSON", "count": 3766}, {"value": "PROPERTY - FOUND", "count": 3698}, {"value": "AUTO THEFT", "count": 3630}, {"value": "TRESPASSING", "count": 3254}, {"value": "FRAUD - CREDIT CARD / ATM FRAUD", "count": 3147}, {"value": "ROBBERY - STREET", "count": 3056}, {"value": "M/V ACCIDENT - OTHER", "count": 2925}, {"value": "ASSAULT - AGGRAVATED", "count": 2910}, {"value": "VAL - VIOLATION OF AUTO LAW - OTHER", "count": 2894}, {"value": "VAL - OPERATING AFTER REV/SUSP.", "count": 2618}, {"value": "DRUGS - POSS CLASS B - COCAINE, ETC.", "count": 2591}, {"value": "BURGLARY - RESIDENTIAL - FORCE", "count": 2585}], "count_uniques": 244}, "dtype": "object"}, "DISTRICT": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 1765, "match": 317308, "frequency": [{"value": "B2", "count": 49945}, {"value": "C11", "count": 42530}, {"value": "D4", "count": 41915}, {"value": "A1", "count": 35717}, {"value": "B3", "count": 35442}, {"value": "C6", "count": 23460}, {"value": "D14", "count": 20127}, {"value": "E13", "count": 17536}, {"value": "E18", "count": 17348}, {"value": "A7", "count": 13544}, {"value": "E5", "count": 13239}, {"value": "A15", "count": 6505}], "count_uniques": 12}, "dtype": "object"}, "REPORTING_AREA": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 20250, "match": 298823, "hist": [{"lower": 0.0, "upper": 30.0625, "count": 1502}, {"lower": 30.0625, "upper": 60.125, "count": 740}, {"lower": 60.125, "upper": 90.1875, "count": 1376}, {"lower": 90.1875, "upper": 120.25, "count": 2618}, {"lower": 120.25, "upper": 150.3125, "count": 2175}, {"lower": 150.3125, "upper": 180.375, "count": 2034}, {"lower": 180.375, "upper": 210.4375, "count": 1349}, {"lower": 210.4375, "upper": 240.5, "count": 1116}, {"lower": 240.5, "upper": 270.5625, "count": 1501}, {"lower": 270.5625, "upper": 300.625, "count": 1968}, {"lower": 300.625, "upper": 330.6875, "count": 2684}, {"lower": 330.6875, "upper": 360.75, "count": 2336}, {"lower": 360.75, "upper": 390.8125, "count": 939}, {"lower": 390.8125, "upper": 420.875, "count": 1051}, {"lower": 420.875, "upper": 450.9375, "count": 2021}, {"lower": 450.9375, "upper": 481.0, "count": 1848}, {"lower": 481.0, "upper": 511.0625, "count": 996}, {"lower": 511.0625, "upper": 541.125, "count": 804}, {"lower": 541.125, "upper": 571.1875, "count": 897}, {"lower": 571.1875, "upper": 601.25, "count": 1146}, {"lower": 601.25, "upper": 631.3125, "count": 1610}, {"lower": 631.3125, "upper": 661.375, "count": 537}, {"lower": 661.375, "upper": 691.4375, "count": 423}, {"lower": 691.4375, "upper": 721.5, "count": 277}, {"lower": 721.5, "upper": 751.5625, "count": 337}, {"lower": 751.5625, "upper": 781.625, "count": 678}, {"lower": 781.625, "upper": 811.6875, "count": 1466}, {"lower": 811.6875, "upper": 841.75, "count": 455}, {"lower": 841.75, "upper": 871.8125, "count": 0}, {"lower": 871.8125, "upper": 901.875, "count": 245}, {"lower": 901.875, "upper": 931.9375, "count": 1546}, {"lower": 931.9375, "upper": 962.0, "count": 1210}], "count_uniques": 878}, "dtype": "object"}, "SHOOTING": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "nan", "count": 318054}, {"value": "Y", "count": 1019}], "count_uniques": 2}, "dtype": "object"}, "OCCURRED_ON_DATE": {"profiler_dtype": "date", "stats": {"mismatch": 319073, "missing": 0, "match": 0, "frequency": [{"value": "2017-06-01 00:00:00+00:00", "count": 29}, {"value": "2015-07-01 00:00:00+00:00", "count": 27}, {"value": "2016-08-01 00:00:00+00:00", "count": 27}, {"value": "2015-06-18 05:00:00+00:00", "count": 22}, {"value": "2017-08-01 00:00:00+00:00", "count": 22}, {"value": "2017-01-01 00:00:00+00:00", "count": 21}, {"value": "2016-04-01 00:00:00+00:00", "count": 20}, {"value": "2017-05-01 00:00:00+00:00", "count": 20}, {"value": "2015-12-07 11:38:00+00:00", "count": 20}, {"value": "2017-04-01 00:00:00+00:00", "count": 19}, {"value": "2016-09-01 00:00:00+00:00", "count": 19}, {"value": "2018-01-01 00:00:00+00:00", "count": 18}, {"value": "2016-11-01 00:00:00+00:00", "count": 18}, {"value": "2018-06-04 12:40:00+00:00", "count": 18}, {"value": "2015-12-01 00:00:00+00:00", "count": 18}, {"value": "2017-07-05 00:00:00+00:00", "count": 18}, {"value": "2017-11-01 00:00:00+00:00", "count": 17}, {"value": "2018-02-01 00:00:00+00:00", "count": 16}, {"value": "2016-02-01 00:00:00+00:00", "count": 16}, {"value": "2018-07-19 00:00:00+00:00", "count": 16}, {"value": "2017-03-01 00:00:00+00:00", "count": 15}, {"value": "2018-03-07 06:00:00+00:00", "count": 15}, {"value": "2018-08-18 08:45:00+00:00", "count": 15}, {"value": "2017-10-01 00:00:00+00:00", "count": 15}, {"value": "2016-12-15 06:00:00+00:00", "count": 14}, {"value": "2015-06-20 00:00:00+00:00", "count": 14}, {"value": "2016-06-24 17:30:00+00:00", "count": 14}, {"value": "2016-11-10 16:00:00+00:00", "count": 13}, {"value": "2018-04-02 00:00:00+00:00", "count": 13}, {"value": "2015-12-22 20:25:00+00:00", "count": 13}, {"value": "2018-01-03 18:00:00+00:00", "count": 13}, {"value": "2015-07-17 00:00:00+00:00", "count": 13}, {"value": "2016-04-20 11:07:00+00:00", "count": 13}], "count_uniques": 233229}, "dtype": "object"}, "YEAR": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 2015.0, "upper": 2015.09375, "count": 39649}, {"lower": 2015.09375, "upper": 2015.1875, "count": 0}, {"lower": 2015.1875, "upper": 2015.28125, "count": 0}, {"lower": 2015.28125, "upper": 2015.375, "count": 0}, {"lower": 2015.375, "upper": 2015.46875, "count": 0}, {"lower": 2015.46875, "upper": 2015.5625, "count": 0}, {"lower": 2015.5625, "upper": 2015.65625, "count": 0}, {"lower": 2015.65625, "upper": 2015.75, "count": 0}, {"lower": 2015.75, "upper": 2015.84375, "count": 0}, {"lower": 2015.84375, "upper": 2015.9375, "count": 0}, {"lower": 2015.9375, "upper": 2016.03125, "count": 184}, {"lower": 2016.03125, "upper": 2016.125, "count": 0}, {"lower": 2016.125, "upper": 2016.21875, "count": 0}, {"lower": 2016.21875, "upper": 2016.3125, "count": 0}, {"lower": 2016.3125, "upper": 2016.40625, "count": 0}, {"lower": 2016.40625, "upper": 2016.5, "count": 0}, {"lower": 2016.5, "upper": 2016.59375, "count": 0}, {"lower": 2016.59375, "upper": 2016.6875, "count": 0}, {"lower": 2016.6875, "upper": 2016.78125, "count": 0}, {"lower": 2016.78125, "upper": 2016.875, "count": 0}, {"lower": 2016.875, "upper": 2016.96875, "count": 0}, {"lower": 2016.96875, "upper": 2017.0625, "count": 39}, {"lower": 2017.0625, "upper": 2017.15625, "count": 0}, {"lower": 2017.15625, "upper": 2017.25, "count": 0}, {"lower": 2017.25, "upper": 2017.34375, "count": 0}, {"lower": 2017.34375, "upper": 2017.4375, "count": 0}, {"lower": 2017.4375, "upper": 2017.53125, "count": 0}, {"lower": 2017.53125, "upper": 2017.625, "count": 0}, {"lower": 2017.625, "upper": 2017.71875, "count": 0}, {"lower": 2017.71875, "upper": 2017.8125, "count": 0}, {"lower": 2017.8125, "upper": 2017.90625, "count": 0}, {"lower": 2017.90625, "upper": 2018.0, "count": 13}], "count_uniques": 4}, "dtype": "object"}, "MONTH": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 1.0, "upper": 1.34375, "count": 55}, {"lower": 1.34375, "upper": 1.6875, "count": 0}, {"lower": 1.6875, "upper": 2.03125, "count": 28}, {"lower": 2.03125, "upper": 2.375, "count": 0}, {"lower": 2.375, "upper": 2.71875, "count": 0}, {"lower": 2.71875, "upper": 3.0625, "count": 31}, {"lower": 3.0625, "upper": 3.40625, "count": 0}, {"lower": 3.40625, "upper": 3.75, "count": 0}, {"lower": 3.75, "upper": 4.09375, "count": 30}, {"lower": 4.09375, "upper": 4.4375, "count": 0}, {"lower": 4.4375, "upper": 4.78125, "count": 0}, {"lower": 4.78125, "upper": 5.125, "count": 16}, {"lower": 5.125, "upper": 5.46875, "count": 0}, {"lower": 5.46875, "upper": 5.8125, "count": 0}, {"lower": 5.8125, "upper": 6.15625, "count": 4184}, {"lower": 6.15625, "upper": 6.5, "count": 0}, {"lower": 6.5, "upper": 6.84375, "count": 0}, {"lower": 6.84375, "upper": 7.1875, "count": 8293}, {"lower": 7.1875, "upper": 7.53125, "count": 0}, {"lower": 7.53125, "upper": 7.875, "count": 0}, {"lower": 7.875, "upper": 8.21875, "count": 8298}, {"lower": 8.21875, "upper": 8.5625, "count": 0}, {"lower": 8.5625, "upper": 8.90625, "count": 0}, {"lower": 8.90625, "upper": 9.25, "count": 8334}, {"lower": 9.25, "upper": 9.59375, "count": 0}, {"lower": 9.59375, "upper": 9.9375, "count": 0}, {"lower": 9.9375, "upper": 10.28125, "count": 8151}, {"lower": 10.28125, "upper": 10.625, "count": 0}, {"lower": 10.625, "upper": 10.96875, "count": 0}, {"lower": 10.96875, "upper": 11.3125, "count": 2392}, {"lower": 11.3125, "upper": 11.65625, "count": 0}, {"lower": 11.65625, "upper": 12.0, "count": 73}], "count_uniques": 12}, "dtype": "object"}, "DAY_OF_WEEK": {"profiler_dtype": "date", "stats": {"mismatch": 319073, "missing": 0, "match": 0, "frequency": [{"value": "Friday", "count": 48495}, {"value": "Wednesday", "count": 46729}, {"value": "Thursday", "count": 46656}, {"value": "Tuesday", "count": 46383}, {"value": "Monday", "count": 45679}, {"value": "Saturday", "count": 44818}, {"value": "Sunday", "count": 40313}], "count_uniques": 7}, "dtype": "object"}, "HOUR": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 0.0, "upper": 0.71875, "count": 2069}, {"lower": 0.71875, "upper": 1.4375, "count": 1262}, {"lower": 1.4375, "upper": 2.15625, "count": 1040}, {"lower": 2.15625, "upper": 2.875, "count": 0}, {"lower": 2.875, "upper": 3.59375, "count": 611}, {"lower": 3.59375, "upper": 4.3125, "count": 402}, {"lower": 4.3125, "upper": 5.03125, "count": 461}, {"lower": 5.03125, "upper": 5.75, "count": 0}, {"lower": 5.75, "upper": 6.46875, "count": 606}, {"lower": 6.46875, "upper": 7.1875, "count": 1114}, {"lower": 7.1875, "upper": 7.90625, "count": 0}, {"lower": 7.90625, "upper": 8.625, "count": 1562}, {"lower": 8.625, "upper": 9.34375, "count": 1833}, {"lower": 9.34375, "upper": 10.0625, "count": 1951}, {"lower": 10.0625, "upper": 10.78125, "count": 0}, {"lower": 10.78125, "upper": 11.5, "count": 2028}, {"lower": 11.5, "upper": 12.21875, "count": 2330}, {"lower": 12.21875, "upper": 12.9375, "count": 0}, {"lower": 12.9375, "upper": 13.65625, "count": 2020}, {"lower": 13.65625, "upper": 14.375, "count": 2116}, {"lower": 14.375, "upper": 15.09375, "count": 1988}, {"lower": 15.09375, "upper": 15.8125, "count": 0}, {"lower": 15.8125, "upper": 16.53125, "count": 2415}, {"lower": 16.53125, "upper": 17.25, "count": 2625}, {"lower": 17.25, "upper": 17.96875, "count": 0}, {"lower": 17.96875, "upper": 18.6875, "count": 2584}, {"lower": 18.6875, "upper": 19.40625, "count": 2159}, {"lower": 19.40625, "upper": 20.125, "count": 2041}, {"lower": 20.125, "upper": 20.84375, "count": 0}, {"lower": 20.84375, "upper": 21.5625, "count": 1698}, {"lower": 21.5625, "upper": 22.28125, "count": 1604}, {"lower": 22.28125, "upper": 23.0, "count": 1366}], "count_uniques": 24}, "dtype": "object"}, "UCR_PART": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 90, "match": 318983, "frequency": [{"value": "Part Three", "count": 158553}, {"value": "Part Two", "count": 97569}, {"value": "Part One", "count": 61629}, {"value": "Other", "count": 1232}], "count_uniques": 4}, "dtype": "object"}, "STREET": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 10871, "match": 308202, "frequency": [{"value": "WASHINGTON ST", "count": 14194}, {"value": "BLUE HILL AVE", "count": 7794}, {"value": "BOYLSTON ST", "count": 7221}, {"value": "DORCHESTER AVE", "count": 5149}, {"value": "TREMONT ST", "count": 4796}, {"value": "MASSACHUSETTS AVE", "count": 4708}, {"value": "HARRISON AVE", "count": 4609}, {"value": "CENTRE ST", "count": 4383}, {"value": "COMMONWEALTH AVE", "count": 4134}, {"value": "HYDE PARK AVE", "count": 3470}, {"value": "COLUMBIA RD", "count": 3195}, {"value": "HUNTINGTON AVE", "count": 2989}, {"value": "RIVER ST", "count": 2831}, {"value": "DUDLEY ST", "count": 2352}, {"value": "WARREN ST", "count": 2336}, {"value": "COLUMBUS AVE", "count": 2304}, {"value": "ADAMS ST", "count": 2042}, {"value": "BEACON ST", "count": 1966}, {"value": "NEWBURY ST", "count": 1889}, {"value": "CAMBRIDGE ST", "count": 1670}, {"value": "BOWDOIN ST", "count": 1656}, {"value": "AMERICAN LEGION HWY", "count": 1631}, {"value": "W BROADWAY", "count": 1630}, {"value": "GENEVA AVE", "count": 1628}, {"value": "ALBANY ST", "count": 1429}, {"value": "ALLSTATE RD", "count": 1296}, {"value": "SUMMER ST", "count": 1250}, {"value": "HANCOCK ST", "count": 1246}, {"value": "BORDER ST", "count": 1213}, {"value": "NEW SUDBURY ST", "count": 1213}, {"value": "CUMMINS HWY", "count": 1151}, {"value": "NORFOLK ST", "count": 1115}, {"value": "BENNINGTON ST", "count": 1084}], "count_uniques": 4657}, "dtype": "object"}, "Lat": {"profiler_dtype": "decimal", "stats": {"mismatch": 299074, "missing": 19999, "match": 0, "hist": [{"lower": -1.0, "upper": 0.3560950493749999, "count": 33}, {"lower": 0.3560950493749999, "upper": 1.7121900987499998, "count": 0}, {"lower": 1.7121900987499998, "upper": 3.0682851481249998, "count": 0}, {"lower": 3.0682851481249998, "upper": 4.4243801975, "count": 0}, {"lower": 4.4243801975, "upper": 5.780475246875, "count": 0}, {"lower": 5.780475246875, "upper": 7.1365702962499995, "count": 0}, {"lower": 7.1365702962499995, "upper": 8.492665345625, "count": 0}, {"lower": 8.492665345625, "upper": 9.848760395, "count": 0}, {"lower": 9.848760395, "upper": 11.204855444375, "count": 0}, {"lower": 11.204855444375, "upper": 12.56095049375, "count": 0}, {"lower": 12.56095049375, "upper": 13.917045543124999, "count": 0}, {"lower": 13.917045543124999, "upper": 15.273140592499999, "count": 0}, {"lower": 15.273140592499999, "upper": 16.629235641875, "count": 0}, {"lower": 16.629235641875, "upper": 17.98533069125, "count": 0}, {"lower": 17.98533069125, "upper": 19.341425740625, "count": 0}, {"lower": 19.341425740625, "upper": 20.69752079, "count": 0}, {"lower": 20.69752079, "upper": 22.053615839375, "count": 0}, {"lower": 22.053615839375, "upper": 23.40971088875, "count": 0}, {"lower": 23.40971088875, "upper": 24.765805938125, "count": 0}, {"lower": 24.765805938125, "upper": 26.1219009875, "count": 0}, {"lower": 26.1219009875, "upper": 27.477996036875, "count": 0}, {"lower": 27.477996036875, "upper": 28.834091086249998, "count": 0}, {"lower": 28.834091086249998, "upper": 30.190186135624998, "count": 0}, {"lower": 30.190186135624998, "upper": 31.546281184999998, "count": 0}, {"lower": 31.546281184999998, "upper": 32.902376234375, "count": 0}, {"lower": 32.902376234375, "upper": 34.25847128375, "count": 0}, {"lower": 34.25847128375, "upper": 35.614566333125, "count": 0}, {"lower": 35.614566333125, "upper": 36.9706613825, "count": 0}, {"lower": 36.9706613825, "upper": 38.326756431875, "count": 0}, {"lower": 38.326756431875, "upper": 39.68285148125, "count": 0}, {"lower": 39.68285148125, "upper": 41.038946530625, "count": 0}, {"lower": 41.038946530625, "upper": 42.39504158, "count": 39852}], "count_uniques": 18178}, "dtype": "object"}, "Long": {"profiler_dtype": "decimal", "stats": {"mismatch": 299074, "missing": 19999, "match": 0, "hist": [{"lower": -71.17867378, "upper": -68.985590224375, "count": 39852}, {"lower": -68.985590224375, "upper": -66.79250666875, "count": 0}, {"lower": -66.79250666875, "upper": -64.599423113125, "count": 0}, {"lower": -64.599423113125, "upper": -62.4063395575, "count": 0}, {"lower": -62.4063395575, "upper": -60.213256001874996, "count": 0}, {"lower": -60.213256001874996, "upper": -58.02017244625, "count": 0}, {"lower": -58.02017244625, "upper": -55.827088890625, "count": 0}, {"lower": -55.827088890625, "upper": -53.634005335, "count": 0}, {"lower": -53.634005335, "upper": -51.440921779374996, "count": 0}, {"lower": -51.440921779374996, "upper": -49.247838223749994, "count": 0}, {"lower": -49.247838223749994, "upper": -47.054754668125, "count": 0}, {"lower": -47.054754668125, "upper": -44.8616711125, "count": 0}, {"lower": -44.8616711125, "upper": -42.668587556874996, "count": 0}, {"lower": -42.668587556874996, "upper": -40.47550400125, "count": 0}, {"lower": -40.47550400125, "upper": -38.282420445625, "count": 0}, {"lower": -38.282420445625, "upper": -36.08933689, "count": 0}, {"lower": -36.08933689, "upper": -33.896253334375, "count": 0}, {"lower": -33.896253334375, "upper": -31.703169778749995, "count": 0}, {"lower": -31.703169778749995, "upper": -29.510086223125, "count": 0}, {"lower": -29.510086223125, "upper": -27.3170026675, "count": 0}, {"lower": -27.3170026675, "upper": -25.123919111874997, "count": 0}, {"lower": -25.123919111874997, "upper": -22.930835556250003, "count": 0}, {"lower": -22.930835556250003, "upper": -20.737752000625, "count": 0}, {"lower": -20.737752000625, "upper": -18.544668445, "count": 0}, {"lower": -18.544668445, "upper": -16.351584889374998, "count": 0}, {"lower": -16.351584889374998, "upper": -14.158501333749996, "count": 0}, {"lower": -14.158501333749996, "upper": -11.965417778125001, "count": 0}, {"lower": -11.965417778125001, "upper": -9.7723342225, "count": 0}, {"lower": -9.7723342225, "upper": -7.579250666874998, "count": 0}, {"lower": -7.579250666874998, "upper": -5.386167111250003, "count": 0}, {"lower": -5.386167111250003, "upper": -3.1930835556249946, "count": 0}, {"lower": -3.1930835556249946, "upper": -1.0, "count": 33}], "count_uniques": 18178}, "dtype": "object"}, "Location": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "(0.00000000, 0.00000000)", "count": 19999}, {"value": "(42.34862382, -71.08277637)", "count": 1243}, {"value": "(42.36183857, -71.05976489)", "count": 1208}, {"value": "(42.28482577, -71.09137369)", "count": 1121}, {"value": "(42.32866284, -71.08563401)", "count": 1042}, {"value": "(42.25621592, -71.12401947)", "count": 898}, {"value": "(42.29755533, -71.05970910)", "count": 783}, {"value": "(42.34128751, -71.05467933)", "count": 773}, {"value": "(-1.00000000, -1.00000000)", "count": 745}, {"value": "(42.33152148, -71.07085307)", "count": 735}, {"value": "(42.35231190, -71.06370510)", "count": 688}, {"value": "(42.33954199, -71.06940877)", "count": 655}, {"value": "(42.32696647, -71.06198607)", "count": 652}, {"value": "(42.35512339, -71.06087980)", "count": 584}, {"value": "(42.30971857, -71.10429432)", "count": 573}, {"value": "(42.29848866, -71.06313294)", "count": 562}, {"value": "(42.33401829, -71.07638124)", "count": 561}, {"value": "(42.33367922, -71.09187755)", "count": 550}, {"value": "(42.31043400, -71.06134010)", "count": 523}, {"value": "(42.35095909, -71.07412780)", "count": 523}, {"value": "(42.35241815, -71.06525499)", "count": 515}, {"value": "(42.37081805, -71.03929078)", "count": 507}, {"value": "(42.33428841, -71.07239518)", "count": 504}, {"value": "(42.34980175, -71.07840978)", "count": 472}, {"value": "(42.32696802, -71.08051941)", "count": 472}, {"value": "(42.33511904, -71.07491710)", "count": 455}, {"value": "(42.35037870, -71.07626098)", "count": 445}, {"value": "(42.34653820, -71.09880598)", "count": 444}, {"value": "(42.36643546, -71.06135413)", "count": 440}, {"value": "(42.28709355, -71.14822128)", "count": 438}, {"value": "(42.35602373, -71.06177615)", "count": 436}, {"value": "(42.34840576, -71.08688339)", "count": 432}, {"value": "(42.34905600, -71.15049850)", "count": 427}], "count_uniques": 18194}, "dtype": "object"}}, "name": null, "file_name": "crime.csv", "summary": {"cols_count": 17, "rows_count": 319073, "dtypes_list": ["object"], "total_count_dtypes": 1, "missing_count": 0, "p_missing": 0.0}}'
df.cols.names()
['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']
df.OCCURRED_ON_DATE.astype()
%%time
import pandas as pd
from fastnumbers import fast_int
import pendulum
def func(value):
return pendulum.parse(value)
df.OCCURRED_ON_DATE.apply(func, meta="object").compute()
Wall time: 3.49 s
df.compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | nan | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | nan | 2018-09-02 13:00:00+00:00 | 2018 | 9 | 2020-05-31 | 13 | Part One | LINCOLN ST | 42.3578 | -71.1394 | (42.35779134, -71.13937053) |
1 | nan | 01402 | Vandalism | VANDALISM | C11 | 347 | nan | 2018-08-21 00:00:00+00:00 | 2018 | 8 | 2020-05-26 | 0 | Part Two | HECLA ST | 42.3068 | -71.0603 | (42.30682138, -71.06030035) |
2 | nan | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | nan | 2018-09-03 19:27:00+00:00 | 2018 | 9 | 2020-06-01 | 19 | Part Three | CAZENOVE ST | 42.3466 | -71.0724 | (42.34658879, -71.07242943) |
3 | nan | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | nan | 2018-09-03 21:16:00+00:00 | 2018 | 9 | 2020-06-01 | 21 | Part Three | NEWCOMB ST | 42.3342 | -71.0787 | (42.33418175, -71.07866441) |
4 | nan | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | nan | 2018-09-03 21:05:00+00:00 | 2018 | 9 | 2020-06-01 | 21 | Part Three | DELHI ST | 42.2754 | -71.0904 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | nan | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | nan | 2016-06-05 17:25:00+00:00 | 2016 | 6 | 2020-05-31 | 17 | Part Three | COVENTRY ST | 42.337 | -71.0857 | (42.33695098, -71.08574813) |
319069 | nan | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | nan | 2015-07-09 13:38:00+00:00 | 2015 | 7 | 2020-05-28 | 13 | Part One | RIVER ST | 42.2559 | -71.1232 | (42.25592648, -71.12317207) |
319070 | nan | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | nan | 2015-07-09 13:38:00+00:00 | 2015 | 7 | 2020-05-28 | 13 | Part Three | RIVER ST | 42.2559 | -71.1232 | (42.25592648, -71.12317207) |
319071 | nan | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | nan | 2016-05-31 19:35:00+00:00 | 2016 | 5 | 2020-05-26 | 19 | Part Three | NEW WASHINGTON ST | 42.3023 | -71.1116 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | nan | 2015-06-22 00:12:00+00:00 | 2015 | 6 | 2020-06-01 | 0 | Part Three | WASHINGTON ST | 42.3338 | -71.0803 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
a = df["INCIDENT_NUMBER"].compute()
b = df["OFFENSE_CODE"].compute()
from dask import dataframe as dd
dd.compute(a+b)
(0 I18207094500619 1 I18207094301402 2 I18207094103410 3 I18207094003114 4 I18207093803114 ... 319068 I050310906-0003125 319069 I030217815-0800111 319070 I030217815-0803125 319071 I010370257-0003125 319072 14205255003125 Length: 319073, dtype: object,)
def func(value):
return value
df.apply(func)
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) <ipython-input-10-46167e4a0a69> in <module> 1 def func(value): 2 return value ----> 3 df.apply(func) ~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py in apply(self, func, axis, broadcast, raw, reduce, args, meta, **kwds) 4145 " Try: df.apply(func, axis=1)" 4146 ) -> 4147 raise NotImplementedError(msg) 4148 4149 if meta is no_default: NotImplementedError: dd.DataFrame.apply only supports axis=1 Try: df.apply(func, axis=1)
%%time
# _output = df.ext.cast_and_profile(columns="*", flush=True, output="json")
# df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
Wall time: 0 ns
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
df.ext.partitions()
8
df.compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
df.cols.year("")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-19-f77bfb6ed87a> in <module> ----> 1 df.cols.year("birth") ~\Documents\Optimus\optimus\engines\base\dask\columns.py in year(self, input_cols, output_cols) 573 output_cols = get_output_cols(input_cols, output_cols) 574 for input_col, output_col in zip(input_cols, output_cols): --> 575 df = df.assign(**{output_col: df[input_col].dt.year}) 576 return df 577 pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__() ~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py in dt(self) 2616 def dt(self): 2617 """ Namespace of datetime methods """ -> 2618 return DatetimeAccessor(self) 2619 2620 @cache_readonly ~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\accessor.py in __init__(self, series) 34 if hasattr(series_meta, "to_series"): # is index-like 35 series_meta = series_meta.to_series() ---> 36 meta = getattr(series_meta, self._accessor_name) 37 38 self._meta = meta ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in __getattr__(self, name) 5268 or name in self._accessors 5269 ): -> 5270 return object.__getattribute__(self, name) 5271 else: 5272 if self._info_axis._can_hold_identifiers_and_holds_name(name): ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\accessor.py in __get__(self, obj, cls) 185 # we're accessing the attribute of the class, i.e., Dataset.geo 186 return self._accessor --> 187 accessor_obj = self._accessor(obj) 188 # Replace the property with the accessor object. Inspired by: 189 # http://www.pydanny.com/cached-property.html ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\accessors.py in __new__(cls, data) 336 return DatetimeProperties(data, orig) 337 --> 338 raise AttributeError("Can only use .dt accessor with datetimelike values") AttributeError: Can only use .dt accessor with datetimelike values
print(_output)
{"columns": {"customer_id": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object"}, "transactoin_date": {"profiler_dtype": "date", "profiler_dtype_fotmat": "date", "stats": {"mismatch": 11, "missing": 6, "match": 0}, "dtype": "datetime64[ns]"}, "ticket_price": {"profiler_dtype": "decimal", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": 9.99, "count": 7}, {"value": 29.99, "count": 5}, {"value": 14.99, "count": 4}, {"value": 9.91, "count": 1}], "count_uniques": 4}, "dtype": "object"}, "discount": {"profiler_dtype": "object", "stats": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}, "dtype": "object"}, "product": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object"}, "info": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "int64"}}, "name": null, "file_name": "tmp71_odjvg.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "datetime64[ns]", "int64"], "total_count_dtypes": 3, "missing_count": 0, "p_missing": 0.0}}
import pendulum
pendulum.from_format("11 11 1980", "DD MM YYYY")
DateTime(1980, 11, 11, 0, 0, 0, tzinfo=Timezone('UTC'))
# df = op.load.file("data/test-types.csv").ext.cache()
df = op.load.file("data/dataset-transactions.csv").ext.cache()
a = df.cols.infer_profiler_dtypes("*")
print(a)
{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'string', 'product': 'string', 'info': 'int'}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df = df.cols.cast_to_profiler_dtypes(columns={'transactoin_date': 'date'})
# df["a"] = df.transactoin_date.astype('M8[us]')
# b["transactoin_date"].dt.year
df.dtypes
customer_id object transactoin_date datetime64[ns] ticket_price object discount object product object info object dtype: object
df.cols.year("transactoin_date", "new1").compute()
df.cols.month("transactoin_date", "new1").compute()
df.cols.day("transactoin_date", "new1").compute()
customer_id | transactoin_date | ticket_price | discount | product | info | new1 | |
---|---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010-08-19 | 29.99 | NaN | platinum | 1 | 19.0 |
1 | 0fju234978rfjkhsdf | 2012-01-05 | 29.99 | NaN | platinum | 2 | 5.0 |
2 | 0f34ruiy23e78y2r | 2009-08-11 | 29.99 | NaN | platinum | 3 | 11.0 |
3 | 0fue298y2r23r23r2 | 2010-08-19 | 29.99 | NaN | platinum | 4 | 19.0 |
4 | 0f345kjh345oiuy346 | 2010-08-19 | 29.99 | NaN | platinum | 5 | 19.0 |
5 | 0fju234978rfjkhsdf | 2010-08-19 | 9.99 | 5% | basic | 6 | 19.0 |
6 | 0f34ruiy23e78y2r | 2010-08-19 | 9.99 | NaN | basic | 7 | 19.0 |
7 | 0fue298y2r23r23r3 | 2010-08-19 | 9.99 | NaN | basic | 8 | 19.0 |
8 | 0f345kjh345oiuy347 | 2010-08-19 | 9.99 | NaN | basic | 9 | 19.0 |
9 | 0fju234978rfjkhsdf | 2011-08-11 | 9.99 | 5% | basic | 10 | 11.0 |
10 | 0f34ruiy23e78y2r | 2015-08-09 | 9.99 | 5% | basic | 10 | 9.0 |
11 | 0fue298y2r23r23r4 | NaT | 9.91 | NaN | basic | 10 | NaN |
12 | 0f345kjh345oiuy348 | NaT | 9.99 | NaN | basic | 10 | NaN |
13 | 0fju234978rfjkhsdf | NaT | 14.99 | 5% | deluxe | 10 | NaN |
14 | 0f34ruiy23e78y2r | NaT | 14.99 | NaN | deluxe | 10 | NaN |
15 | 0fue298y2r23r23r5 | NaT | 14.99 | 5% | deluxe | 10 | NaN |
16 | 0f345kjh345oiuy349 | NaT | 14.99 | NaN | deluxe | 10 | NaN |
df.ext.display(20)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (object)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
0f34ruiy23e78y2r
|
2015/08/09
|
9.99
|
5%
|
basic
|
10
|
0fue298y2r23r23r4
|
nan
|
9.91
|
nan
|
basic
|
10
|
0f345kjh345oiuy348
|
nan
|
9.99
|
nan
|
basic
|
10
|
0fju234978rfjkhsdf
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f34ruiy23e78y2r
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
0fue298y2r23r23r5
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f345kjh345oiuy349
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
df= df.cols.unnest("transactoin_date", separator="/", splits=3, output_cols="transactoin_date").ext.cache()
df.ext.profile(columns="*", infer=True, output="json", flush=True, bins=10)
df.ext.display()
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (object)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 13 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'A': 'string', 'B': 'string'}
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590426328940.csv").ext.cache()
df = df.ext.optimize()
df = df.ext.repartition(1).ext.cache()
col_name = "customer_id"
df.cols.count_mismatch({col_name:"int","transactoin_date":"date"})
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0}, 'transactoin_date': {'mismatch': 11, 'missing': 6, 'match': 0}}
df = op.load.file("data/crime.csv").ext.cache()
%%time
df.cols.cast("INCIDENT_NUMBER","int").compute()
Wall time: 447 ms
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
00619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
01402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
03410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
03820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
00724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
00301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
import numpy as np
a = np.array([1,2,3])
b = np.array([1,2,"a"])
def func(a, b):
try:
r =a+b
except:
r = np.nan
return r
func(a,b)
nan
df[["INCIDENT_NUMBER","REPORTING_AREA"]].compute().to_numpy()[0]
array(['I182070945', '808'], dtype=object)
def op(value):
return value
# return value["INCIDENT_NUMBER"]
# print(value)
# try:
# return value["INCIDENT_NUMBER"] + value["REPORTING_AREA"]
# except():
# return np.nan
def func(pdf):
# print(type(pdf))
return pdf.apply(op, axis=1)
df[["INCIDENT_NUMBER","REPORTING_AREA"]].map_partitions(func, meta=object).compute()
%%time
from numpy.core._exceptions import UFuncTypeError
def func(a,b):
return a+b
def myfunc(a, b, expr):
"Return a-b if a>b, otherwise return a+b"
try:
return func(a,b)
except (UFuncTypeError, TypeError):
return np.nan
vfunc = np.vectorize(myfunc)
print(func)
arr= [1, 2, 3,4,5,6,7,8,9,"A"]*1000000
b = vfunc(arr, 2, func)
# vfunc([1, 2, 3, "A"], 2)
<function func at 0x00000259BE4C1AF8> Wall time: 5.37 s
len(arr)
10000000
df = df.ext.repartition(8)
import re
comp = re.compile("^(?=.)([+-]?([0-9]*)(\.([0-9]+))?)$")
%time df["INCIDENT_NUMBER"].str.match(comp).compute()
Wall time: 326 ms
0 False 1 False 2 False 3 False 4 False ... 319068 False 319069 False 319070 False 319071 False 319072 True Name: INCIDENT_NUMBER, Length: 319073, dtype: bool
df.cols.names()
['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']
from dateutil.parser import parse as dparse
dparse("2018-09-02 13:00:00")
datetime.datetime(2018, 9, 2, 13, 0)
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
00619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
01402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
03410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
03820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
00724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
00301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
df.cols.infer_profiler_dtypes("*")
{'INCIDENT_NUMBER': 'string', 'OFFENSE_CODE': 'zip_code', 'OFFENSE_CODE_GROUP': 'string', 'OFFENSE_DESCRIPTION': 'string', 'DISTRICT': 'string', 'REPORTING_AREA': 'int', 'SHOOTING': 'object', 'OCCURRED_ON_DATE': 'string', 'YEAR': 'int', 'MONTH': 'int', 'DAY_OF_WEEK': 'string', 'HOUR': 'int', 'UCR_PART': 'string', 'STREET': 'string', 'Lat': 'decimal', 'Long': 'decimal', 'Location': 'string'}
df.ext.display(20)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (uint8)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
0f34ruiy23e78y2r
|
2015/08/09
|
9.99
|
5%
|
basic
|
10
|
0fue298y2r23r23r4
|
nan
|
9.91
|
nan
|
basic
|
10
|
0f345kjh345oiuy348
|
nan
|
9.99
|
nan
|
basic
|
10
|
0fju234978rfjkhsdf
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f34ruiy23e78y2r
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
0fue298y2r23r23r5
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f345kjh345oiuy349
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
col_name ="customer_id"
print(df.cols.count_mismatch({col_name:"int"}))
print(df.cols.count_mismatch({col_name:"decimal"}))
print(df.cols.count_mismatch({col_name:"boolean"}))
print(df.cols.count_mismatch({col_name:"date"}))
print(df.cols.count_mismatch({col_name:"array"}))
print(df.cols.count_mismatch({col_name:"object"}))
print(df.cols.count_mismatch({col_name:"gender"}))
print(df.cols.count_mismatch({col_name:"ip"}))
print(df.cols.count_mismatch({col_name:"url"}))
print(df.cols.count_mismatch({col_name:"email"}))
print(df.cols.count_mismatch({col_name:"credit_card_number"}))
print(df.cols.count_mismatch({col_name:"zip_code"}))
{'customer_id': {'mismatch': 0, 'missing': 0, 'match': 17, 'profiler_dtype': 'string'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'int'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'decimal'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'boolean'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'date'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'array'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'object'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'gender'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'ip'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'url'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'email'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'credit_card_number'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'zip_code'}}
_output = df.ext.profile(columns=col_name, infer=True, output="json", flush=True)
# _output = df.ext.set_buffer("*")
# _output = df.ext.set_buffer("*")
# _output = df.ext.buffer_window("*", 0, 17).ext.to_json("*")
print(_output)
cols_and_inferred_dtype {'customer_id': 'string'} {"columns": {"customer_id": {"stats": {"mismatch": 0, "missing": 0, "match": 3, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object", "profiler_dtype": "string"}}, "name": null, "file_name": "tmpthwlzht1.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "uint8"], "total_count_dtypes": 2, "missing_count": 0, "p_missing": 0.0}}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
print(cols_and_inferred_dtype)
mismatch = df.cols.count_mismatch(cols_and_inferred_dtype, infer=True, compute=True)
print(mismatch)
{'customer_id': 'string', 'transactoin_date': 'string', 'ticket_price': 'decimal', 'discount': 'object', 'product': 'string', 'info': 'int'} {'customer_id': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'transactoin_date': {'mismatch': 0, 'missing': 3, 'match': 0, 'profiler_dtype': 'string'}, 'ticket_price': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'decimal'}, 'discount': {'mismatch': 1, 'missing': 2, 'match': 0, 'profiler_dtype': 'object'}, 'product': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'info': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'int'}}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df = op.load.file("data/crime.csv").ext.cache()
df = df.ext.optimize().ext.cache()
df.cols.count_mismatch({"OFFENSE_DESCRIPTION":"string"})
{'OFFENSE_DESCRIPTION': {'mismatch': 0, 'missing': 0, 'match': 319073, 'profiler_dtype': 'string'}}
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (uint16)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (uint16)
not nullable
|
MONTH
10 (uint8)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (uint8)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
import functools
import dask
import dask.dataframe as dd
import pandas as pd
pdf = pd.DataFrame({
'x': range(0, 100),
'y': range(0, 100),
'z': range(0, 100)
})
ddf = dd.from_pandas(pdf, npartitions=8)
print('Number of partitions', ddf.npartitions)
def compute_stats(row):
return {
'sum': row['x'] + row['y'] + row['z'],
'min': min(row),
'max': max(row)
}
def accum_stats(stats_accum, stats):
return {
'sum': stats_accum['sum'] + stats['sum'],
'min': min(stats_accum['min'], stats['min']),
'max': max(stats_accum['max'], stats['max'])
}
def compute_stats_partition(pdf):
pds = pdf.apply(compute_stats, axis=1)
print(pdf)
print("99999999999999999999999999999999999999999999999")
return pds
return functools.reduce(accum_stats, pds)
def merge_stats_series(pds):
print(pds)
print("--")
return pds
return functools.reduce(accum_stats, pds)
res = ddf.reduction(
compute_stats_partition,
merge_stats_series,
meta={
'sum': 'int64',
'min': 'int64',
'max': 'int64'
})
# singleton dataframe to list of delayed objects
# where each row is a delayed object
# and in this case we just want the first one
delayed_dict = res.to_delayed()[0]
a = dd.compute(delayed_dict)
Number of partitions 8 x y z 13 13 13 13 14 14 14 14 15 15 15 15 16 16 16 16 17 17 17 17 18 18 18 18 19 19 19 19 20 20 20 20 21 21 21 21 22 22 22 22 23 23 23 23 24 24 24 24 25 25 25 25 x y z 26 26 26 26 27 27 27 27 28 28 28 28 29 29 29 29 30 30 30 30 31 31 31 31 32 32 32 32 33 33 33 33 34 34 34 34 35 35 35 35 36 36 36 36 37 37 37 37 38 38 38 38 99999999999999999999999999999999999999999999999 x y z 39 39 39 39 40 40 40 40 41 41 41 41 42 42 42 42 43 43 43 43 44 44 44 44 45 45 45 45 46 46 46 46 47 47 47 47 48 48 48 48 49 49 49 49 50 50 50 50 51 51 51 51 99999999999999999999999999999999999999999999999 x y z 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7 8 8 8 8 9 9 9 9 10 10 10 10 11 11 11 11 12 12 12 12 99999999999999999999999999999999999999999999999 99999999999999999999999999999999999999999999999 x y z 52 52 52 52 53 53 53 53 54 54 54 54 55 55 55 55 56 56 56 56 57 57 57 57 58 58 58 58 59 59 59 59 60 60 60 60 61 61 61 61 62 62 62 62 63 63 63 63 64 64 64 64 99999999999999999999999999999999999999999999999 x y z 91 91 91 91 92 92 92 92 93 93 93 93 94 94 94 94 95 95 95 95 96 96 96 96 97 97 97 97 98 98 98 98 99 99 99 99 99999999999999999999999999999999999999999999999 x y z 65 65 65 65 66 66 66 66 67 67 67 67 68 68 68 68 69 69 69 69 70 70 70 70 71 71 71 71 72 72 72 72 73 73 73 73 74 74 74 74 75 75 75 75 76 76 76 76 77 77 77 77 99999999999999999999999999999999999999999999999 x y z 78 78 78 78 79 79 79 79 80 80 80 80 81 81 81 81 82 82 82 82 83 83 83 83 84 84 84 84 85 85 85 85 86 86 86 86 87 87 87 87 88 88 88 88 89 89 89 89 90 90 90 90 99999999999999999999999999999999999999999999999 0 1 \ 0 {'sum': 0, 'min': 0, 'max': 0} {'sum': 3, 'min': 1, 'max': 1} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 2 3 \ 0 {'sum': 6, 'min': 2, 'max': 2} {'sum': 9, 'min': 3, 'max': 3} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 4 5 \ 0 {'sum': 12, 'min': 4, 'max': 4} {'sum': 15, 'min': 5, 'max': 5} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 6 7 \ 0 {'sum': 18, 'min': 6, 'max': 6} {'sum': 21, 'min': 7, 'max': 7} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 8 9 ... \ 0 {'sum': 24, 'min': 8, 'max': 8} {'sum': 27, 'min': 9, 'max': 9} ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 90 91 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 270, 'min': 90, 'max': 90} NaN 0 NaN {'sum': 273, 'min': 91, 'max': 91} 92 93 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 276, 'min': 92, 'max': 92} {'sum': 279, 'min': 93, 'max': 93} 94 95 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 282, 'min': 94, 'max': 94} {'sum': 285, 'min': 95, 'max': 95} 96 97 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 288, 'min': 96, 'max': 96} {'sum': 291, 'min': 97, 'max': 97} 98 99 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 294, 'min': 98, 'max': 98} {'sum': 297, 'min': 99, 'max': 99} [8 rows x 100 columns] --
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590447280612.csv").ext.cache()
df = df.ext.optimize()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.profile(columns="*", infer=True, output="json")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
print(_output)
{"columns": {"customer_id": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}}, "transactoin_date": {"stats": {"mismatch": 0, "missing": 6, "match": 11, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 6, "match": 11, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}}, "ticket_price": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "9.99", "count": 7}, {"value": "29.99", "count": 5}, {"value": "14.99", "count": 4}, {"value": "9.91", "count": 1}], "count_uniques": 4}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "9.99", "count": 7}, {"value": "29.99", "count": 5}, {"value": "14.99", "count": 4}, {"value": "9.91", "count": 1}], "count_uniques": 4}}, "discount": {"stats": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}, "dtype": "object", "profiler_dtype": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}}, "product": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}}, "info": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "uint8", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}}}, "name": null, "file_name": "tmp_vk79fcm.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "uint8"], "total_count_dtypes": 2, "missing_count": 0, "p_missing": 0.0}}
##Bag
# each element is an integer
import dask.bag as db
b = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], npartitions=1)
b.take(3)
(1, 2, 3)
%%time
b.to_dataframe().compute()
Wall time: 850 ms
0 | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 6 |
6 | 7 |
7 | 8 |
8 | 9 |
9 | 10 |
from optimus.helpers.functions import match_date
import re
pattern = match_date("dd MM yy h")
string = "11 05 1980 2"
prog = re.compile(pattern)
result = prog.match(string)
print(bool(result))
['dd', ' ', 'MM', ' ', 'yy', ' ', 'h'] False