%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
op = Optimus("dask")
[nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] C:\Users\argenisleon\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date!
df.cols.max()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-6-3661798f3454> in <module> ----> 1 df.cols.max() NameError: name 'df' is not defined
# df = op.create.dataframe({"name": ["A1", "B2"]*20})
df = op.load.csv("store.csv", dtype="str")
df.cols.len("*").cols.max()
{'id': 5, 'transaction_date': 9, 'name': 10, 'price': 18, 'discount': 3}
df.cols.min(compute=False)
Delayed('list-fa556bb4-e62a-4827-a9d3-73c09fc9ff0e')
df.cols.min(compute=True)
{'id': '1', 'transaction_date': '1/1/2015', 'name': 'PANTS FG00', 'price': '10', 'discount': ''}
df
id
1 (object)
not nullable
|
transaction_date
2 (object)
not nullable
|
name
3 (object)
not nullable
|
price
4 (object)
not nullable
|
discount
5 (object)
not nullable
|
---|---|---|---|---|
1
|
1/7/2019
|
PANTS-rt38
|
260
|
50
|
2
|
1/4/2019
|
PANTS-FG63
|
82.99
|
|
3
|
1/5/2015
|
Pants-SG08
|
372.99
|
|
4
|
1/10/2019
|
SHIRT-St05
|
150
|
|
5
|
1/4/2017
|
shirt-Ft67
|
52.99
|
|
6
|
1/10/2018
|
shoes-FT13
|
62.99
|
|
7
|
1/10/2017
|
Pants-FT46
|
375
|
|
8
|
1/5/2016
|
pants-ft75
|
77.99
|
|
9
|
1/8/2018
|
SHIRT-Fg07
|
232.99
|
|
10
|
1/12/2015
|
PANTS-Rt74
|
245
|
|
11
|
1/11/2018
|
PANTS-Fg76
|
85
|
5
|
df.cols.hist("id")
{'hist': {'id': [{'lower': 1.0, 'upper': 2812.4848484848485, 'count': 2812}, {'lower': 2812.4848484848485, 'upper': 5623.969696969697, 'count': 2811}, {'lower': 5623.969696969697, 'upper': 8435.454545454546, 'count': 2812}, {'lower': 8435.454545454546, 'upper': 11246.939393939394, 'count': 2811}, {'lower': 11246.939393939394, 'upper': 14058.424242424242, 'count': 2812}, {'lower': 14058.424242424242, 'upper': 16869.909090909092, 'count': 2811}, {'lower': 16869.909090909092, 'upper': 19681.39393939394, 'count': 2812}, {'lower': 19681.39393939394, 'upper': 22492.878787878788, 'count': 2811}, {'lower': 22492.878787878788, 'upper': 25304.363636363636, 'count': 2812}, {'lower': 25304.363636363636, 'upper': 28115.848484848484, 'count': 2811}, {'lower': 28115.848484848484, 'upper': 30927.333333333332, 'count': 2812}, {'lower': 30927.333333333332, 'upper': 33738.818181818184, 'count': 2811}, {'lower': 33738.818181818184, 'upper': 36550.30303030303, 'count': 2812}, {'lower': 36550.30303030303, 'upper': 39361.78787878788, 'count': 2811}, {'lower': 39361.78787878788, 'upper': 42173.27272727273, 'count': 2812}, {'lower': 42173.27272727273, 'upper': 44984.757575757576, 'count': 2811}, {'lower': 44984.757575757576, 'upper': 47796.242424242424, 'count': 2812}, {'lower': 47796.242424242424, 'upper': 50607.72727272727, 'count': 2811}, {'lower': 50607.72727272727, 'upper': 53419.21212121212, 'count': 2812}, {'lower': 53419.21212121212, 'upper': 56230.69696969697, 'count': 2811}, {'lower': 56230.69696969697, 'upper': 59042.181818181816, 'count': 2812}, {'lower': 59042.181818181816, 'upper': 61853.666666666664, 'count': 2811}, {'lower': 61853.666666666664, 'upper': 64665.15151515151, 'count': 2812}, {'lower': 64665.15151515151, 'upper': 67476.63636363637, 'count': 2811}, {'lower': 67476.63636363637, 'upper': 70288.12121212122, 'count': 2812}, {'lower': 70288.12121212122, 'upper': 73099.60606060606, 'count': 2811}, {'lower': 73099.60606060606, 'upper': 75911.09090909091, 'count': 2812}, {'lower': 75911.09090909091, 'upper': 78722.57575757576, 'count': 2811}, {'lower': 78722.57575757576, 'upper': 81534.06060606061, 'count': 2812}, {'lower': 81534.06060606061, 'upper': 84345.54545454546, 'count': 2811}, {'lower': 84345.54545454546, 'upper': 87157.0303030303, 'count': 2812}, {'lower': 87157.0303030303, 'upper': 89968.51515151515, 'count': 2811}, {'lower': 89968.51515151515, 'upper': 92780.0, 'count': 2812}]}}
%%time
df.cols.hist("id")
Wall time: 288 ms
{'hist': {'id': [{'lower': 1.0, 'upper': 2812.4848484848485, 'count': 2812}, {'lower': 2812.4848484848485, 'upper': 5623.969696969697, 'count': 2811}, {'lower': 5623.969696969697, 'upper': 8435.454545454546, 'count': 2812}, {'lower': 8435.454545454546, 'upper': 11246.939393939394, 'count': 2811}, {'lower': 11246.939393939394, 'upper': 14058.424242424242, 'count': 2812}, {'lower': 14058.424242424242, 'upper': 16869.909090909092, 'count': 2811}, {'lower': 16869.909090909092, 'upper': 19681.39393939394, 'count': 2812}, {'lower': 19681.39393939394, 'upper': 22492.878787878788, 'count': 2811}, {'lower': 22492.878787878788, 'upper': 25304.363636363636, 'count': 2812}, {'lower': 25304.363636363636, 'upper': 28115.848484848484, 'count': 2811}, {'lower': 28115.848484848484, 'upper': 30927.333333333332, 'count': 2812}, {'lower': 30927.333333333332, 'upper': 33738.818181818184, 'count': 2811}, {'lower': 33738.818181818184, 'upper': 36550.30303030303, 'count': 2812}, {'lower': 36550.30303030303, 'upper': 39361.78787878788, 'count': 2811}, {'lower': 39361.78787878788, 'upper': 42173.27272727273, 'count': 2812}, {'lower': 42173.27272727273, 'upper': 44984.757575757576, 'count': 2811}, {'lower': 44984.757575757576, 'upper': 47796.242424242424, 'count': 2812}, {'lower': 47796.242424242424, 'upper': 50607.72727272727, 'count': 2811}, {'lower': 50607.72727272727, 'upper': 53419.21212121212, 'count': 2812}, {'lower': 53419.21212121212, 'upper': 56230.69696969697, 'count': 2811}, {'lower': 56230.69696969697, 'upper': 59042.181818181816, 'count': 2812}, {'lower': 59042.181818181816, 'upper': 61853.666666666664, 'count': 2811}, {'lower': 61853.666666666664, 'upper': 64665.15151515151, 'count': 2812}, {'lower': 64665.15151515151, 'upper': 67476.63636363637, 'count': 2811}, {'lower': 67476.63636363637, 'upper': 70288.12121212122, 'count': 2812}, {'lower': 70288.12121212122, 'upper': 73099.60606060606, 'count': 2811}, {'lower': 73099.60606060606, 'upper': 75911.09090909091, 'count': 2812}, {'lower': 75911.09090909091, 'upper': 78722.57575757576, 'count': 2811}, {'lower': 78722.57575757576, 'upper': 81534.06060606061, 'count': 2812}, {'lower': 81534.06060606061, 'upper': 84345.54545454546, 'count': 2811}, {'lower': 84345.54545454546, 'upper': 87157.0303030303, 'count': 2812}, {'lower': 87157.0303030303, 'upper': 89968.51515151515, 'count': 2811}, {'lower': 89968.51515151515, 'upper': 92780.0, 'count': 2812}]}}
df.meta
df.cols.frequency("*", n=10, count_uniques=True, compute=True)
{'frequency': {'id': {'values': [{'value': '1', 'count': 1}, {'value': '65673', 'count': 1}, {'value': '65671', 'count': 1}, {'value': '65670', 'count': 1}, {'value': '6567', 'count': 1}, {'value': '65669', 'count': 1}, {'value': '65668', 'count': 1}, {'value': '65667', 'count': 1}, {'value': '65666', 'count': 1}, {'value': '65665', 'count': 1}], 'count_uniques': 92780}, 'transaction_date': {'values': [{'value': '1/7/2017', 'count': 2267}, {'value': '1/8/2016', 'count': 2219}, {'value': '1/6/2016', 'count': 2208}, {'value': '1/10/2016', 'count': 2194}, {'value': '1/5/2016', 'count': 2167}, {'value': '1/4/2017', 'count': 2147}, {'value': '1/3/2016', 'count': 2142}, {'value': '1/2/2017', 'count': 2135}, {'value': '1/4/2016', 'count': 2128}, {'value': '1/5/2017', 'count': 2128}], 'count_uniques': 60}, 'name': {'values': [{'value': 'Pants-FG35', 'count': 38}, {'value': 'Pants-SG42', 'count': 36}, {'value': 'pants-SG82', 'count': 35}, {'value': 'Pants-FG65', 'count': 34}, {'value': 'PANTS-FG24', 'count': 32}, {'value': 'pants-SG81', 'count': 31}, {'value': 'PANTS-SG14', 'count': 30}, {'value': 'PANTS-SG42', 'count': 30}, {'value': 'pants-FG41', 'count': 30}, {'value': 'pants-FG51', 'count': 29}], 'count_uniques': 20527}, 'price': {'values': [{'value': '45', 'count': 1132}, {'value': '50', 'count': 917}, {'value': '40', 'count': 879}, {'value': '90', 'count': 774}, {'value': '80', 'count': 735}, {'value': '155', 'count': 726}, {'value': '165', 'count': 724}, {'value': '175', 'count': 717}, {'value': '110', 'count': 716}, {'value': '160', 'count': 713}], 'count_uniques': 320}, 'discount': {'values': [{'value': '', 'count': 54051}, {'value': '5', 'count': 10594}, {'value': '20', 'count': 6984}, {'value': '15', 'count': 6947}, {'value': '5%', 'count': 3348}, {'value': '15%', 'count': 2385}, {'value': '20%', 'count': 2363}, {'value': '50', 'count': 2312}, {'value': '75', 'count': 2288}, {'value': '75%', 'count': 760}], 'count_uniques': 11}}}
df.cols.names()
['id', 'transaction_date', 'name', 'price', 'discount']
df.rows.count()
92780
df.cols.hist("id")
asdfasdf
{'hist': {'id': [{'lower': 1.0, 'upper': 2812.4848484848485, 'count': 2812}, {'lower': 2812.4848484848485, 'upper': 5623.969696969697, 'count': 2811}, {'lower': 5623.969696969697, 'upper': 8435.454545454546, 'count': 2812}, {'lower': 8435.454545454546, 'upper': 11246.939393939394, 'count': 2811}, {'lower': 11246.939393939394, 'upper': 14058.424242424242, 'count': 2812}, {'lower': 14058.424242424242, 'upper': 16869.909090909092, 'count': 2811}, {'lower': 16869.909090909092, 'upper': 19681.39393939394, 'count': 2812}, {'lower': 19681.39393939394, 'upper': 22492.878787878788, 'count': 2811}, {'lower': 22492.878787878788, 'upper': 25304.363636363636, 'count': 2812}, {'lower': 25304.363636363636, 'upper': 28115.848484848484, 'count': 2811}, {'lower': 28115.848484848484, 'upper': 30927.333333333332, 'count': 2812}, {'lower': 30927.333333333332, 'upper': 33738.818181818184, 'count': 2811}, {'lower': 33738.818181818184, 'upper': 36550.30303030303, 'count': 2812}, {'lower': 36550.30303030303, 'upper': 39361.78787878788, 'count': 2811}, {'lower': 39361.78787878788, 'upper': 42173.27272727273, 'count': 2812}, {'lower': 42173.27272727273, 'upper': 44984.757575757576, 'count': 2811}, {'lower': 44984.757575757576, 'upper': 47796.242424242424, 'count': 2812}, {'lower': 47796.242424242424, 'upper': 50607.72727272727, 'count': 2811}, {'lower': 50607.72727272727, 'upper': 53419.21212121212, 'count': 2812}, {'lower': 53419.21212121212, 'upper': 56230.69696969697, 'count': 2811}, {'lower': 56230.69696969697, 'upper': 59042.181818181816, 'count': 2812}, {'lower': 59042.181818181816, 'upper': 61853.666666666664, 'count': 2811}, {'lower': 61853.666666666664, 'upper': 64665.15151515151, 'count': 2812}, {'lower': 64665.15151515151, 'upper': 67476.63636363637, 'count': 2811}, {'lower': 67476.63636363637, 'upper': 70288.12121212122, 'count': 2812}, {'lower': 70288.12121212122, 'upper': 73099.60606060606, 'count': 2811}, {'lower': 73099.60606060606, 'upper': 75911.09090909091, 'count': 2812}, {'lower': 75911.09090909091, 'upper': 78722.57575757576, 'count': 2811}, {'lower': 78722.57575757576, 'upper': 81534.06060606061, 'count': 2812}, {'lower': 81534.06060606061, 'upper': 84345.54545454546, 'count': 2811}, {'lower': 84345.54545454546, 'upper': 87157.0303030303, 'count': 2812}, {'lower': 87157.0303030303, 'upper': 89968.51515151515, 'count': 2811}, {'lower': 89968.51515151515, 'upper': 92780.0, 'count': 2812}]}}
df.cols.hist("price")
asdfasdf
{'hist': {'price': [{'lower': 4.99, 'upper': 17.065151515151516, 'count': 1873}, {'lower': 17.065151515151516, 'upper': 29.14030303030303, 'count': 2187}, {'lower': 29.14030303030303, 'upper': 41.21545454545455, 'count': 3685}, {'lower': 41.21545454545455, 'upper': 53.290606060606066, 'count': 4540}, {'lower': 53.290606060606066, 'upper': 65.36575757575758, 'count': 3932}, {'lower': 65.36575757575758, 'upper': 77.44090909090909, 'count': 2755}, {'lower': 77.44090909090909, 'upper': 89.5160606060606, 'count': 3402}, {'lower': 89.5160606060606, 'upper': 101.59121212121212, 'count': 3690}, {'lower': 101.59121212121212, 'upper': 113.66636363636364, 'count': 3344}, {'lower': 113.66636363636364, 'upper': 125.74151515151516, 'count': 3660}, {'lower': 125.74151515151516, 'upper': 137.8166666666667, 'count': 2766}, {'lower': 137.8166666666667, 'upper': 149.8918181818182, 'count': 3260}, {'lower': 149.8918181818182, 'upper': 161.96696969696973, 'count': 3719}, {'lower': 161.96696969696973, 'upper': 174.04212121212123, 'count': 3332}, {'lower': 174.04212121212123, 'upper': 186.11727272727276, 'count': 3764}, {'lower': 186.11727272727276, 'upper': 198.19242424242427, 'count': 3120}, {'lower': 198.19242424242427, 'upper': 210.26757575757577, 'count': 2965}, {'lower': 210.26757575757577, 'upper': 222.3427272727273, 'count': 1947}, {'lower': 222.3427272727273, 'upper': 234.4178787878788, 'count': 2299}, {'lower': 234.4178787878788, 'upper': 246.49303030303034, 'count': 2587}, {'lower': 246.49303030303034, 'upper': 258.5681818181818, 'count': 2417}, {'lower': 258.5681818181818, 'upper': 270.6433333333334, 'count': 2537}, {'lower': 270.6433333333334, 'upper': 282.7184848484849, 'count': 1919}, {'lower': 282.7184848484849, 'upper': 294.7936363636364, 'count': 2234}, {'lower': 294.7936363636364, 'upper': 306.8687878787879, 'count': 2539}, {'lower': 306.8687878787879, 'upper': 318.94393939393944, 'count': 2340}, {'lower': 318.94393939393944, 'upper': 331.01909090909095, 'count': 2555}, {'lower': 331.01909090909095, 'upper': 343.09424242424245, 'count': 2167}, {'lower': 343.09424242424245, 'upper': 355.16939393939396, 'count': 2652}, {'lower': 355.16939393939396, 'upper': 367.2445454545455, 'count': 1948}, {'lower': 367.2445454545455, 'upper': 379.319696969697, 'count': 2271}, {'lower': 379.319696969697, 'upper': 391.3948484848485, 'count': 2534}, {'lower': 391.3948484848485, 'upper': 403.47, 'count': 1840}]}}
%%time
df.cols.hist()
distributed.worker - WARNING - Compute Failed Function: execute_task args: ((<function apply at 0x000001A39503C0D8>, <function _reduction_aggregate at 0x000001A3967F6048>, [(<function _concat at 0x000001A396571438>, [(<function apply at 0x000001A39503C0D8>, <function _reduction_chunk at 0x000001A3967F4EE8>, [(subgraph_callable, 'getitem-d88a0ae422f4c1a9d42629e3644fd1fa', (<function pandas_read_text at 0x000001A396830048>, <function read_csv at 0x000001A3961CC0D8>, (<function read_block_from_file at 0x000001A39694BD38>, <OpenFile 'C:/Users/argenisleon/Documents/Optimus/examples/store.csv'>, 0, 64000000, b'\n'), b'id,transaction_date,name,price,discount\n', {'sep': ',', 'header': 0, 'encoding': 'utf-8', 'quoting': 0, 'error_bad_lines': False, 'keep_default_na': True, 'na_values': None, 'engine': 'c', 'na_filter': False, 'low_memory': False, 'dtype': 'str'}, {'id': dtype('O'), 'transaction_date': dtype('O'), 'name': dtype('O'), 'price': dtype('O'), 'discount': dtype('O')}, ['id', 'transaction_date', 'name', 'price', 'discount'], False, False, None), 'transaction kwargs: {} Exception: ValueError("could not convert string to float: '1/7/2019'")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <timed eval> in <module> ~\Documents\Optimus\optimus\engines\dask\columns.py in hist(self, columns, buckets, compute) 37 dfd = df.data[col_name].astype("float") 38 ---> 39 if len(dfd) > 0: 40 _count, _bins = dask.compute(da.histogram(dfd, bins=buckets, range=[dfd.min(), dfd.max()]))[0] 41 result[col_name] = [ ~\Anaconda3\lib\site-packages\dask\dataframe\core.py in __len__(self) 556 def __len__(self): 557 return self.reduction( --> 558 len, np.sum, token="len", meta=int, split_every=False 559 ).compute() 560 ~\Anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs) 281 dask.base.compute 282 """ --> 283 (result,) = compute(self, traverse=False, **kwargs) 284 return result 285 ~\Anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs) 563 postcomputes.append(x.__dask_postcompute__()) 564 --> 565 results = schedule(dsk, keys, **kwargs) 566 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)]) 567 ~\Anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs) 2710 should_rejoin = False 2711 try: -> 2712 results = self.gather(packed, asynchronous=asynchronous, direct=direct) 2713 finally: 2714 for f in futures.values(): ~\Anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, direct, asynchronous) 1989 direct=direct, 1990 local_worker=local_worker, -> 1991 asynchronous=asynchronous, 1992 ) 1993 ~\Anaconda3\lib\site-packages\distributed\client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs) 832 else: 833 return sync( --> 834 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs 835 ) 836 ~\Anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, callback_timeout, *args, **kwargs) 337 if error[0]: 338 typ, exc, tb = error[0] --> 339 raise exc.with_traceback(tb) 340 else: 341 return result[0] ~\Anaconda3\lib\site-packages\distributed\utils.py in f() 321 if callback_timeout is not None: 322 future = asyncio.wait_for(future, callback_timeout) --> 323 result[0] = yield future 324 except Exception as exc: 325 error[0] = sys.exc_info() ~\Anaconda3\lib\site-packages\tornado\gen.py in run(self) 733 734 try: --> 735 value = future.result() 736 except Exception: 737 exc_info = sys.exc_info() ~\Anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker) 1848 exc = CancelledError(key) 1849 else: -> 1850 raise exception.with_traceback(traceback) 1851 raise exc 1852 if errors == "skip": ~\Anaconda3\lib\site-packages\dask\optimization.py in __call__(self, *args) 961 if not len(args) == len(self.inkeys): 962 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args))) --> 963 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args))) 964 965 def __reduce__(self): ~\Anaconda3\lib\site-packages\dask\core.py in get(dsk, out, cache) 149 for key in toposort(dsk): 150 task = dsk[key] --> 151 result = _execute_task(task, cache) 152 cache[key] = result 153 result = _execute_task(out, cache) ~\Anaconda3\lib\site-packages\dask\core.py in _execute_task(arg, cache, dsk) 119 # temporaries by their reference count and can execute certain 120 # operations in-place. --> 121 return func(*(_execute_task(a, cache) for a in args)) 122 elif not ishashable(arg): 123 return arg ~\Anaconda3\lib\site-packages\dask\utils.py in apply(func, args, kwargs) 33 def apply(func, args, kwargs=None): 34 if kwargs: ---> 35 return func(*args, **kwargs) 36 else: 37 return func(*args) ~\Anaconda3\lib\site-packages\dask\utils.py in __call__(self, obj, *args, **kwargs) 899 900 def __call__(self, obj, *args, **kwargs): --> 901 return getattr(obj, self.method)(*args, **kwargs) 902 903 def __reduce__(self): ~\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors) 5870 else: 5871 # else, only a single dtype is given -> 5872 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 5873 return self._constructor(new_data).__finalize__(self, method="astype") 5874 ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, copy, errors) 629 self, dtype, copy: bool = False, errors: str = "raise" 630 ) -> "BlockManager": --> 631 return self.apply("astype", dtype=dtype, copy=copy, errors=errors) 632 633 def convert( ~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, align_keys, ignore_failures, **kwargs) 425 applied = b.apply(f, **kwargs) 426 else: --> 427 applied = getattr(b, f)(**kwargs) 428 except (TypeError, NotImplementedError): 429 if not ignore_failures: ~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors) 671 vals1d = values.ravel() 672 try: --> 673 values = astype_nansafe(vals1d, dtype, copy=True) 674 except (ValueError, TypeError): 675 # e.g. astype_nansafe can fail on object-dtype of strings ~\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna) 1095 if copy or is_object_dtype(arr) or is_object_dtype(dtype): 1096 # Explicit copy, or required since NumPy can't view from / to object. -> 1097 return arr.astype(dtype, copy=True) 1098 1099 return arr.view(dtype) ValueError: could not convert string to float: '1/7/2019'
# import sys,os,os.path
# os.environ
def xray(df, col_name):
for i in df[col_name]:
print(i, type(i))
op = Optimus("dask", threads_per_worker=8, n_workers=1, comm=True)
preview_df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv", n_rows=35).ext.cache()
C:\Users\argenisleon\Anaconda3\lib\site-packages\distributed\dashboard\core.py:79: UserWarning: Port 8787 is already in use. Perhaps you already have a cluster running? Hosting the diagnostics dashboard on a random port instead. warnings.warn("\n" + msg) C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 35 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
import fastnumbers
fastnumbers.fast_float(None)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-29-8e1775e35bdf> in <module> 1 import fastnumbers ----> 2 fastnumbers.fast_float(None) TypeError: float() argument must be a string or a number, not 'NoneType'
f = df.cols.set(value='df["ticket_price"]*df["discount"]', where='df["ticket_price"]!=None', output_cols="ticket_price").ext.cache()
_output = df.ext.profile(columns="*", output="json")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
_output = {**preview_df.ext.to_json("*"), "meta": preview_df.meta.get() if (preview_df.meta and preview_df.meta.get) else {} }
_output = preview_df.ext.profile(columns="*", output="json")
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1591123129359.csv").ext.cache()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.profile(columns="*", output="json")
_output = df.ext.set_buffer("*")
_output = df.ext.buffer_window("*", 0, 17).ext.to_json("*")
print(_output)
_output = df.ext.set_buffer("*")
_output = df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]', where='df["ticket_price"]!=None', output_cols="new ticket_price").ext.to_json("*")
_df_profile = df.ext.buffer_window("*").cols.set(value='df["ticket_price"]', where='df["ticket_price"]!=None', output_cols="new ticket_price")
_output = { "profile": _df_profile.ext.profile(["new ticket_price"], output="json")}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'sample': {'columns': [{'title': 'customer_id'}, {'title': 'transactoin_date'}, {'title': 'ticket_price'}, {'title': 'discount'}, {'title': 'product'}, {'title': 'info'}], 'value': [['0f345kjh345oiuy345', '2010/08/19', '29.99', nan, 'platinum', '1'], ['0fju234978rfjkhsdf', '2012/01/05', '29.99', nan, 'platinum', '2'], ['0f34ruiy23e78y2r', '2009/08/11', '29.99', nan, 'platinum', '3'], ['0fue298y2r23r23r2', '2010/08/19', '29.99', nan, 'platinum', '4'], ['0f345kjh345oiuy346', '2010/08/19', '29.99', nan, 'platinum', '5'], ['0fju234978rfjkhsdf', '2010/08/19', '9.99', '5%', 'basic', '6'], ['0f34ruiy23e78y2r', '2010/08/19', '9.99', nan, 'basic', '7'], ['0fue298y2r23r23r3', '2010/08/19', '9.99', nan, 'basic', '8'], ['0f345kjh345oiuy347', '2010/08/19', '9.99', nan, 'basic', '9'], ['0fju234978rfjkhsdf', '2011/08/11', '9.99', '5%', 'basic', '10'], ['0f34ruiy23e78y2r', '2015/08/09', '9.99', '5%', 'basic', '10'], ['0fue298y2r23r23r4', nan, '9.91', nan, 'basic', '10'], ['0f345kjh345oiuy348', nan, '9.99', nan, 'basic', '10'], ['0fju234978rfjkhsdf', nan, '14.99', '5%', 'deluxe', '10'], ['0f34ruiy23e78y2r', nan, '14.99', nan, 'deluxe', '10'], ['0fue298y2r23r23r5', nan, '14.99', '5%', 'deluxe', '10'], ['0f345kjh345oiuy349', nan, '14.99', nan, 'deluxe', 'a']]}} pdf ticket_price 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 a 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 Name: new ticket_price, dtype: float64 pdf ticket_price 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 a 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 Name: new ticket_price, dtype: float64 dtype int {'new ticket_price': {'match': 17, 'missing': 0, 'mismatch': 17}}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r))) C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 500000 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df.compute().ext.head()
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5% | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5% | basic | 10 |
_output = df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]*df["discount"]', where='df["ticket_price"]!=None', output_cols="new ticket_price").ext.to_json("*")
pdf ticket_price 0 29.99 1 29.99 2 29.99 3 29.99 4 29.99 5 9.99 6 9.99 7 9.99 8 9.99 9 9.99 10 9.99 11 9.91 12 9.99 13 14.99 14 14.99 15 14.99 16 14.99 a 0 899.4001 1 899.4001 2 899.4001 3 899.4001 4 899.4001 5 99.8001 6 99.8001 7 99.8001 8 99.8001 9 99.8001 10 99.8001 11 98.2081 12 99.8001 13 224.7001 14 224.7001 15 224.7001 16 224.7001 Name: new ticket_price, dtype: float64
df.ext.buffer_window("*", 0, 17).cols.set(value='df["ticket_price"]*df["ticket_price"]',output_cols="new 1")
from optimus import Optimus
op = Optimus("dask", n_workers=1, threads_per_worker=8, processes=False, memory_limit="3G", comm=True)
%%time
df = op.load.file("data/dataset-transactions.csv").ext.cache()
# df = df.ext.optimize()
df = df.ext.repartition(1).ext.cache()
df.ext.display(20)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (object)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
0f34ruiy23e78y2r
|
2015/08/09
|
9.99
|
5%
|
basic
|
10
|
0fue298y2r23r23r4
|
nan
|
9.91
|
nan
|
basic
|
10
|
0f345kjh345oiuy348
|
nan
|
9.99
|
nan
|
basic
|
10
|
0fju234978rfjkhsdf
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f34ruiy23e78y2r
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
0fue298y2r23r23r5
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f345kjh345oiuy349
|
nan
|
14.99
|
nan
|
deluxe
|
a
|
Wall time: 163 ms
# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
# df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
# # result = df.ext.profile(columns=columns, bins=bins, output=output, flush=flush, size=size)
# print(cols_and_inferred_dtype)
# df.cols.infer_profiler_dtypes("*")
p = df.ext.profile("*", flush=True)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
AAAA {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\r\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}} BBBB {'file_name': 'dataset-transactions.csv', 'mime_info': [{'mime': 'text/plain', 'encoding': 'us-ascii', 'file_ext': 'csv', 'file_type': 'csv', 'properties': {'delimiter': ',', 'doublequote': False, 'escapechar': None, 'lineterminator': '\r\n', 'quotechar': '"', 'quoting': 0, 'skipinitialspace': False}}], 'transformations': {'actions': [{'profiler_dtype': 'customer_id'}, {'profiler_dtype': 'transactoin_date'}, {'profiler_dtype': 'ticket_price'}, {'profiler_dtype': 'discount'}, {'profiler_dtype': 'product'}, {'profiler_dtype': 'info'}]}, 'profile': {'columns': {'customer_id': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'transactoin_date': {'stats': {'mismatch': 11, 'missing': 6, 'match': 0, 'frequency': [{'value': '2010/08/19', 'count': 7}, {'value': '2015/08/09', 'count': 1}, {'value': '2012/01/05', 'count': 1}, {'value': '2011/08/11', 'count': 1}, {'value': '2009/08/11', 'count': 1}], 'count_uniques': 5}, 'dtype': 'object', 'profiler_dtype': 'date'}, 'ticket_price': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'hist': [{'lower': 9.91, 'upper': 10.5375, 'count': 8}, {'lower': 10.5375, 'upper': 11.165, 'count': 0}, {'lower': 11.165, 'upper': 11.7925, 'count': 0}, {'lower': 11.7925, 'upper': 12.42, 'count': 0}, {'lower': 12.42, 'upper': 13.0475, 'count': 0}, {'lower': 13.0475, 'upper': 13.675, 'count': 0}, {'lower': 13.675, 'upper': 14.3025, 'count': 0}, {'lower': 14.3025, 'upper': 14.93, 'count': 0}, {'lower': 14.93, 'upper': 15.5575, 'count': 4}, {'lower': 15.5575, 'upper': 16.185, 'count': 0}, {'lower': 16.185, 'upper': 16.8125, 'count': 0}, {'lower': 16.8125, 'upper': 17.439999999999998, 'count': 0}, {'lower': 17.439999999999998, 'upper': 18.0675, 'count': 0}, {'lower': 18.0675, 'upper': 18.695, 'count': 0}, {'lower': 18.695, 'upper': 19.322499999999998, 'count': 0}, {'lower': 19.322499999999998, 'upper': 19.95, 'count': 0}, {'lower': 19.95, 'upper': 20.5775, 'count': 0}, {'lower': 20.5775, 'upper': 21.205, 'count': 0}, {'lower': 21.205, 'upper': 21.8325, 'count': 0}, {'lower': 21.8325, 'upper': 22.46, 'count': 0}, {'lower': 22.46, 'upper': 23.0875, 'count': 0}, {'lower': 23.0875, 'upper': 23.715, 'count': 0}, {'lower': 23.715, 'upper': 24.3425, 'count': 0}, {'lower': 24.3425, 'upper': 24.97, 'count': 0}, {'lower': 24.97, 'upper': 25.597499999999997, 'count': 0}, {'lower': 25.597499999999997, 'upper': 26.224999999999998, 'count': 0}, {'lower': 26.224999999999998, 'upper': 26.8525, 'count': 0}, {'lower': 26.8525, 'upper': 27.48, 'count': 0}, {'lower': 27.48, 'upper': 28.107499999999998, 'count': 0}, {'lower': 28.107499999999998, 'upper': 28.735, 'count': 0}, {'lower': 28.735, 'upper': 29.362499999999997, 'count': 0}, {'lower': 29.362499999999997, 'upper': 29.99, 'count': 5}], 'count_uniques': 4}, 'dtype': 'object', 'profiler_dtype': 'decimal'}, 'discount': {'stats': {'mismatch': 0, 'missing': 12, 'match': 5, 'frequency': [{'value': '5%', 'count': 5}], 'count_uniques': 1}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'product': {'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': 'basic', 'count': 8}, {'value': 'platinum', 'count': 5}, {'value': 'deluxe', 'count': 4}], 'count_uniques': 3}, 'dtype': 'object', 'profiler_dtype': 'string'}, 'info': {'stats': {'mismatch': 1, 'missing': 0, 'match': 16}, 'dtype': 'object', 'profiler_dtype': 'int'}}, 'name': None, 'file_name': 'dataset-transactions.csv', 'summary': {'cols_count': 6, 'rows_count': 17, 'dtypes_list': ['object'], 'total_count_dtypes': 1, 'missing_count': 0, 'p_missing': 0.0}}}
df.meta.get()["profile"]["columns"]["customer_id"]
{'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}
# df, p = df.ext.cast_and_profile("*")
assert(p["columns"]["ticket_price"]["stats"]["match"]==17)
assert(p["columns"]["ticket_price"]["stats"]["mismatch"]==0)
assert(p["columns"]["ticket_price"]["stats"]["missing"]==0)
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'string', 'product': 'string', 'info': 'int'}
df = df.cols.replace("discount", search=["%"], replace_by="", search_by="chars", ignore_case=True)
df.compute()
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5 | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5 | basic | 10 |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 9.99 | 5 | basic | 10 |
11 | 0fue298y2r23r23r4 | NaN | 9.91 | NaN | basic | 10 |
12 | 0f345kjh345oiuy348 | NaN | 9.99 | NaN | basic | 10 |
13 | 0fju234978rfjkhsdf | NaN | 14.99 | 5 | deluxe | 10 |
14 | 0f34ruiy23e78y2r | NaN | 14.99 | NaN | deluxe | 10 |
15 | 0fue298y2r23r23r5 | NaN | 14.99 | 5 | deluxe | 10 |
16 | 0f345kjh345oiuy349 | NaN | 14.99 | NaN | deluxe | a |
# df.cols.infer_profiler_dtypes("*")
from optimus.infer import Infer
df.ext.head("*",50).applymap(Infer.parse_pandas)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 50 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | string | date | decimal | null | string | int |
1 | string | date | decimal | null | string | int |
2 | string | date | decimal | null | string | int |
3 | string | date | decimal | null | string | int |
4 | string | date | decimal | null | string | int |
5 | string | date | decimal | int | string | int |
6 | string | date | decimal | null | string | int |
7 | string | date | decimal | null | string | int |
8 | string | date | decimal | null | string | int |
9 | string | date | decimal | int | string | int |
10 | string | date | decimal | int | string | int |
11 | string | null | decimal | null | string | int |
12 | string | null | decimal | null | string | int |
13 | string | null | decimal | int | string | int |
14 | string | null | decimal | null | string | int |
15 | string | null | decimal | int | string | int |
16 | string | null | decimal | null | string | string |
xray(df, "discount")
nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> 5 <class 'str'> nan <class 'float'>
p = df.ext.profile("discount")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'int', 'product': 'string', 'info': 'int'}
# df["transactoin_date"].compute()
df.compute()
customer_id | transactoin_date | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5 | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5 | basic | 10 |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 9.99 | 5 | basic | 10 |
11 | 0fue298y2r23r23r4 | NaN | 9.91 | NaN | basic | 10 |
12 | 0f345kjh345oiuy348 | NaN | 9.99 | NaN | basic | 10 |
13 | 0fju234978rfjkhsdf | NaN | 14.99 | 5 | deluxe | 10 |
14 | 0f34ruiy23e78y2r | NaN | 14.99 | NaN | deluxe | 10 |
15 | 0fue298y2r23r23r5 | NaN | 14.99 | 5 | deluxe | 10 |
16 | 0f345kjh345oiuy349 | NaN | 14.99 | NaN | deluxe | a |
xray(df, "discount")
nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> 5 <class 'str'> nan <class 'float'> nan <class 'float'> 5 <class 'str'> nan <class 'float'> 5 <class 'str'> nan <class 'float'>
df = df.cols.unnest("transactoin_date", separator="/", splits=2, output_cols="transactoin_date").ext.cache()
_output = df.ext.profile(columns="*", output="json", flush=True)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df.meta.get()["profile"]["columns"]["customer_id"]
{'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'customer_id': 'string', 'transactoin_date': 'date', 'transactoin_date_0': 'int', 'transactoin_date_1': 'int', 'ticket_price': 'decimal', 'discount': 'int', 'product': 'string', 'info': 'int'}
df.compute()
customer_id | transactoin_date | transactoin_date_0 | transactoin_date_1 | ticket_price | discount | product | info | |
---|---|---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 2010 | 08 | 29.99 | NaN | platinum | 1 |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 2012 | 01 | 29.99 | NaN | platinum | 2 |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 2009 | 08 | 29.99 | NaN | platinum | 3 |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 2010 | 08 | 29.99 | NaN | platinum | 4 |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 2010 | 08 | 29.99 | NaN | platinum | 5 |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 2010 | 08 | 9.99 | 5 | basic | 6 |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 2010 | 08 | 9.99 | NaN | basic | 7 |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 2010 | 08 | 9.99 | NaN | basic | 8 |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 2010 | 08 | 9.99 | NaN | basic | 9 |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 2011 | 08 | 9.99 | 5 | basic | 10 |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 2015 | 08 | 9.99 | 5 | basic | 10 |
11 | 0fue298y2r23r23r4 | NaN | nan | None | 9.91 | NaN | basic | 10 |
12 | 0f345kjh345oiuy348 | NaN | nan | None | 9.99 | NaN | basic | 10 |
13 | 0fju234978rfjkhsdf | NaN | nan | None | 14.99 | 5 | deluxe | 10 |
14 | 0f34ruiy23e78y2r | NaN | nan | None | 14.99 | NaN | deluxe | 10 |
15 | 0fue298y2r23r23r5 | NaN | nan | None | 14.99 | 5 | deluxe | 10 |
16 | 0f345kjh345oiuy349 | NaN | nan | None | 14.99 | NaN | deluxe | a |
df.meta.get()["profile"]["columns"]["customer_id"]
{'stats': {'mismatch': 0, 'missing': 0, 'match': 17, 'frequency': [{'value': '0fju234978rfjkhsdf', 'count': 4}, {'value': '0f34ruiy23e78y2r', 'count': 4}, {'value': '0fue298y2r23r23r5', 'count': 1}, {'value': '0fue298y2r23r23r4', 'count': 1}, {'value': '0fue298y2r23r23r3', 'count': 1}, {'value': '0fue298y2r23r23r2', 'count': 1}, {'value': '0f345kjh345oiuy349', 'count': 1}, {'value': '0f345kjh345oiuy348', 'count': 1}, {'value': '0f345kjh345oiuy347', 'count': 1}, {'value': '0f345kjh345oiuy346', 'count': 1}, {'value': '0f345kjh345oiuy345', 'count': 1}], 'count_uniques': 11}, 'dtype': 'object', 'profiler_dtype': 'string'}
df1 = df.cols.set(value='df["ticket_price"]*df["discount"]*10', output_cols="new ticket_price")
cols df["ticket_price"]*df["discount"]*10 columns ['ticket_price', 'discount'] cols None
df.cols.names()
['customer_id', 'transactoin_date', 'ticket_price', 'discount', 'product', 'info']
df1.compute()
customer_id | transactoin_date | ticket_price | discount | product | info | new ticket_price | |
---|---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010/08/19 | 29.99 | NaN | platinum | 1 | NaN |
1 | 0fju234978rfjkhsdf | 2012/01/05 | 29.99 | NaN | platinum | 2 | NaN |
2 | 0f34ruiy23e78y2r | 2009/08/11 | 29.99 | NaN | platinum | 3 | NaN |
3 | 0fue298y2r23r23r2 | 2010/08/19 | 29.99 | NaN | platinum | 4 | NaN |
4 | 0f345kjh345oiuy346 | 2010/08/19 | 29.99 | NaN | platinum | 5 | NaN |
5 | 0fju234978rfjkhsdf | 2010/08/19 | 9.99 | 5% | basic | 6 | NaN |
6 | 0f34ruiy23e78y2r | 2010/08/19 | 9.99 | NaN | basic | 7 | NaN |
7 | 0fue298y2r23r23r3 | 2010/08/19 | 9.99 | NaN | basic | 8 | NaN |
8 | 0f345kjh345oiuy347 | 2010/08/19 | 9.99 | NaN | basic | 9 | NaN |
9 | 0fju234978rfjkhsdf | 2011/08/11 | 9.99 | 5% | basic | 10 | NaN |
10 | 0f34ruiy23e78y2r | 2015/08/09 | 9.99 | 5% | basic | 10 | NaN |
11 | 0fue298y2r23r23r4 | NaN | 9.91 | NaN | basic | 10 | NaN |
12 | 0f345kjh345oiuy348 | NaN | 9.99 | NaN | basic | 10 | NaN |
13 | 0fju234978rfjkhsdf | NaN | 14.99 | 5% | deluxe | 10 | NaN |
14 | 0f34ruiy23e78y2r | NaN | 14.99 | NaN | deluxe | 10 | NaN |
15 | 0fue298y2r23r23r5 | NaN | 14.99 | 5% | deluxe | 10 | NaN |
16 | 0f345kjh345oiuy349 | NaN | 14.99 | NaN | deluxe | a | NaN |
df.cols.set(value='mask["ticket_price"]+mask["transactoin_date"]', where='df["ticket_price"]!=None', output_cols="new ticket_price_2").compute()
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590599684769.csv").ext.cache()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.cast_and_profile(columns="*", output="json")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
dtype string dtype date dtype decimal dtype string dtype string dtype int dtypedtype string dtype date dtype decimal dtype string dtype dtype string dtype string string dtype int dtype string dtype datedtype string dtype decimal dtype string dtype string date dtype decimal dtype string dtype date dtype decimaldtype string dtype datedtype dtypedtypedtype date dtype decimal dtype string dtypedtype date dtype decimal dtype string dtype decimal dtype string int stringdtypestring string dtype string dtype date dtype int string dtype int dtypedtypedtype string dtype string dtype int dtype decimal dtype string dtype string dtype int int string dtype int
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
unmasked = "DD/MM/YY"
masked = unmasked[1:4]
masked
'ABB'
print(_output)
{"columns": {"customer_id": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object"}, "transactoin_date": {"profiler_dtype": "date", "stats": {"mismatch": 11, "missing": 6, "match": 0, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}, "dtype": "object"}, "ticket_price": {"profiler_dtype": "decimal", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 9.91, "upper": 10.5375, "count": 0}, {"lower": 10.5375, "upper": 11.165, "count": 0}, {"lower": 11.165, "upper": 11.7925, "count": 0}, {"lower": 11.7925, "upper": 12.42, "count": 0}, {"lower": 12.42, "upper": 13.0475, "count": 0}, {"lower": 13.0475, "upper": 13.675, "count": 0}, {"lower": 13.675, "upper": 14.3025, "count": 0}, {"lower": 14.3025, "upper": 14.93, "count": 0}, {"lower": 14.93, "upper": 15.5575, "count": 3}, {"lower": 15.5575, "upper": 16.185, "count": 0}, {"lower": 16.185, "upper": 16.8125, "count": 0}, {"lower": 16.8125, "upper": 17.439999999999998, "count": 0}, {"lower": 17.439999999999998, "upper": 18.0675, "count": 0}, {"lower": 18.0675, "upper": 18.695, "count": 0}, {"lower": 18.695, "upper": 19.322499999999998, "count": 0}, {"lower": 19.322499999999998, "upper": 19.95, "count": 0}, {"lower": 19.95, "upper": 20.5775, "count": 0}, {"lower": 20.5775, "upper": 21.205, "count": 0}, {"lower": 21.205, "upper": 21.8325, "count": 0}, {"lower": 21.8325, "upper": 22.46, "count": 0}, {"lower": 22.46, "upper": 23.0875, "count": 0}, {"lower": 23.0875, "upper": 23.715, "count": 0}, {"lower": 23.715, "upper": 24.3425, "count": 0}, {"lower": 24.3425, "upper": 24.97, "count": 0}, {"lower": 24.97, "upper": 25.597499999999997, "count": 0}, {"lower": 25.597499999999997, "upper": 26.224999999999998, "count": 0}, {"lower": 26.224999999999998, "upper": 26.8525, "count": 0}, {"lower": 26.8525, "upper": 27.48, "count": 0}, {"lower": 27.48, "upper": 28.107499999999998, "count": 0}, {"lower": 28.107499999999998, "upper": 28.735, "count": 0}, {"lower": 28.735, "upper": 29.362499999999997, "count": 0}, {"lower": 29.362499999999997, "upper": 29.99, "count": 0}], "count_uniques": 4}, "dtype": "object"}, "discount": {"profiler_dtype": "object", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "nan", "count": 12}, {"value": "5%", "count": 5}], "count_uniques": 2}, "dtype": "object"}, "product": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object"}, "info": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "object"}}, "name": null, "file_name": "tmpk28b8e9o.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object"], "total_count_dtypes": 1, "missing_count": 0, "p_missing": 0.0}}
# cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
# print(cols_and_inferred_dtype)
# df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
df.compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
%%time
df.cols.cast(columns = {"INCIDENT_NUMBER":"int"}).compute()
Wall time: 589 ms
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | NaN | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | NaN | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | NaN | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | NaN | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | NaN | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | NaN | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | NaN | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | NaN | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
%%time
df.ext.cast_and_profile(columns="*", flush=True, output="json")
Wall time: 9.43 s
'{"columns": {"INCIDENT_NUMBER": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "I162030584", "count": 13}, {"value": "I152080623", "count": 11}, {"value": "I172013170", "count": 10}, {"value": "I182065208", "count": 10}, {"value": "I172096394", "count": 10}, {"value": "I162071327", "count": 9}, {"value": "I162001871", "count": 9}, {"value": "I172056883", "count": 9}, {"value": "I172022524", "count": 9}, {"value": "I172054429", "count": 9}, {"value": "I162098170", "count": 9}, {"value": "I162078338", "count": 8}, {"value": "I162074826", "count": 8}, {"value": "I162090278", "count": 8}, {"value": "I172069723", "count": 8}, {"value": "I162022140", "count": 8}, {"value": "I130041200-00", "count": 8}, {"value": "I162087224", "count": 8}, {"value": "I152076465", "count": 8}, {"value": "I162056703", "count": 8}, {"value": "I162064331", "count": 8}, {"value": "I162082917", "count": 8}, {"value": "I152105431", "count": 8}, {"value": "I172053616", "count": 8}, {"value": "I152101399", "count": 7}, {"value": "I152095733", "count": 7}, {"value": "I162054378", "count": 7}, {"value": "I152067057", "count": 7}, {"value": "I152096998", "count": 7}, {"value": "I172018004", "count": 7}, {"value": "I162083089", "count": 7}, {"value": "I162045680", "count": 7}, {"value": "I152071480", "count": 7}], "count_uniques": 282517}, "dtype": "object"}, "OFFENSE_CODE": {"profiler_dtype": "zip_code", "stats": {"mismatch": 251, "missing": 0, "match": 318822, "frequency": [{"value": "03006", "count": 18783}, {"value": "03115", "count": 18746}, {"value": "03831", "count": 16323}, {"value": "01402", "count": 15152}, {"value": "00802", "count": 14791}, {"value": "03301", "count": 13099}, {"value": "03410", "count": 11287}, {"value": "03114", "count": 11123}, {"value": "00617", "count": 9069}, {"value": "02647", "count": 9039}, {"value": "00614", "count": 8893}, {"value": "03201", "count": 8892}, {"value": "03125", "count": 8389}, {"value": "00613", "count": 7949}, {"value": "03802", "count": 6557}, {"value": "00619", "count": 5963}, {"value": "03803", "count": 5131}, {"value": "00413", "count": 4886}, {"value": "01102", "count": 4413}, {"value": "03502", "count": 4365}, {"value": "02629", "count": 4007}, {"value": "03501", "count": 3766}, {"value": "03207", "count": 3698}, {"value": "00724", "count": 3629}, {"value": "02610", "count": 3245}, {"value": "01106", "count": 3147}, {"value": "00301", "count": 3056}, {"value": "03801", "count": 2925}, {"value": "00423", "count": 2910}, {"value": "02900", "count": 2894}, {"value": "02907", "count": 2616}, {"value": "00520", "count": 2585}, {"value": "01849", "count": 2584}], "count_uniques": 263}, "dtype": "object"}, "OFFENSE_CODE_GROUP": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "Motor Vehicle Accident Response", "count": 37132}, {"value": "Larceny", "count": 25935}, {"value": "Medical Assistance", "count": 23540}, {"value": "Investigate Person", "count": 18750}, {"value": "Other", "count": 18075}, {"value": "Drug Violation", "count": 16548}, {"value": "Simple Assault", "count": 15826}, {"value": "Vandalism", "count": 15415}, {"value": "Verbal Disputes", "count": 13099}, {"value": "Towed", "count": 11287}, {"value": "Investigate Property", "count": 11124}, {"value": "Larceny From Motor Vehicle", "count": 10847}, {"value": "Property Lost", "count": 9751}, {"value": "Warrant Arrests", "count": 8407}, {"value": "Aggravated Assault", "count": 7807}, {"value": "Violations", "count": 6095}, {"value": "Fraud", "count": 5829}, {"value": "Residential Burglary", "count": 5606}, {"value": "Missing Person Located", "count": 4958}, {"value": "Auto Theft", "count": 4851}, {"value": "Robbery", "count": 4624}, {"value": "Harassment", "count": 4007}, {"value": "Property Found", "count": 3925}, {"value": "Missing Person Reported", "count": 3797}, {"value": "Confidence Games", "count": 3147}, {"value": "Police Service Incidents", "count": 2781}, {"value": "Disorderly Conduct", "count": 2611}, {"value": "Fire Related Reports", "count": 1920}, {"value": "Firearm Violations", "count": 1777}, {"value": "License Violation", "count": 1701}, {"value": "Restraining Order Violations", "count": 1607}, {"value": "Recovered Stolen Property", "count": 1455}, {"value": "Counterfeiting", "count": 1454}], "count_uniques": 67}, "dtype": "object"}, "OFFENSE_DESCRIPTION": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "SICK/INJURED/MEDICAL - PERSON", "count": 18783}, {"value": "INVESTIGATE PERSON", "count": 18754}, {"value": "M/V - LEAVING SCENE - PROPERTY DAMAGE", "count": 16323}, {"value": "VANDALISM", "count": 15154}, {"value": "ASSAULT SIMPLE - BATTERY", "count": 14791}, {"value": "VERBAL DISPUTE", "count": 13099}, {"value": "TOWED MOTOR VEHICLE", "count": 11287}, {"value": "INVESTIGATE PROPERTY", "count": 11124}, {"value": "LARCENY THEFT FROM BUILDING", "count": 9069}, {"value": "THREATS TO DO BODILY HARM", "count": 9042}, {"value": "LARCENY THEFT FROM MV - NON-ACCESSORY", "count": 8893}, {"value": "PROPERTY - LOST", "count": 8893}, {"value": "WARRANT ARREST", "count": 8407}, {"value": "LARCENY SHOPLIFTING", "count": 7949}, {"value": "M/V ACCIDENT - PROPERTY \\u00a0DAMAGE", "count": 6557}, {"value": "LARCENY ALL OTHERS", "count": 5963}, {"value": "M/V ACCIDENT - PERSONAL INJURY", "count": 5131}, {"value": "ASSAULT - AGGRAVATED - BATTERY", "count": 4886}, {"value": "FRAUD - FALSE PRETENSE / SCHEME", "count": 4413}, {"value": "MISSING PERSON - LOCATED", "count": 4365}, {"value": "HARASSMENT", "count": 4007}, {"value": "MISSING PERSON", "count": 3766}, {"value": "PROPERTY - FOUND", "count": 3698}, {"value": "AUTO THEFT", "count": 3630}, {"value": "TRESPASSING", "count": 3254}, {"value": "FRAUD - CREDIT CARD / ATM FRAUD", "count": 3147}, {"value": "ROBBERY - STREET", "count": 3056}, {"value": "M/V ACCIDENT - OTHER", "count": 2925}, {"value": "ASSAULT - AGGRAVATED", "count": 2910}, {"value": "VAL - VIOLATION OF AUTO LAW - OTHER", "count": 2894}, {"value": "VAL - OPERATING AFTER REV/SUSP.", "count": 2618}, {"value": "DRUGS - POSS CLASS B - COCAINE, ETC.", "count": 2591}, {"value": "BURGLARY - RESIDENTIAL - FORCE", "count": 2585}], "count_uniques": 244}, "dtype": "object"}, "DISTRICT": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 1765, "match": 317308, "frequency": [{"value": "B2", "count": 49945}, {"value": "C11", "count": 42530}, {"value": "D4", "count": 41915}, {"value": "A1", "count": 35717}, {"value": "B3", "count": 35442}, {"value": "C6", "count": 23460}, {"value": "D14", "count": 20127}, {"value": "E13", "count": 17536}, {"value": "E18", "count": 17348}, {"value": "A7", "count": 13544}, {"value": "E5", "count": 13239}, {"value": "A15", "count": 6505}], "count_uniques": 12}, "dtype": "object"}, "REPORTING_AREA": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 20250, "match": 298823, "hist": [{"lower": 0.0, "upper": 30.0625, "count": 1502}, {"lower": 30.0625, "upper": 60.125, "count": 740}, {"lower": 60.125, "upper": 90.1875, "count": 1376}, {"lower": 90.1875, "upper": 120.25, "count": 2618}, {"lower": 120.25, "upper": 150.3125, "count": 2175}, {"lower": 150.3125, "upper": 180.375, "count": 2034}, {"lower": 180.375, "upper": 210.4375, "count": 1349}, {"lower": 210.4375, "upper": 240.5, "count": 1116}, {"lower": 240.5, "upper": 270.5625, "count": 1501}, {"lower": 270.5625, "upper": 300.625, "count": 1968}, {"lower": 300.625, "upper": 330.6875, "count": 2684}, {"lower": 330.6875, "upper": 360.75, "count": 2336}, {"lower": 360.75, "upper": 390.8125, "count": 939}, {"lower": 390.8125, "upper": 420.875, "count": 1051}, {"lower": 420.875, "upper": 450.9375, "count": 2021}, {"lower": 450.9375, "upper": 481.0, "count": 1848}, {"lower": 481.0, "upper": 511.0625, "count": 996}, {"lower": 511.0625, "upper": 541.125, "count": 804}, {"lower": 541.125, "upper": 571.1875, "count": 897}, {"lower": 571.1875, "upper": 601.25, "count": 1146}, {"lower": 601.25, "upper": 631.3125, "count": 1610}, {"lower": 631.3125, "upper": 661.375, "count": 537}, {"lower": 661.375, "upper": 691.4375, "count": 423}, {"lower": 691.4375, "upper": 721.5, "count": 277}, {"lower": 721.5, "upper": 751.5625, "count": 337}, {"lower": 751.5625, "upper": 781.625, "count": 678}, {"lower": 781.625, "upper": 811.6875, "count": 1466}, {"lower": 811.6875, "upper": 841.75, "count": 455}, {"lower": 841.75, "upper": 871.8125, "count": 0}, {"lower": 871.8125, "upper": 901.875, "count": 245}, {"lower": 901.875, "upper": 931.9375, "count": 1546}, {"lower": 931.9375, "upper": 962.0, "count": 1210}], "count_uniques": 878}, "dtype": "object"}, "SHOOTING": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "nan", "count": 318054}, {"value": "Y", "count": 1019}], "count_uniques": 2}, "dtype": "object"}, "OCCURRED_ON_DATE": {"profiler_dtype": "date", "stats": {"mismatch": 319073, "missing": 0, "match": 0, "frequency": [{"value": "2017-06-01 00:00:00+00:00", "count": 29}, {"value": "2015-07-01 00:00:00+00:00", "count": 27}, {"value": "2016-08-01 00:00:00+00:00", "count": 27}, {"value": "2015-06-18 05:00:00+00:00", "count": 22}, {"value": "2017-08-01 00:00:00+00:00", "count": 22}, {"value": "2017-01-01 00:00:00+00:00", "count": 21}, {"value": "2016-04-01 00:00:00+00:00", "count": 20}, {"value": "2017-05-01 00:00:00+00:00", "count": 20}, {"value": "2015-12-07 11:38:00+00:00", "count": 20}, {"value": "2017-04-01 00:00:00+00:00", "count": 19}, {"value": "2016-09-01 00:00:00+00:00", "count": 19}, {"value": "2018-01-01 00:00:00+00:00", "count": 18}, {"value": "2016-11-01 00:00:00+00:00", "count": 18}, {"value": "2018-06-04 12:40:00+00:00", "count": 18}, {"value": "2015-12-01 00:00:00+00:00", "count": 18}, {"value": "2017-07-05 00:00:00+00:00", "count": 18}, {"value": "2017-11-01 00:00:00+00:00", "count": 17}, {"value": "2018-02-01 00:00:00+00:00", "count": 16}, {"value": "2016-02-01 00:00:00+00:00", "count": 16}, {"value": "2018-07-19 00:00:00+00:00", "count": 16}, {"value": "2017-03-01 00:00:00+00:00", "count": 15}, {"value": "2018-03-07 06:00:00+00:00", "count": 15}, {"value": "2018-08-18 08:45:00+00:00", "count": 15}, {"value": "2017-10-01 00:00:00+00:00", "count": 15}, {"value": "2016-12-15 06:00:00+00:00", "count": 14}, {"value": "2015-06-20 00:00:00+00:00", "count": 14}, {"value": "2016-06-24 17:30:00+00:00", "count": 14}, {"value": "2016-11-10 16:00:00+00:00", "count": 13}, {"value": "2018-04-02 00:00:00+00:00", "count": 13}, {"value": "2015-12-22 20:25:00+00:00", "count": 13}, {"value": "2018-01-03 18:00:00+00:00", "count": 13}, {"value": "2015-07-17 00:00:00+00:00", "count": 13}, {"value": "2016-04-20 11:07:00+00:00", "count": 13}], "count_uniques": 233229}, "dtype": "object"}, "YEAR": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 2015.0, "upper": 2015.09375, "count": 39649}, {"lower": 2015.09375, "upper": 2015.1875, "count": 0}, {"lower": 2015.1875, "upper": 2015.28125, "count": 0}, {"lower": 2015.28125, "upper": 2015.375, "count": 0}, {"lower": 2015.375, "upper": 2015.46875, "count": 0}, {"lower": 2015.46875, "upper": 2015.5625, "count": 0}, {"lower": 2015.5625, "upper": 2015.65625, "count": 0}, {"lower": 2015.65625, "upper": 2015.75, "count": 0}, {"lower": 2015.75, "upper": 2015.84375, "count": 0}, {"lower": 2015.84375, "upper": 2015.9375, "count": 0}, {"lower": 2015.9375, "upper": 2016.03125, "count": 184}, {"lower": 2016.03125, "upper": 2016.125, "count": 0}, {"lower": 2016.125, "upper": 2016.21875, "count": 0}, {"lower": 2016.21875, "upper": 2016.3125, "count": 0}, {"lower": 2016.3125, "upper": 2016.40625, "count": 0}, {"lower": 2016.40625, "upper": 2016.5, "count": 0}, {"lower": 2016.5, "upper": 2016.59375, "count": 0}, {"lower": 2016.59375, "upper": 2016.6875, "count": 0}, {"lower": 2016.6875, "upper": 2016.78125, "count": 0}, {"lower": 2016.78125, "upper": 2016.875, "count": 0}, {"lower": 2016.875, "upper": 2016.96875, "count": 0}, {"lower": 2016.96875, "upper": 2017.0625, "count": 39}, {"lower": 2017.0625, "upper": 2017.15625, "count": 0}, {"lower": 2017.15625, "upper": 2017.25, "count": 0}, {"lower": 2017.25, "upper": 2017.34375, "count": 0}, {"lower": 2017.34375, "upper": 2017.4375, "count": 0}, {"lower": 2017.4375, "upper": 2017.53125, "count": 0}, {"lower": 2017.53125, "upper": 2017.625, "count": 0}, {"lower": 2017.625, "upper": 2017.71875, "count": 0}, {"lower": 2017.71875, "upper": 2017.8125, "count": 0}, {"lower": 2017.8125, "upper": 2017.90625, "count": 0}, {"lower": 2017.90625, "upper": 2018.0, "count": 13}], "count_uniques": 4}, "dtype": "object"}, "MONTH": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 1.0, "upper": 1.34375, "count": 55}, {"lower": 1.34375, "upper": 1.6875, "count": 0}, {"lower": 1.6875, "upper": 2.03125, "count": 28}, {"lower": 2.03125, "upper": 2.375, "count": 0}, {"lower": 2.375, "upper": 2.71875, "count": 0}, {"lower": 2.71875, "upper": 3.0625, "count": 31}, {"lower": 3.0625, "upper": 3.40625, "count": 0}, {"lower": 3.40625, "upper": 3.75, "count": 0}, {"lower": 3.75, "upper": 4.09375, "count": 30}, {"lower": 4.09375, "upper": 4.4375, "count": 0}, {"lower": 4.4375, "upper": 4.78125, "count": 0}, {"lower": 4.78125, "upper": 5.125, "count": 16}, {"lower": 5.125, "upper": 5.46875, "count": 0}, {"lower": 5.46875, "upper": 5.8125, "count": 0}, {"lower": 5.8125, "upper": 6.15625, "count": 4184}, {"lower": 6.15625, "upper": 6.5, "count": 0}, {"lower": 6.5, "upper": 6.84375, "count": 0}, {"lower": 6.84375, "upper": 7.1875, "count": 8293}, {"lower": 7.1875, "upper": 7.53125, "count": 0}, {"lower": 7.53125, "upper": 7.875, "count": 0}, {"lower": 7.875, "upper": 8.21875, "count": 8298}, {"lower": 8.21875, "upper": 8.5625, "count": 0}, {"lower": 8.5625, "upper": 8.90625, "count": 0}, {"lower": 8.90625, "upper": 9.25, "count": 8334}, {"lower": 9.25, "upper": 9.59375, "count": 0}, {"lower": 9.59375, "upper": 9.9375, "count": 0}, {"lower": 9.9375, "upper": 10.28125, "count": 8151}, {"lower": 10.28125, "upper": 10.625, "count": 0}, {"lower": 10.625, "upper": 10.96875, "count": 0}, {"lower": 10.96875, "upper": 11.3125, "count": 2392}, {"lower": 11.3125, "upper": 11.65625, "count": 0}, {"lower": 11.65625, "upper": 12.0, "count": 73}], "count_uniques": 12}, "dtype": "object"}, "DAY_OF_WEEK": {"profiler_dtype": "date", "stats": {"mismatch": 319073, "missing": 0, "match": 0, "frequency": [{"value": "Friday", "count": 48495}, {"value": "Wednesday", "count": 46729}, {"value": "Thursday", "count": 46656}, {"value": "Tuesday", "count": 46383}, {"value": "Monday", "count": 45679}, {"value": "Saturday", "count": 44818}, {"value": "Sunday", "count": 40313}], "count_uniques": 7}, "dtype": "object"}, "HOUR": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "hist": [{"lower": 0.0, "upper": 0.71875, "count": 2069}, {"lower": 0.71875, "upper": 1.4375, "count": 1262}, {"lower": 1.4375, "upper": 2.15625, "count": 1040}, {"lower": 2.15625, "upper": 2.875, "count": 0}, {"lower": 2.875, "upper": 3.59375, "count": 611}, {"lower": 3.59375, "upper": 4.3125, "count": 402}, {"lower": 4.3125, "upper": 5.03125, "count": 461}, {"lower": 5.03125, "upper": 5.75, "count": 0}, {"lower": 5.75, "upper": 6.46875, "count": 606}, {"lower": 6.46875, "upper": 7.1875, "count": 1114}, {"lower": 7.1875, "upper": 7.90625, "count": 0}, {"lower": 7.90625, "upper": 8.625, "count": 1562}, {"lower": 8.625, "upper": 9.34375, "count": 1833}, {"lower": 9.34375, "upper": 10.0625, "count": 1951}, {"lower": 10.0625, "upper": 10.78125, "count": 0}, {"lower": 10.78125, "upper": 11.5, "count": 2028}, {"lower": 11.5, "upper": 12.21875, "count": 2330}, {"lower": 12.21875, "upper": 12.9375, "count": 0}, {"lower": 12.9375, "upper": 13.65625, "count": 2020}, {"lower": 13.65625, "upper": 14.375, "count": 2116}, {"lower": 14.375, "upper": 15.09375, "count": 1988}, {"lower": 15.09375, "upper": 15.8125, "count": 0}, {"lower": 15.8125, "upper": 16.53125, "count": 2415}, {"lower": 16.53125, "upper": 17.25, "count": 2625}, {"lower": 17.25, "upper": 17.96875, "count": 0}, {"lower": 17.96875, "upper": 18.6875, "count": 2584}, {"lower": 18.6875, "upper": 19.40625, "count": 2159}, {"lower": 19.40625, "upper": 20.125, "count": 2041}, {"lower": 20.125, "upper": 20.84375, "count": 0}, {"lower": 20.84375, "upper": 21.5625, "count": 1698}, {"lower": 21.5625, "upper": 22.28125, "count": 1604}, {"lower": 22.28125, "upper": 23.0, "count": 1366}], "count_uniques": 24}, "dtype": "object"}, "UCR_PART": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 90, "match": 318983, "frequency": [{"value": "Part Three", "count": 158553}, {"value": "Part Two", "count": 97569}, {"value": "Part One", "count": 61629}, {"value": "Other", "count": 1232}], "count_uniques": 4}, "dtype": "object"}, "STREET": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 10871, "match": 308202, "frequency": [{"value": "WASHINGTON ST", "count": 14194}, {"value": "BLUE HILL AVE", "count": 7794}, {"value": "BOYLSTON ST", "count": 7221}, {"value": "DORCHESTER AVE", "count": 5149}, {"value": "TREMONT ST", "count": 4796}, {"value": "MASSACHUSETTS AVE", "count": 4708}, {"value": "HARRISON AVE", "count": 4609}, {"value": "CENTRE ST", "count": 4383}, {"value": "COMMONWEALTH AVE", "count": 4134}, {"value": "HYDE PARK AVE", "count": 3470}, {"value": "COLUMBIA RD", "count": 3195}, {"value": "HUNTINGTON AVE", "count": 2989}, {"value": "RIVER ST", "count": 2831}, {"value": "DUDLEY ST", "count": 2352}, {"value": "WARREN ST", "count": 2336}, {"value": "COLUMBUS AVE", "count": 2304}, {"value": "ADAMS ST", "count": 2042}, {"value": "BEACON ST", "count": 1966}, {"value": "NEWBURY ST", "count": 1889}, {"value": "CAMBRIDGE ST", "count": 1670}, {"value": "BOWDOIN ST", "count": 1656}, {"value": "AMERICAN LEGION HWY", "count": 1631}, {"value": "W BROADWAY", "count": 1630}, {"value": "GENEVA AVE", "count": 1628}, {"value": "ALBANY ST", "count": 1429}, {"value": "ALLSTATE RD", "count": 1296}, {"value": "SUMMER ST", "count": 1250}, {"value": "HANCOCK ST", "count": 1246}, {"value": "BORDER ST", "count": 1213}, {"value": "NEW SUDBURY ST", "count": 1213}, {"value": "CUMMINS HWY", "count": 1151}, {"value": "NORFOLK ST", "count": 1115}, {"value": "BENNINGTON ST", "count": 1084}], "count_uniques": 4657}, "dtype": "object"}, "Lat": {"profiler_dtype": "decimal", "stats": {"mismatch": 299074, "missing": 19999, "match": 0, "hist": [{"lower": -1.0, "upper": 0.3560950493749999, "count": 33}, {"lower": 0.3560950493749999, "upper": 1.7121900987499998, "count": 0}, {"lower": 1.7121900987499998, "upper": 3.0682851481249998, "count": 0}, {"lower": 3.0682851481249998, "upper": 4.4243801975, "count": 0}, {"lower": 4.4243801975, "upper": 5.780475246875, "count": 0}, {"lower": 5.780475246875, "upper": 7.1365702962499995, "count": 0}, {"lower": 7.1365702962499995, "upper": 8.492665345625, "count": 0}, {"lower": 8.492665345625, "upper": 9.848760395, "count": 0}, {"lower": 9.848760395, "upper": 11.204855444375, "count": 0}, {"lower": 11.204855444375, "upper": 12.56095049375, "count": 0}, {"lower": 12.56095049375, "upper": 13.917045543124999, "count": 0}, {"lower": 13.917045543124999, "upper": 15.273140592499999, "count": 0}, {"lower": 15.273140592499999, "upper": 16.629235641875, "count": 0}, {"lower": 16.629235641875, "upper": 17.98533069125, "count": 0}, {"lower": 17.98533069125, "upper": 19.341425740625, "count": 0}, {"lower": 19.341425740625, "upper": 20.69752079, "count": 0}, {"lower": 20.69752079, "upper": 22.053615839375, "count": 0}, {"lower": 22.053615839375, "upper": 23.40971088875, "count": 0}, {"lower": 23.40971088875, "upper": 24.765805938125, "count": 0}, {"lower": 24.765805938125, "upper": 26.1219009875, "count": 0}, {"lower": 26.1219009875, "upper": 27.477996036875, "count": 0}, {"lower": 27.477996036875, "upper": 28.834091086249998, "count": 0}, {"lower": 28.834091086249998, "upper": 30.190186135624998, "count": 0}, {"lower": 30.190186135624998, "upper": 31.546281184999998, "count": 0}, {"lower": 31.546281184999998, "upper": 32.902376234375, "count": 0}, {"lower": 32.902376234375, "upper": 34.25847128375, "count": 0}, {"lower": 34.25847128375, "upper": 35.614566333125, "count": 0}, {"lower": 35.614566333125, "upper": 36.9706613825, "count": 0}, {"lower": 36.9706613825, "upper": 38.326756431875, "count": 0}, {"lower": 38.326756431875, "upper": 39.68285148125, "count": 0}, {"lower": 39.68285148125, "upper": 41.038946530625, "count": 0}, {"lower": 41.038946530625, "upper": 42.39504158, "count": 39852}], "count_uniques": 18178}, "dtype": "object"}, "Long": {"profiler_dtype": "decimal", "stats": {"mismatch": 299074, "missing": 19999, "match": 0, "hist": [{"lower": -71.17867378, "upper": -68.985590224375, "count": 39852}, {"lower": -68.985590224375, "upper": -66.79250666875, "count": 0}, {"lower": -66.79250666875, "upper": -64.599423113125, "count": 0}, {"lower": -64.599423113125, "upper": -62.4063395575, "count": 0}, {"lower": -62.4063395575, "upper": -60.213256001874996, "count": 0}, {"lower": -60.213256001874996, "upper": -58.02017244625, "count": 0}, {"lower": -58.02017244625, "upper": -55.827088890625, "count": 0}, {"lower": -55.827088890625, "upper": -53.634005335, "count": 0}, {"lower": -53.634005335, "upper": -51.440921779374996, "count": 0}, {"lower": -51.440921779374996, "upper": -49.247838223749994, "count": 0}, {"lower": -49.247838223749994, "upper": -47.054754668125, "count": 0}, {"lower": -47.054754668125, "upper": -44.8616711125, "count": 0}, {"lower": -44.8616711125, "upper": -42.668587556874996, "count": 0}, {"lower": -42.668587556874996, "upper": -40.47550400125, "count": 0}, {"lower": -40.47550400125, "upper": -38.282420445625, "count": 0}, {"lower": -38.282420445625, "upper": -36.08933689, "count": 0}, {"lower": -36.08933689, "upper": -33.896253334375, "count": 0}, {"lower": -33.896253334375, "upper": -31.703169778749995, "count": 0}, {"lower": -31.703169778749995, "upper": -29.510086223125, "count": 0}, {"lower": -29.510086223125, "upper": -27.3170026675, "count": 0}, {"lower": -27.3170026675, "upper": -25.123919111874997, "count": 0}, {"lower": -25.123919111874997, "upper": -22.930835556250003, "count": 0}, {"lower": -22.930835556250003, "upper": -20.737752000625, "count": 0}, {"lower": -20.737752000625, "upper": -18.544668445, "count": 0}, {"lower": -18.544668445, "upper": -16.351584889374998, "count": 0}, {"lower": -16.351584889374998, "upper": -14.158501333749996, "count": 0}, {"lower": -14.158501333749996, "upper": -11.965417778125001, "count": 0}, {"lower": -11.965417778125001, "upper": -9.7723342225, "count": 0}, {"lower": -9.7723342225, "upper": -7.579250666874998, "count": 0}, {"lower": -7.579250666874998, "upper": -5.386167111250003, "count": 0}, {"lower": -5.386167111250003, "upper": -3.1930835556249946, "count": 0}, {"lower": -3.1930835556249946, "upper": -1.0, "count": 33}], "count_uniques": 18178}, "dtype": "object"}, "Location": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 319073, "frequency": [{"value": "(0.00000000, 0.00000000)", "count": 19999}, {"value": "(42.34862382, -71.08277637)", "count": 1243}, {"value": "(42.36183857, -71.05976489)", "count": 1208}, {"value": "(42.28482577, -71.09137369)", "count": 1121}, {"value": "(42.32866284, -71.08563401)", "count": 1042}, {"value": "(42.25621592, -71.12401947)", "count": 898}, {"value": "(42.29755533, -71.05970910)", "count": 783}, {"value": "(42.34128751, -71.05467933)", "count": 773}, {"value": "(-1.00000000, -1.00000000)", "count": 745}, {"value": "(42.33152148, -71.07085307)", "count": 735}, {"value": "(42.35231190, -71.06370510)", "count": 688}, {"value": "(42.33954199, -71.06940877)", "count": 655}, {"value": "(42.32696647, -71.06198607)", "count": 652}, {"value": "(42.35512339, -71.06087980)", "count": 584}, {"value": "(42.30971857, -71.10429432)", "count": 573}, {"value": "(42.29848866, -71.06313294)", "count": 562}, {"value": "(42.33401829, -71.07638124)", "count": 561}, {"value": "(42.33367922, -71.09187755)", "count": 550}, {"value": "(42.31043400, -71.06134010)", "count": 523}, {"value": "(42.35095909, -71.07412780)", "count": 523}, {"value": "(42.35241815, -71.06525499)", "count": 515}, {"value": "(42.37081805, -71.03929078)", "count": 507}, {"value": "(42.33428841, -71.07239518)", "count": 504}, {"value": "(42.34980175, -71.07840978)", "count": 472}, {"value": "(42.32696802, -71.08051941)", "count": 472}, {"value": "(42.33511904, -71.07491710)", "count": 455}, {"value": "(42.35037870, -71.07626098)", "count": 445}, {"value": "(42.34653820, -71.09880598)", "count": 444}, {"value": "(42.36643546, -71.06135413)", "count": 440}, {"value": "(42.28709355, -71.14822128)", "count": 438}, {"value": "(42.35602373, -71.06177615)", "count": 436}, {"value": "(42.34840576, -71.08688339)", "count": 432}, {"value": "(42.34905600, -71.15049850)", "count": 427}], "count_uniques": 18194}, "dtype": "object"}}, "name": null, "file_name": "crime.csv", "summary": {"cols_count": 17, "rows_count": 319073, "dtypes_list": ["object"], "total_count_dtypes": 1, "missing_count": 0, "p_missing": 0.0}}'
df.cols.names()
['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']
df.OCCURRED_ON_DATE.astype()
%%time
import pandas as pd
from fastnumbers import fast_int
import pendulum
def func(value):
return pendulum.parse(value)
df.OCCURRED_ON_DATE.apply(func, meta="object").compute()
Wall time: 3.49 s
df.compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | nan | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | nan | 2018-09-02 13:00:00+00:00 | 2018 | 9 | 2020-05-31 | 13 | Part One | LINCOLN ST | 42.3578 | -71.1394 | (42.35779134, -71.13937053) |
1 | nan | 01402 | Vandalism | VANDALISM | C11 | 347 | nan | 2018-08-21 00:00:00+00:00 | 2018 | 8 | 2020-05-26 | 0 | Part Two | HECLA ST | 42.3068 | -71.0603 | (42.30682138, -71.06030035) |
2 | nan | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | nan | 2018-09-03 19:27:00+00:00 | 2018 | 9 | 2020-06-01 | 19 | Part Three | CAZENOVE ST | 42.3466 | -71.0724 | (42.34658879, -71.07242943) |
3 | nan | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | nan | 2018-09-03 21:16:00+00:00 | 2018 | 9 | 2020-06-01 | 21 | Part Three | NEWCOMB ST | 42.3342 | -71.0787 | (42.33418175, -71.07866441) |
4 | nan | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | nan | 2018-09-03 21:05:00+00:00 | 2018 | 9 | 2020-06-01 | 21 | Part Three | DELHI ST | 42.2754 | -71.0904 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | nan | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | nan | 2016-06-05 17:25:00+00:00 | 2016 | 6 | 2020-05-31 | 17 | Part Three | COVENTRY ST | 42.337 | -71.0857 | (42.33695098, -71.08574813) |
319069 | nan | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | nan | 2015-07-09 13:38:00+00:00 | 2015 | 7 | 2020-05-28 | 13 | Part One | RIVER ST | 42.2559 | -71.1232 | (42.25592648, -71.12317207) |
319070 | nan | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | nan | 2015-07-09 13:38:00+00:00 | 2015 | 7 | 2020-05-28 | 13 | Part Three | RIVER ST | 42.2559 | -71.1232 | (42.25592648, -71.12317207) |
319071 | nan | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | nan | 2016-05-31 19:35:00+00:00 | 2016 | 5 | 2020-05-26 | 19 | Part Three | NEW WASHINGTON ST | 42.3023 | -71.1116 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | nan | 2015-06-22 00:12:00+00:00 | 2015 | 6 | 2020-06-01 | 0 | Part Three | WASHINGTON ST | 42.3338 | -71.0803 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
a = df["INCIDENT_NUMBER"].compute()
b = df["OFFENSE_CODE"].compute()
from dask import dataframe as dd
dd.compute(a+b)
(0 I18207094500619 1 I18207094301402 2 I18207094103410 3 I18207094003114 4 I18207093803114 ... 319068 I050310906-0003125 319069 I030217815-0800111 319070 I030217815-0803125 319071 I010370257-0003125 319072 14205255003125 Length: 319073, dtype: object,)
def func(value):
return value
df.apply(func)
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) <ipython-input-10-46167e4a0a69> in <module> 1 def func(value): 2 return value ----> 3 df.apply(func) ~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py in apply(self, func, axis, broadcast, raw, reduce, args, meta, **kwds) 4145 " Try: df.apply(func, axis=1)" 4146 ) -> 4147 raise NotImplementedError(msg) 4148 4149 if meta is no_default: NotImplementedError: dd.DataFrame.apply only supports axis=1 Try: df.apply(func, axis=1)
%%time
# _output = df.ext.cast_and_profile(columns="*", flush=True, output="json")
# df = df.cols.cast_to_profiler_dtypes(columns=cols_and_inferred_dtype).persist()
Wall time: 0 ns
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
df.ext.partitions()
8
df.compute()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
df.cols.year("")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-19-f77bfb6ed87a> in <module> ----> 1 df.cols.year("birth") ~\Documents\Optimus\optimus\engines\base\dask\columns.py in year(self, input_cols, output_cols) 573 output_cols = get_output_cols(input_cols, output_cols) 574 for input_col, output_col in zip(input_cols, output_cols): --> 575 df = df.assign(**{output_col: df[input_col].dt.year}) 576 return df 577 pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__() ~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py in dt(self) 2616 def dt(self): 2617 """ Namespace of datetime methods """ -> 2618 return DatetimeAccessor(self) 2619 2620 @cache_readonly ~\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\accessor.py in __init__(self, series) 34 if hasattr(series_meta, "to_series"): # is index-like 35 series_meta = series_meta.to_series() ---> 36 meta = getattr(series_meta, self._accessor_name) 37 38 self._meta = meta ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in __getattr__(self, name) 5268 or name in self._accessors 5269 ): -> 5270 return object.__getattribute__(self, name) 5271 else: 5272 if self._info_axis._can_hold_identifiers_and_holds_name(name): ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\accessor.py in __get__(self, obj, cls) 185 # we're accessing the attribute of the class, i.e., Dataset.geo 186 return self._accessor --> 187 accessor_obj = self._accessor(obj) 188 # Replace the property with the accessor object. Inspired by: 189 # http://www.pydanny.com/cached-property.html ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\indexes\accessors.py in __new__(cls, data) 336 return DatetimeProperties(data, orig) 337 --> 338 raise AttributeError("Can only use .dt accessor with datetimelike values") AttributeError: Can only use .dt accessor with datetimelike values
print(_output)
{"columns": {"customer_id": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object"}, "transactoin_date": {"profiler_dtype": "date", "profiler_dtype_fotmat": "date", "stats": {"mismatch": 11, "missing": 6, "match": 0}, "dtype": "datetime64[ns]"}, "ticket_price": {"profiler_dtype": "decimal", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": 9.99, "count": 7}, {"value": 29.99, "count": 5}, {"value": 14.99, "count": 4}, {"value": 9.91, "count": 1}], "count_uniques": 4}, "dtype": "object"}, "discount": {"profiler_dtype": "object", "stats": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}, "dtype": "object"}, "product": {"profiler_dtype": "string", "stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object"}, "info": {"profiler_dtype": "int", "stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "int64"}}, "name": null, "file_name": "tmp71_odjvg.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "datetime64[ns]", "int64"], "total_count_dtypes": 3, "missing_count": 0, "p_missing": 0.0}}
import pendulum
pendulum.from_format("11 11 1980", "DD MM YYYY")
DateTime(1980, 11, 11, 0, 0, 0, tzinfo=Timezone('UTC'))
# df = op.load.file("data/test-types.csv").ext.cache()
df = op.load.file("data/dataset-transactions.csv").ext.cache()
a = df.cols.infer_profiler_dtypes("*")
print(a)
{'customer_id': 'string', 'transactoin_date': 'date', 'ticket_price': 'decimal', 'discount': 'string', 'product': 'string', 'info': 'int'}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df = df.cols.cast_to_profiler_dtypes(columns={'transactoin_date': 'date'})
# df["a"] = df.transactoin_date.astype('M8[us]')
# b["transactoin_date"].dt.year
df.dtypes
customer_id object transactoin_date datetime64[ns] ticket_price object discount object product object info object dtype: object
df.cols.year("transactoin_date", "new1").compute()
df.cols.month("transactoin_date", "new1").compute()
df.cols.day("transactoin_date", "new1").compute()
customer_id | transactoin_date | ticket_price | discount | product | info | new1 | |
---|---|---|---|---|---|---|---|
0 | 0f345kjh345oiuy345 | 2010-08-19 | 29.99 | NaN | platinum | 1 | 19.0 |
1 | 0fju234978rfjkhsdf | 2012-01-05 | 29.99 | NaN | platinum | 2 | 5.0 |
2 | 0f34ruiy23e78y2r | 2009-08-11 | 29.99 | NaN | platinum | 3 | 11.0 |
3 | 0fue298y2r23r23r2 | 2010-08-19 | 29.99 | NaN | platinum | 4 | 19.0 |
4 | 0f345kjh345oiuy346 | 2010-08-19 | 29.99 | NaN | platinum | 5 | 19.0 |
5 | 0fju234978rfjkhsdf | 2010-08-19 | 9.99 | 5% | basic | 6 | 19.0 |
6 | 0f34ruiy23e78y2r | 2010-08-19 | 9.99 | NaN | basic | 7 | 19.0 |
7 | 0fue298y2r23r23r3 | 2010-08-19 | 9.99 | NaN | basic | 8 | 19.0 |
8 | 0f345kjh345oiuy347 | 2010-08-19 | 9.99 | NaN | basic | 9 | 19.0 |
9 | 0fju234978rfjkhsdf | 2011-08-11 | 9.99 | 5% | basic | 10 | 11.0 |
10 | 0f34ruiy23e78y2r | 2015-08-09 | 9.99 | 5% | basic | 10 | 9.0 |
11 | 0fue298y2r23r23r4 | NaT | 9.91 | NaN | basic | 10 | NaN |
12 | 0f345kjh345oiuy348 | NaT | 9.99 | NaN | basic | 10 | NaN |
13 | 0fju234978rfjkhsdf | NaT | 14.99 | 5% | deluxe | 10 | NaN |
14 | 0f34ruiy23e78y2r | NaT | 14.99 | NaN | deluxe | 10 | NaN |
15 | 0fue298y2r23r23r5 | NaT | 14.99 | 5% | deluxe | 10 | NaN |
16 | 0f345kjh345oiuy349 | NaT | 14.99 | NaN | deluxe | 10 | NaN |
df.ext.display(20)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (object)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
0f34ruiy23e78y2r
|
2015/08/09
|
9.99
|
5%
|
basic
|
10
|
0fue298y2r23r23r4
|
nan
|
9.91
|
nan
|
basic
|
10
|
0f345kjh345oiuy348
|
nan
|
9.99
|
nan
|
basic
|
10
|
0fju234978rfjkhsdf
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f34ruiy23e78y2r
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
0fue298y2r23r23r5
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f345kjh345oiuy349
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
df= df.cols.unnest("transactoin_date", separator="/", splits=3, output_cols="transactoin_date").ext.cache()
df.ext.profile(columns="*", infer=True, output="json", flush=True, bins=10)
df.ext.display()
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (object)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
df.cols.infer_profiler_dtypes("*")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 13 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
{'A': 'string', 'B': 'string'}
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590426328940.csv").ext.cache()
df = df.ext.optimize()
df = df.ext.repartition(1).ext.cache()
col_name = "customer_id"
df.cols.count_mismatch({col_name:"int","transactoin_date":"date"})
{'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0}, 'transactoin_date': {'mismatch': 11, 'missing': 6, 'match': 0}}
df = op.load.file("data/crime.csv").ext.cache()
%%time
df.cols.cast("INCIDENT_NUMBER","int").compute()
Wall time: 447 ms
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 00619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.35779134 | -71.13937053 | (42.35779134, -71.13937053) |
1 | I182070943 | 01402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.30682138 | -71.06030035 | (42.30682138, -71.06030035) |
2 | I182070941 | 03410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.34658879 | -71.07242943 | (42.34658879, -71.07242943) |
3 | I182070940 | 03114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.33418175 | -71.07866441 | (42.33418175, -71.07866441) |
4 | I182070938 | 03114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.27536542 | -71.09036101 | (42.27536542, -71.09036101) |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
319068 | I050310906-00 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 285 | NaN | 2016-06-05 17:25:00 | 2016 | 6 | Sunday | 17 | Part Three | COVENTRY ST | 42.33695098 | -71.08574813 | (42.33695098, -71.08574813) |
319069 | I030217815-08 | 00111 | Homicide | MURDER, NON-NEGLIGIENT MANSLAUGHTER | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part One | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319070 | I030217815-08 | 03125 | Warrant Arrests | WARRANT ARREST | E18 | 520 | NaN | 2015-07-09 13:38:00 | 2015 | 7 | Thursday | 13 | Part Three | RIVER ST | 42.25592648 | -71.12317207 | (42.25592648, -71.12317207) |
319071 | I010370257-00 | 03125 | Warrant Arrests | WARRANT ARREST | E13 | 569 | NaN | 2016-05-31 19:35:00 | 2016 | 5 | Tuesday | 19 | Part Three | NEW WASHINGTON ST | 42.30233307 | -71.11156487 | (42.30233307, -71.11156487) |
319072 | 142052550 | 03125 | Warrant Arrests | WARRANT ARREST | D4 | 903 | NaN | 2015-06-22 00:12:00 | 2015 | 6 | Monday | 0 | Part Three | WASHINGTON ST | 42.33383935 | -71.08029038 | (42.33383935, -71.08029038) |
319073 rows × 17 columns
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
00619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
01402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
03410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
03820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
00724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
00301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
import numpy as np
a = np.array([1,2,3])
b = np.array([1,2,"a"])
def func(a, b):
try:
r =a+b
except:
r = np.nan
return r
func(a,b)
nan
df[["INCIDENT_NUMBER","REPORTING_AREA"]].compute().to_numpy()[0]
array(['I182070945', '808'], dtype=object)
def op(value):
return value
# return value["INCIDENT_NUMBER"]
# print(value)
# try:
# return value["INCIDENT_NUMBER"] + value["REPORTING_AREA"]
# except():
# return np.nan
def func(pdf):
# print(type(pdf))
return pdf.apply(op, axis=1)
df[["INCIDENT_NUMBER","REPORTING_AREA"]].map_partitions(func, meta=object).compute()
%%time
from numpy.core._exceptions import UFuncTypeError
def func(a,b):
return a+b
def myfunc(a, b, expr):
"Return a-b if a>b, otherwise return a+b"
try:
return func(a,b)
except (UFuncTypeError, TypeError):
return np.nan
vfunc = np.vectorize(myfunc)
print(func)
arr= [1, 2, 3,4,5,6,7,8,9,"A"]*1000000
b = vfunc(arr, 2, func)
# vfunc([1, 2, 3, "A"], 2)
<function func at 0x00000259BE4C1AF8> Wall time: 5.37 s
len(arr)
10000000
df = df.ext.repartition(8)
import re
comp = re.compile("^(?=.)([+-]?([0-9]*)(\.([0-9]+))?)$")
%time df["INCIDENT_NUMBER"].str.match(comp).compute()
Wall time: 326 ms
0 False 1 False 2 False 3 False 4 False ... 319068 False 319069 False 319070 False 319071 False 319072 True Name: INCIDENT_NUMBER, Length: 319073, dtype: bool
df.cols.names()
['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']
from dateutil.parser import parse as dparse
dparse("2018-09-02 13:00:00")
datetime.datetime(2018, 9, 2, 13, 0)
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (object)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (object)
not nullable
|
MONTH
10 (object)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (object)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
00619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
01402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
03410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
03114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
03820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
00724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
00301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
03301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
df.cols.infer_profiler_dtypes("*")
{'INCIDENT_NUMBER': 'string', 'OFFENSE_CODE': 'zip_code', 'OFFENSE_CODE_GROUP': 'string', 'OFFENSE_DESCRIPTION': 'string', 'DISTRICT': 'string', 'REPORTING_AREA': 'int', 'SHOOTING': 'object', 'OCCURRED_ON_DATE': 'string', 'YEAR': 'int', 'MONTH': 'int', 'DAY_OF_WEEK': 'string', 'HOUR': 'int', 'UCR_PART': 'string', 'STREET': 'string', 'Lat': 'decimal', 'Long': 'decimal', 'Location': 'string'}
df.ext.display(20)
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 20 elements requested, only 17 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
customer_id
1 (object)
not nullable
|
transactoin_date
2 (object)
not nullable
|
ticket_price
3 (object)
not nullable
|
discount
4 (object)
not nullable
|
product
5 (object)
not nullable
|
info
6 (uint8)
not nullable
|
---|---|---|---|---|---|
0f345kjh345oiuy345
|
2010/08/19
|
29.99
|
nan
|
platinum
|
1
|
0fju234978rfjkhsdf
|
2012/01/05
|
29.99
|
nan
|
platinum
|
2
|
0f34ruiy23e78y2r
|
2009/08/11
|
29.99
|
nan
|
platinum
|
3
|
0fue298y2r23r23r2
|
2010/08/19
|
29.99
|
nan
|
platinum
|
4
|
0f345kjh345oiuy346
|
2010/08/19
|
29.99
|
nan
|
platinum
|
5
|
0fju234978rfjkhsdf
|
2010/08/19
|
9.99
|
5%
|
basic
|
6
|
0f34ruiy23e78y2r
|
2010/08/19
|
9.99
|
nan
|
basic
|
7
|
0fue298y2r23r23r3
|
2010/08/19
|
9.99
|
nan
|
basic
|
8
|
0f345kjh345oiuy347
|
2010/08/19
|
9.99
|
nan
|
basic
|
9
|
0fju234978rfjkhsdf
|
2011/08/11
|
9.99
|
5%
|
basic
|
10
|
0f34ruiy23e78y2r
|
2015/08/09
|
9.99
|
5%
|
basic
|
10
|
0fue298y2r23r23r4
|
nan
|
9.91
|
nan
|
basic
|
10
|
0f345kjh345oiuy348
|
nan
|
9.99
|
nan
|
basic
|
10
|
0fju234978rfjkhsdf
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f34ruiy23e78y2r
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
0fue298y2r23r23r5
|
nan
|
14.99
|
5%
|
deluxe
|
10
|
0f345kjh345oiuy349
|
nan
|
14.99
|
nan
|
deluxe
|
10
|
col_name ="customer_id"
print(df.cols.count_mismatch({col_name:"int"}))
print(df.cols.count_mismatch({col_name:"decimal"}))
print(df.cols.count_mismatch({col_name:"boolean"}))
print(df.cols.count_mismatch({col_name:"date"}))
print(df.cols.count_mismatch({col_name:"array"}))
print(df.cols.count_mismatch({col_name:"object"}))
print(df.cols.count_mismatch({col_name:"gender"}))
print(df.cols.count_mismatch({col_name:"ip"}))
print(df.cols.count_mismatch({col_name:"url"}))
print(df.cols.count_mismatch({col_name:"email"}))
print(df.cols.count_mismatch({col_name:"credit_card_number"}))
print(df.cols.count_mismatch({col_name:"zip_code"}))
{'customer_id': {'mismatch': 0, 'missing': 0, 'match': 17, 'profiler_dtype': 'string'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'int'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'decimal'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'boolean'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'date'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'array'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'object'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'gender'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'ip'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'url'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'email'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'credit_card_number'}} {'customer_id': {'mismatch': 17, 'missing': 0, 'match': 0, 'profiler_dtype': 'zip_code'}}
_output = df.ext.profile(columns=col_name, infer=True, output="json", flush=True)
# _output = df.ext.set_buffer("*")
# _output = df.ext.set_buffer("*")
# _output = df.ext.buffer_window("*", 0, 17).ext.to_json("*")
print(_output)
cols_and_inferred_dtype {'customer_id': 'string'} {"columns": {"customer_id": {"stats": {"mismatch": 0, "missing": 0, "match": 3, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object", "profiler_dtype": "string"}}, "name": null, "file_name": "tmpthwlzht1.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "uint8"], "total_count_dtypes": 2, "missing_count": 0, "p_missing": 0.0}}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
cols_and_inferred_dtype = df.cols.infer_profiler_dtypes("*")
print(cols_and_inferred_dtype)
mismatch = df.cols.count_mismatch(cols_and_inferred_dtype, infer=True, compute=True)
print(mismatch)
{'customer_id': 'string', 'transactoin_date': 'string', 'ticket_price': 'decimal', 'discount': 'object', 'product': 'string', 'info': 'int'} {'customer_id': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'transactoin_date': {'mismatch': 0, 'missing': 3, 'match': 0, 'profiler_dtype': 'string'}, 'ticket_price': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'decimal'}, 'discount': {'mismatch': 1, 'missing': 2, 'match': 0, 'profiler_dtype': 'object'}, 'product': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'string'}, 'info': {'mismatch': 0, 'missing': 0, 'match': 3, 'profiler_dtype': 'int'}}
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
df = op.load.file("data/crime.csv").ext.cache()
df = df.ext.optimize().ext.cache()
df.cols.count_mismatch({"OFFENSE_DESCRIPTION":"string"})
{'OFFENSE_DESCRIPTION': {'mismatch': 0, 'missing': 0, 'match': 319073, 'profiler_dtype': 'string'}}
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (uint16)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (uint16)
not nullable
|
MONTH
10 (uint8)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (uint8)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (object)
not nullable
|
Long
16 (object)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.08273260
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
import functools
import dask
import dask.dataframe as dd
import pandas as pd
pdf = pd.DataFrame({
'x': range(0, 100),
'y': range(0, 100),
'z': range(0, 100)
})
ddf = dd.from_pandas(pdf, npartitions=8)
print('Number of partitions', ddf.npartitions)
def compute_stats(row):
return {
'sum': row['x'] + row['y'] + row['z'],
'min': min(row),
'max': max(row)
}
def accum_stats(stats_accum, stats):
return {
'sum': stats_accum['sum'] + stats['sum'],
'min': min(stats_accum['min'], stats['min']),
'max': max(stats_accum['max'], stats['max'])
}
def compute_stats_partition(pdf):
pds = pdf.apply(compute_stats, axis=1)
print(pdf)
print("99999999999999999999999999999999999999999999999")
return pds
return functools.reduce(accum_stats, pds)
def merge_stats_series(pds):
print(pds)
print("--")
return pds
return functools.reduce(accum_stats, pds)
res = ddf.reduction(
compute_stats_partition,
merge_stats_series,
meta={
'sum': 'int64',
'min': 'int64',
'max': 'int64'
})
# singleton dataframe to list of delayed objects
# where each row is a delayed object
# and in this case we just want the first one
delayed_dict = res.to_delayed()[0]
a = dd.compute(delayed_dict)
Number of partitions 8 x y z 13 13 13 13 14 14 14 14 15 15 15 15 16 16 16 16 17 17 17 17 18 18 18 18 19 19 19 19 20 20 20 20 21 21 21 21 22 22 22 22 23 23 23 23 24 24 24 24 25 25 25 25 x y z 26 26 26 26 27 27 27 27 28 28 28 28 29 29 29 29 30 30 30 30 31 31 31 31 32 32 32 32 33 33 33 33 34 34 34 34 35 35 35 35 36 36 36 36 37 37 37 37 38 38 38 38 99999999999999999999999999999999999999999999999 x y z 39 39 39 39 40 40 40 40 41 41 41 41 42 42 42 42 43 43 43 43 44 44 44 44 45 45 45 45 46 46 46 46 47 47 47 47 48 48 48 48 49 49 49 49 50 50 50 50 51 51 51 51 99999999999999999999999999999999999999999999999 x y z 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7 8 8 8 8 9 9 9 9 10 10 10 10 11 11 11 11 12 12 12 12 99999999999999999999999999999999999999999999999 99999999999999999999999999999999999999999999999 x y z 52 52 52 52 53 53 53 53 54 54 54 54 55 55 55 55 56 56 56 56 57 57 57 57 58 58 58 58 59 59 59 59 60 60 60 60 61 61 61 61 62 62 62 62 63 63 63 63 64 64 64 64 99999999999999999999999999999999999999999999999 x y z 91 91 91 91 92 92 92 92 93 93 93 93 94 94 94 94 95 95 95 95 96 96 96 96 97 97 97 97 98 98 98 98 99 99 99 99 99999999999999999999999999999999999999999999999 x y z 65 65 65 65 66 66 66 66 67 67 67 67 68 68 68 68 69 69 69 69 70 70 70 70 71 71 71 71 72 72 72 72 73 73 73 73 74 74 74 74 75 75 75 75 76 76 76 76 77 77 77 77 99999999999999999999999999999999999999999999999 x y z 78 78 78 78 79 79 79 79 80 80 80 80 81 81 81 81 82 82 82 82 83 83 83 83 84 84 84 84 85 85 85 85 86 86 86 86 87 87 87 87 88 88 88 88 89 89 89 89 90 90 90 90 99999999999999999999999999999999999999999999999 0 1 \ 0 {'sum': 0, 'min': 0, 'max': 0} {'sum': 3, 'min': 1, 'max': 1} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 2 3 \ 0 {'sum': 6, 'min': 2, 'max': 2} {'sum': 9, 'min': 3, 'max': 3} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 4 5 \ 0 {'sum': 12, 'min': 4, 'max': 4} {'sum': 15, 'min': 5, 'max': 5} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 6 7 \ 0 {'sum': 18, 'min': 6, 'max': 6} {'sum': 21, 'min': 7, 'max': 7} 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 8 9 ... \ 0 {'sum': 24, 'min': 8, 'max': 8} {'sum': 27, 'min': 9, 'max': 9} ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 0 NaN NaN ... 90 91 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 270, 'min': 90, 'max': 90} NaN 0 NaN {'sum': 273, 'min': 91, 'max': 91} 92 93 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 276, 'min': 92, 'max': 92} {'sum': 279, 'min': 93, 'max': 93} 94 95 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 282, 'min': 94, 'max': 94} {'sum': 285, 'min': 95, 'max': 95} 96 97 \ 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 288, 'min': 96, 'max': 96} {'sum': 291, 'min': 97, 'max': 97} 98 99 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 NaN NaN 0 {'sum': 294, 'min': 98, 'max': 98} {'sum': 297, 'min': 99, 'max': 99} [8 rows x 100 columns] --
df = op.load.file("http://159.65.217.17:5003/uploads/datasetFile-1590447280612.csv").ext.cache()
df = df.ext.optimize()
df = df.ext.repartition(8).ext.cache()
_output = df.ext.profile(columns="*", infer=True, output="json")
C:\Users\argenisleon\AppData\Roaming\Python\Python37\site-packages\dask\dataframe\core.py:5979: UserWarning: Insufficient elements for `head`. 30 elements requested, only 2 elements available. Try passing larger `npartitions` to `head`. warnings.warn(msg.format(n, len(r)))
print(_output)
{"columns": {"customer_id": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "0fju234978rfjkhsdf", "count": 4}, {"value": "0f34ruiy23e78y2r", "count": 4}, {"value": "0fue298y2r23r23r5", "count": 1}, {"value": "0fue298y2r23r23r4", "count": 1}, {"value": "0fue298y2r23r23r3", "count": 1}, {"value": "0fue298y2r23r23r2", "count": 1}, {"value": "0f345kjh345oiuy349", "count": 1}, {"value": "0f345kjh345oiuy348", "count": 1}, {"value": "0f345kjh345oiuy347", "count": 1}, {"value": "0f345kjh345oiuy346", "count": 1}, {"value": "0f345kjh345oiuy345", "count": 1}], "count_uniques": 11}}, "transactoin_date": {"stats": {"mismatch": 0, "missing": 6, "match": 11, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 6, "match": 11, "frequency": [{"value": "2010/08/19", "count": 7}, {"value": "2015/08/09", "count": 1}, {"value": "2012/01/05", "count": 1}, {"value": "2011/08/11", "count": 1}, {"value": "2009/08/11", "count": 1}], "count_uniques": 5}}, "ticket_price": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "9.99", "count": 7}, {"value": "29.99", "count": 5}, {"value": "14.99", "count": 4}, {"value": "9.91", "count": 1}], "count_uniques": 4}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "9.99", "count": 7}, {"value": "29.99", "count": 5}, {"value": "14.99", "count": 4}, {"value": "9.91", "count": 1}], "count_uniques": 4}}, "discount": {"stats": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}, "dtype": "object", "profiler_dtype": {"mismatch": 5, "missing": 12, "match": 0, "frequency": [{"value": "5%", "count": 5}], "count_uniques": 1}}, "product": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}, "dtype": "object", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "frequency": [{"value": "basic", "count": 8}, {"value": "platinum", "count": 5}, {"value": "deluxe", "count": 4}], "count_uniques": 3}}, "info": {"stats": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}, "dtype": "uint8", "profiler_dtype": {"mismatch": 0, "missing": 0, "match": 17, "hist": [{"lower": 1.0, "upper": 1.28125, "count": 0}, {"lower": 1.28125, "upper": 1.5625, "count": 0}, {"lower": 1.5625, "upper": 1.84375, "count": 0}, {"lower": 1.84375, "upper": 2.125, "count": 0}, {"lower": 2.125, "upper": 2.40625, "count": 0}, {"lower": 2.40625, "upper": 2.6875, "count": 0}, {"lower": 2.6875, "upper": 2.96875, "count": 0}, {"lower": 2.96875, "upper": 3.25, "count": 0}, {"lower": 3.25, "upper": 3.53125, "count": 0}, {"lower": 3.53125, "upper": 3.8125, "count": 0}, {"lower": 3.8125, "upper": 4.09375, "count": 0}, {"lower": 4.09375, "upper": 4.375, "count": 0}, {"lower": 4.375, "upper": 4.65625, "count": 0}, {"lower": 4.65625, "upper": 4.9375, "count": 0}, {"lower": 4.9375, "upper": 5.21875, "count": 0}, {"lower": 5.21875, "upper": 5.5, "count": 0}, {"lower": 5.5, "upper": 5.78125, "count": 0}, {"lower": 5.78125, "upper": 6.0625, "count": 0}, {"lower": 6.0625, "upper": 6.34375, "count": 0}, {"lower": 6.34375, "upper": 6.625, "count": 0}, {"lower": 6.625, "upper": 6.90625, "count": 0}, {"lower": 6.90625, "upper": 7.1875, "count": 0}, {"lower": 7.1875, "upper": 7.46875, "count": 0}, {"lower": 7.46875, "upper": 7.75, "count": 0}, {"lower": 7.75, "upper": 8.03125, "count": 0}, {"lower": 8.03125, "upper": 8.3125, "count": 0}, {"lower": 8.3125, "upper": 8.59375, "count": 0}, {"lower": 8.59375, "upper": 8.875, "count": 0}, {"lower": 8.875, "upper": 9.15625, "count": 0}, {"lower": 9.15625, "upper": 9.4375, "count": 0}, {"lower": 9.4375, "upper": 9.71875, "count": 0}, {"lower": 9.71875, "upper": 10.0, "count": 3}], "count_uniques": 10}}}, "name": null, "file_name": "tmp_vk79fcm.csv", "summary": {"cols_count": 6, "rows_count": 17, "dtypes_list": ["object", "uint8"], "total_count_dtypes": 2, "missing_count": 0, "p_missing": 0.0}}
##Bag
# each element is an integer
import dask.bag as db
b = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], npartitions=1)
b.take(3)
(1, 2, 3)
%%time
b.to_dataframe().compute()
Wall time: 850 ms
0 | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 6 |
6 | 7 |
7 | 8 |
8 | 9 |
9 | 10 |
from optimus.helpers.functions import match_date
import re
pattern = match_date("dd MM yy h")
string = "11 05 1980 2"
prog = re.compile(pattern)
result = prog.match(string)
print(bool(result))
['dd', ' ', 'MM', ' ', 'yy', ' ', 'h'] False