%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working from collections import Callable C:\Users\argenisleon\Anaconda3\lib\site-packages\statsmodels\base\wrapper.py:100: DeprecationWarning: `formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly defaults=argspec[3]) You are using PySparkling of version 2.4.10, but your PySpark is of version 2.3.1. Please make sure Spark and PySparkling versions are compatible.
# op = Optimus("spark")
# df = op.load.csv("data/crime.csv")
op = Optimus("dask")
df = op.load.csv("data/crime.csv", charset="ISO-8859–1")
df = op.load.json("data/foo.json", multiline=True)
df = op.load.parquet("data/foo.parquet")
df = op.load.excel("data/titanic3.xls")
Ds data/titanic3.xls data/titanic3.xls data/titanic3.xls
df.ext.display()
pclass
1 (int64)
not nullable
|
survived
2 (int64)
not nullable
|
name
3 (object)
not nullable
|
sex
4 (object)
not nullable
|
age
5 (float64)
not nullable
|
sibsp
6 (int64)
not nullable
|
parch
7 (int64)
not nullable
|
ticket
8 (object)
not nullable
|
fare
9 (float64)
not nullable
|
cabin
10 (object)
not nullable
|
embarked
11 (object)
not nullable
|
boat
12 (object)
not nullable
|
body
13 (float64)
not nullable
|
home.dest
14 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1
|
1
|
Allen,⋅Miss.⋅Elisabeth⋅Walton
|
female
|
29.0
|
0
|
0
|
24160
|
211.3375
|
B5
|
S
|
2
|
nan
|
St⋅Louis,⋅MO
|
1
|
1
|
Allison,⋅Master.⋅Hudson⋅Trevor
|
male
|
0.9167
|
1
|
2
|
113781
|
151.55
|
C22⋅C26
|
S
|
11
|
nan
|
Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
|
1
|
0
|
Allison,⋅Miss.⋅Helen⋅Loraine
|
female
|
2.0
|
1
|
2
|
113781
|
151.55
|
C22⋅C26
|
S
|
nan
|
nan
|
Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
|
1
|
0
|
Allison,⋅Mr.⋅Hudson⋅Joshua⋅Creighton
|
male
|
30.0
|
1
|
2
|
113781
|
151.55
|
C22⋅C26
|
S
|
nan
|
135.0
|
Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
|
1
|
0
|
Allison,⋅Mrs.⋅Hudson⋅J⋅C⋅(Bessie⋅Waldo⋅Daniels)
|
female
|
25.0
|
1
|
2
|
113781
|
151.55
|
C22⋅C26
|
S
|
nan
|
nan
|
Montreal,⋅PQ⋅/⋅Chesterville,⋅ON
|
1
|
1
|
Anderson,⋅Mr.⋅Harry
|
male
|
48.0
|
0
|
0
|
19952
|
26.55
|
E12
|
S
|
3
|
nan
|
New⋅York,⋅NY
|
1
|
1
|
Andrews,⋅Miss.⋅Kornelia⋅Theodosia
|
female
|
63.0
|
1
|
0
|
13502
|
77.9583
|
D7
|
S
|
10
|
nan
|
Hudson,⋅NY
|
1
|
0
|
Andrews,⋅Mr.⋅Thomas⋅Jr
|
male
|
39.0
|
0
|
0
|
112050
|
0.0
|
A36
|
S
|
nan
|
nan
|
Belfast,⋅NI
|
1
|
1
|
Appleton,⋅Mrs.⋅Edward⋅Dale⋅(Charlotte⋅Lamson)
|
female
|
53.0
|
2
|
0
|
11769
|
51.4792
|
C101
|
S
|
D
|
nan
|
Bayside,⋅Queens,⋅NY
|
1
|
0
|
Artagaveytia,⋅Mr.⋅Ramon
|
male
|
71.0
|
0
|
0
|
PC⋅17609
|
49.5042
|
nan
|
C
|
nan
|
22.0
|
Montevideo,⋅Uruguay
|
1
|
0
|
Astor,⋅Col.⋅John⋅Jacob
|
male
|
47.0
|
1
|
0
|
PC⋅17757
|
227.525
|
C62⋅C64
|
C
|
nan
|
124.0
|
New⋅York,⋅NY
|
df = op.load.avro("data/foo.avro")
df.ext.display()
id
1 (int32)
not nullable
|
firstName
2 (object)
not nullable
|
lastName
3 (object)
not nullable
|
billingId
4 (int32)
not nullable
|
product
5 (object)
not nullable
|
price
6 (int32)
not nullable
|
birth
7 (object)
not nullable
|
dummyCol
8 (object)
not nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
11
|
Isaac
|
Newton
|
992
|
pasta
|
9
|
1999/02/15
|
never⋅
|
%%time
df.min().compute()
Wall time: 5.85 s
INCIDENT_NUMBER 142052550 OFFENSE_CODE 111 OFFENSE_CODE_GROUP Aggravated Assault OFFENSE_DESCRIPTION A&B HANDS, FEET, ETC. - MED. ATTENTION REQ. REPORTING_AREA OCCURRED_ON_DATE 2015-06-15 00:00:00 YEAR 2015 MONTH 1 DAY_OF_WEEK Friday HOUR 0 Lat -1 Long -71.1787 Location (-1.00000000, -1.00000000) dtype: object
%%time
df.cols.min("*")
[('min', Dask Series Structure: npartitions=1 DAY_OF_WEEK object YEAR ... dtype: object Dask Name: dataframe-min-agg, 6 tasks)] Wall time: 5.9 s
{'min': {'INCIDENT_NUMBER': '142052550', 'OFFENSE_CODE': 111, 'OFFENSE_CODE_GROUP': 'Aggravated Assault', 'OFFENSE_DESCRIPTION': 'A&B HANDS, FEET, ETC. - MED. ATTENTION REQ.', 'REPORTING_AREA': ' ', 'OCCURRED_ON_DATE': '2015-06-15 00:00:00', 'YEAR': 2015, 'MONTH': 1, 'DAY_OF_WEEK': 'Friday', 'HOUR': 0, 'Lat': -1.0, 'Long': -71.17867378, 'Location': '(-1.00000000, -1.00000000)'}}
df.cols.schema_dtype()
[numpy.object_, numpy.int64, numpy.object_, numpy.object_, numpy.object_, numpy.object_, numpy.object_, numpy.object_, numpy.int64, numpy.int64, numpy.object_, numpy.int64, numpy.object_, numpy.object_, numpy.float64, numpy.float64, numpy.object_]
%%time
df.min().compute()
Wall time: 5.49 s
INCIDENT_NUMBER 142052550 OFFENSE_CODE 111 OFFENSE_CODE_GROUP Aggravated Assault OFFENSE_DESCRIPTION A&B HANDS, FEET, ETC. - MED. ATTENTION REQ. REPORTING_AREA OCCURRED_ON_DATE 2015-06-15 00:00:00 YEAR 2015 MONTH 1 DAY_OF_WEEK Friday HOUR 0 Lat -1 Long -71.1787 Location (-1.00000000, -1.00000000) dtype: object
df.cols.min("*")
{'min': {'INCIDENT_NUMBER': '142052550', 'OFFENSE_CODE': 111, 'OFFENSE_CODE_GROUP': 'Aggravated Assault', 'OFFENSE_DESCRIPTION': 'A&B HANDS, FEET, ETC. - MED. ATTENTION REQ.', 'REPORTING_AREA': ' ', 'OCCURRED_ON_DATE': '2015-06-15 00:00:00', 'YEAR': 2015, 'MONTH': 1, 'DAY_OF_WEEK': 'Friday', 'HOUR': 0, 'Lat': -1.0, 'Long': -71.17867378, 'Location': '(-1.00000000, -1.00000000)'}}
%%time
df.cols.kurt("*")
Wall time: 16.5 s
{'Long': nan, 'MONTH': 1.9528853000207569, 'HOUR': 2.3994270670757927, 'Lat': nan, 'YEAR': 1.9469652465192953, 'OFFENSE_CODE': 1.5838041686767816}
%%time
df.cols.min("*")
Wall time: 5.91 s
{'min': {'INCIDENT_NUMBER': '142052550', 'OFFENSE_CODE': 111, 'OFFENSE_CODE_GROUP': 'Aggravated Assault', 'OFFENSE_DESCRIPTION': 'A&B HANDS, FEET, ETC. - MED. ATTENTION REQ.', 'REPORTING_AREA': ' ', 'OCCURRED_ON_DATE': '2015-06-15 00:00:00', 'YEAR': 2015, 'MONTH': 1, 'DAY_OF_WEEK': 'Friday', 'HOUR': 0, 'Lat': -1.0, 'Long': -71.17867378, 'Location': '(-1.00000000, -1.00000000)'}}
%%time
from optimus.profiler.profiler import Profiler
p= Profiler()
p.run(df)
['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <timed exec> in <module> ~\Documents\Optimus\optimus\helpers\decorators.py in timed(*args, **kw) 8 def timed(*args, **kw): 9 start_time = timeit.default_timer() ---> 10 f = method(*args, **kw) 11 _time = round(timeit.default_timer() - start_time, 2) 12 logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time)) ~\Documents\Optimus\optimus\profiler\profiler.py in run(self, df, columns, buckets, infer, relative_error, approx_count, mismatch, advanced_stats) 72 columns = parse_columns(df, columns) 73 output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict", ---> 74 mismatch=mismatch, advanced_stats=advanced_stats) 75 76 # Load jinja ~\Documents\Optimus\optimus\profiler\profiler.py in dataset(self, df, columns, buckets, infer, relative_error, approx_count, sample, stats, format, mismatch, advanced_stats) 325 self.cols_count = cols_count = len(df.columns) 326 updated_columns = self.columns_stats(df, cols_to_profile, buckets, infer, relative_error, approx_count, --> 327 mismatch, advanced_stats) 328 329 output_columns = update_dict(output_columns, updated_columns) ~\Documents\Optimus\optimus\profiler\profiler.py in columns_stats(self, df, columns, buckets, infer, relative_error, approx_count, mismatch, advanced_stats) 434 435 # Aggregation --> 436 stats = self.columns_agg(df, columns, buckets, relative_error, approx_count, advanced_stats) 437 438 # Calculate Frequency ~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(self, df, columns, buckets, relative_error, approx_count, advanced_stats) 490 df.functions.sum, df.functions.variance, df.functions.zeros_agg] 491 print(cols) --> 492 exprs.extend(df.cols.create_exprs(cols, funcs)) 493 494 # TODO: None in basic calculation ~\Documents\Optimus\optimus\dask\columns.py in create_exprs(columns, funcs, *args) 119 exprs[col_name].update(func(col_name, args)(df)) 120 else: --> 121 exprs[col_name] = func(col_name, args)(df) 122 123 result = {} ~\Documents\Optimus\optimus\dask\functions.py in _kurtoris(serie) 134 def kurtosis(col_name, args): 135 def _kurtoris(serie): --> 136 result = {"kurtosis": float(stats.kurtosis(serie[col_name]))} 137 return result 138 ~\Anaconda3\lib\site-packages\dask\array\stats.py in kurtosis(a, axis, fisher, bias, nan_policy) 227 olderr = np.seterr(all='ignore') 228 try: --> 229 vals = da.where(zero, 0, m4 / m2**2.0) 230 finally: 231 np.seterr(**olderr) TypeError: unsupported operand type(s) for ** or pow(): 'Array' and 'float'
df.rows.count()
319073
df.cols.min(["OFFENSE_CODE","YEAR"])
{'YEAR': {'min': 2015}, 'OFFENSE_CODE': {'min': 111}}
%%time
df.cols.range(["OFFENSE_CODE","YEAR"])
Wall time: 7.52 s
{'YEAR': {'min': 2015, 'max': 2018}, 'OFFENSE_CODE': {'min': 111, 'max': 3831}}
%%time
df.cols.hist(["OFFENSE_CODE","YEAR"])
Wall time: 12 s
{'YEAR': {'hist': [{'count': 53388, 'lower': 2015.0, 'upper': 2015.15}, {'count': 0, 'lower': 2015.15, 'upper': 2015.3}, {'count': 0, 'lower': 2015.3, 'upper': 2015.45}, {'count': 0, 'lower': 2015.45, 'upper': 2015.6}, {'count': 0, 'lower': 2015.6, 'upper': 2015.75}, {'count': 0, 'lower': 2015.75, 'upper': 2015.9}, {'count': 99114, 'lower': 2015.9, 'upper': 2016.05}, {'count': 0, 'lower': 2016.05, 'upper': 2016.2}, {'count': 0, 'lower': 2016.2, 'upper': 2016.35}, {'count': 0, 'lower': 2016.35, 'upper': 2016.5}, {'count': 0, 'lower': 2016.5, 'upper': 2016.65}, {'count': 0, 'lower': 2016.65, 'upper': 2016.8}, {'count': 0, 'lower': 2016.8, 'upper': 2016.95}, {'count': 100886, 'lower': 2016.95, 'upper': 2017.1}, {'count': 0, 'lower': 2017.1, 'upper': 2017.25}, {'count': 0, 'lower': 2017.25, 'upper': 2017.4}, {'count': 0, 'lower': 2017.4, 'upper': 2017.55}, {'count': 0, 'lower': 2017.55, 'upper': 2017.7}, {'count': 0, 'lower': 2017.7, 'upper': 2017.85}, {'count': 65685, 'lower': 2017.85, 'upper': 2018.0}]}, 'OFFENSE_CODE': {'hist': [{'count': 169, 'lower': 111.0, 'upper': 297.0}, {'count': 12431, 'lower': 297.0, 'upper': 483.0}, {'count': 44188, 'lower': 483.0, 'upper': 669.0}, {'count': 21795, 'lower': 669.0, 'upper': 855.0}, {'count': 1548, 'lower': 855.0, 'upper': 1041.0}, {'count': 9272, 'lower': 1041.0, 'upper': 1227.0}, {'count': 16609, 'lower': 1227.0, 'upper': 1413.0}, {'count': 2185, 'lower': 1413.0, 'upper': 1599.0}, {'count': 216, 'lower': 1599.0, 'upper': 1785.0}, {'count': 16536, 'lower': 1785.0, 'upper': 1971.0}, {'count': 2759, 'lower': 1971.0, 'upper': 2157.0}, {'count': 256, 'lower': 2157.0, 'upper': 2343.0}, {'count': 2655, 'lower': 2343.0, 'upper': 2529.0}, {'count': 20908, 'lower': 2529.0, 'upper': 2715.0}, {'count': 2894, 'lower': 2715.0, 'upper': 2901.0}, {'count': 29658, 'lower': 2901.0, 'upper': 3087.0}, {'count': 63012, 'lower': 3087.0, 'upper': 3273.0}, {'count': 25653, 'lower': 3273.0, 'upper': 3459.0}, {'count': 9197, 'lower': 3459.0, 'upper': 3645.0}, {'count': 37132, 'lower': 3645.0, 'upper': 3831.0}]}}
df.cols.percentile(["OFFENSE_CODE","YEAR"], values =[0.5,0.9])
{'YEAR': {'percentile': {0.5: 2017.0, 0.9: 2018.0}}, 'OFFENSE_CODE': {'percentile': {0.5: 2907.0, 0.9: 3802.0}}}
import numpy as np
a ={'hist_agg': {'hist': np.array([ 169, 12431, 44188, 21795, 1548, 9272, 16609, 2185, 216,
16536, 2759, 256, 2655, 20908, 2894, 29658, 63012, 25653,
9197, 37132]),
'bins': np.array([ 111., 297., 483., 669., 855., 1041., 1227., 1413., 1599.,
1785., 1971., 2157., 2343., 2529., 2715., 2901., 3087., 3273.,
3459., 3645., 3831.])}}
x = a["hist_agg"]["hist"]
y = a["hist_agg"]["bins"]
for idx, v in enumerate(y):
if idx <len(y)-1:
print({"count":x[idx],"lower":y[idx], "upper":y[idx+1]})
{'count': 169, 'lower': 111.0, 'upper': 297.0} {'count': 12431, 'lower': 297.0, 'upper': 483.0} {'count': 44188, 'lower': 483.0, 'upper': 669.0} {'count': 21795, 'lower': 669.0, 'upper': 855.0} {'count': 1548, 'lower': 855.0, 'upper': 1041.0} {'count': 9272, 'lower': 1041.0, 'upper': 1227.0} {'count': 16609, 'lower': 1227.0, 'upper': 1413.0} {'count': 2185, 'lower': 1413.0, 'upper': 1599.0} {'count': 216, 'lower': 1599.0, 'upper': 1785.0} {'count': 16536, 'lower': 1785.0, 'upper': 1971.0} {'count': 2759, 'lower': 1971.0, 'upper': 2157.0} {'count': 256, 'lower': 2157.0, 'upper': 2343.0} {'count': 2655, 'lower': 2343.0, 'upper': 2529.0} {'count': 20908, 'lower': 2529.0, 'upper': 2715.0} {'count': 2894, 'lower': 2715.0, 'upper': 2901.0} {'count': 29658, 'lower': 2901.0, 'upper': 3087.0} {'count': 63012, 'lower': 3087.0, 'upper': 3273.0} {'count': 25653, 'lower': 3273.0, 'upper': 3459.0} {'count': 9197, 'lower': 3459.0, 'upper': 3645.0} {'count': 37132, 'lower': 3645.0, 'upper': 3831.0}
%%time
import dask.array as da
# x = da.random.normal(10, 0.1, size=(100000,), chunks=(1000,))
h, bins = da.histogram(df["OFFENSE_CODE"], bins=30, range=[111, 3831])
print(h.compute(), bins)
[ 169 3799 8632 7406 41698 16879 94 5868 4858 1455 15415 1924 216 11327 5209 2185 830 0 2611 44 20908 0 8989 23563 63012 14054 11599 8724 473 37132] [ 111. 235. 359. 483. 607. 731. 855. 979. 1103. 1227. 1351. 1475. 1599. 1723. 1847. 1971. 2095. 2219. 2343. 2467. 2591. 2715. 2839. 2963. 3087. 3211. 3335. 3459. 3583. 3707. 3831.] Wall time: 1.26 s
%%time
df.cols.hist(["OFFENSE_CODE"])
Wall time: 3.11 s
{'OFFENSE_CODE': {'hist_agg': array([ 169, 12431, 44188, 21795, 1548, 9272, 16609, 2185, 216, 16536, 2759, 256, 2655, 20908, 2894, 29658, 63012, 25653, 9197, 37132], dtype=int64)}}
da.histogram(serie[col_name], bins=bins, range=[min_max["min"], min_max["max"]])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-15-f7c9ec5b4f51> in <module> ----> 1 da.histogram(serie[col_name], bins=bins, range=[min_max["min"], min_max["max"]]) NameError: name 'da' is not defined
print(df.cols.test_agg("OFFENSE_CODE"))
(<dask.dataframe.groupby.Aggregation object at 0x000001FEF04D0550>,)
print(res)
(<dask.dataframe.groupby.Aggregation object at 0x000001FEEA135FD0>,)
%%time
df.cols.percentile(["YEAR","OFFENSE_CODE"], values=[0.5,0.95])
Wall time: 1.2 s
{'YEAR': {'percentile': {'percentile': {0.5: 2017.0, 0.95: 2018.0}}}, 'OFFENSE_CODE': {'percentile': {'percentile': {0.5: 2907.0, 0.95: 3831.0}}}}
custom_mean = dd.Aggregation(
name='custom_mean',
chunk=lambda s: (s.count(), s.sum()),
agg=lambda count, sum: (count.sum(), sum.sum()),
finalize=lambda count, sum: sum / count,
) # doctest: +SKIP
df.groupby('g').agg(custom_mean) # doctest: +SKIP
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-161-72873dc9eb19> in <module> ----> 1 custom_mean = dd.Aggregation( 2 name='custom_mean', 3 chunk=lambda s: (s.count(), s.sum()), 4 agg=lambda count, sum: (count.sum(), sum.sum()), 5 finalize=lambda count, sum: sum / count, NameError: name 'dd' is not defined
a[0][1]["percentile"].names
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-118-df5250a9f6f7> in <module> ----> 1 a[0][1]["percentile"].names ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in __getattr__(self, name) 5065 if self._info_axis._can_hold_identifiers_and_holds_name(name): 5066 return self[name] -> 5067 return object.__getattribute__(self, name) 5068 5069 def __setattr__(self, name, value): AttributeError: 'DataFrame' object has no attribute 'names'
df.cols.names()
['INCIDENT_NUMBER', 'OFFENSE_CODE', 'OFFENSE_CODE_GROUP', 'OFFENSE_DESCRIPTION', 'DISTRICT', 'REPORTING_AREA', 'SHOOTING', 'OCCURRED_ON_DATE', 'YEAR', 'MONTH', 'DAY_OF_WEEK', 'HOUR', 'UCR_PART', 'STREET', 'Lat', 'Long', 'Location']
# df.cols.percentile("*")
{"percentile": {str(i): j for i, j in df[["INCIDENT_NUMBER","OFFENSE_CODE"]].quantile(0.5).iteritems()}}
{'percentile': {'OFFENSE_CODE': 2907.0}}
columns = ["YEAR","OFFENSE_CODE"]
values= [0.5,0.9]
result ={}
for index, row in df[columns].quantile(values).iterrows():
for i, c in enumerate(columns):
result.setdefault(c,{})
result[c].setdefault("percentile",{})
result[c]["percentile"][index] = row[i]
print(r)
{'YEAR': {'percentile': {0.5: 2017.0, 0.9: 2018.0}}, 'OFFENSE_CODE': {'percentile': {0.5: 2907.0, 0.9: 3802.0}}}
# %reset_selective?
for i in df.cols.names():
df.cols.kurtosis(i)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-23-8251d58bc6f0> in <module> 1 for i in df.cols.names(): ----> 2 df.cols.kurtosis(i) AttributeError: 'Cols' object has no attribute 'kurtosis'
df.cols.min("*")
{'INCIDENT_NUMBER': {'min': '142052550'}, 'OFFENSE_CODE': {'min': 111}, 'OFFENSE_CODE_GROUP': {'min': 'Aggravated Assault'}, 'OFFENSE_DESCRIPTION': {'min': 'A&B HANDS, FEET, ETC. - MED. ATTENTION REQ.'}, 'REPORTING_AREA': {'min': ' '}, 'OCCURRED_ON_DATE': {'min': '2015-06-15 00:00:00'}, 'YEAR': {'min': 2015}, 'MONTH': {'min': 1}, 'DAY_OF_WEEK': {'min': 'Friday'}, 'HOUR': {'min': 0}, 'Lat': {'min': -1.0}, 'Long': {'min': -71.17867378}, 'Location': {'min': '(-1.00000000, -1.00000000)'}}
%%time
a = df.min().compute()
print(type(a))
# print({k:{"min":v} for k,v in a.items()} )
<class 'pandas.core.series.Series'> Wall time: 2.53 s
df.cols.lower("INCIDENT_NUMBER").ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (int64)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (int64)
not nullable
|
MONTH
10 (int64)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (int64)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (float64)
not nullable
|
Long
16 (float64)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
i182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
i182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
i182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
i182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
i182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
i182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
i182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
i182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
i182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
i182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
i182070928
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C6
|
913
|
nan
|
2018-09-03⋅19:55:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
(42.31957856,⋅-71.04032766)
|
df.cols.upper("OFFENSE_CODE_GROUP").ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (int64)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (int64)
not nullable
|
MONTH
10 (int64)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (int64)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (float64)
not nullable
|
Long
16 (float64)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
LARCENY
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
VANDALISM
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
TOWED
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
INVESTIGATE⋅PROPERTY
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
INVESTIGATE⋅PROPERTY
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
MOTOR⋅VEHICLE⋅ACCIDENT⋅RESPONSE
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
AUTO⋅THEFT
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
VERBAL⋅DISPUTES
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
ROBBERY
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
VERBAL⋅DISPUTES
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
I182070928
|
3301
|
VERBAL⋅DISPUTES
|
VERBAL⋅DISPUTE
|
C6
|
913
|
nan
|
2018-09-03⋅19:55:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
(42.31957856,⋅-71.04032766)
|
df.head()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | LAT | LONG | LOCATION | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.357791 | -71.139371 | (42.35779134, -71.13937053) |
1 | I182070943 | 1402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.306821 | -71.060300 | (42.30682138, -71.06030035) |
2 | I182070941 | 3410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.346589 | -71.072429 | (42.34658879, -71.07242943) |
3 | I182070940 | 3114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.334182 | -71.078664 | (42.33418175, -71.07866441) |
4 | I182070938 | 3114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.275365 | -71.090361 | (42.27536542, -71.09036101) |
df.cols.trim("OFFENSE_CODE_GROUP").ext.display()
OFFENSE_CODE_GROUP OFFENSE_CODE_GROUP None <class 'str'>
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (int64)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (int64)
not nullable
|
MONTH
10 (int64)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (int64)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (float64)
not nullable
|
Long
16 (float64)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
I182070928
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C6
|
913
|
nan
|
2018-09-03⋅19:55:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
(42.31957856,⋅-71.04032766)
|
# def polar(data):
# data=scale(sid.polarity_scores(data.tweet)['compound'])
# return data
# hola_df.map_partitions(clean_text, meta=df)
def _lower(text, args):
a = args[0]
return text[a].str.lower()
def apply(df,cols, func):
kwargs = {cols:df[[cols]].map_partitions(_lower, args=(cols,1), meta=(cols,str))}
return df.assign(**kwargs)
apply(df, "INCIDENT_NUMBER", _lower).head()
# df["INCIDENT_NUMBER"].map_partitions(_lower, meta=("INCIDENT_NUMBER",str)).head()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | LAT | LONG | LOCATION | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | i182070945 | 619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.357791 | -71.139371 | (42.35779134, -71.13937053) |
1 | i182070943 | 1402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.306821 | -71.060300 | (42.30682138, -71.06030035) |
2 | i182070941 | 3410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.346589 | -71.072429 | (42.34658879, -71.07242943) |
3 | i182070940 | 3114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.334182 | -71.078664 | (42.33418175, -71.07866441) |
4 | i182070938 | 3114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.275365 | -71.090361 | (42.27536542, -71.09036101) |
df1["hola"] = hola_df["INCIDENT_NUMBER"].replace(20,0)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-26-07556533af62> in <module> ----> 1 df1["hola"] = hola_df["INCIDENT_NUMBER"].replace(20,0) NameError: name 'hola_df' is not defined
df1 = df.cols.replace("*",["a","b","c"],"")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-9-8f2979ed9daa> in <module> ----> 1 df1 = df.cols.replace("*",["a","b","c"],"") AttributeError: 'Cols' object has no attribute 'replace'
df1.ext.display()
INCIDENT_NUMBER
1 (string)
nullable
|
OFFENSE_CODE
2 (string)
nullable
|
OFFENSE_CODE_GROUP
3 (string)
nullable
|
OFFENSE_DESCRIPTION
4 (string)
nullable
|
DISTRICT
5 (string)
nullable
|
REPORTING_AREA
6 (string)
nullable
|
SHOOTING
7 (string)
nullable
|
OCCURRED_ON_DATE
8 (timestamp)
nullable
|
YEAR
9 (string)
nullable
|
MONTH
10 (string)
nullable
|
DAY_OF_WEEK
11 (string)
nullable
|
HOUR
12 (string)
nullable
|
UCR_PART
13 (string)
nullable
|
STREET
14 (string)
nullable
|
Lat
15 (string)
nullable
|
Long
16 (string)
nullable
|
Location
17 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Lreny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
None
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sundy
|
13
|
Prt⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vndlism
|
VANDALISM
|
C11
|
347
|
None
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesdy
|
0
|
Prt⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
None
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Mondy
|
19
|
Prt⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigte⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
None
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Mondy
|
21
|
Prt⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigte⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
None
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Mondy
|
21
|
Prt⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehile⋅Aident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
None
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Mondy
|
21
|
Prt⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
None
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Mondy
|
21
|
Prt⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verl⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
None
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Mondy
|
20
|
Prt⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Roery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
None
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Mondy
|
20
|
Prt⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verl⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
None
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Mondy
|
20
|
Prt⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
df.cols.count_by_dtypes("OFFENSE_CODE")
{'OFFENSE_CODE': {'int': 319073, 'float': 0}}
df.cols.percentile("OFFENSE_CODE",[0.5])
invalid escape sequence \d
[('OFFENSE_CODE', {'percentile': {'0.5': 2907.0}})]
{'OFFENSE_CODE': {'percentile': {'0.5': 2907.0}}}
df.cols.median("*")
{'OFFENSE_CODE': 0.5 2907.0 Name: OFFENSE_CODE, dtype: float64, 'YEAR': 0.5 2017.0 Name: YEAR, dtype: float64, 'MONTH': 0.5 7.0 Name: MONTH, dtype: float64, 'HOUR': 0.5 14.0 Name: HOUR, dtype: float64, 'Lat': 0.5 42.325538 Name: Lat, dtype: float64, 'Long': 0.5 -71.077524 Name: Long, dtype: float64}
df.cols.hist("OFFENSE_CODE")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-28-bf27b0e7abaa> in <module> ----> 1 df.cols.hist("OFFENSE_CODE") AttributeError: 'Cols' object has no attribute 'hist'
df.ext.display()
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (int64)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (int64)
not nullable
|
MONTH
10 (int64)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (int64)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (float64)
not nullable
|
Long
16 (float64)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
I182070928
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C6
|
913
|
nan
|
2018-09-03⋅19:55:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
(42.31957856,⋅-71.04032766)
|
df.cols.dtypes("INCIDENT_NUMBER")
{'INCIDENT_NUMBER': 'object'}
df.head()
invalid escape sequence \d
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.357791 | -71.139371 | (42.35779134, -71.13937053) |
1 | I182070943 | 1402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.306821 | -71.060300 | (42.30682138, -71.06030035) |
2 | I182070941 | 3410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.346589 | -71.072429 | (42.34658879, -71.07242943) |
3 | I182070940 | 3114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.334182 | -71.078664 | (42.33418175, -71.07866441) |
4 | I182070938 | 3114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.275365 | -71.090361 | (42.27536542, -71.09036101) |
%%time
df["INCIDENT_NUMBER"].min().compute()
Wall time: 1.19 s
'142052550'
%%time
df.cols.min("*")
[('OFFENSE_CODE', {'min': dd.Scalar<series-..., dtype=int64>}), ('YEAR', {'min': dd.Scalar<series-..., dtype=int64>}), ('MONTH', {'min': dd.Scalar<series-..., dtype=int64>}), ('HOUR', {'min': dd.Scalar<series-..., dtype=int64>}), ('LAT', {'min': dd.Scalar<series-..., dtype=float64>}), ('LONG', {'min': dd.Scalar<series-..., dtype=float64>})]
distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.44 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.46 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.49 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.52 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.54 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.57 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 1.52 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.52 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Worker is at 76% memory usage. Resuming worker. Process memory: 1.53 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.53 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.56 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.59 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Worker is at 80% memory usage. Pausing worker. Process memory: 1.61 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.61 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.64 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.67 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.63 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. Perhaps some other process is leaking memory? Process memory: 1.65 GB -- Worker memory limit: 2.00 GB distributed.worker - WARNING - Worker is at 61% memory usage. Resuming worker. Process memory: 1.22 GB -- Worker memory limit: 2.00 GB
Wall time: 5.78 s
{'OFFENSE_CODE': {'min': 111}, 'YEAR': {'min': 2015}, 'MONTH': {'min': 1}, 'HOUR': {'min': 0}, 'LAT': {'min': -1.0}, 'LONG': {'min': -71.17867378}}
!pip install graphviz
Collecting graphviz Downloading https://files.pythonhosted.org/packages/94/cd/7b37f2b658995033879719e1ea4c9f171bf7a14c16b79220bd19f9eda3fe/graphviz-0.13-py2.py3-none-any.whl Installing collected packages: graphviz Successfully installed graphviz-0.13
df.min().visualize()
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) ~\Anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, quiet, **kwargs) 157 try: --> 158 proc = subprocess.Popen(cmd, startupinfo=get_startupinfo(), **kwargs) 159 except OSError as e: ~\Anaconda3\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text) 768 errread, errwrite, --> 769 restore_signals, start_new_session) 770 except: ~\Anaconda3\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session) 1171 os.fspath(cwd) if cwd is not None else None, -> 1172 startupinfo) 1173 finally: FileNotFoundError: [WinError 2] The system cannot find the file specified During handling of the above exception, another exception occurred: ExecutableNotFound Traceback (most recent call last) <ipython-input-15-e7fadf3d4654> in <module> ----> 1 df.min().visualize() ~\Anaconda3\lib\site-packages\dask\base.py in visualize(self, filename, format, optimize_graph, **kwargs) 86 """ 87 return visualize(self, filename=filename, format=format, ---> 88 optimize_graph=optimize_graph, **kwargs) 89 90 def persist(self, **kwargs): ~\Anaconda3\lib\site-packages\dask\base.py in visualize(*args, **kwargs) 479 raise NotImplementedError("Unknown value color=%s" % color) 480 --> 481 return dot_graph(dsk, filename=filename, **kwargs) 482 483 ~\Anaconda3\lib\site-packages\dask\dot.py in dot_graph(dsk, filename, format, **kwargs) 254 format = 'png' 255 --> 256 data = g.pipe(format=format) 257 if not data: 258 raise RuntimeError("Graphviz failed to properly produce an image. " ~\Anaconda3\lib\site-packages\graphviz\files.py in pipe(self, format, renderer, formatter, quiet) 136 out = backend.pipe(self._engine, format, data, 137 renderer=renderer, formatter=formatter, --> 138 quiet=quiet) 139 140 return out ~\Anaconda3\lib\site-packages\graphviz\backend.py in pipe(engine, format, data, renderer, formatter, quiet) 226 """ 227 cmd, _ = command(engine, format, None, renderer, formatter) --> 228 out, _ = run(cmd, input=data, capture_output=True, check=True, quiet=quiet) 229 return out 230 ~\Anaconda3\lib\site-packages\graphviz\backend.py in run(cmd, input, capture_output, check, quiet, **kwargs) 159 except OSError as e: 160 if e.errno == errno.ENOENT: --> 161 raise ExecutableNotFound(cmd) 162 else: 163 raise ExecutableNotFound: failed to execute ['dot', '-Tpng'], make sure the Graphviz executables are on your systems' PATH
df.cols.create_exprs("*",df.functions.min)
[('OFFENSE_CODE', {'min': dd.Scalar<series-..., dtype=int64>}), ('YEAR', {'min': dd.Scalar<series-..., dtype=int64>}), ('MONTH', {'min': dd.Scalar<series-..., dtype=int64>}), ('HOUR', {'min': dd.Scalar<series-..., dtype=int64>}), ('Lat', {'min': dd.Scalar<series-..., dtype=float64>}), ('Long', {'min': dd.Scalar<series-..., dtype=float64>})]
[('OFFENSE_CODE', {'min': dd.Scalar<series-..., dtype=int64>}), ('YEAR', {'min': dd.Scalar<series-..., dtype=int64>}), ('MONTH', {'min': dd.Scalar<series-..., dtype=int64>}), ('HOUR', {'min': dd.Scalar<series-..., dtype=int64>}), ('Lat', {'min': dd.Scalar<series-..., dtype=float64>}), ('Long', {'min': dd.Scalar<series-..., dtype=float64>})]
df.cols.dtypes()
{'INCIDENT_NUMBER': 'object', 'OFFENSE_CODE': 'int64', 'OFFENSE_CODE_GROUP': 'object', 'OFFENSE_DESCRIPTION': 'object', 'DISTRICT': 'object', 'REPORTING_AREA': 'object', 'SHOOTING': 'object', 'OCCURRED_ON_DATE': 'object', 'YEAR': 'int64', 'MONTH': 'int64', 'DAY_OF_WEEK': 'object', 'HOUR': 'int64', 'UCR_PART': 'object', 'STREET': 'object', 'Lat': 'float64', 'Long': 'float64', 'Location': 'object'}
for c in df.cols.names():
print(c)
df.cols.min(c)
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in f(values, axis, skipna, **kwds) 126 else: --> 127 result = alt(values, axis=axis, skipna=skipna, **kwds) 128 except Exception: ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in reduction(values, axis, skipna, mask) 741 else: --> 742 result = getattr(values, meth)(axis) 743 ~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _amin(a, axis, out, keepdims, initial) 31 initial=_NoValue): ---> 32 return umr_minimum(a, axis, None, out, keepdims, initial) 33 TypeError: '<=' not supported between instances of 'str' and 'float' During handling of the above exception, another exception occurred: TypeError Traceback (most recent call last) <ipython-input-24-8c2153824c92> in <module> 1 for c in df.cols.names(): 2 print(c) ----> 3 df.cols.min(c) ~\Documents\Optimus\optimus\dask\columns.py in min(columns) 463 :return: 464 """ --> 465 return Cols.agg_exprs(columns, self.functions.min) 466 467 return Cols() ~\Documents\Optimus\optimus\dask\columns.py in agg_exprs(columns, funcs, *args) 326 """ 327 # print(args) --> 328 return Cols.exec_agg(Cols.create_exprs(columns, funcs, *args)) 329 330 @staticmethod ~\Documents\Optimus\optimus\dask\columns.py in exec_agg(exprs) 364 if is_list_of_futures(agg_list): 365 for agg_element in agg_list: --> 366 agg_result.append(agg_element.result()) 367 else: 368 agg_result = agg_list[0] ~\Anaconda3\lib\site-packages\distributed\client.py in result(self, timeout) 193 raiseit=False) 194 if self.status == 'error': --> 195 six.reraise(*result) 196 elif self.status == 'cancelled': 197 raise result ~\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb) 690 value = tp() 691 if value.__traceback__ is not tb: --> 692 raise value.with_traceback(tb) 693 raise value 694 finally: ~\Anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs) 91 def apply(func, args, kwargs=None): 92 if kwargs: ---> 93 return func(*args, **kwargs) 94 else: 95 return func(*args) ~\Anaconda3\lib\site-packages\dask\dataframe\core.py in _reduction_chunk(x, aca_chunk, **kwargs) 4314 4315 def _reduction_chunk(x, aca_chunk=None, **kwargs): -> 4316 o = aca_chunk(x, **kwargs) 4317 # Return a dataframe so that the concatenated version is also a dataframe 4318 return o.to_frame().T if isinstance(o, pd.Series) else o ~\Anaconda3\lib\site-packages\dask\utils.py in __call__(self, obj, *args, **kwargs) 692 693 def __call__(self, obj, *args, **kwargs): --> 694 return getattr(obj, self.method)(*args, **kwargs) 695 696 def __reduce__(self): ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\generic.py in stat_func(self, axis, skipna, level, numeric_only, **kwargs) 10954 skipna=skipna) 10955 return self._reduce(f, name, axis=axis, skipna=skipna, > 10956 numeric_only=numeric_only) 10957 10958 return set_function_name(stat_func, name, cls) ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds) 3628 'numeric_only.'.format(name)) 3629 with np.errstate(all='ignore'): -> 3630 return op(delegate, skipna=skipna, **kwds) 3631 3632 # TODO(EA) dispatch to Index ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in f(values, axis, skipna, **kwds) 128 except Exception: 129 try: --> 130 result = alt(values, axis=axis, skipna=skipna, **kwds) 131 except ValueError as e: 132 # we want to transform an object array ~\AppData\Roaming\Python\Python37\site-packages\pandas\core\nanops.py in reduction(values, axis, skipna, mask) 740 result = np.nan 741 else: --> 742 result = getattr(values, meth)(axis) 743 744 result = _wrap_results(result, dtype, fill_value) ~\Anaconda3\lib\site-packages\numpy\core\_methods.py in _amin(a, axis, out, keepdims, initial) 30 def _amin(a, axis=None, out=None, keepdims=False, 31 initial=_NoValue): ---> 32 return umr_minimum(a, axis, None, out, keepdims, initial) 33 34 def _sum(a, axis=None, dtype=None, out=None, keepdims=False, TypeError: '<=' not supported between instances of 'str' and 'float'
import dask.array as da
df
da.histogram(df["INCIDENT_NUMBER"],bins=10, range=[9, 11])
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-100-0b0a8087ee90> in <module> ----> 1 da.histogram(df["INCIDENT_NUMBER"],bins=10, range=[9, 11]).compute() AttributeError: 'tuple' object has no attribute 'compute'
df.sum().compute()
%%time
from optimus.profiler.profiler import Profiler
p = Profiler()
p.run(df, "*")
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <timed exec> in <module> ~\Documents\Optimus\optimus\helpers\decorators.py in timed(*args, **kw) 8 def timed(*args, **kw): 9 start_time = timeit.default_timer() ---> 10 f = method(*args, **kw) 11 _time = round(timeit.default_timer() - start_time, 2) 12 logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time)) ~\Documents\Optimus\optimus\profiler\profiler.py in run(self, df, columns, buckets, infer, relative_error, approx_count) 118 # df.ext.set_meta({"initialized": True}) 119 --> 120 output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict") 121 122 # Load jinja ~\Documents\Optimus\optimus\profiler\profiler.py in dataset(self, df, columns, buckets, infer, relative_error, approx_count, sample, stats, format) 245 # if ["drop", "rename"] not in trans and self.already_run is False: 246 if stats is True: --> 247 output_columns = self.columns_stats(df, columns, buckets, infer, relative_error, approx_count) 248 249 assign(output_columns, "name", df.ext.get_name(), dict) ~\Documents\Optimus\optimus\profiler\profiler.py in columns_stats(self, df, columns, buckets, infer, relative_error, approx_count) 330 331 # Aggregation --> 332 stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count) 333 334 # Calculate Frequency ~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(df, columns, buckets, relative_error, approx_count) 375 df.functions.kurtosis, df.functions.mean, df.functions.skewness, df.functions.sum, 376 df.functions.variance, df.functions.zeros_agg] --> 377 exprs.extend(df.cols.create_exprs(cols, funcs)) 378 379 # TODO: None in basic calculation ~\Documents\Optimus\optimus\dask\columns.py in create_exprs(columns, funcs, *args) 620 exprs[col_name].update(func(col_name, args)(self)) 621 else: --> 622 exprs[col_name] = func(col_name, args)(self) 623 624 result = {} ~\Documents\Optimus\optimus\dask\functions.py in _kurtoris(serie) 127 def kurtosis(col_name, args): 128 def _kurtoris(serie): --> 129 result = {"kurtosis": float(stats.kurtosis(serie[col_name]))} 130 return result 131 ~\Anaconda3\lib\site-packages\dask\array\stats.py in kurtosis(a, axis, fisher, bias, nan_policy) 227 olderr = np.seterr(all='ignore') 228 try: --> 229 vals = da.where(zero, 0, m4 / m2**2.0) 230 finally: 231 np.seterr(**olderr) TypeError: unsupported operand type(s) for ** or pow(): 'Array' and 'float'
df.ext.display(10)
INCIDENT_NUMBER
1 (object)
not nullable
|
OFFENSE_CODE
2 (int64)
not nullable
|
OFFENSE_CODE_GROUP
3 (object)
not nullable
|
OFFENSE_DESCRIPTION
4 (object)
not nullable
|
DISTRICT
5 (object)
not nullable
|
REPORTING_AREA
6 (object)
not nullable
|
SHOOTING
7 (object)
not nullable
|
OCCURRED_ON_DATE
8 (object)
not nullable
|
YEAR
9 (int64)
not nullable
|
MONTH
10 (int64)
not nullable
|
DAY_OF_WEEK
11 (object)
not nullable
|
HOUR
12 (int64)
not nullable
|
UCR_PART
13 (object)
not nullable
|
STREET
14 (object)
not nullable
|
Lat
15 (float64)
not nullable
|
Long
16 (float64)
not nullable
|
Location
17 (object)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
I182070945
|
619
|
Larceny
|
LARCENY⋅ALL⋅OTHERS
|
D14
|
808
|
nan
|
2018-09-02⋅13:00:00
|
2018
|
9
|
Sunday
|
13
|
Part⋅One
|
LINCOLN⋅ST
|
42.35779134
|
-71.13937053
|
(42.35779134,⋅-71.13937053)
|
I182070943
|
1402
|
Vandalism
|
VANDALISM
|
C11
|
347
|
nan
|
2018-08-21⋅00:00:00
|
2018
|
8
|
Tuesday
|
0
|
Part⋅Two
|
HECLA⋅ST
|
42.30682138
|
-71.06030035
|
(42.30682138,⋅-71.06030035)
|
I182070941
|
3410
|
Towed
|
TOWED⋅MOTOR⋅VEHICLE
|
D4
|
151
|
nan
|
2018-09-03⋅19:27:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
CAZENOVE⋅ST
|
42.34658879
|
-71.07242943
|
(42.34658879,⋅-71.07242943)
|
I182070940
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
D4
|
272
|
nan
|
2018-09-03⋅21:16:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
NEWCOMB⋅ST
|
42.33418175
|
-71.07866441
|
(42.33418175,⋅-71.07866441)
|
I182070938
|
3114
|
Investigate⋅Property
|
INVESTIGATE⋅PROPERTY
|
B3
|
421
|
nan
|
2018-09-03⋅21:05:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
DELHI⋅ST
|
42.27536542
|
-71.09036101
|
(42.27536542,⋅-71.09036101)
|
I182070936
|
3820
|
Motor⋅Vehicle⋅Accident⋅Response
|
M/V⋅ACCIDENT⋅INVOLVING⋅PEDESTRIAN⋅-⋅INJURY
|
C11
|
398
|
nan
|
2018-09-03⋅21:09:00
|
2018
|
9
|
Monday
|
21
|
Part⋅Three
|
TALBOT⋅AVE
|
42.29019621
|
-71.07159012
|
(42.29019621,⋅-71.07159012)
|
I182070933
|
724
|
Auto⋅Theft
|
AUTO⋅THEFT
|
B2
|
330
|
nan
|
2018-09-03⋅21:25:00
|
2018
|
9
|
Monday
|
21
|
Part⋅One
|
NORMANDY⋅ST
|
42.30607218
|
-71.0827326
|
(42.30607218,⋅-71.08273260)
|
I182070932
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
B2
|
584
|
nan
|
2018-09-03⋅20:39:37
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LAWN⋅ST
|
42.32701648
|
-71.10555088
|
(42.32701648,⋅-71.10555088)
|
I182070931
|
301
|
Robbery
|
ROBBERY⋅-⋅STREET
|
C6
|
177
|
nan
|
2018-09-03⋅20:48:00
|
2018
|
9
|
Monday
|
20
|
Part⋅One
|
MASSACHUSETTS⋅AVE
|
42.33152148
|
-71.07085307
|
(42.33152148,⋅-71.07085307)
|
I182070929
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C11
|
364
|
nan
|
2018-09-03⋅20:38:00
|
2018
|
9
|
Monday
|
20
|
Part⋅Three
|
LESLIE⋅ST
|
42.29514664
|
-71.05860832
|
(42.29514664,⋅-71.05860832)
|
I182070928
|
3301
|
Verbal⋅Disputes
|
VERBAL⋅DISPUTE
|
C6
|
913
|
nan
|
2018-09-03⋅19:55:00
|
2018
|
9
|
Monday
|
19
|
Part⋅Three
|
OCEAN⋅VIEW⋅DR
|
42.31957856
|
-71.04032766
|
(42.31957856,⋅-71.04032766)
|
df.rows.to_list("OFFENSE_CODE")
# type(df["OFFENSE_CODE"])
# type(df.cols.select("OFFENSE_CODE"))
# a = [v for v in df["OFFENSE_CODE"].iteritems()]
# df[["OFFENSE_CODE"]].iteritems()
df.cols.select("OFFENSE_CODE")
OFFENSE_CODE | |
---|---|
npartitions=1 | |
int64 | |
... |
df.sample(1).head()
sample does not support the number of sampled items parameter, 'n'. Please use the 'frac' parameter instead.
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
186823 | I162085248 | 3006 | Medical Assistance | SICK/INJURED/MEDICAL - PERSON | B2 | 314 | NaN | 2016-10-18 07:24:00 | 2016 | 10 | Tuesday | 7 | Part Three | HOWLAND ST | 42.314448 | -71.089934 | (42.31444840, -71.08993418) |
65160 | I182001199 | 1102 | Fraud | FRAUD - FALSE PRETENSE / SCHEME | E18 | 526 | NaN | 2017-11-01 10:00:00 | 2017 | 11 | Wednesday | 10 | Part Two | CLIFFORD ST | 42.234543 | -71.130915 | (42.23454320, -71.13091490) |
277221 | I152095488 | 361 | Robbery | ROBBERY - OTHER | D4 | 272 | NaN | 2015-11-17 14:52:00 | 2015 | 11 | Tuesday | 14 | Part One | WASHINGTON ST | 42.334831 | -71.079041 | (42.33483110, -71.07904134) |
44193 | I182023593 | 3805 | Motor Vehicle Accident Response | M/V ACCIDENT - POLICE VEHICLE | D4 | 171 | NaN | 2018-03-30 09:49:00 | 2018 | 3 | Friday | 9 | Part Three | ALBANY ST | 42.334288 | -71.072395 | (42.33428841, -71.07239518) |
13308 | I182056371 | 3115 | Investigate Person | INVESTIGATE PERSON | C11 | 250 | NaN | 2018-07-17 20:18:00 | 2018 | 7 | Tuesday | 20 | Part Three | SAVIN HILL AVE | 42.312482 | -71.048290 | (42.31248165, -71.04829028) |
df.cols.schema_dtype()
[numpy.object_, numpy.int64, numpy.object_, numpy.object_, numpy.object_, numpy.object_, numpy.object_, numpy.object_, numpy.int64, numpy.int64, numpy.object_, numpy.int64, numpy.object_, numpy.object_, numpy.float64, numpy.float64, numpy.object_]
import numpy as np
np.dtype(df["OFFENSE_CODE"]).type
numpy.int64
df.schema
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-77-2830e85e5da4> in <module> ----> 1 df.schema ~\Anaconda3\lib\site-packages\dask\dataframe\core.py in __getattr__(self, key) 2531 return new_dd_object(merge(self.dask, dsk), name, 2532 meta, self.divisions) -> 2533 raise AttributeError("'DataFrame' object has no attribute %r" % key) 2534 2535 def __dir__(self): AttributeError: 'DataFrame' object has no attribute 'schema'
%%time
df["INCIDENT_NUMBER"].value_counts().nlargest(5).compute()
Wall time: 2.51 s
I162030584 13 I152080623 11 I172013170 10 I182065208 10 I172096394 10 Name: INCIDENT_NUMBER, dtype: int64
df["INCIDENT_NUMBER"].nunique().compute()
282517
df.npartitions
1
df = df.repartition(npartitions=20)
df.npartitions
20
%%time
df["INCIDENT_NUMBER"].value_counts().nlargest(5).compute()
Wall time: 8.07 s
I162030584 13 I152080623 11 I172013170 10 I182065208 10 I172096394 10 Name: INCIDENT_NUMBER, dtype: int64
df["INCIDENT_NUMBER"].unique().count().compute()
282517
df.head()
INCIDENT_NUMBER | OFFENSE_CODE | OFFENSE_CODE_GROUP | OFFENSE_DESCRIPTION | DISTRICT | REPORTING_AREA | SHOOTING | OCCURRED_ON_DATE | YEAR | MONTH | DAY_OF_WEEK | HOUR | UCR_PART | STREET | Lat | Long | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | I182070945 | 619 | Larceny | LARCENY ALL OTHERS | D14 | 808 | NaN | 2018-09-02 13:00:00 | 2018 | 9 | Sunday | 13 | Part One | LINCOLN ST | 42.357791 | -71.139371 | (42.35779134, -71.13937053) |
1 | I182070943 | 1402 | Vandalism | VANDALISM | C11 | 347 | NaN | 2018-08-21 00:00:00 | 2018 | 8 | Tuesday | 0 | Part Two | HECLA ST | 42.306821 | -71.060300 | (42.30682138, -71.06030035) |
2 | I182070941 | 3410 | Towed | TOWED MOTOR VEHICLE | D4 | 151 | NaN | 2018-09-03 19:27:00 | 2018 | 9 | Monday | 19 | Part Three | CAZENOVE ST | 42.346589 | -71.072429 | (42.34658879, -71.07242943) |
3 | I182070940 | 3114 | Investigate Property | INVESTIGATE PROPERTY | D4 | 272 | NaN | 2018-09-03 21:16:00 | 2018 | 9 | Monday | 21 | Part Three | NEWCOMB ST | 42.334182 | -71.078664 | (42.33418175, -71.07866441) |
4 | I182070938 | 3114 | Investigate Property | INVESTIGATE PROPERTY | B3 | 421 | NaN | 2018-09-03 21:05:00 | 2018 | 9 | Monday | 21 | Part Three | DELHI ST | 42.275365 | -71.090361 | (42.27536542, -71.09036101) |
import dask.dataframe as dd
def count_uniques_agg(col_name, args=True):
estimate = args[0]
def count_uniques_agg_(x):
print(estimate)
if estimate is True:
result = {"count_uniques_agg": x[col_name].nunique_approx()}
else:
result = {"count_uniques_agg": x[col_name].nunique()}
return result
return count_uniques_agg_
df.rows.count()
319073
import dask.array as da
def hist_agg(col_name, args):
df = args[0]
bins = args[1]
range = args[2]
def hist_agg_(x):
return {"hist_agg":list(da.histogram(x[col_name], bins=bins, range=range)[1]) }
return hist_agg_
def percentile_agg(col_name, args):
values = args[0]
def _percentile(x):
return {"percentile_agg": {i:j for i,j in x[col_name].quantile(values).iteritems()}}
return _percentile
%%time
import dask.dataframe as dd
# k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)]
# agg_func=["min","max"]
# agg_func=[percentile_agg(0.1)(df),"min","max", count_uniques_agg("INCIDENT_NUMBER",False)(df)]
# agg_func=[percentile_agg(0.1)(df),"min","max"]
agg_func=[hist_agg]
# agg_func=[df.functions.count_na_agg]
exprs = df.cols.create_exprs(["OFFENSE_CODE","YEAR"], agg_func, df, 10,[1,5])
# print(exprs)
print(dd.compute(exprs))
([('OFFENSE_CODE', {'hist_agg': [1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 3.4000000000000004, 3.8000000000000003, 4.2, 4.6, 5.0]}), ('YEAR', {'hist_agg': [1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 3.4000000000000004, 3.8000000000000003, 4.2, 4.6, 5.0]})],) Wall time: 4 ms
dask.dataframe.core.DataFrame
list(r["OFFENSE_CODE"]["hist_agg"][1])
[4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5]
for i in x:
print(i[0],i[1])
a {'min': 111}
print(df.cols.exec_agg(exprs))
{'OFFENSE_CODE': {'kurtosis': 1.5838041686767816, 'skew': array(-0.40994654), 'zeros': 15106, 'variance': 1404901.8183066112}, 'HOUR': {'kurtosis': 2.3994270670757927, 'skew': array(-0.4834518), 'zeros': 15106, 'variance': 39.61701992983649}}
import dask.dataframe as dd
df.min()
Dask Series Structure: npartitions=1 DAY_OF_WEEK object YEAR ... dtype: object Dask Name: dataframe-min-agg, 5 tasks
df.functions.percentile_agg(0.1)(df)
Dask Series Structure: npartitions=1 HOUR float64 YEAR ... Name: 0.1, dtype: float64 Dask Name: quantiles-concat, 28 tasks
dd.compute(df.min())
(INCIDENT_NUMBER 142052550 OFFENSE_CODE 111 OFFENSE_CODE_GROUP Aggravated Assault OFFENSE_DESCRIPTION A&B HANDS, FEET, ETC. - MED. ATTENTION REQ. REPORTING_AREA OCCURRED_ON_DATE 2015-06-15 00:00:00 YEAR 2015 MONTH 1 DAY_OF_WEEK Friday HOUR 0 Lat -1 Long -71.1787 Location (-1.00000000, -1.00000000) dtype: object,)
import dask.dataframe as dd
def frequency(col_name):
def frequency_(x):
print(x)
return
return frequency_
df.cols.frequency("INCIDENT_NUMBER")
({'I162030584': 13, 'I152080623': 11, 'I172013170': 10, 'I182065208': 10, 'I172096394': 10},)
print(dd.compute(df[col_name].value_counts().nlargest(5)))
(<function frequency.<locals>.frequency_ at 0x00000202489E19D8>,)
%%time
dd.compute(df["OFFENSE_CODE"].min(), df["INCIDENT_NUMBER"].value_counts().nlargest(5), df["OFFENSE_CODE"].value_counts().nlargest(5), df["OFFENSE_CODE"].nunique())
Wall time: 2.01 s
(111, I162030584 13 I152080623 11 I172013170 10 I182065208 10 I172096394 10 Name: INCIDENT_NUMBER, dtype: int64, 3006 18783 3115 18754 3831 16323 1402 15154 802 14799 Name: OFFENSE_CODE, dtype: int64, 222)
%%time
dd.compute(df["OFFENSE_CODE"].nunique())
Wall time: 1.26 s
(222,)
## https://distributed.dask.org/en/latest/web.html
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-27-bc0913579ac8> in <module> ----> 1 cluster.scheduler.processing NameError: name 'cluster' is not defined
# Another approach
# from dask.array import stats
# x = da.random.beta(1, 1, size=(1000,), chunks=10)
# k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)]
# dask.compute(k, s, m)
(1.7612340817172787, -0.064073498030693302, -0.00054523780628304799)
(1.7612340817172787, -0.0640734980306933, -0.000545237806283048)
import pandas as pd
data = [['tom', 10], ['nick', 15], ['juli', 0],['argenis', 10]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Name', 'Age'])
df.cols.count_by_dtypes("*")
ddf = dd.from_pandas(df, npartitions=2)
df.reset_index(drop=True)
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 0 |
3 | argenis | 10 |
# df[df == 0].count(axis=0)
# Speedy https://stackoverflow.com/questions/35277075/python-pandas-counting-the-occurrences-of-a-specific-value
def zeros(series):
return (series.values==0).sum()
zeros(df["Age"])
1
def percentile_agg(n):
def percentile_(x):
return x.quantile(n)
percentile_.__name__ = 'percentile_%s' % n
return percentile_
# percentile_agg(values=[0.1, 0.5])
print(ddf)
Dask DataFrame Structure: Name Age npartitions=2 0 object int64 2 ... ... 3 ... ... Dask Name: from_pandas, 2 tasks
dd.compute(ddf.min(), ddf.max())
(Name argenis Age 0 dtype: object, Name tom Age 15 dtype: object)
ddf.agg(
{
# # find the min, max, and sum of the duration column
# 'Name': [min, max, sum],
# # find the number of network type entries
# 'Age': "count",
# min, first, and number of unique dates per group
'Age': [min, 'max', 'nunique', zeros , percentile_agg(1)]
}
).to_dict()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-103-9d1e9f74206e> in <module> ----> 1 df.agg( 2 { 3 # # find the min, max, and sum of the duration column 4 # 'Name': [min, max, sum], 5 # # find the number of network type entries ~\Anaconda3\lib\site-packages\dask\dataframe\core.py in __getattr__(self, key) 2531 return new_dd_object(merge(self.dask, dsk), name, 2532 meta, self.divisions) -> 2533 raise AttributeError("'DataFrame' object has no attribute %r" % key) 2534 2535 def __dir__(self): AttributeError: 'DataFrame' object has no attribute 'agg'
df
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 0 |
3 | argenis | 10 |
def agg_fn(x):
result = pd.Series(
dict(
C = x.quantile(1.0), # int
)
)
print(result)
return result
a=df[['Age']]
a.apply(agg_fn)
C 15.0 dtype: float64 C 15.0 dtype: float64
Age | |
---|---|
C | 15.0 |
a.apply(agg_fn)
C 15.0 dtype: float64 C 15.0 dtype: float64
Age | |
---|---|
C | 15.0 |
print(df)
Name Age 0 tom 10 1 nick 15 2 juli 0 3 argenis 10
#https://stackoverflow.com/questions/46080171/constructing-mode-and-corresponding-count-functions-using-custom-aggregation-fun
def chunk(s):
"""
The function applied to the
individual partition (map)
"""
return s.apply(lambda x: list(set(x)))
def agg(s):
"""
The function whic will aggrgate
the result from all the partitions(reduce)
"""
s = s._selected_obj
return s.groupby(level=list(range(s.index.nlevels))).sum()
def finalize(s):
"""
The optional functional that will be
applied to the result of the agg_tu functions
"""
return s.apply(lambda x: len(set(x)))
tunique = dd.Aggregation('tunique', chunk, agg, finalize)
df.groupby(['INCIDENT_NUMBER']).agg({'INCIDENT_NUMBER': tunique}).compute()
INCIDENT_NUMBER | |
---|---|
INCIDENT_NUMBER | |
142052550 | 1 |
I010370257-00 | 1 |
I030217815-08 | 1 |
I050310906-00 | 1 |
I060168073-00 | 1 |
I080542626-00 | 1 |
I090317057-00 | 1 |
I090321958-00 | 1 |
I100033064-00 | 1 |
I100222105-02 | 1 |
I100340225-00 | 1 |
I100636670-00 | 1 |
I110177502-00 | 1 |
I110261417-00 | 1 |
I110372326-00 | 1 |
I110551302-00 | 1 |
I110611058-00 | 1 |
I110694557-00 | 1 |
I120069826-00 | 1 |
I120189428-00 | 1 |
I120201612-00 | 1 |
I120260724-01 | 1 |
I120283195-00 | 1 |
I120470733-00 | 1 |
I120595668-00 | 1 |
I120719309-00 | 1 |
I120720047-00 | 1 |
I130007264-01 | 1 |
I130031413-00 | 1 |
I130041200-00 | 1 |
... | ... |
I182070901 | 1 |
I182070903 | 1 |
I182070904 | 1 |
I182070905 | 1 |
I182070906 | 1 |
I182070908 | 1 |
I182070909 | 1 |
I182070910 | 1 |
I182070911 | 1 |
I182070913 | 1 |
I182070915 | 1 |
I182070917 | 1 |
I182070918 | 1 |
I182070919 | 1 |
I182070920 | 1 |
I182070921 | 1 |
I182070922 | 1 |
I182070923 | 1 |
I182070927 | 1 |
I182070928 | 1 |
I182070929 | 1 |
I182070931 | 1 |
I182070932 | 1 |
I182070933 | 1 |
I182070936 | 1 |
I182070938 | 1 |
I182070940 | 1 |
I182070941 | 1 |
I182070943 | 1 |
I182070945 | 1 |
282517 rows × 1 columns
distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/10 remote=inproc://192.168.0.9/34804/1> distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/11 remote=inproc://192.168.0.9/34804/1> distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/12 remote=inproc://192.168.0.9/34804/1> distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/13 remote=inproc://192.168.0.9/34804/1> distributed.comm.inproc - WARNING - Closing dangling queue in <InProc local=inproc://192.168.0.9/34804/14 remote=inproc://192.168.0.9/34804/1>
df.apply({'g0': unique}, axis=1).compute()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-109-0d3e0320a332> in <module> ----> 1 df.apply({'g0': unique}, axis=1).compute() NameError: name 'unique' is not defined
from abc import abstractmethod, ABC
class AbstractCols(ABC):
@abstractmethod
def min(self, columns):
pass
class BaseCols(AbstractCols):
def __init__(self, functions):
self.functions = functions
@staticmethod
# @abstractmethod
def get_agg_function():
return 3
def min(self, columns):
return BaseCols.agg_exprs(columns, self.functions)
@staticmethod
def agg_exprs(columns, agg):
return agg
functions={"hola1"}
class DaskCols(BaseCols):
def __init__(self):
super().__init__(functions)
c = DaskCols()
print(c.min("cols"))
{'hola1'}