%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0,"..")
# print(sys.path)
from optimus import Optimus
# op = Optimus(engine="dask",verbose=True)
op = Optimus("dask", n_workers=1, threads_per_worker=8, processes=False, memory_limit="3G")
C:\Users\argenisleon\Anaconda3\lib\site-packages\numpy\_distributor_init.py:32: UserWarning: loaded more than 1 DLL from .libs: C:\Users\argenisleon\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll C:\Users\argenisleon\Anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll stacklevel=1)
# df = op.load.file("data/Meteorite_Landings.csv")
# df = op.load.file("https://bumblebee.nyc3.digitaloceanspaces.com/luisaguirre/edad-32291509-7a3f-4d48-a3a5-a224e088dde7.csv")
df = op.load.file("data/crime.csv").cache()
# df = op.load.file("data/crime.csv")
# df= df.to_optimus_pandas()
# op.load.csv("data/airline-safety_csv.csv", sep=",", error_bad_lines=False, header=True, null_value="null",infer_schema="true").ext.cache()
pdf = df.compute()
import numpy as np
def case1(self, df, col_name, conditions, default):
_when = list(conditions.keys())
_values = list(conditions.values())
_else_ = default
return np.select(_when, _values, default=_else_)
case(df)
a = [("cond1", "value1"), ("cond2", "value2")]
x,y for i,j in a:
File "<ipython-input-4-c9364212201c>", line 3 x,y for i,j in a: ^ SyntaxError: invalid syntax
import numpy as np
class Case:
def __init__(self, df):
self.df = df
self._when = []
self._values = []
self._else_ = []
def case(value):
return value
def when(self, expr, value):
self._when.append(expr)
self._values.append(value)
return self
def else_(self, value):
self._else_.append(value)
return self
def end(self):
return np.select(self._when, self._values, default=self._else_)
# return value
for ()
# {"when":[cond1, cond2], "values":["Easy","Medium"], default:"Unknown"}
condition = [
(cond1, value1)
]
df.cols.case("name",
conditions,
default="Unknown")
#case
cond1 = pdf['INCIDENT_NUMBER']=="I182070945"
cond2 = pdf['INCIDENT_NUMBER']=="I182070943"
pdf["name"] = pdf["name"].case(pdf['INCIDENT_NUMBER']).
.when(cond1, "Easy")
.when(cond2, "Medium")
.else_("Unknown")
.end()
#if
a
array(['Easy', 'Medium', 'Unknown', ..., 'Unknown', 'Unknown', 'Unknown'], dtype='<U7')
print(a._value)
print(a._else_)
print(a._when)
['Easy', 'Medium'] ['nknown'] [0 True 1 False 2 False 3 False 4 False ... 319068 False 319069 False 319070 False 319071 False 319072 False Name: INCIDENT_NUMBER, Length: 319073, dtype: bool, 0 False 1 True 2 False 3 False 4 False ... 319068 False 319069 False 319070 False 319071 False 319072 False Name: INCIDENT_NUMBER, Length: 319073, dtype: bool]
%%time
import numpy as np
np.select(
[
pdf['INCIDENT_NUMBER']=="I182070945",
pdf['INCIDENT_NUMBER']=="I182070943"
],
[
'Easy',
'Medium'
],
default='Unknown'
)
Wall time: 83 ms