%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
import pandas as pd
from optimus import optimus as Optimus
C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working from collections import Callable You are using PySparkling of version 2.4.10, but your PySpark is of version 2.3.1. Please make sure Spark and PySparkling versions are compatible. `formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly
data = [['tom', 10], ['nick', 15], ['juli', 0],['argenis', 10]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Name', 'Age'])
df.cols.count_by_dtypes("*")
{'Name': {'int': 0, 'decimal': 0, 'string': 0, 'date': 0, 'boolean': 0, 'binary': 0, 'array': 0, 'object': 4, 'null': 0, 'missing': 0}, 'Age': {'int': 0, 'decimal': 0, 'string': 0, 'date': 0, 'boolean': 0, 'binary': 0, 'array': 0, 'object': 0, 'null': 0, 'missing': 0}}
df
Name | Age | |
---|---|---|
0 | tom | 10 |
1 | nick | 15 |
2 | juli | 0 |
3 | argenis | 10 |
# df[df == 0].count(axis=0)
# Speedy https://stackoverflow.com/questions/35277075/python-pandas-counting-the-occurrences-of-a-specific-value
def zeros(series):
return (series.values==0).sum()
zeros(df["Age"])
1
def percentile_agg(col_name, values):
return col_name.quantile(values).to_dict()
percentile_agg(df["Age"], [0.1, 0.5])
{0.1: 3.0000000000000004, 0.5: 10.0}
df.agg(
{
# # find the min, max, and sum of the duration column
# 'Name': [min, max, sum],
# # find the number of network type entries
# 'Age': "count",
# min, first, and number of unique dates per group
'Age': [min, 'first', 'nunique', percentile_agg("Age",[0.1, 0.5])]
}
).to_dict()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-136-2dcbdc92d217> in <module> 6 # 'Age': "count", 7 # min, first, and number of unique dates per group ----> 8 'Age': [min, 'first', 'nunique', percentile_agg("Age",[0.1, 0.5])] 9 } 10 ).to_dict() <ipython-input-135-baf44f890966> in percentile_agg(col_name, values) 1 def percentile_agg(col_name, values): ----> 2 return col_name.quantile(values).to_dict() 3 4 5 percentile_agg(df["Age"], [0.1, 0.5]) AttributeError: 'str' object has no attribute 'quantile'
print(df)
Name Age 0 tom 10 1 nick 15 2 juli 0
from optimus.profiler.profiler import Profiler
p = Profiler()
print(df.functions)
p.columns_agg(df)
<optimus.pandas.functions.functions.<locals>.Functions object at 0x000001F961E9E438> <optimus.pandas.functions.functions.<locals>.Functions object at 0x000001F961E7EF60> [(<function functions.<locals>.Functions.count_uniques_agg at 0x000001F961EED1E0>, ('Name', True)), (<function functions.<locals>.Functions.count_uniques_agg at 0x000001F961EED1E0>, ('Age', True))]
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-138-02af911c93ea> in <module> 2 p = Profiler() 3 print(df.functions) ----> 4 p.columns_agg(df) ~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(df, columns, buckets, relative_error, approx_count) 371 print(df.functions) 372 funcs = [df.functions.count_uniques_agg] --> 373 exprs = df.cols.create_exprs(cols, funcs, approx_count) 374 375 # TODO: in basic calculations only funcs = [F.min, F.max] ~\Documents\Optimus\optimus\pandas\columns.py in create_exprs(columns, funcs, *args) 252 return _exprs 253 print(exprs) --> 254 r = _agg_exprs(exprs) 255 256 return r ~\Documents\Optimus\optimus\pandas\columns.py in _agg_exprs(_funcs) 242 243 if not _filter(_col_name, _func): --> 244 agg = _func(*_args) 245 if agg is not None: 246 func_name = _beautify_col_names(_func) ~\Documents\Optimus\optimus\pandas\functions.py in count_uniques_agg(col_name, estimate) 68 @staticmethod 69 def count_uniques_agg(col_name: pd.DataFrame, estimate=True ): ---> 70 return col_name.value_counts().to_dict() 71 72 # @staticmethod AttributeError: 'str' object has no attribute 'value_counts'
from optimus.profiler.profiler import Profiler
p = Profiler()
p.run(df)
<optimus.pandas.functions.functions.<locals>.Functions object at 0x0000021950468C50>
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-26-e792527df240> in <module> 1 from optimus.profiler.profiler import Profiler 2 p = Profiler() ----> 3 p.run(df) ~\Documents\Optimus\optimus\helpers\decorators.py in timed(*args, **kw) 8 def timed(*args, **kw): 9 start_time = timeit.default_timer() ---> 10 f = method(*args, **kw) 11 _time = round(timeit.default_timer() - start_time, 2) 12 logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time)) ~\Documents\Optimus\optimus\profiler\profiler.py in run(self, df, columns, buckets, infer, relative_error, approx_count) 119 # df.ext.set_meta({"initialized": True}) 120 --> 121 output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict") 122 123 # Load jinja ~\Documents\Optimus\optimus\profiler\profiler.py in dataset(self, df, columns, buckets, infer, relative_error, approx_count, sample, stats, format) 246 # if ["drop", "rename"] not in trans and self.already_run is False: 247 if stats is True: --> 248 output_columns = self.columns_stats(df, columns, buckets, infer, relative_error, approx_count) 249 250 assign(output_columns, "name", df.get_name(), dict) ~\Documents\Optimus\optimus\profiler\profiler.py in columns_stats(self, df, columns, buckets, infer, relative_error, approx_count) 331 332 # Aggregation --> 333 stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count) 334 335 # Calculate Frequency ~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(df, columns, buckets, relative_error, approx_count) 371 print(df.functions) 372 funcs = [df.functions.count_uniques_agg] --> 373 exprs = df.cols.create_exprs(cols, funcs, approx_count) 374 375 # TODO: in basic calculations only funcs = [F.min, F.max] ~\Documents\Optimus\optimus\pandas\columns.py in create_exprs(columns, funcs, *args) 204 # Std, kurtosis, mean, skewness and other agg functions can not process date columns. 205 filters = {"date": [F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, --> 206 self.functions.zeros_agg], 207 "array": [F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct, 208 self.functions.zeros_agg], AttributeError: 'Functions' object has no attribute 'zeros_agg'
df.cols.dtypes()
{'Name': 'object', 'Age': 'int64'}
print({k:str(v) for k,v in dict(df.dtypes).items()})
{'Name': 'object', 'Age': 'int64'}