In [1]:

%load_ext autoreload
%autoreload 2

In [2]:

import sys
sys.path.append("..")

In [3]:

import pandas as pd

In [4]:

from optimus import optimus as Optimus

C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
  from collections import Callable

    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible. 
`formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly

In [57]:

data = [['tom', 10], ['nick', 15], ['juli', 0],['argenis', 10]]

# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Name', 'Age']) 
df.cols.count_by_dtypes("*")

Out[57]:

{'Name': {'int': 0,
  'decimal': 0,
  'string': 0,
  'date': 0,
  'boolean': 0,
  'binary': 0,
  'array': 0,
  'object': 4,
  'null': 0,
  'missing': 0},
 'Age': {'int': 0,
  'decimal': 0,
  'string': 0,
  'date': 0,
  'boolean': 0,
  'binary': 0,
  'array': 0,
  'object': 0,
  'null': 0,
  'missing': 0}}

In [58]:

df

Out[58]:

	Name	Age
0	tom	10
1	nick	15
2	juli	0
3	argenis	10

In [65]:

# df[df == 0].count(axis=0)
# Speedy https://stackoverflow.com/questions/35277075/python-pandas-counting-the-occurrences-of-a-specific-value
def zeros(series):
    return (series.values==0).sum()

zeros(df["Age"])

Out[65]:

In [135]:

def percentile_agg(col_name, values):  
    return col_name.quantile(values).to_dict()
    

percentile_agg(df["Age"], [0.1, 0.5])

Out[135]:

{0.1: 3.0000000000000004, 0.5: 10.0}

In [136]:

df.agg(
    {
#         # find the min, max, and sum of the duration column
#         'Name': [min, max, sum],
#          # find the number of network type entries
#         'Age': "count",
        # min, first, and number of unique dates per group
        'Age': [min, 'first', 'nunique', percentile_agg("Age",[0.1, 0.5])]
    }
).to_dict()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-136-2dcbdc92d217> in <module>
      6 #         'Age': "count",
      7         # min, first, and number of unique dates per group
----> 8         'Age': [min, 'first', 'nunique', percentile_agg("Age",[0.1, 0.5])]
      9     }
     10 ).to_dict()

<ipython-input-135-baf44f890966> in percentile_agg(col_name, values)
      1 def percentile_agg(col_name, values):
----> 2     return col_name.quantile(values).to_dict()
      3 
      4 
      5 percentile_agg(df["Age"], [0.1, 0.5])

AttributeError: 'str' object has no attribute 'quantile'

In [37]:

print(df)

   Name Age
0   tom  10
1  nick  15
2  juli   0

In [138]:

from optimus.profiler.profiler import Profiler
p = Profiler()
print(df.functions)
p.columns_agg(df)

<optimus.pandas.functions.functions.<locals>.Functions object at 0x000001F961E9E438>
<optimus.pandas.functions.functions.<locals>.Functions object at 0x000001F961E7EF60>
[(<function functions.<locals>.Functions.count_uniques_agg at 0x000001F961EED1E0>, ('Name', True)), (<function functions.<locals>.Functions.count_uniques_agg at 0x000001F961EED1E0>, ('Age', True))]

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-138-02af911c93ea> in <module>
      2 p = Profiler()
      3 print(df.functions)
----> 4 p.columns_agg(df)

~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(df, columns, buckets, relative_error, approx_count)
    371             print(df.functions)
    372             funcs = [df.functions.count_uniques_agg]
--> 373             exprs = df.cols.create_exprs(cols, funcs, approx_count)
    374 
    375             # TODO: in basic calculations only funcs = [F.min, F.max]

~\Documents\Optimus\optimus\pandas\columns.py in create_exprs(columns, funcs, *args)
    252                 return _exprs
    253             print(exprs)
--> 254             r = _agg_exprs(exprs)
    255 
    256             return r

~\Documents\Optimus\optimus\pandas\columns.py in _agg_exprs(_funcs)
    242 
    243                     if not _filter(_col_name, _func):
--> 244                         agg = _func(*_args)
    245                         if agg is not None:
    246                             func_name = _beautify_col_names(_func)

~\Documents\Optimus\optimus\pandas\functions.py in count_uniques_agg(col_name, estimate)
     68         @staticmethod
     69         def count_uniques_agg(col_name: pd.DataFrame, estimate=True ):
---> 70             return col_name.value_counts().to_dict()
     71 
     72         # @staticmethod

AttributeError: 'str' object has no attribute 'value_counts'

In [26]:

from optimus.profiler.profiler import Profiler
p = Profiler()
p.run(df)

<optimus.pandas.functions.functions.<locals>.Functions object at 0x0000021950468C50>

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-26-e792527df240> in <module>
      1 from optimus.profiler.profiler import Profiler
      2 p = Profiler()
----> 3 p.run(df)

~\Documents\Optimus\optimus\helpers\decorators.py in timed(*args, **kw)
      8     def timed(*args, **kw):
      9         start_time = timeit.default_timer()
---> 10         f = method(*args, **kw)
     11         _time = round(timeit.default_timer() - start_time, 2)
     12         logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time))

~\Documents\Optimus\optimus\profiler\profiler.py in run(self, df, columns, buckets, infer, relative_error, approx_count)
    119         # df.ext.set_meta({"initialized": True})
    120 
--> 121         output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict")
    122 
    123         # Load jinja

~\Documents\Optimus\optimus\profiler\profiler.py in dataset(self, df, columns, buckets, infer, relative_error, approx_count, sample, stats, format)
    246         # if ["drop", "rename"] not in trans and self.already_run is False:
    247         if stats is True:
--> 248             output_columns = self.columns_stats(df, columns, buckets, infer, relative_error, approx_count)
    249 
    250         assign(output_columns, "name", df.get_name(), dict)

~\Documents\Optimus\optimus\profiler\profiler.py in columns_stats(self, df, columns, buckets, infer, relative_error, approx_count)
    331 
    332         # Aggregation
--> 333         stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count)
    334 
    335         # Calculate Frequency

~\Documents\Optimus\optimus\profiler\profiler.py in columns_agg(df, columns, buckets, relative_error, approx_count)
    371             print(df.functions)
    372             funcs = [df.functions.count_uniques_agg]
--> 373             exprs = df.cols.create_exprs(cols, funcs, approx_count)
    374 
    375             # TODO: in basic calculations only funcs = [F.min, F.max]

~\Documents\Optimus\optimus\pandas\columns.py in create_exprs(columns, funcs, *args)
    204             # Std, kurtosis, mean, skewness and other agg functions can not process date columns.
    205             filters = {"date": [F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct,
--> 206                                 self.functions.zeros_agg],
    207                        "array": [F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, F.approx_count_distinct,
    208                                  self.functions.zeros_agg],

AttributeError: 'Functions' object has no attribute 'zeros_agg'

In [54]:

df.cols.dtypes()

Out[54]:

{'Name': 'object', 'Age': 'int64'}

In [53]:

print({k:str(v) for k,v in dict(df.dtypes).items()})
    

{'Name': 'object', 'Age': 'int64'}

In [ ]: