%matplotlib inline # standard path wrangling to make notebook reproducible in dev and prod import sys from os.path import dirname, abspath from os import getcwd try: root = dirname(dirname(abspath(__file__))) except NameError: root = dirname(getcwd()) sys.path.append(root) from lib import auto_impala # imports specific to this notebook from impala.util import as_pandas import brewer2mpl import math from scipy.stats import chisquare import pandas as pd import brewer2mpl import lib.display_utils query = lib.display_utils.sql_query_from_file('examples/first_digits.sql') query with auto_impala() as cursor: cursor.execute(query) df = as_pandas(cursor) # Clean up the dataframe a bit df = df.dropna().sort('first digit').set_index('first digit') def benfordp(digit): return math.log10(1. + (1./digit)) scaled_df = df / df.sum()[0] scaled_df.rename(columns={'count': 'True Rate'}, inplace=True) benford_series = pd.Series(map(benfordp, scaled_df.index), index=scaled_df.index) benford_df = pd.DataFrame(benford_series, columns=['Benford Rate']) joined_df = pd.concat([benford_df, scaled_df], axis=1) joined_df colors = brewer2mpl.wesanderson.Moonrise1.mpl_colors # oh yeah joined_df.plot(kind='bar', figsize=(14,10), title="Charge 1st Digits vs Benford's Law", color=colors) chi2_score, p_value = chisquare(df['count'], benford_series * df.sum()[0]) p_value