fin = open("data/tips.csv")
data = []
labels = fin.readline().strip().split(",")
for line in fin:
data.append(line.strip().split(","))
labels
['obs', 'totbill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
data[2]
['3', '21.01', ' 3.50', 'M', 'No', 'Sun', 'Night', '3']
tips = [float(row[2]) for row in data]
tips[:10]
[1.01, 1.66, 3.5, 3.31, 3.61, 4.71, 2.0, 3.12, 1.96, 3.23]
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(tips, 30)
plt.xlabel("Tip Amount")
plt.ylabel("Count")
<matplotlib.text.Text at 0x10f0404a8>
days = list(set([row[5] for row in data]))
for i in range(len(days)):
plt.subplot(2, 2, i + 1)
tipsd = [float(row[2]) for row in data if row[5] == days[i]]
plt.hist(tipsd)
plt.xlabel(days[i])
plt.xlim([0, 12])
plt.tight_layout()
for i in range(len(days)):
plt.figure(i + 1)
tipsd = [float(row[2]) for row in data if row[5] == days[i]]
plt.hist(tipsd)
plt.xlabel(days[i])
plt.xlim([0, 12])
amounts = [float(row[1]) for row in data]
tips[:10]
[1.01, 1.66, 3.5, 3.31, 3.61, 4.71, 2.0, 3.12, 1.96, 3.23]
amounts[:10]
[16.99, 10.34, 21.01, 23.68, 24.59, 25.29, 8.77, 26.88, 15.04, 14.78]
plt.scatter(amounts, tips)
plt.ylabel("Tip")
plt.xlabel("Bill")
<matplotlib.text.Text at 0x114fe0a90>
tiprate = [t / b for t, b in zip(tips, amounts)]
plt.scatter(amounts, tiprate)
<matplotlib.collections.PathCollection at 0x11521ca58>
import scipy.stats as st
slope, intercept, r_value, p_value, std_error = st.linregress(amounts, tiprate)
slope
-0.0023230242956649386
intercept
0.20676580714825193
r_value
-0.33862408496473467
p_value
5.8480589871585955e-08
std_error
0.00041493658617062503
import numpy as np
interps = np.linspace(0, 60, 200)
len(interps)
200
interps[:10]
array([ 0. , 0.30150754, 0.60301508, 0.90452261, 1.20603015, 1.50753769, 1.80904523, 2.11055276, 2.4120603 , 2.71356784])
plt.scatter(amounts, tiprate, color="r")
plt.plot(interps, intercept + slope * interps)
[<matplotlib.lines.Line2D at 0x117de0dd8>]