In [3]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import pandas 
import numpy as np

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('erikrood','3eqsrype8v')
In [4]:
cereal = pandas.read_csv('/Users/erikrood/desktop/ipython_datasets/cereal_timeseries.csv') 
In [5]:
cereal.head()
Out[5]:
Name Brand Clean_launch_year Clean_end_year Launch_decade End_decade Launch_Year End_year
0 Ceccettio's General Mills None None None None None None
1 Clackers General Mills None None None None None None
2 Clusters General Mills None None None None None None
3 Diamond Shreddies General Mills None None None None None None
4 Dudley Do-Right General Mills None None None None None None
In [6]:
df = cereal [[0,2]]
In [7]:
#removing rows with no launch year value
df = df[df.Clean_launch_year != 'None']
In [8]:
df.head(1000)
Out[8]:
Name Clean_launch_year
162 Honey Smacks 1953
163 MultiGrain Dark Chocolate Crunch Cheerios (201 2014
164 MultiGrain Dark Chocolate Crunch Cheerios (201 2014
165 S'mores Krave 2014
166 S'mores Krave 2014
167 Poppin' Pebbles 2014
168 Poppin' Pebbles 2014
169 Monsters University Cereal 2013
170 Rocky Mountain Chocolate Cereal 2013
171 Scoob 2013
172 Vanilla Chex (201 2013
173 Vanilla Chex (201 2013
174 Chocolate Toast Crunch 2013
175 Hershey's Cookies 'n' Creme Cereal 2013
176 Cinnamon Jacks 2013
177 Froots Loops Treasures Cereal 2013
178 Sesame Street Cereal 2013
179 Cookie Crisp Brownie (201 2013
180 Cookie Crisp Brownie (201 2013
181 Apple Cinnamon Chex (201 2012
182 Apple Cinnamon Chex (201 2012
183 Krave U.S Version - Kellogg's - (201 2012
184 Chocolate Krave - (201 2012
185 Double Chocolate Krave - (201 2012
186 Chocolate Krave - (201 2012
187 Double Chocolate Krave - (201 2012
188 MultiGrain Peanut Butter Crunch Cheerios (201 2012
189 MultiGrain Peanut Butter Crunch Cheerios (201 2012
190 Frosted Toast Crunch 2012
191 Honey Bunches of Oats Fruit Blends (201 2012
... ... ...
521 Kix 1937
522 CoCo Wheats 1930
523 Rice Krispies 1929
524 All-Bran 1916
525 Strawberry Chex (200 2008
526 Corn Flakes 1907
527 Grape-Nuts 1897
528 Choco Crunch (Re-introduced Version) 2007
529 Eggo (breakfast cereal) 2006
530 Star Wars cereal 2005
531 Homer's Cinnamon Donut Cereal 2001
532 Strawberry Chex (200 2008
533 Keebler Cookie Crunch Cereal 2008
534 Oat Crisp 1998
535 Chex 1997
536 Cupcake Pebbles 2009
537 Cupcake Pebbles 2009
538 Jumbo Krispies 2009
539 Jumbo Krispies 2009
540 Cookie Crisp Sprinkles (200 2009
541 Cookie Crisp Sprinkles (200 2009
542 Honey Bunches of Oats with Cinnamon Clusters (... 2006
543 Honey Bunches of Oats with Cinnamon Clusters (... 2006
544 Frosted Flakes Chocolate (201 2010
545 Yummy Mummy 1988
546 Yummy Mummy 1988
547 Sugar Smacks 1953
548 Frosted Flakes 1952
549 Golden Crisp 1949
550 Cheerios 1941

389 rows × 2 columns

In [9]:
df.to_csv('cereal_feeder_data.csv')
In [11]:
total_c_by_year = df.groupby('Clean_launch_year').count().reset_index()
total_c_by_year.head(100)
Out[11]:
Clean_launch_year Name
0 1897 1
1 1907 1
2 1913 1
3 1916 1
4 1929 1
5 1930 1
6 1937 3
7 1941 1
8 1944 1
9 1949 2
10 1950 4
11 1951 2
12 1952 1
13 1953 2
14 1955 1
15 1958 6
16 1959 1
17 1960 5
18 1961 1
19 1963 2
20 1964 2
21 1965 3
22 1967 3
23 1969 5
24 1970 12
25 1971 8
26 1972 3
27 1973 3
28 1974 2
29 1977 1
... ... ...
37 1985 6
38 1986 4
39 1987 5
40 1988 12
41 1989 4
42 1990 16
43 1991 3
44 1992 2
45 1993 3
46 1994 3
47 1995 7
48 1996 2
49 1997 8
50 1998 5
51 1999 7
52 2000 1
53 2001 3
54 2002 10
55 2003 19
56 2004 16
57 2005 15
58 2006 23
59 2007 18
60 2008 18
61 2009 12
62 2010 7
63 2011 7
64 2012 12
65 2013 12
66 2014 6

67 rows × 2 columns

In [13]:
#Creating a better looking scatterplot of Cereal vs. Year

_x = total_c_by_year['Clean_launch_year']
_y = total_c_by_year['Name']

# Create a trace
trace = go.Scatter(
    x = _x,
    y = _y,
    mode = 'markers',
        marker = dict(
        size = 6,
        color = 'rgb(156, 231, 193)',

    )
       
)

data = [trace]

layout =  dict(title='Number of cereals launched by year',
    hovermode='closest',
    xaxis=dict(
        title='Cereal launch year',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
)


# Plot and embed in ipython notebook
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='basic-scatter2')
Out[13]:
In [14]:
df1 = cereal [[0,4]]
df1 = df1[df1.Launch_decade != 'None']
df1.head()
Out[14]:
Name Launch_decade
162 Honey Smacks 1950
163 MultiGrain Dark Chocolate Crunch Cheerios (201 1910
164 MultiGrain Dark Chocolate Crunch Cheerios (201 1910
165 S'mores Krave 1910
166 S'mores Krave 1910
In [15]:
#Creating a histogram of cereal by launch decade
x = df1['Launch_decade']
data = [
    go.Histogram(
        x=x,
        marker=dict(
        color='rgb(255, 217, 102)'
    ),
    opacity=.45
)
    ]

layout = go.Layout(
    title='Histogram of cereal launches',
    xaxis=dict(
        title='Year bin'
    ),
    yaxis=dict(
        title='n-count'
    ))

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='basic-histogram')
Out[15]:
In [97]:
#cereal launches by brand
df2 = cereal [[1,2]]
#df2 = df2[df2.Clean_launch_year != 'None']
df2.head()
Out[97]:
Brand Clean_launch_year
0 General Mills None
1 General Mills None
2 General Mills None
3 General Mills None
4 General Mills None
In [98]:
ranked_brands = df2.groupby('Brand').count().sort_values(by='Clean_launch_year',ascending=False).reset_index()
ranked_brands = ranked_brands.head(5)
ranked_brands.head()
Out[98]:
Brand Clean_launch_year
0 General Mills 153
1 Kellogg's 147
2 Post 76
3 Quaker 46
4 Ralston 24
In [99]:
#bar chart


x = ranked_brands['Brand']
y = ranked_brands['Clean_launch_year']

data = [
    go.Bar(
        x=x,
        y=y,
        marker=dict(
            color='rgb(243, 118, 121)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='Number of cereal launches by brand',
    xaxis=dict(
        title='Brand'
    ),
    yaxis=dict(
        title='total # cereal launches'
    ),
    
    annotations=[
        dict(
            x=xi,
            y=yi,
            text=str(yi),
            xanchor='center',
            yanchor='bottom',
            showarrow=False,
        ) for xi, yi in zip(x, y)]
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cereal_launches')
Out[99]:

revenue info

general mills - 17.9 B, ~1.2B NI

kellogs 14.8 B, 5B

post ~5B revenue, 1.2 B profit,

quaker owned by pepsi

ralston 15B? (privately held)

Retail sales of ready-to-eat cold cereal have been declining at a compound annual growth rate (CAGR) of 1.5% for the past five years.

To combat the decline, brands are introducing new products and trying to get the message out that cereal is healthy.

Since the late 1990s, its popularity has been slowly fading. Sales, which totaled $13.9 billion in 2000, dipped last year to about $10 billion. http://www.seattletimes.com/life/food-drink/are-breakfast-cereals-of-baby-boomers-youth-now-pass/

In [131]:
df3 = cereal [[0,1,2,3]]
df3.head(5)
Out[131]:
Name Brand Clean_launch_year Clean_end_year
0 Ceccettio's General Mills None None
1 Clackers General Mills None None
2 Clusters General Mills None None
3 Diamond Shreddies General Mills None None
4 Dudley Do-Right General Mills None None
In [132]:
#longest running cereal
#shortest running cereal

#removing entries w/ empty launch year
df3 = df3[df3.Clean_launch_year != 'None']
df3.head()
Out[132]:
Name Brand Clean_launch_year Clean_end_year
162 Honey Smacks Kellogg's 1953 Currently available
163 MultiGrain Dark Chocolate Crunch Cheerios (201 General Mills 2014 None
164 MultiGrain Dark Chocolate Crunch Cheerios (201 General Mills 2014 None
165 S'mores Krave Kellogg's 2014 None
166 S'mores Krave Kellogg's 2014 None
In [133]:
#removing entries w/ empty/missing end year
df3 = df3[df3.Clean_end_year != 'None']
df3.head(4)
Out[133]:
Name Brand Clean_launch_year Clean_end_year
162 Honey Smacks Kellogg's 1953 Currently available
179 Cookie Crisp Brownie (201 General Mills 2013 Currently available
180 Cookie Crisp Brownie (201 General Mills 2013 Currently available
181 Apple Cinnamon Chex (201 General Mills 2012 Currently available
In [205]:
#defining function to switch 'Currently available' for cereal end date to 2016 to get present tenure
def func(row):
    if row['Clean_end_year'] == 'Currently available':
        return 2016
    else:
        return row['Clean_end_year']
In [213]:
#converting launch year from string to float
df3[['Clean_launch_year']] = df3[['Clean_launch_year']].astype(float)
df3[['Clean_end_year_2']] = df3[['Clean_end_year_2']].astype(float)
In [214]:
#adding new column, cereal tenure to show the cereals that have been around longest
df3['Cereal_tenure'] = (df3.Clean_end_year_2 - df3.Clean_launch_year)
In [234]:
df3.head(5)
Out[234]:
Name Brand Clean_launch_year Clean_end_year Clean_end_year_2 Cereal_tenure
162 Honey Smacks Kellogg's 1953.0 Currently available 2016.0 63.0
179 Cookie Crisp Brownie (201 General Mills 2013.0 Currently available 2016.0 3.0
180 Cookie Crisp Brownie (201 General Mills 2013.0 Currently available 2016.0 3.0
181 Apple Cinnamon Chex (201 General Mills 2012.0 Currently available 2016.0 4.0
182 Apple Cinnamon Chex (201 General Mills 2012.0 Currently available 2016.0 4.0
In [264]:
#putting the tenured cereals together to plot
#proper table made by exporting this to csv and using TablePress plug-in
df4 = df3 [[0,5]]
tenured_cereals = df4.groupby('Name').mean().sort_values(by='Cereal_tenure',ascending=False).round(decimals=0).reset_index()
tenured_cereals.head(5)
Out[264]:
Name Cereal_tenure
0 Grape-Nuts 119.0
1 Corn Flakes 109.0
2 All-Bran 100.0
3 Rice Krispies 87.0
4 CoCo Wheats 86.0
In [219]:
#plot.ly table of cereal tenure
colorscale = [[0, '#4a9eec'],[.5, 'd2d2d2'],[1, '#ffffff']]

table = FF.create_table(tenured_cereals, colorscale=colorscale)
plot_url = py.plot(table, filename='tenured_cereals')
In [224]:
#histogram of tenure for cereals
tenured_cereals_clean = tenured_cereals.head(200)
In [225]:
tenured_cereals_clean.head()
Out[225]:
Name Cereal_tenure
0 Grape-Nuts 119.0
1 Corn Flakes 109.0
2 All-Bran 100.0
3 Rice Krispies 87.0
4 CoCo Wheats 86.0
In [230]:
#Creating a histogram of cereal tenure
x = tenured_cereals_clean['Cereal_tenure']
data = [
    go.Histogram(
        x=x,
        marker=dict(
        color='rgb(31, 223, 105)'
    ),
    opacity=.45
)
    ]

layout = go.Layout(
    title='The majority of cereals last <5 years on the market',
    xaxis=dict(
        title='Year bin'
    ),
    yaxis=dict(
        title='n-count'
    ))

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='cereal_tenure_hist')
Out[230]:
In [231]:
#avg cereal tenure by company
df5 = df3 [[1,5]]
df5.head(5)
Out[231]:
Brand Cereal_tenure
162 Kellogg's 63.0
179 General Mills 3.0
180 General Mills 3.0
181 General Mills 4.0
182 General Mills 4.0
In [232]:
#major brands have similar cereal tenure trends
brand_tenure = df5.groupby('Brand').mean().sort_values(by='Cereal_tenure',ascending=False).round(decimals=0).reset_index()
brand_tenure.head(5)
Out[232]:
Brand Cereal_tenure
0 Little Crow Foods 86.0
1 General Mills 20.0
2 Kellogg's 19.0
3 Post 18.0
4 General Mills/Nestlé 17.0
In [233]:
df6 = cereal [[1,3]]
df6.head()
Out[233]:
Brand Clean_end_year
0 General Mills None
1 General Mills None
2 General Mills None
3 General Mills None
4 General Mills None
In [236]:
#defining function to switch 'Currently available' for cereal end date to 2016 to get present tenure
def func1(row):
    if row['Clean_end_year'] == 'Currently available':
        pass
    elif row['Clean_end_year'] == 'None':
        pass
    else:
        return row['Clean_end_year']
In [241]:
#switch currently available and no end year rows to null values
df6['Clean_end_year'] = df3.apply(func1, axis=1)
df6.head(5)
/Users/erikrood/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[241]:
Brand Clean_end_year
0 General Mills NaN
1 General Mills NaN
2 General Mills NaN
3 General Mills NaN
4 General Mills NaN
In [243]:
#remove the new null value rows from list

df6[['Clean_end_year']] = df6[['Clean_end_year']].astype(float)
df6 = df6[np.isfinite(df6['Clean_end_year'])]
/Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:2369: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [246]:
df6.head(5)
Out[246]:
Brand Clean_end_year
206 Post 2011.0
211 Post 2011.0
212 NaN 2007.0
213 Kellogg's 2000.0
214 Kellogg's 2000.0
In [248]:
brand_cereal_sunsets = df6.groupby('Brand').count().sort_values(by='Clean_end_year',ascending=False).round(decimals=0).reset_index()
brand_cereal_sunsets.head()
Out[248]:
Brand Clean_end_year
0 Kellogg's 45
1 General Mills 37
2 Post 25
3 Quaker 14
4 Ralston 10
In [253]:
#performing left join to bring in n-count of total cereals by brand
s1 = pandas.merge(brand_cereal_sunsets, ranked_brands, how='left', on=['Brand'])
In [254]:
s1.head(10)
Out[254]:
Brand Clean_end_year Clean_launch_year
0 Kellogg's 45 147.0
1 General Mills 37 153.0
2 Post 25 76.0
3 Quaker 14 46.0
4 Ralston 10 24.0
5 Kellogg's/Disney 3 NaN
6 Ralston/Nintendo – 1 NaN
In [283]:
#adding a new field for the percent of total cereals sunsetted by each brand
s1['perc_failed_cereals'] = ((s1.Clean_end_year / s1.Clean_launch_year)*100)
s1.head(10)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-283-2c4b43fe0129> in <module>()
      1 #adding a new field for the percent of total cereals sunsetted by each brand
----> 2 s1['perc_failed_cereals'] = (round((s1.Clean_end_year / s1.Clean_launch_year)*100,1))
      3 s1.head(10)

/Users/erikrood/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in wrapper(self)
     78             return converter(self.iloc[0])
     79         raise TypeError("cannot convert the series to "
---> 80                         "{0}".format(str(converter)))
     81 
     82     return wrapper

TypeError: cannot convert the series to <type 'float'>
In [279]:
df7 = s1.head(4)
df7[['perc_failed_cereals']] = df7[['perc_failed_cereals']].astype(float)
df7.head()
Out[279]:
Brand Clean_end_year Clean_launch_year perc_failed_cereals
0 Kellogg's 45 147.00% 30.61%
1 General Mills 37 153.00% 24.18%
2 Post 25 76.00% 32.89%
3 Quaker 14 46.00% 30.43%
In [280]:
#plot what's above, number of cereals launched, # of cereals ended, % failed cereals (combo)
#bar chart
df8 = df7 [[0,3]]
#changing float format to % for this chart
pandas.options.display.float_format = '{:.2f}%'.format
df8.head()
Out[280]:
Brand perc_failed_cereals
0 Kellogg's 30.61%
1 General Mills 24.18%
2 Post 32.89%
3 Quaker 30.43%
In [285]:
brand_sunsets = df8.groupby('Brand').sum().sort_values(by='perc_failed_cereals',ascending=False).round(decimals=2).reset_index()
brand_sunsets.head()
Out[285]:
Brand perc_failed_cereals
0 Post 32.89%
1 Kellogg's 30.61%
2 Quaker 30.43%
3 General Mills 24.18%
In [287]:
#bar chart

x = brand_sunsets['Brand']
y = brand_sunsets['perc_failed_cereals']

data = [
    go.Bar(
        x=x,
        y=y,
        marker=dict(
            color='rgba(0, 0, 0, 0.52)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='Percent of total cereals that have been discontinued by brand',
    xaxis=dict(
        title='Brand'
    ),
    yaxis=dict(
        title='% of total cereals discontinued',
    ),
    
    annotations=[
        dict(
            x=xi,
            y=yi,
            text=str(yi),
            xanchor='center',
            yanchor='bottom',
            showarrow=False,
        ) for xi, yi in zip(x, y)]
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='cereal_sunsets')
Out[287]: