In [51]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import pandas 
import numpy as np

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('erikrood','MvFBSoiNNUMNbCgJPNAp')
In [52]:
NIPE = pandas.read_csv('/Users/erikrood/desktop/NIPE_main_cleaned.csv') 
In [53]:
NIPE.head(5)
Out[53]:
Company_name NIPE_MM Sector Industry Exchange Market_Cap Employees_Full_Time Net_income_MM Clean_market_cap
0 Host Hotels & Resorts Inc 2675 Financials Real Estate Investment Trusts (REITs) NYSE $11.8B 240 $642,000 12
1 Gilead Sciences Inc 2168 Health Care Biotechnology NASDAQ $118.9B 8000 $17,341,000 119
2 Realty Income Corp. 1967 Financials Real Estate Investment Trusts (REITs) NYSE $14.9B 132 $259,665 15
3 Welltower Inc 1719 Financials Real Estate Investment Trusts (REITs) NYSE $24.6B 476 $818,344 25
4 Equity Residential 1221 Financials Real Estate Investment Trusts (REITs) NYSE $24.9B 3500 $4,272,041 25
In [54]:
df = NIPE [[3,1]]
df[['NIPE_MM']] = df[['NIPE_MM']].astype(float)
df.head()
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[54]:
Industry NIPE_MM
0 Real Estate Investment Trusts (REITs) 2675.0
1 Biotechnology 2168.0
2 Real Estate Investment Trusts (REITs) 1967.0
3 Real Estate Investment Trusts (REITs) 1719.0
4 Real Estate Investment Trusts (REITs) 1221.0
In [55]:
average_NIPE_industry = df.groupby('Industry').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
average_NIPE_industry_top7 = average_NIPE_industry.head(7)
average_NIPE_industry_top7.head(7)
Out[55]:
Industry NIPE_MM
0 Tobacco 609.0
1 Real Estate Investment Trusts (REITs) 487.0
2 Biotechnology 293.0
3 Diversified Financial Services 184.0
4 Road & Rail 130.0
5 Consumer Finance 128.0
6 Semiconductors & Semiconductor Equipment 106.0
In [56]:
average_NIPE_industry_bottom7 = average_NIPE_industry.tail(7)
average_NIPE_industry_bottom7.head(7)
Out[56]:
Industry NIPE_MM
53 Food & Staples Retailing 10.0
54 Auto Components 8.0
55 Containers & Packaging 8.0
56 Multiline Retail 7.0
57 Metals & Mining -20.0
58 Energy Equipment & Services -21.0
59 Oil, Gas & Consumable Fuels -421.0
In [57]:
#too lazy to create function, but dropping everything less the top 7 and bottom 7 companies by NIPE
average_NIPE_industry.drop([7,8,9,10,11,12,13,14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52], inplace = True)
In [58]:
average_NIPE_industry.head(14)
Out[58]:
Industry NIPE_MM
0 Tobacco 609.0
1 Real Estate Investment Trusts (REITs) 487.0
2 Biotechnology 293.0
3 Diversified Financial Services 184.0
4 Road & Rail 130.0
5 Consumer Finance 128.0
6 Semiconductors & Semiconductor Equipment 106.0
53 Food & Staples Retailing 10.0
54 Auto Components 8.0
55 Containers & Packaging 8.0
56 Multiline Retail 7.0
57 Metals & Mining -20.0
58 Energy Equipment & Services -21.0
59 Oil, Gas & Consumable Fuels -421.0
In [59]:
#bar chart


x = average_NIPE_industry['Industry']
y = average_NIPE_industry['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by industry [top 7/bottom 7 industries]',
    xaxis=dict(
        title='industry',
        tickangle = 47,
    ),
    yaxis=dict(
        title='avg. NIPE (k)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry')
Out[59]:
In [60]:
#scatter of employees vs. NI,(could drill into sectors/industries)
df4 = NIPE [[6,7]]
df4[['Employees_Full_Time']] = df4[['Employees_Full_Time']].astype(int)
df4.head()
df5 = df4.groupby('Net_income_MM').sum().round(decimals=0).reset_index()
df5.head()
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[60]:
Net_income_MM Employees_Full_Time
0 $1,000,000 6700
1 $1,002,100 47145
2 $1,010,551 39556
3 $1,020,661 77800
4 $1,028,380 180835
In [61]:
#Creating a better looking scatterplot of Cereal vs. Year

_x = df5['Employees_Full_Time']
_y = df5['Net_income_MM']

# Create a trace
trace = go.Scatter(
    x = _x,
    y = _y,

    mode = 'markers',
        marker = dict(
        size = 6,
        color = 'rgb(156, 231, 193)',

    )
       
)

data = [trace]

layout =  dict(title='net income vs. number of employees',
    hovermode='closest',
    xaxis=dict(
        title='Net Income (MM)',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
               yaxis=dict(
        title='# of employees',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
)


# Plot and embed in ipython notebook
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='NIPE_vs_Employees')
Out[61]:
In [62]:
#rank top/bottom companies
df3 = NIPE [[0,1]]
#top 10
df6 = df3.head(10)
#bottom 10
df7 = df3.tail(10)
In [63]:
#bar chart

x = df6['Company_name']
y = df6['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(247, 135, 22)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by company [top 10]',
    xaxis=dict(
        title='company',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (k)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_top10')
Out[63]:
In [65]:
#bar chart, bottom 10

x = df7['Company_name']
y = df7['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by company [bottom 10]',
    xaxis=dict(
        title='company',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (k)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_bottom10')
Out[65]:
In [ ]:
#splitting out IT companies
IT_companies = ['Information Technology']
NIPE_IT = NIPE[NIPE['Sector'].isin(IT_companies)]
In [ ]:
NIPE_IT.head()
In [ ]:
df8 = NIPE_IT[[0,1]]
df8.head()
In [ ]:
IT_NIPE_cleaned = df8.groupby('Company_name').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
IT_NIPE_cleaned.head(5)
In [ ]:
#standard deviation in NIPE by industry
df9 = NIPE [[3,1]]
#converting NIPE to float/#
df9[['NIPE_MM']] = df9[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df10 = df9.groupby('Industry', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)
In [ ]:
#cleaning column names
df10.columns = ['Industry', 'NIPE_Std_dev'] #renaming columns
df10.head(5)
In [ ]:
#standard deviation in NIPE by sector
df11 = NIPE [[2,1]]
#converting NIPE to float/#
df11[['NIPE_MM']] = df11[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df12 = df11.groupby('Sector', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)
In [ ]:
df12.columns = ['Sector', 'NIPE_Std_dev'] #renaming columns
df12.head(10)
In [ ]:
#avg NIPE by sector
df13 = NIPE [[2,1]]
df14 = df13.groupby('Sector').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
df14.columns = ['Sector', 'Avg_NIPE'] #renaming columns
df14.head(5)
In [ ]:
#joining avg_NIPE w/ standard deviation NIPE table
j1 = df14.merge(df12, how='left', on="Sector")
j1.head(10)
In [ ]:
#bar chart

x = j1['Sector']
y1 = j1['Avg_NIPE']
y2 = j1['NIPE_Std_dev']


trace0 = go.Bar(
        x=x,
        y=y1,
        name='Average NIPE',
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )

trace1 = go.Bar(
        x=x,
        y=y2,
        name='Standard deviation',
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
        
data = [trace0, trace1]
    
layout = go.Layout(
      title='mean NIPE by sector, with standard deviation',
    xaxis=dict(
        title='industry',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (k)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_sector_stddev')
In [ ]:
#setting up y values for box plot (step 0), loading in the data I want
sector_nipe = NIPE[['Sector', 'NIPE_MM']]
sector_nipe[['NIPE_MM']] = sector_nipe[['NIPE_MM']].astype(float)
sector_nipe.head()
In [ ]:
#setting up x variables for box plot
x_data1 = {'Consumer Discretionary','Consumer Staples','Energy',\
         'Financials', 'Health Care','Industrials',\
          'Information Technology','Materials','Telecommunication Services','Utilities'}
x_data1
In [ ]:
#setting up y values for box plot (step 1)
sector_CD = sector_nipe[sector_nipe['Sector'] == 'Consumer Discretionary'].copy()
ConsumerDiscretionary = sector_CD[['NIPE_MM']].copy()

sector_CS = sector_nipe[sector_nipe['Sector']== 'Consumer Staples'].copy()
ConsumerStaples = sector_CS[['NIPE_MM']].copy()

sector_energy = sector_nipe[sector_nipe['Sector'] == 'Energy'].copy()
Energy = sector_energy[['NIPE_MM']].copy()

sector_financials = sector_nipe[sector_nipe['Sector'] == 'Financials'].copy()
Financials = sector_financials[['NIPE_MM']].copy()

sector_healthcare = sector_nipe[sector_nipe['Sector'] == 'Health Care'].copy()
Healthcare = sector_healthcare[['NIPE_MM']].copy()

sector_industrials = sector_nipe[sector_nipe['Sector'] == 'Industrials'].copy()
Industrials = sector_industrials[['NIPE_MM']].copy()

sector_IT = sector_nipe[sector_nipe['Sector'] == 'Information Technology'].copy()
InformationTechnology = sector_IT[['NIPE_MM']].copy()

sector_materials = sector_nipe[sector_nipe['Sector'] == 'Materials'].copy()
Materials = sector_materials[['NIPE_MM']].copy()

sector_TS = sector_nipe[sector_nipe['Sector'] == 'Telecommunication Services'].copy()
TelecommunicationServices = sector_TS[['NIPE_MM']].copy()

sector_utilities = sector_nipe[sector_nipe['Sector'] == 'Utilities'].copy()
Utilities = sector_utilities[['NIPE_MM']].copy()

#setting up y values for box plot (step 2)
y_data = [
    Industrials.values,
    Energy.values,
    Utilities.values,
    ConsumerStaples.values,
    Healthcare.values,
    Materials.values,
    TelecommunicationServices.values,
    Financials.values,
    ConsumerDiscretionary.values,
    InformationTechnology.values
]
In [ ]:
#putting together the box plot
traces = []
for xd, yd in zip(x_data1, y_data):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='NIPE distribution by sector',
    yaxis=dict(
        range=[-300, 350],
        showgrid=True,
        zeroline=True,
        #dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=go.Margin(
        l=50,
        r=90,
        b=280,
        t=60,
        pad=4
    ),
    width=800,
    height=650,
    showlegend=False
)

    
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename = 'NIPE_distribution_sector')