In [1]:
%matplotlib inline 
import matplotlib.pyplot as plt 
import pandas 
import numpy as np

import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF
py.sign_in('erikrood','3eqsrype8v')
In [3]:
NIPE = pandas.read_csv('/Users/erikrood/desktop/NIPE_main_cleaned.csv') 
In [4]:
NIPE.head(5)
Out[4]:
Company_name NIPE_MM Sector Industry Exchange Market_Cap Employees_Full_Time Net_income_MM Clean_market_cap
0 Host Hotels & Resorts Inc 2675 Financials Real Estate Investment Trusts (REITs) NYSE $11.8B 240 $642,000 12
1 Gilead Sciences Inc 2168 Health Care Biotechnology NASDAQ $118.9B 8000 $17,341,000 119
2 Realty Income Corp. 1967 Financials Real Estate Investment Trusts (REITs) NYSE $14.9B 132 $259,665 15
3 Welltower Inc 1719 Financials Real Estate Investment Trusts (REITs) NYSE $24.6B 476 $818,344 25
4 Equity Residential 1221 Financials Real Estate Investment Trusts (REITs) NYSE $24.9B 3500 $4,272,041 25
In [5]:
df = NIPE [[3,1]]
df[['NIPE_MM']] = df[['NIPE_MM']].astype(float)
df.head()
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[5]:
Industry NIPE_MM
0 Real Estate Investment Trusts (REITs) 2675.0
1 Biotechnology 2168.0
2 Real Estate Investment Trusts (REITs) 1967.0
3 Real Estate Investment Trusts (REITs) 1719.0
4 Real Estate Investment Trusts (REITs) 1221.0
In [6]:
average_NIPE_industry = df.groupby('Industry').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
average_NIPE_industry_top7 = average_NIPE_industry.head(7)
average_NIPE_industry_top7.head(7)
Out[6]:
Industry NIPE_MM
0 Tobacco 609.0
1 Real Estate Investment Trusts (REITs) 487.0
2 Biotechnology 293.0
3 Diversified Financial Services 184.0
4 Road & Rail 130.0
5 Consumer Finance 128.0
6 Semiconductors & Semiconductor Equipment 106.0
In [7]:
average_NIPE_industry_bottom7 = average_NIPE_industry.tail(7)
average_NIPE_industry_bottom7.head(7)
Out[7]:
Industry NIPE_MM
53 Food & Staples Retailing 10.0
54 Auto Components 8.0
55 Containers & Packaging 8.0
56 Multiline Retail 7.0
57 Metals & Mining -20.0
58 Energy Equipment & Services -21.0
59 Oil, Gas & Consumable Fuels -421.0
In [8]:
#too lazy to create function, but dropping everything less the top 7 and bottom 7 companies by NIPE
average_NIPE_industry.drop([7,8,9,10,11,12,13,14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52], inplace = True)
In [9]:
average_NIPE_industry.head(14)
Out[9]:
Industry NIPE_MM
0 Tobacco 609.0
1 Real Estate Investment Trusts (REITs) 487.0
2 Biotechnology 293.0
3 Diversified Financial Services 184.0
4 Road & Rail 130.0
5 Consumer Finance 128.0
6 Semiconductors & Semiconductor Equipment 106.0
53 Food & Staples Retailing 10.0
54 Auto Components 8.0
55 Containers & Packaging 8.0
56 Multiline Retail 7.0
57 Metals & Mining -20.0
58 Energy Equipment & Services -21.0
59 Oil, Gas & Consumable Fuels -421.0
In [10]:
#bar chart


x = average_NIPE_industry['Industry']
y = average_NIPE_industry['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by industry [top 7/bottom 7 industries]',
    xaxis=dict(
        title='industry',
        tickangle = 47,
    ),
    yaxis=dict(
        title='avg. NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry')
Out[10]:
In [11]:
#scatter of employees vs. NI,(could drill into sectors/industries)
df4 = NIPE [[6,7]]
df4[['Employees_Full_Time']] = df4[['Employees_Full_Time']].astype(int)
df4.head()
df5 = df4.groupby('Net_income_MM').sum().round(decimals=0).reset_index()
df5.head()
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[11]:
Net_income_MM Employees_Full_Time
0 $1,000,000 6700
1 $1,002,100 47145
2 $1,010,551 39556
3 $1,020,661 77800
4 $1,028,380 180835
In [21]:
#Creating a better looking scatterplot of Cereal vs. Year

_x = df5['Employees_Full_Time']
_y = df5['Net_income_MM']

# Create a trace
trace = go.Scatter(
    x = _x,
    y = _y,

    mode = 'markers',
        marker = dict(
        size = 6,
        color = 'rgb(156, 231, 193)',

    )
       
)

data = [trace]

layout =  dict(title='net income vs. number of employees',
    hovermode='closest',
    xaxis=dict(
        title='Net Income (MM)',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
               yaxis=dict(
        title='# of employees',
        ticklen=5,
        zeroline=False,
        gridwidth=2,
    ),
)


# Plot and embed in ipython notebook
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='NIPE_vs_Employees')
Out[21]:
In [22]:
#rank top/bottom companies
df3 = NIPE [[0,1]]
#top 10
df6 = df3.head(10)
#bottom 10
df7 = df3.tail(10)
In [23]:
#bar chart

x = df6['Company_name']
y = df6['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(247, 135, 22)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by company [top 10]',
    xaxis=dict(
        title='company',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_top10')
Out[23]:
In [24]:
#bar chart, bottom 10

x = df7['Company_name']
y = df7['NIPE_MM']

data = [
    go.Bar(
        x=x,
        y=y,
        
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
]
layout = go.Layout(
      title='net income per employee by company [bottom 10]',
    xaxis=dict(
        title='company',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_industry_bottom10')
Out[24]:
In [25]:
#splitting out IT companies
IT_companies = ['Information Technology']
NIPE_IT = NIPE[NIPE['Sector'].isin(IT_companies)]
In [26]:
NIPE_IT.head()
Out[26]:
Company_name NIPE_MM Sector Industry Exchange Market_Cap Employees_Full_Time Net_income_MM Clean_market_cap
12 Visa Inc 607 Information Technology IT Services NYSE $184.2B 11300 $6,857,000 184
20 Apple Inc 461 Information Technology Technology Hardware, Storage & Peripherals NASDAQ $513.5B 110000 $50,678,000 514
23 Facebook Inc 369 Information Technology Internet Software & Services NASDAQ $336.3B 12691 $4,686,000 336
27 MasterCard Inc 332 Information Technology IT Services NYSE $106.9B 11300 $3,747,000 107
29 Alibaba Group Holding Ltd 305 Information Technology Internet Software & Services NYSE $192B 34985 $10,673,110 192
In [27]:
df8 = NIPE_IT[[0,1]]
df8.head()
Out[27]:
Company_name NIPE_MM
12 Visa Inc 607
20 Apple Inc 461
23 Facebook Inc 369
27 MasterCard Inc 332
29 Alibaba Group Holding Ltd 305
In [28]:
IT_NIPE_cleaned = df8.groupby('Company_name').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
IT_NIPE_cleaned.head(5)
Out[28]:
Company_name NIPE_MM
0 Visa Inc 607
1 Apple Inc 461
2 Facebook Inc 369
3 MasterCard Inc 332
4 Alibaba Group Holding Ltd 305
In [29]:
#standard deviation in NIPE by industry
df9 = NIPE [[3,1]]
#converting NIPE to float/#
df9[['NIPE_MM']] = df9[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df10 = df9.groupby('Industry', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [30]:
#cleaning column names
df10.columns = ['Industry', 'NIPE_Std_dev'] #renaming columns
df10.head(5)
Out[30]:
Industry NIPE_Std_dev
44 Oil, Gas & Consumable Fuels 1092.0
48 Real Estate Investment Trusts (REITs) 989.0
7 Biotechnology 596.0
56 Tobacco 427.0
41 Metals & Mining 331.0
In [31]:
#standard deviation in NIPE by sector
df11 = NIPE [[2,1]]
#converting NIPE to float/#
df11[['NIPE_MM']] = df11[['NIPE_MM']].astype(float)
#pivoting and calculating std deviation
df12 = df11.groupby('Sector', as_index=False).agg(np.std, ddof=0).sort_values(by='NIPE_MM',ascending=False).round(decimals=0)
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [32]:
df12.columns = ['Sector', 'NIPE_Std_dev'] #renaming columns
df12.head(10)
Out[32]:
Sector NIPE_Std_dev
2 Energy 1058.0
3 Financials 510.0
4 Health Care 281.0
7 Materials 204.0
1 Consumer Staples 182.0
6 Information Technology 135.0
8 Telecommunication Services 73.0
0 Consumer Discretionary 56.0
5 Industrials 48.0
9 Utilities 40.0
In [33]:
#avg NIPE by sector
df13 = NIPE [[2,1]]
df14 = df13.groupby('Sector').mean().sort_values(by='NIPE_MM',ascending=False).round(decimals=0).reset_index()
df14.columns = ['Sector', 'Avg_NIPE'] #renaming columns
df14.head(5)
Out[33]:
Sector Avg_NIPE
0 Financials 185.0
1 Health Care 92.0
2 Consumer Staples 86.0
3 Utilities 83.0
4 Information Technology 74.0
In [34]:
#joining avg_NIPE w/ standard deviation NIPE table
j1 = df14.merge(df12, how='left', on="Sector")
j1.head(10)
Out[34]:
Sector Avg_NIPE NIPE_Std_dev
0 Financials 185.0 510.0
1 Health Care 92.0 281.0
2 Consumer Staples 86.0 182.0
3 Utilities 83.0 40.0
4 Information Technology 74.0 135.0
5 Industrials 43.0 48.0
6 Telecommunication Services 40.0 73.0
7 Consumer Discretionary 38.0 56.0
8 Materials 35.0 204.0
9 Energy -393.0 1058.0
In [35]:
#bar chart

x = j1['Sector']
y1 = j1['Avg_NIPE']
y2 = j1['NIPE_Std_dev']


trace0 = go.Bar(
        x=x,
        y=y1,
        name='Average NIPE',
        marker=dict(
            color='rgb(255, 217, 102)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )

trace1 = go.Bar(
        x=x,
        y=y2,
        name='Standard deviation',
        marker=dict(
            color='rgb(84, 226, 129)',
            line=dict(
                color='rgb(8,48,107)',
                width=1.5
            ),
        ),
        opacity=0.6
    )
        
data = [trace0, trace1]
    
layout = go.Layout(
      title='mean NIPE by sector, with standard deviation',
    xaxis=dict(
        title='industry',
        tickangle = 47,
    ),
    yaxis=dict(
        title='NIPE (MM)'
    ),
     margin=go.Margin(
        b = 220
        
    )

    
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='NIPE_sector_stddev')
Out[35]:
In [41]:
#setting up y values for box plot (step 0), loading in the data I want
sector_nipe = NIPE[['Sector', 'NIPE_MM']]
sector_nipe[['NIPE_MM']] = sector_nipe[['NIPE_MM']].astype(float)
sector_nipe.head()
/Users/erikrood/anaconda3/lib/python3.5/site-packages/pandas/core/frame.py:2378: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[41]:
Sector NIPE_MM
0 Financials 2675.0
1 Health Care 2168.0
2 Financials 1967.0
3 Financials 1719.0
4 Financials 1221.0
In [38]:
#setting up x variables for box plot
x_data1 = {'Consumer Discretionary','Consumer Staples','Energy',\
         'Financials', 'Health Care','Industrials',\
          'Information Technology','Materials','Telecommunication Services','Utilities'}
x_data1
Out[38]:
{'Consumer Discretionary',
 'Consumer Staples',
 'Energy',
 'Financials',
 'Health Care',
 'Industrials',
 'Information Technology',
 'Materials',
 'Telecommunication Services',
 'Utilities'}
In [39]:
#setting up y values for box plot (step 1)
sector_CD = sector_nipe[sector_nipe['Sector'] == 'Consumer Discretionary'].copy()
ConsumerDiscretionary = sector_CD[['NIPE_MM']].copy()

sector_CS = sector_nipe[sector_nipe['Sector']== 'Consumer Staples'].copy()
ConsumerStaples = sector_CS[['NIPE_MM']].copy()

sector_energy = sector_nipe[sector_nipe['Sector'] == 'Energy'].copy()
Energy = sector_energy[['NIPE_MM']].copy()

sector_financials = sector_nipe[sector_nipe['Sector'] == 'Financials'].copy()
Financials = sector_financials[['NIPE_MM']].copy()

sector_healthcare = sector_nipe[sector_nipe['Sector'] == 'Health Care'].copy()
Healthcare = sector_healthcare[['NIPE_MM']].copy()

sector_industrials = sector_nipe[sector_nipe['Sector'] == 'Industrials'].copy()
Industrials = sector_industrials[['NIPE_MM']].copy()

sector_IT = sector_nipe[sector_nipe['Sector'] == 'Information Technology'].copy()
InformationTechnology = sector_IT[['NIPE_MM']].copy()

sector_materials = sector_nipe[sector_nipe['Sector'] == 'Materials'].copy()
Materials = sector_materials[['NIPE_MM']].copy()

sector_TS = sector_nipe[sector_nipe['Sector'] == 'Telecommunication Services'].copy()
TelecommunicationServices = sector_TS[['NIPE_MM']].copy()

sector_utilities = sector_nipe[sector_nipe['Sector'] == 'Utilities'].copy()
Utilities = sector_utilities[['NIPE_MM']].copy()

#setting up y values for box plot (step 2)
y_data = [
    Industrials.values,
    Energy.values,
    Utilities.values,
    ConsumerStaples.values,
    Healthcare.values,
    Materials.values,
    TelecommunicationServices.values,
    Financials.values,
    ConsumerDiscretionary.values,
    InformationTechnology.values
]
In [40]:
#putting together the box plot
traces = []
for xd, yd in zip(x_data1, y_data):
        traces.append(go.Box(
            y=yd,
            name=xd,
            boxpoints='all',
            jitter=0.5,
            whiskerwidth=0.2,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
        ))

layout = go.Layout(
    title='NIPE distribution by sector',
    yaxis=dict(
        range=[-300, 350],
        showgrid=True,
        zeroline=True,
        #dtick=5,
        gridcolor='rgb(255, 255, 255)',
        gridwidth=1,
        zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2,
    ),
    margin=go.Margin(
        l=50,
        r=90,
        b=280,
        t=60,
        pad=4
    ),
    width=800,
    height=650,
    showlegend=False
)

    
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename = 'NIPE_distribution_sector')
Out[40]:
In [ ]: