# For data
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
from __future__ import division
from datetime import datetime
# use to get info from web
import requests
# stringIO to work with csv
from StringIO import StringIO
# getting the data
url = 'http://elections.huffingtonpost.com/pollster/2016-general-election-trump-vs-clinton.csv'
source = requests.get(url).text
poll_data = StringIO(source)
# creating the dataframe
poll_df = pd.read_csv(poll_data)
poll_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1522 entries, 0 to 1521 Data columns (total 17 columns): Pollster 1522 non-null object Start Date 1522 non-null object End Date 1522 non-null object Entry Date/Time (ET) 1522 non-null object Number of Observations 1013 non-null float64 Population 1522 non-null object Mode 1522 non-null object Trump 1522 non-null float64 Clinton 1522 non-null float64 Other 1098 non-null float64 Undecided 1460 non-null float64 Pollster URL 1522 non-null object Source URL 1522 non-null object Partisan 1522 non-null object Affiliation 1522 non-null object Question Text 661 non-null object Question Iteration 1522 non-null int64 dtypes: float64(5), int64(1), object(11) memory usage: 202.2+ KB
# previewing the data
poll_df.head()
Pollster | Start Date | End Date | Entry Date/Time (ET) | Number of Observations | Population | Mode | Trump | Clinton | Other | Undecided | Pollster URL | Source URL | Partisan | Affiliation | Question Text | Question Iteration | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Insights West | 2016-11-04 | 2016-11-07 | 2016-11-08T12:16:30Z | 940.0 | Likely Voters | Internet | 41.0 | 45.0 | 2.0 | 8.0 | http://elections.huffingtonpost.com/pollster/p... | http://www.insightswest.com/news/clinton-is-ah... | Nonpartisan | None | As you may know, there will be a presidential ... | 1 |
1 | Insights West | 2016-11-04 | 2016-11-07 | 2016-11-08T12:16:30Z | NaN | Likely Voters - Democrat | Internet | 6.0 | 89.0 | 0.0 | 4.0 | http://elections.huffingtonpost.com/pollster/p... | http://www.insightswest.com/news/clinton-is-ah... | Nonpartisan | None | As you may know, there will be a presidential ... | 1 |
2 | Insights West | 2016-11-04 | 2016-11-07 | 2016-11-08T12:16:30Z | NaN | Likely Voters - Republican | Internet | 82.0 | 7.0 | 2.0 | 6.0 | http://elections.huffingtonpost.com/pollster/p... | http://www.insightswest.com/news/clinton-is-ah... | Nonpartisan | None | As you may know, there will be a presidential ... | 1 |
3 | Insights West | 2016-11-04 | 2016-11-07 | 2016-11-08T12:16:30Z | NaN | Likely Voters - independent | Internet | 38.0 | 43.0 | 4.0 | 7.0 | http://elections.huffingtonpost.com/pollster/p... | http://www.insightswest.com/news/clinton-is-ah... | Nonpartisan | None | As you may know, there will be a presidential ... | 1 |
4 | IBD/TIPP | 2016-11-04 | 2016-11-07 | 2016-11-08T12:10:06Z | 1107.0 | Likely Voters | Live Phone | 43.0 | 41.0 | 4.0 | 5.0 | http://elections.huffingtonpost.com/pollster/p... | http://www.investors.com/politics/ibd-tipp-pre... | Nonpartisan | None | NaN | 1 |
# averaging the data
avg = pd.DataFrame(poll_df.mean())
#droping the info we dont need
avg.drop('Number of Observations',axis=0,inplace=True)
avg.drop('Question Iteration',axis=0,inplace=True)
# getting the standard deviation
std = pd.DataFrame(poll_df.std())
#dropping the same columns we dont need
std.drop('Number of Observations',axis=0,inplace=True)
std.drop('Question Iteration',axis=0,inplace=True)
# combining them into a dataframe
poll_avg = pd.concat([avg,std],axis=1)
#adding header
poll_avg.columns = ['Average','STD']
# viewing the average and standard deviation
poll_avg
Average | STD | |
---|---|---|
Trump | 40.643890 | 23.566390 |
Clinton | 42.733903 | 25.298731 |
Other | 5.806011 | 5.009533 |
Undecided | 9.315068 | 6.253118 |
# Adding a difference column to show the difference between clinton/trump in every poll
# A POSITIVE value = Leaning Clinton a NEGITIVE value = Leaning Trump
poll_df['Difference'] = (poll_df.Clinton - poll_df.Trump)/100
# grouping all the polls by day and averaging all the data
poll_df = poll_df.groupby(['Start Date'],as_index=False).mean()
poll_df.head()
Start Date | Number of Observations | Trump | Clinton | Other | Undecided | Question Iteration | Difference | |
---|---|---|---|---|---|---|---|---|
0 | 2015-05-19 | 1046.00 | 34.25 | 48.75 | 2.5 | 14.00 | 1.0 | 0.1450 |
1 | 2015-06-20 | 420.75 | 35.00 | 47.25 | NaN | 17.75 | 1.0 | 0.1225 |
2 | 2015-06-21 | 1005.00 | 34.00 | 51.00 | 3.0 | 12.00 | 1.0 | 0.1700 |
3 | 2015-06-26 | 890.00 | 36.75 | 57.00 | 6.0 | 0.00 | 1.0 | 0.2025 |
4 | 2015-07-09 | 499.25 | 35.25 | 49.50 | NaN | 16.00 | 1.0 | 0.1425 |
# previewing the difference throughout the election
poll_df.plot('Start Date','Difference',figsize=(12,4),marker='',linestyle='-',color='purple')
<matplotlib.axes._subplots.AxesSubplot at 0xde112b0>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(226,262))
#sept 26 1st debate
plt.axvline(x=225+2,linewidth=4,color='grey', alpha=0.5)
#oct 9th 2nd debate
plt.axvline(x=232+8,linewidth=4,color='grey', alpha=0.5)
#oct 19th 3rd debate
plt.axvline(x=232+18,linewidth=4,color='grey', alpha=0.5)
<matplotlib.lines.Line2D at 0xe1fab38>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(159,182))
plt.axvline(x=162,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=167,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=171,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=174,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=175,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=180,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0xe4e7400>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(183,208))
plt.axvline(x=189,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=196,linewidth=4,color='red', alpha=0.5)
<matplotlib.lines.Line2D at 0xe336588>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(209,231))
plt.axvline(x=215,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=216,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=221,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=227,linewidth=4,color='grey', alpha=0.5)
<matplotlib.lines.Line2D at 0xe840d68>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(232,262))
plt.axvline(x=238,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=238,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=240,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=243,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=250,linewidth=4,color='grey', alpha=0.5)
plt.axvline(x=259,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0xf0e83c8>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(257,266))
plt.axvline(x=232+27,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0xe59f908>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(134,159))
plt.axvline(x=143,linewidth=4,color='orange', alpha=0.5)
plt.axvline(x=152,linewidth=4,color='cyan', alpha=0.5)
plt.axvline(x=157,linewidth=4,color='orange', alpha=0.5)
<matplotlib.lines.Line2D at 0xf6936a0>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(159,182))
plt.axvline(x=169,linewidth=4,color='orange', alpha=0.5)
plt.axvline(x=170,linewidth=4,color='green', alpha=0.5)
<matplotlib.lines.Line2D at 0xe08d160>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(183,208))
plt.axvline(x=185,linewidth=4,color='green', alpha=0.5)
plt.axvline(x=200,linewidth=4,color='green', alpha=0.5)
plt.axvline(x=203,linewidth=4,color='green', alpha=0.5)
<matplotlib.lines.Line2D at 0xf9f2518>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(134,159))
plt.axvline(x=134,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=135,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=136,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=141,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=142,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=144,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=145,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=146,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=147,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=148,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=157,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=158,linewidth=4,color='red', alpha=0.5)
<matplotlib.lines.Line2D at 0xfc41080>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(159,182))
plt.axvline(x=162,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=163,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=167,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=177,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=179,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=180,linewidth=4,color='red', alpha=0.5)
<matplotlib.lines.Line2D at 0xfe03438>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(183,209))
plt.axvline(x=183,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=184,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=185,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=186,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=188,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=189,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=190,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=191,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=192,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=195,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=197,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=198,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=199,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=200,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=201,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=202,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=203,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=208,linewidth=4,color='red', alpha=0.5)
<matplotlib.lines.Line2D at 0x10dc5898>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(209,232))
plt.axvline(x=209,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=212,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=214,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=217,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=218,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=219,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=220,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=222,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=223,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=224,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=225,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=228,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=229,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=230,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=231,linewidth=4,color='red', alpha=0.5)
<matplotlib.lines.Line2D at 0x110d5710>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(232,262))
plt.axvline(x=232,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=234,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=235,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=236,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=241,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=242,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=243,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=244,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=245,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=246,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=251,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=252,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=253,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=256,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=257,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=258,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=259,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=260,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=261,linewidth=4,color='red', alpha=0.5)
plt.axvline(x=262,linewidth=4,color='red', alpha=0.5)
<matplotlib.lines.Line2D at 0x111926d8>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(134,159))
plt.axvline(x=134,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=135,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=136,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=137,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=138,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=144,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=151,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=156,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0x15644a58>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(159,182))
plt.axvline(x=162,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=167,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=168,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=169,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=173,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=175,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=176,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=180,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=181,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=182,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0x1555e748>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(183,208))
plt.axvline(x=183,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=184,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=187,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=189,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=194,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=195,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=196,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=203,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0x15ae2cf8>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(209,231))
plt.axvline(x=211,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=212,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=213,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=218,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=222,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=224,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=228,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=229,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=231,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0x15fda160>
poll_df.plot('Start Date','Difference',figsize=(12,6),marker='o',linestyle='-',color='purple',xlim=(232,262))
plt.axvline(x=234,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=235,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=241,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=242,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=243,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=252,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=253,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=254,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=255,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=256,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=257,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=258,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=259,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=261,linewidth=4,color='blue', alpha=0.5)
plt.axvline(x=262,linewidth=4,color='blue', alpha=0.5)
<matplotlib.lines.Line2D at 0x161bd7b8>