In this notebook, we will:
import pandas as pd
import gensim
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
# Set options for pandas
def start():
options = {
'display': {
'max_columns': None,
'max_colwidth': 25,
'expand_frame_repr': False, # Don't wrap to multiple pages
'max_rows': 14,
'max_seq_items': 50, # Max length of printed sequence
'precision': 4,
'show_dimensions': False
},
'mode': {
'chained_assignment': None # Controls SettingWithCopyWarning
}
}
for category, option in options.items():
for op, value in option.items():
pd.set_option(f'{category}.{op}', value) # Python 3.6+
start()
main_data = pd.read_csv('News_Final.csv')
main_data.head()
IDLink | Title | Headline | Source | Topic | PublishDate | SentimentTitle | SentimentHeadline | GooglePlus | |||
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 99248.0 | Obama Lays Wreath at ... | Obama Lays Wreath at ... | USA TODAY | obama | 2002-04-02 00:00:00 | 0.0000 | -0.0533 | -1 | -1 | -1 |
1 | 10423.0 | A Look at the Health ... | Tim Haywood, investme... | Bloomberg | economy | 2008-09-20 00:00:00 | 0.2083 | -0.1564 | -1 | -1 | -1 |
2 | 18828.0 | Nouriel Roubini: Glob... | Nouriel Roubini, NYU ... | Bloomberg | economy | 2012-01-28 00:00:00 | -0.4252 | 0.1398 | -1 | -1 | -1 |
3 | 27788.0 | Finland GDP Expands I... | Finland's economy exp... | RTT News | economy | 2015-03-01 00:06:00 | 0.0000 | 0.0261 | -1 | -1 | -1 |
4 | 27789.0 | Tourism, govt spendin... | Tourism and public sp... | The Nation - Thailand... | economy | 2015-03-01 00:11:00 | 0.0000 | 0.1411 | -1 | -1 | -1 |
We'll start by doing some cleaning and manipulation on the text data. In particular, we need to:
# Grab all the article titles
article_titles = main_data['Title']
article_titles.shape
(93239,)
# Create a list of strings, where each string is an article title
titles_list = [title for title in article_titles]
# Collapse the list of strings into a single long string for processing
big_title_string = ' '.join(titles_list)
from nltk.tokenize import word_tokenize
# Tokenize the string into words
tokens = word_tokenize(big_title_string)
# Remove non-alphabetic tokens, such as punctuation
words = [word.lower() for word in tokens if word.isalpha()]
# Filter out stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if not word in stop_words]
# Print first 10 words
words[:10]
['obama', 'lays', 'wreath', 'arlington', 'national', 'cemetery', 'look', 'health', 'chinese', 'economy']
# Load Word2Vec model (trained on an enormous Google corpus)
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary = True)
# Check dimension of word vectors
model.vector_size
300
What a word vector looks like:
economy_vec = model['economy']
economy_vec[:20] # First 20 components
array([ 0.18847656, -0.0703125 , -0.16113281, 0.03857422, -0.04956055, -0.23046875, -0.15722656, -0.16992188, 0.17089844, -0.01208496, -0.16015625, -0.02770996, 0.05932617, 0.00491333, -0.13769531, 0.39257812, 0.06201172, 0.30859375, 0.12890625, -0.07373047], dtype=float32)
# Filter the list of vectors to include only those that Word2Vec has a vector for
vector_list = [model[word] for word in words if word in model.vocab]
# Create a list of the words corresponding to these vectors
words_filtered = [word for word in words if word in model.vocab]
# Zip the words together with their vector representations
word_vec_zip = zip(words_filtered, vector_list)
# Cast to a dict so we can turn it into a dataframe
word_vec_dict = dict(word_vec_zip)
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')
df.shape
(21805, 300)
Now we have a bunch of tidy vectors in a DataFrame. Each column is the word's component for that dimension (300 dimensions total).
df.head(3)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
obama | -0.1235 | 0.0723 | 0.1719 | 0.4023 | -0.1260 | -0.3027 | -0.0449 | -0.0153 | 0.1719 | -0.0435 | -0.1328 | -0.4590 | -0.5273 | 0.1396 | -0.1235 | 0.4219 | -0.1465 | 0.0413 | 0.0299 | -0.3574 | 0.0444 | -0.0938 | 0.4258 | -0.1543 | -0.2295 | 0.3008 | -0.4277 | -0.0801 | 0.3164 | -0.2021 | 0.1035 | 0.2061 | -0.3184 | -0.0908 | -0.0581 | 0.0996 | -0.2256 | 0.0630 | 0.2002 | 0.1680 | 0.0233 | -0.2451 | 0.3672 | 0.0840 | 0.1211 | 0.1582 | 0.3594 | -0.3320 | -0.2598 | -0.0337 | -0.1357 | -0.0903 | 0.1943 | 0.0884 | 0.2520 | 0.0496 | -0.3027 | -0.0820 | 0.1807 | -0.2988 | 0.1768 | 0.2021 | -0.2793 | -0.1377 | 0.0415 | -0.2949 | -0.1250 | -0.3301 | -0.2100 | 0.0366 | 0.1328 | 0.5625 | 0.1807 | 0.1543 | -0.1250 | -0.2236 | 0.3301 | 0.0903 | -0.1777 | 0.0019 | -0.1621 | 0.3223 | -0.0977 | -0.0544 | 0.1738 | 0.0066 | -0.2715 | 0.1943 | 0.0238 | 0.0854 | -0.0674 | 0.1504 | -0.0811 | -0.3125 | -0.1963 | 0.1611 | -0.2354 | -0.1533 | 0.6562 | -0.0952 | 0.1914 | -0.1514 | 0.5469 | 0.0430 | -0.1807 | 0.0889 | -0.2295 | 0.1367 | -0.0527 | -0.5547 | 0.1387 | -0.5078 | 0.2832 | 0.2910 | 0.3672 | 0.1494 | 0.1235 | -0.4824 | -0.1465 | 0.0479 | 0.0967 | -0.0603 | 0.1396 | -0.0554 | 0.5234 | 0.0903 | -0.0933 | 0.1118 | 0.0198 | -0.1001 | -0.3516 | -0.0388 | 0.3438 | -0.0422 | -0.0562 | -0.1196 | -0.3184 | -0.2080 | 0.4785 | -0.0190 | 0.3105 | -0.3379 | 0.0552 | 0.0398 | -0.2256 | -0.1143 | -0.0209 | -0.1348 | 0.0635 | -0.3164 | 0.3730 | -0.6016 | -0.1816 | -0.3379 | -0.4629 | -0.2031 | 0.2061 | 0.2451 | -0.2109 | 0.1357 | -0.0403 | 0.2129 | -0.1592 | 0.0630 | -0.2021 | -0.6758 | 0.1709 | 0.0447 | -0.0859 | 0.2812 | -0.4082 | 0.1123 | -0.0007 | 0.0078 | -0.0422 | -0.0476 | 0.3145 | -0.4785 | 0.2334 | -0.4199 | -0.0884 | 0.0938 | -0.3809 | -0.0972 | 0.0322 | 0.2227 | -0.4121 | -0.1201 | -0.0153 | 0.0698 | 0.2949 | 0.3516 | -0.0030 | 0.1914 | 0.0035 | 0.1387 | -0.4492 | -0.1621 | -0.2197 | -0.2256 | -0.8711 | 0.1250 | 0.0503 | -0.7656 | -0.1191 | -0.2715 | 0.4805 | -0.0491 | -0.3320 | 0.1050 | -0.0903 | 0.1455 | -0.3145 | -0.2295 | 0.1582 | 0.0618 | -0.2598 | -0.2539 | -0.1157 | -0.2100 | -0.3594 | 0.0781 | 0.1060 | 0.2051 | 0.2412 | -0.0544 | 0.2773 | 0.3711 | 0.0815 | -0.0664 | 0.3457 | -0.1118 | -0.0957 | -0.3945 | -0.0439 | 0.2412 | 0.2344 | 0.3555 | 0.0728 | -0.0649 | 0.1011 | 0.2441 | 0.0547 | 0.1650 | -0.4785 | 0.2021 | -0.1562 | 0.6172 | -0.0564 | -0.2949 | -0.0366 | -0.0130 | 0.1758 | 0.1108 | 0.4785 | -0.1206 | 0.1787 | -0.1660 | -0.0305 | -0.2871 | 0.3711 | 0.0840 | 0.2344 | -0.3086 | -0.3027 | -0.2393 | -0.1875 | -0.2188 | 0.0757 | -0.3926 | -0.4844 | 0.1465 | 0.3145 | 0.1260 | -0.1357 | -0.1582 | 0.2412 | -0.3594 | 0.0654 | 0.1367 | -0.1318 | -0.0762 | -0.0498 | 0.2676 | -0.5625 | -0.1914 | 0.0064 | -0.3340 | 0.1475 | -0.3926 | -0.0554 | -0.1387 | -0.0645 | -0.1040 | -0.2676 | -0.2051 | -0.1118 | -0.0225 | 0.0669 | 0.0269 |
lays | -0.0522 | -0.0574 | 0.1934 | -0.1069 | 0.2988 | -0.1660 | 0.1377 | 0.1250 | -0.0322 | -0.0962 | -0.3750 | 0.3262 | 0.0500 | 0.0557 | 0.0059 | 0.2129 | 0.0479 | 0.2871 | -0.3008 | 0.2490 | -0.0972 | 0.1167 | -0.1611 | 0.2334 | 0.1914 | -0.3008 | -0.1895 | 0.1230 | -0.2158 | 0.2539 | -0.2451 | -0.1865 | -0.2051 | 0.3125 | -0.0280 | -0.2109 | 0.0452 | 0.2100 | 0.2031 | 0.0684 | 0.1172 | -0.0219 | 0.0679 | 0.2275 | -0.0322 | -0.2070 | -0.1875 | -0.1895 | 0.3711 | -0.0223 | -0.0361 | 0.1348 | -0.0281 | -0.3672 | 0.3594 | -0.1299 | -0.2197 | -0.3125 | 0.2178 | 0.3027 | 0.0879 | 0.2471 | 0.0082 | -0.1245 | 0.1104 | -0.0361 | 0.0649 | -0.0354 | -0.0532 | -0.2695 | 0.0562 | 0.0219 | 0.0266 | -0.2969 | 0.0510 | -0.2930 | 0.0129 | 0.5234 | -0.1367 | 0.2402 | -0.2012 | -0.3613 | 0.2109 | -0.0933 | 0.4258 | -0.1367 | 0.2559 | 0.0317 | 0.0109 | -0.1855 | 0.1260 | 0.0215 | -0.1523 | 0.2129 | -0.0684 | 0.0081 | 0.0806 | 0.0275 | 0.2578 | 0.2051 | -0.2031 | -0.2500 | 0.1484 | 0.0938 | -0.4199 | -0.3770 | 0.1455 | 0.1963 | -0.3984 | -0.0223 | -0.1367 | -0.1641 | -0.0742 | -0.1387 | 0.0151 | 0.0830 | -0.2617 | 0.1709 | -0.0310 | 0.1099 | -0.2158 | 0.1436 | 0.1797 | -0.1396 | 0.3496 | -0.0913 | -0.0913 | -0.1562 | -0.0747 | -0.0056 | -0.2393 | -0.0737 | 0.1079 | 0.3105 | -0.0674 | 0.0042 | -0.1426 | 0.0217 | -0.0178 | -0.1299 | 0.3984 | 0.1875 | 0.0972 | 0.0928 | -0.0635 | 0.3770 | -0.4336 | -0.0942 | 0.3828 | -0.0058 | 0.1719 | 0.0211 | 0.2412 | -0.1396 | 0.0201 | -0.0664 | 0.3359 | -0.2676 | -0.1680 | -0.0486 | 0.2168 | -0.1670 | -0.1289 | -0.2119 | 0.0405 | -0.0113 | 0.0928 | -0.2617 | -0.2275 | -0.1069 | -0.0058 | -0.0693 | 0.0208 | 0.0304 | 0.2471 | 0.2344 | 0.0396 | -0.0693 | -0.0339 | 0.2910 | -0.3164 | -0.0991 | 0.0282 | -0.1172 | -0.2061 | -0.0021 | -0.1221 | 0.0408 | 0.1758 | 0.2051 | 0.4082 | -0.0007 | -0.0064 | 0.1787 | 0.1289 | 0.3086 | -0.2021 | -0.2275 | 0.1875 | -0.1250 | 0.0157 | 0.0435 | -0.1201 | 0.0006 | 0.0238 | 0.0183 | 0.0728 | 0.2324 | 0.2812 | -0.2500 | 0.1226 | 0.1250 | 0.1016 | -0.1885 | 0.0410 | 0.0029 | 0.2676 | 0.1187 | -0.2178 | -0.2197 | -0.2578 | -0.2051 | 0.1240 | -0.0442 | 0.3281 | -0.4609 | -0.2598 | -0.1533 | 0.0918 | 0.1250 | -0.2930 | -0.1641 | 0.0574 | 0.2275 | 0.1982 | 0.0103 | 0.0781 | 0.0688 | -0.0193 | -0.1279 | 0.2578 | 0.3008 | -0.0854 | -0.1963 | 0.2256 | -0.1660 | -0.0571 | 0.1689 | 0.2031 | 0.1406 | -0.1045 | 0.0957 | -0.1748 | 0.0283 | 0.1934 | -0.1680 | -0.3223 | -0.1641 | -0.1768 | 0.1924 | -0.2695 | -0.2715 | -0.0503 | -0.1089 | -0.0586 | 0.2832 | -0.2432 | -0.1963 | -0.1289 | -0.0986 | 0.0474 | 0.0513 | -0.0422 | -0.0659 | -0.0903 | 0.1196 | 0.0413 | 0.2314 | -0.0796 | 0.3672 | -0.2314 | 0.0302 | 0.0596 | 0.3340 | 0.2383 | 0.0303 | -0.1836 | 0.2295 | -0.0249 | -0.1016 | -0.1729 | -0.4844 | -0.2285 | 0.1689 | -0.2910 | 0.0036 | 0.0410 | 0.2295 | 0.2480 | 0.3535 |
wreath | 0.1670 | 0.1992 | -0.0820 | 0.1426 | 0.0554 | -0.1025 | 0.1035 | -0.1387 | 0.0309 | -0.0070 | 0.1147 | -0.1016 | -0.1377 | 0.0732 | -0.0884 | -0.0211 | -0.4004 | -0.0112 | -0.1777 | -0.3711 | 0.3320 | 0.4297 | -0.0280 | -0.2471 | 0.0874 | 0.1709 | -0.1099 | -0.1201 | -0.2695 | -0.1445 | -0.2217 | -0.1973 | -0.1689 | 0.0889 | 0.3262 | -0.4434 | 0.3164 | 0.0771 | -0.4375 | -0.0498 | -0.1973 | -0.4434 | 0.2188 | -0.4082 | 0.0879 | -0.1768 | -0.3906 | 0.2002 | 0.0334 | 0.0055 | -0.1484 | 0.0562 | 0.1455 | 0.0322 | -0.1348 | -0.1934 | -0.1631 | -0.0144 | -0.1118 | -0.2500 | -0.2734 | 0.0513 | -0.2578 | 0.4258 | -0.1748 | -0.3281 | -0.1240 | -0.1426 | 0.0767 | -0.0221 | 0.2354 | -0.0957 | -0.0864 | -0.5703 | -0.1406 | -0.0107 | 0.0962 | -0.3945 | 0.2061 | 0.0549 | 0.3926 | -0.0320 | -0.0040 | -0.3184 | -0.1895 | -0.1240 | -0.1611 | -0.2930 | 0.2852 | 0.2637 | -0.0131 | 0.0718 | 0.1758 | -0.1436 | -0.3027 | -0.2324 | 0.2422 | -0.0354 | 0.3730 | -0.5508 | -0.1357 | -0.2656 | 0.2383 | 0.0752 | -0.4199 | 0.0786 | 0.4023 | -0.0698 | 0.0262 | -0.1572 | -0.3633 | -0.0302 | -0.1758 | 0.1582 | 0.0708 | -0.2832 | 0.1309 | 0.1875 | 0.1064 | 0.3301 | 0.1553 | 0.2354 | 0.3047 | 0.2559 | 0.1465 | 0.1738 | 0.4121 | -0.1118 | 0.0225 | 0.1602 | -0.1035 | 0.0771 | -0.0264 | -0.2217 | 0.2520 | 0.2754 | -0.4902 | -0.2041 | 0.1797 | -0.2070 | -0.4102 | 0.1260 | -0.2754 | -0.1016 | 0.0258 | 0.1406 | -0.2119 | -0.3477 | -0.4766 | -0.2812 | -0.0220 | 0.2471 | 0.1147 | 0.3828 | -0.0469 | -0.1226 | -0.1748 | 0.0106 | 0.1299 | 0.1416 | -0.3184 | -0.1992 | 0.4355 | -0.1836 | 0.0107 | -0.0564 | 0.0236 | -0.1865 | -0.3809 | 0.3340 | -0.0469 | -0.1963 | -0.0786 | -0.0189 | -0.2969 | -0.0354 | 0.2070 | -0.0850 | 0.1152 | 0.2070 | 0.0325 | 0.0554 | -0.1494 | 0.4629 | -0.1260 | -0.1108 | 0.3633 | 0.3223 | 0.1641 | 0.4453 | -0.3184 | 0.2275 | -0.3223 | -0.0120 | 0.0618 | -0.1089 | 0.2002 | 0.1562 | -0.2871 | -0.3438 | -0.0635 | -0.3301 | -0.6250 | -0.1157 | -0.1152 | 0.0195 | 0.3281 | 0.2520 | -0.2969 | -0.0242 | -0.6680 | -0.3555 | -0.6875 | -0.2100 | 0.4043 | 0.1797 | -0.2715 | -0.2178 | 0.2080 | 0.1206 | 0.0967 | -0.0718 | 0.0117 | -0.1328 | -0.0564 | -0.3320 | 0.1455 | 0.0552 | 0.2930 | 0.1514 | -0.1982 | -0.0435 | -0.2363 | 0.2109 | 0.2383 | -0.0430 | -0.0016 | 0.2012 | 0.2246 | -0.0889 | -0.0569 | -0.0554 | -0.1089 | -0.2617 | -0.2617 | 0.1270 | 0.0864 | 0.3145 | -0.1182 | -0.1680 | -0.3691 | 0.1187 | -0.1514 | -0.0732 | 0.0732 | -0.1729 | 0.0093 | -0.1885 | -0.0166 | -0.0430 | -0.0281 | 0.3633 | -0.4258 | -0.5430 | 0.2275 | 0.4512 | 0.0781 | 0.0991 | -0.4668 | 0.0598 | -0.0197 | 0.0386 | 0.0386 | -0.1543 | 0.2949 | -0.0767 | 0.2656 | 0.2637 | -0.4102 | 0.1172 | -0.1592 | 0.1660 | 0.1309 | 0.2236 | 0.0820 | 0.0608 | -0.1572 | 0.4199 | 0.0095 | 0.0405 | 0.0155 | -0.0859 | -0.1738 | 0.1099 | 0.0967 | -0.1021 | 0.1196 | -0.2871 | -0.0859 | 0.1885 |
Next, we're going to squish (read: do dimensionality reduction on) these word vectors using t-SNE. If you're not familiar with t-SNE and its interpretations, check out this excellent, interactive distill.pub article on t-SNE.
It's important to play around with the parameters for t-SNE, as different values can produce very different results. I tested several values between 0 and 100 for perplexity, and found that it produced roughly the same shape each time. I tested several learning rates between 20 and 400 as well, and decided to leave the learning rate at its default (200).
from sklearn.manifold import TSNE
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)
# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(df[:400])
Now we're ready to plot our reduced array of word vectors and see what patterns we can find.
sns.set()
fig, ax = plt.subplots(figsize = (11.7, 8.27))
sns.scatterplot(tsne_df[:, 0], tsne_df[:, 1], alpha = 0.5)
# Second plot: Import adjustText, initialize list of texts
from adjustText import adjust_text
texts = []
words_to_plot = list(np.arange(0, 400, 10))
# Append words to list
for word in words_to_plot:
texts.append(plt.text(tsne_df[word, 0], tsne_df[word, 1], df.index[word], fontsize = 14))
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4,
expand_points = (2,1), expand_text = (1,2),
arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))
plt.show()
Note that even though the text still overlaps some points, we no longer have overlapping text. Nifty!
If you're interested in trying out adjust_text
for your own plotting needs, you can find it here. Be sure to import using the camelcase adjustText
, and please note that adjustText
is currently not compatible with matplotlib 3.0 or higher.
It's encouraging to see that even when the vector embeddings have been reduced to 2 dimensions, we see certain items clustering together. For example, we have months in the left/upper left, we have corporate finance terms near the bottom, and we have more generic, non-topical words (like 'full', 'really', 'slew') in the middle.
Note that if we were to run the t-SNE again with different parameters, we may observe some similarities to this result, but we're not guaranteed to see the exact same patterns. t-SNE, as compared to PCA, is not deterministic. Relatedly, tightness of clusters and distances between clusters are not always meaningful. It is meant primarily as an exploratory tool, rather than as a decisive indicator of similarity.
We've gotten a sense of how word embeddings work as applied to this dataset. Now we can move on to some more interesting ML applications: finding titles that cluster together, and seeing what patterns emerge.
Instead of using Doc2Vec, which would require a lengthy training process, we can use a simpler (and sometimes even more effective) trick: averaging the embeddings of the word vectors in each document. In our case, a document refers to a title or a headline.
def document_vector(word2vec_model, doc):
# remove out-of-vocabulary words
doc = [word for word in doc if word in model.vocab]
return np.mean(model[doc], axis=0)
# Our earlier preprocessing was done when we were dealing only with word vectors
# Here, we need each document to remain a document
def preprocess(text):
text = text.lower()
doc = word_tokenize(text)
doc = [word for word in doc if word not in stop_words]
doc = [word for word in doc if word.isalpha()]
return doc
# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(word2vec_model, doc):
"""check if at least one word of the document is in the
word2vec dictionary"""
return not all(word not in word2vec_model.vocab for word in doc)
# Filter out documents
def filter_docs(corpus, texts, condition_on_doc):
"""
Filter corpus and texts given the function condition_on_doc which takes
a doc. The document doc is kept if condition_on_doc(doc) is true.
"""
number_of_docs = len(corpus)
if texts is not None:
texts = [text for (text, doc) in zip(texts, corpus)
if condition_on_doc(doc)]
corpus = [doc for doc in corpus if condition_on_doc(doc)]
print("{} docs removed".format(number_of_docs - len(corpus)))
return (corpus, texts)
# Preprocess the corpus
corpus = [preprocess(title) for title in titles_list]
# Remove docs that don't include any words in W2V's vocab
corpus, titles_list = filter_docs(corpus, titles_list, lambda doc: has_vector_representation(model, doc))
# Filter out any empty docs
corpus, titles_list = filter_docs(corpus, titles_list, lambda doc: (len(doc) != 0))
37 docs removed 0 docs removed
# Initialize an array for the size of the corpus
x = []
for doc in corpus: # append the vector for each document
x.append(document_vector(model, doc))
X = np.array(x) # list to array
Procedure and functions adapted from this notebook by Dimitris Spathis.
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)
# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(X[:400])
fig, ax = plt.subplots(figsize = (14, 10))
sns.scatterplot(tsne_df[:, 0], tsne_df[:, 1], alpha = 0.5)
# Second plot: Import adjustText, initialize list of texts
from adjustText import adjust_text
texts = []
titles_to_plot = list(np.arange(0, 400, 40)) # plots every 40th title in first 400 titles
# Append words to list
for title in titles_to_plot:
texts.append(plt.text(tsne_df[title, 0], tsne_df[title, 1], titles_list[title], fontsize = 14))
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4,
expand_points = (2,1), expand_text = (1,2),
arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))
plt.show()
Pretty interesting! We can see that the t-SNE has collapsed the document vectors into a dimensional space where the documents are spread out based on whether their content has more to do with countries, world leaders, and foreign affairs, or has more to do with technology companies.
# Drop all the rows where the article popularities are unknown (this is only about 11% of the data)
main_data = main_data.drop(main_data[(main_data.Facebook == -1) |
(main_data.GooglePlus == -1) |
(main_data.LinkedIn == -1)].index)
# Also drop all rows where we don't know the source
main_data = main_data.drop(main_data[main_data['Source'].isna()].index)
main_data.shape
(81417, 11)
We still have 81,000 rows left to work with.
Let's see if we can find any associations between sentiment and number of shares.
fig, ax = plt.subplots(1, 3, figsize=(15, 10))
subplots = [a for a in ax]
platforms = ['Facebook', 'GooglePlus', 'LinkedIn']
colors = list(sns.husl_palette(10, h=.5)[1:4])
for platform, subplot, color in zip(platforms, subplots, colors):
sns.scatterplot(x = main_data[platform], y = main_data['SentimentTitle'], ax=subplot, color=color)
subplot.set_title(platform, fontsize=18)
subplot.set_xlabel('')
fig.suptitle('Plot of Popularity (Shares) by Title Sentiment', fontsize=24)
plt.show()
It's a bit hard to make out whether there's any relationship here, since a few articles are significant outliers in terms of their share counts. Let's try log-transforming the x-axis to see if we can reveal any patterns.
# Our data has over 80,000 rows, so let's subsample it to make the log-transformed scatterplot easier to read
subsample = main_data.sample(5000)
fig, ax = plt.subplots(1, 3, figsize=(15, 10))
subplots = [a for a in ax]
for platform, subplot, color in zip(platforms, subplots, colors):
# Regression plot, so we can gauge the linear relationship
sns.regplot(x = np.log(subsample[platform] + 1), y = subsample['SentimentTitle'],
ax=subplot,
color=color,
# Pass an alpha value to regplot's scatterplot call
scatter_kws={'alpha':0.5})
# Set a nice title, get rid of x labels
subplot.set_title(platform, fontsize=18)
subplot.set_xlabel('')
fig.suptitle('Plot of log(Popularity) by Title Sentiment', fontsize=24)
plt.show()
/Users/charlenechambliss/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result. return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Contrary to what we might expect (from our idea of highly emotional, clickbaity headlines), in this dataset we find no relationship between headline sentiment and article popularity as measured by number of shares.
import datetime
# Convert publish date column to make it compatible with other datetime objects
main_data['PublishDate'] = pd.to_datetime(main_data['PublishDate'])
# Time since Linux Epoch
t = datetime.datetime(1970, 1, 1)
# Subtract this time from each article's publish date
main_data['TimeSinceEpoch'] = main_data['PublishDate'] - t
# Create another column for just the days from the timedelta objects
main_data['DaysSinceEpoch'] = main_data['TimeSinceEpoch'].astype('timedelta64[D]')
main_data['TimeSinceEpoch'].describe()
count 81417 mean 16859 days 20:08:20.4... std 64 days 18:03:50.779344 min 16747 days 05:25:00 25% 16805 days 20:54:00 50% 16857 days 06:04:23 75% 16913 days 08:15:00 max 16989 days 15:59:22 Name: TimeSinceEpoch, dtype: object
So it looks like all of the articles were published within about a 200-day window.
# Get the list of top 12 sources by number of articles
source_names = list(main_data['Source'].value_counts()[:12].index)
source_colors = list(sns.husl_palette(12, h=.5))
fig, ax = plt.subplots(4, 3, figsize=(20, 15), sharex=True, sharey=True)
ax = ax.flatten()
for ax, source, color in zip(ax, source_names, source_colors):
sns.distplot(main_data.loc[main_data['Source'] == source]['SentimentTitle'],
ax=ax, color=color, kde_kws={'shade':True})
ax.set_title(source, fontsize=14)
ax.set_xlabel('')
plt.xlim(-0.75, 0.75)
plt.show()
# Overlay each density curve on the same plot for closer comparison
fig, ax = plt.subplots(figsize=(12, 8))
for source, color in zip(source_names, source_colors):
sns.distplot(main_data.loc[main_data['Source'] == source]['SentimentTitle'],
ax=ax, hist=False, label=source, color=color)
ax.set_xlabel('')
plt.xlim(-0.75, 0.75)
plt.show()
By overlaying the density estimates onto the same plot, we can get a better idea of how their shapes differ than trying to compare each plot to the other.
We see that the sources' Sentiment distributions (for article titles) are very similar - it doesn't look like any one source is an outlier in terms of positive or negative headlines. Instead, all 12 of the most common sources have distributions centered around 0 with modestly sized tails.
fig, ax = plt.subplots(3, 1, figsize=(15, 10))
subplots = [a for a in ax]
for platform, subplot, color in zip(platforms, subplots, colors):
sns.distplot(np.log(main_data[platform] + 1), ax=subplot, color=color, kde_kws={'shade':True})
# Set a nice title, get rid of x labels
subplot.set_title(platform, fontsize=18)
subplot.set_xlabel('')
fig.suptitle('Plot of Popularity by Platform', fontsize=24)
plt.show()
Our exploratory visualizations may not have yielded much to work from, but we can still see if there are any non-visual patterns we've missed that a model might discover. Our first task in preparing the data for modeling is to rejoin the document vectors with their respective titles. Thankfully, when we were preprocessing the corpus, we processed the corpus
and titles_list
simultaneously, so the vectors and the titles they represent will still match up. Meanwhile, in main_df
, we have dropped all of the articles that had -1 popularity, so we'll need to drop the vectors that represent those article titles.
Training a model on these enormous vectors as-is will not be possible on this computer, but we'll see what we can do with a little dimension reduction.
from sklearn.decomposition import PCA
pca = PCA(n_components=15, random_state=10)
# x is the array with our 300-dimensional vectors
reduced_vecs = pca.fit_transform(x)
df_w_vectors = pd.DataFrame(reduced_vecs)
df_w_vectors['Title'] = titles_list
main_w_vectors = pd.concat((df_w_vectors, main_data), axis=1)
# Get rid of vectors that couldn't be matched with the main_df
main_w_vectors.dropna(axis=0, inplace=True)
main_w_vectors.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | Title | IDLink | Title | Headline | Source | Topic | PublishDate | SentimentTitle | SentimentHeadline | GooglePlus | TimeSinceEpoch | DaysSinceEpoch | |||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
726 | 0.6100 | -0.3781 | -0.1349 | 0.0096 | 0.1040 | 0.1445 | -0.1544 | -0.1649 | 0.0073 | -0.0285 | 0.0396 | 0.0707 | -0.0077 | -0.0429 | 0.0253 | Microsoft offering £1... | 311.0 | Microsoft offering £1... | Just ahead of the mar... | International Busines... | microsoft | 2015-11-08 05:25:00 | 0.0000 | -0.1328 | 0.0 | 0.0 | 0.0 | 16747 days 05:25:00 | 16747.0 |
727 | -0.5408 | -0.4113 | -0.1794 | -0.2340 | 0.0801 | -0.0471 | -0.1286 | -0.0446 | -0.0060 | -0.1737 | -0.2527 | 0.0677 | -0.0050 | -0.0954 | -0.1532 | Economy to improve th... | 252.0 | Economy to improve th... | "In the coming s... | IANS India Private Li... | economy | 2015-11-08 05:40:00 | -0.0820 | 0.2055 | 0.0 | 0.0 | 0.0 | 16747 days 05:40:00 | 16747.0 |
728 | -0.5408 | -0.4113 | -0.1794 | -0.2340 | 0.0801 | -0.0471 | -0.1286 | -0.0446 | -0.0060 | -0.1737 | -0.2527 | 0.0677 | -0.0050 | -0.0954 | -0.1532 | Economy to improve th... | 227.0 | Economy to improve th... | New Delhi, Nov 8 (IAN... | IANS via Yahoo Maktoo... | economy | 2015-11-08 05:48:00 | -0.0820 | 0.1870 | 0.0 | 0.0 | 0.0 | 16747 days 05:48:00 | 16747.0 |
729 | 0.4975 | -0.4756 | -0.1600 | -0.1626 | 0.0125 | 0.0779 | -0.0786 | -0.0614 | -0.1395 | 0.0587 | -0.0891 | -0.1018 | 0.0672 | -0.0052 | 0.0592 | Microsoft scales back... | 307.0 | Microsoft scales back... | NEW YORK — Microsoft ... | The Columbian | microsoft | 2015-11-08 06:54:00 | 0.1063 | -0.2073 | 0.0 | 0.0 | 0.0 | 16747 days 06:54:00 | 16747.0 |
730 | -0.1604 | -0.2756 | 0.3429 | 0.1201 | 0.1550 | 0.2435 | 0.1748 | 0.2246 | -0.0212 | 0.0185 | 0.1277 | 0.0315 | -0.0493 | -0.1332 | 0.0156 | IDC predicts the emer... | 207.0 | IDC predicts the emer... | KUCHING: The global d... | The Borneo Post | economy | 2015-11-08 08:34:00 | 0.0000 | 0.0032 | 1.0 | 0.0 | 0.0 | 16747 days 08:34:00 | 16747.0 |
# All of the columns have been successfully appended
list(main_w_vectors.columns)[-13:]
['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate', 'SentimentTitle', 'SentimentHeadline', 'Facebook', 'GooglePlus', 'LinkedIn', 'TimeSinceEpoch', 'DaysSinceEpoch']
# Drop all non-numeric, non-dummy columns, for feeding into the models
cols_to_drop = ['IDLink', 'Title', 'TimeSinceEpoch', 'Headline', 'PublishDate', 'Source']
data_only_df = pd.get_dummies(main_w_vectors, columns = ['Topic']).drop(columns=cols_to_drop)
# Standardize DaysSinceEpoch since the raw numbers are larger in magnitude
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Reshape so we can feed the column to the scaler
standardized_days = np.array(data_only_df['DaysSinceEpoch']).reshape(-1, 1)
data_only_df['StandardizedDays'] = scaler.fit_transform(standardized_days)
# Drop the raw column; we don't need it anymore
data_only_df.drop(columns=['DaysSinceEpoch'], inplace=True)
# Look at the new range
data_only_df['StandardizedDays'].describe()
count 8.1398e+04 mean -6.1554e-14 std 1.0000e+00 min -1.7346e+00 25% -8.3868e-01 50% -3.5482e-02 75% 8.2950e-01 max 2.0034e+00 Name: StandardizedDays, dtype: float64
# Create individual DataFrames for each platform's regression task
fb_data_only_df = data_only_df.drop(columns=['GooglePlus', 'LinkedIn'])
goog_data_only_df = data_only_df.drop(columns=['Facebook', 'LinkedIn'])
li_data_only_df = data_only_df.drop(columns=['Facebook', 'GooglePlus'])
X = fb_data_only_df.drop('Facebook', axis=1)
y = fb_data_only_df['Facebook']
# 80% of data goes to training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)
from sklearn.metrics import mean_squared_error
# Instantiate an XGBRegressor
xgr = xgb.XGBRegressor(random_state=2)
# Fit the classifier to the training set
xgr.fit(X_train, y_train)
y_pred = xgr.predict(X_test)
mean_squared_error(y_test, y_pred)
353495.42702873156
Underwhelming results - let's do some hyperparameter tuning and see if we can improve. Hyperparameter grid repurposed from this Kaggle article.
from sklearn.model_selection import GridSearchCV
# Various hyper-parameters to tune
xgb1 = xgb.XGBRegressor()
parameters = {'nthread':[4],
'objective':['reg:linear'],
'learning_rate': [.03, 0.05, .07],
'max_depth': [5, 6, 7],
'min_child_weight': [4],
'silent': [1],
'subsample': [0.7],
'colsample_bytree': [0.7],
'n_estimators': [250]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv = 2,
n_jobs = 5,
verbose=True)
xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
Fitting 2 folds for each of 9 candidates, totalling 18 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers. [Parallel(n_jobs=5)]: Done 18 out of 18 | elapsed: 1.5min finished
0.01814513627786679 {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 250, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4,
'n_estimators': 250, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
# Try again with new params
xgr = xgb.XGBRegressor(random_state=2, **params)
# Fit the classifier to the training set
xgr.fit(X_train, y_train)
y_pred = xgr.predict(X_test)
mean_squared_error(y_test, y_pred)
# Well, it -is- better than the base model by about 30,000, but still not ideal.
315220.13834017253
As seen above, the data in its current state seems insufficient for this model to perform. Let's see if we can improve it with a little feature engineering: we'll train some classifiers to separate the two main groups of articles: Duds (0 or 1 share) vs. Not Duds.
The idea is that if we can give the regressor a feature (the probability that the article will have extremely low shares), it may perform more favorably on predicting the more highly-shared articles, thus lowering the residual values for those articles and reducing mean squared error.
Let's take a look at the distribution of FB popularity as a whole:
fb_data_only_df['Facebook'].describe()
count 81398.0000 mean 129.7399 std 662.1148 min 0.0000 25% 1.0000 50% 8.0000 75% 44.0000 max 49211.0000 Name: Facebook, dtype: float64
This tells us that 25% of articles don't even get one share. Also, from the log-transformed plots we made earlier, we can note that in general, there are 2 chunks of articles: 1 cluster at 0, and another cluster (the long tail) going from 1 onwards. We can train a few classifiers to identify whether the article will be a "dud" (be in the 0 shares bin), and then use the predictions of those models as features for the final regressor, which will predict probability. This is called model stacking.
# Define a quick function that will return 1 (true) if the article has 0-1 share(s)
def dud_finder(popularity):
if popularity <= 1:
return 1
else:
return 0
# Create target column using the function
fb_data_only_df['is_dud'] = fb_data_only_df['Facebook'].apply(dud_finder)
fb_data_only_df[['Facebook', 'is_dud']].head()
is_dud | ||
---|---|---|
726 | 0.0 | 1 |
727 | 0.0 | 1 |
728 | 0.0 | 1 |
729 | 0.0 | 1 |
730 | 1.0 | 1 |
fb_data_only_df['is_dud'].value_counts()
0 58352 1 23046 Name: is_dud, dtype: int64
# 28% of articles can be classified as "duds"
fb_data_only_df['is_dud'].sum() / len(fb_data_only_df)
0.2831273495663284
Now that we have our dud feature made, we'll initialize the classifiers. We'll use a Random Forest, an optimized XGBClassifier, and a K-Nearest Neighbors classifier. First, the XGB, since it requires a lot of fiddling.
X = fb_data_only_df.drop('is_dud', axis=1)
y = fb_data_only_df['is_dud']
# 80% of data goes to training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)
# Hyper-parameters to tune
xgc = xgb.XGBClassifier()
parameters = {'nthread':[4],
'learning_rate': [.03, 0.05, .07],
'max_depth': [5, 6, 7],
'min_child_weight': [4],
'silent': [1],
'subsample': [0.7],
'colsample_bytree': [0.7],
'n_estimators': [100]}
xgb_grid = GridSearchCV(xgc,
parameters,
cv = 2,
n_jobs = 5,
verbose=True)
xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
Fitting 2 folds for each of 9 candidates, totalling 18 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers. [Parallel(n_jobs=5)]: Done 18 out of 18 | elapsed: 22.8s finished
1.0 {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100, 'nthread': 4, 'silent': 1, 'subsample': 0.7}
# Give it 200 estimators, 100 is just a restriction to make the grid search run faster
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4,
'n_estimators': 200, 'nthread': 4, 'silent': 1, 'subsample': 0.7}
# Try again with new params
xgc = xgb.XGBClassifier(random_state=10, **params)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
X = fb_data_only_df.drop(['is_dud', 'Facebook'], axis=1)
y = fb_data_only_df['is_dud']
# 70% of data goes to training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)
#lr = LogisticRegression(solver='liblinear', multi_class='auto', random_state=10)
#xgc = xgb.XGBClassifier(random_state=10)
rfc = RandomForestClassifier(n_estimators=100, random_state=10)
knn = KNeighborsClassifier()
preds = {}
for model_name, model in zip(['XGClassifier', 'RandomForestClassifier', 'KNearestNeighbors'], [xgc, rfc, knn]):
model.fit(X_train, y_train)
preds[model_name] = model.predict(X_test)
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
for k in preds:
print("{} performance:".format(k))
print()
print(classification_report(y_test, preds[k]), sep='\n')
XGClassifier performance: precision recall f1-score support 0 0.72 1.00 0.84 11694 1 0.58 0.00 0.00 4586 micro avg 0.72 0.72 0.72 16280 macro avg 0.65 0.50 0.42 16280 weighted avg 0.68 0.72 0.60 16280 RandomForestClassifier performance: precision recall f1-score support 0 0.72 0.97 0.83 11694 1 0.39 0.05 0.09 4586 micro avg 0.71 0.71 0.71 16280 macro avg 0.56 0.51 0.46 16280 weighted avg 0.63 0.71 0.62 16280 KNearestNeighbors performance: precision recall f1-score support 0 0.73 0.85 0.78 11694 1 0.35 0.21 0.26 4586 micro avg 0.67 0.67 0.67 16280 macro avg 0.54 0.53 0.52 16280 weighted avg 0.62 0.67 0.64 16280
# Plot ROC curves
for model in [xgc, rfc, knn]:
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.show()
# Now we can add the averaged probabilities that each article is a dud to help out the regressor
averaged_probs = (xgc.predict_proba(X)[:, 1] +
knn.predict_proba(X)[:, 1] +
rfc.predict_proba(X)[:, 1]) / 3
X['prob_dud'] = averaged_probs
y = fb_data_only_df['Facebook']
# Reinitialize the train_test_split since we've changed back to regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)
# Another round of hyperparameter tuning
xgb1 = xgb.XGBRegressor()
parameters = {'nthread':[4],
'objective':['reg:linear'],
'learning_rate': [.03, .05, .07],
'max_depth': [5, 6, 7],
'min_child_weight': [4],
'silent': [1],
'subsample': [0.7],
'colsample_bytree': [0.7],
'n_estimators': [250]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv = 2,
n_jobs = 5,
verbose=True)
xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
Fitting 2 folds for each of 9 candidates, totalling 18 fits
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers. [Parallel(n_jobs=5)]: Done 18 out of 18 | elapsed: 1.5min finished
0.032297286141127646 {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 250, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
params = {'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4,
'n_estimators': 250, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
# Try again with new params
xgr = xgb.XGBRegressor(random_state=2, **params)
# Fit the classifier to the training set
xgr.fit(X_train, y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.7, gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=5, min_child_weight=4, missing=None, n_estimators=250, n_jobs=1, nthread=4, objective='reg:linear', random_state=2, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=1, subsample=0.7)
y_pred = xgr.predict(X_test)
mean_squared_error(y_test, y_pred)
# Oops...the performance is essentially the same as it was before any model stacking.
314551.5939678392
for feature, importance in zip(list(X.columns), xgr.feature_importances_):
print('Model weight for feature {}: {}'.format(feature, importance))
Model weight for feature 0: 0.04595760628581047 Model weight for feature 1: 0.03413993492722511 Model weight for feature 2: 0.02532357908785343 Model weight for feature 3: 0.054211217910051346 Model weight for feature 4: 0.02626148983836174 Model weight for feature 5: 0.05233539640903473 Model weight for feature 6: 0.03245169669389725 Model weight for feature 7: 0.038829486817121506 Model weight for feature 8: 0.035453010350465775 Model weight for feature 9: 0.035453010350465775 Model weight for feature 10: 0.08291127532720566 Model weight for feature 11: 0.03901707008481026 Model weight for feature 12: 0.0438942052423954 Model weight for feature 13: 0.028887638822197914 Model weight for feature 14: 0.03413993492722511 Model weight for feature SentimentTitle: 0.027949728071689606 Model weight for feature SentimentHeadline: 0.048583757132291794 Model weight for feature Topic_economy: 0.006565372459590435 Model weight for feature Topic_microsoft: 0.008441193029284477 Model weight for feature Topic_obama: 0.0380791611969471 Model weight for feature Topic_palestine: 0.0060026259161531925 Model weight for feature StandardizedDays: 0.07840930670499802 Model weight for feature prob_dud: 0.17670230567455292
We can at least claim one victory: the model thought that prob_dud
was the most important feature!