version 1.1, July 2018
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Kevin Markham
NLP requires an understanding of the language and the world.
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
df = pd.read_csv('../datasets/mashable_texts.csv', index_col=0)
df.head()
author | author_web | shares | text | title | facebo | linked | twitte | twitter_followers | ||
---|---|---|---|---|---|---|---|---|---|---|
0 | Seth Fiegerman | http://mashable.com/people/seth-fiegerman/ | 4900 | \nApple's long and controversial ebook case ha... | The Supreme Court smacked down Apple today | http://www.facebook.com/sfiegerman | NaN | http://www.linkedin.com/in/sfiegerman | https://twitter.com/sfiegerman | 14300 |
1 | Rebecca Ruiz | http://mashable.com/people/rebecca-ruiz/ | 1900 | Analysis\n\n\n\n\n\nThere is a reason that Don... | Every woman has met a man like Donald Trump | NaN | NaN | NaN | https://twitter.com/rebecca_ruiz | 3738 |
2 | Davina Merchant | http://mashable.com/people/568bdab351984019310... | 7000 | LONDON - Last month we reported on a dog-sized... | Adorable dog-sized rabbit finally finds his fo... | NaN | https://plus.google.com/105525238342980116477?... | NaN | NaN | 0 |
3 | Scott Gerber | [] | 5000 | Today's digital marketing experts must have a ... | 15 essential skills all digital marketing hire... | NaN | NaN | NaN | NaN | 0 |
4 | Josh Dickey | http://mashable.com/people/joshdickey/ | 1600 | LOS ANGELES — For big, fun, populist popcorn m... | Mashable top 10: 'The Force Awakens' is the be... | NaN | https://plus.google.com/109213469090692520544?... | NaN | https://twitter.com/JLDlite | 11200 |
y = df.shares
y.describe()
count 82.000000 mean 3090.487805 std 8782.031594 min 437.000000 25% 893.500000 50% 1200.000000 75% 2275.000000 max 63100.000000 Name: shares, dtype: float64
y = pd.cut(y, [0, 893, 1200, 2275, 63200], labels=[0, 1, 2, 3])
y.value_counts()
1 22 3 21 0 21 2 18 Name: shares, dtype: int64
df['y'] = y
X = df.text
# use CountVectorizer to create document-term matrices from X
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)
temp=X_dtm.todense()
vect.vocabulary_
{'apple': 682, 'long': 4303, 'and': 617, 'controversial': 1747, 'ebook': 2401, 'case': 1307, 'has': 3367, 'reached': 5734, 'its': 3884, 'final': 2893, 'chapter': 1383, 'it': 3878, 'not': 4883, 'the': 7054, 'happy': 3352, 'ending': 2527, 'company': 1612, 'wanted': 7620, 'supreme': 6865, 'court': 1809, 'on': 4969, 'monday': 4687, 'rejected': 5841, 'an': 603, 'appeal': 673, 'filed': 2882, 'by': 1224, 'to': 7150, 'overturn': 5075, 'stinging': 6723, 'ruling': 6087, 'that': 7051, 'led': 4181, 'broad': 1147, 'conspiracy': 1706, 'with': 7748, 'several': 6303, 'major': 4374, 'publishers': 5610, 'fix': 2927, 'price': 5483, 'of': 4935, 'books': 1088, 'sold': 6528, 'through': 7106, 'online': 4979, 'bookstore': 1089, 'decision': 2009, 'means': 4496, 'now': 4895, 'no': 4858, 'choice': 1437, 'but': 1215, 'pay': 5178, 'out': 5037, '400': 223, 'million': 4611, 'consumers': 1714, 'additional': 446, '50': 252, 'in': 3664, 'legal': 4187, 'fees': 2846, 'according': 400, 'original': 5021, 'settlement': 6301, '2014': 153, 'see': 6237, 'also': 575, 'here': 3440, 'how': 3559, 'marshalled': 4437, 'entire': 2564, 'tech': 6989, 'industry': 3712, 'fight': 2876, 'fbi': 2826, 'for': 2996, 'verdict': 7519, 'is': 3863, 'more': 4700, 'damaging': 1939, 'reputation': 5917, 'as': 734, 'consumer': 1713, 'friendly': 3072, 'brand': 1120, 'mention': 4539, 'legacy': 4186, 'beloved': 959, 'founder': 3036, 'steve': 6715, 'jobs': 3938, 'than': 7045, 'actual': 432, 'bottom': 1102, 'line': 4252, 'put': 5636, 'fine': 2903, 'context': 1727, 'total': 7188, '450': 237, 'payout': 5183, 'equal': 2581, 'about': 374, 'little': 4276, 'half': 3328, 'sales': 6118, 'generates': 3151, 'average': 821, 'each': 2382, 'day': 1972, 'based': 895, '75': 314, 'billion': 998, 'revenue': 5987, 'reported': 5897, 'most': 4707, 'recent': 5773, 'quarter': 5651, 'fixing': 2929, 'episode': 2579, 'dates': 1962, 'back': 854, 'late': 4130, '2009': 148, 'just': 3986, 'ahead': 513, 'ipad': 3856, 'launch': 4141, 'recognizing': 5780, 'would': 7787, 'likely': 4244, 'be': 924, 'big': 990, 'selling': 6253, 'point': 5339, 'tablet': 6925, 'began': 942, 'courting': 1812, 'what': 7686, 'were': 7681, 'then': 7065, 'five': 2926, 'book': 1084, 'series': 6283, 'mails': 4367, 'later': 4132, 'released': 5854, 'government': 3237, 'personally': 5232, 'persuaded': 5234, 'publishing': 5611, 'executives': 2680, 're': 5732, 'think': 7078, 'flat': 2941, '99': 344, 'pricing': 5487, 'previously': 5481, 'imposed': 3652, 'amazon': 584, 'giant': 3181, 'world': 7779, 'all': 558, 'tell': 7004, 'us': 7462, 'new': 4830, 'releases': 5855, 'eroding': 2593, 'value': 7496, 'perception': 5210, 'their': 7058, 'products': 5525, 'customer': 1911, 'minds': 4619, 'they': 7072, 'do': 2262, 'want': 7619, 'this': 7082, 'practice': 5417, 'continue': 1729, 'wrote': 7801, 'one': 4972, 'email': 2473, 'james': 3898, 'murdoch': 4746, 'executive': 2679, 'at': 765, 'news': 4833, 'corp': 1777, 'which': 7699, 'owns': 5083, 'harper': 3362, 'collins': 1552, 'ceo': 1348, 'sent': 6268, 'exec': 2675, 'image': 3625, 'screengrab': 6201, 'mashablethe': 4452, 'unhappy': 7404, 'unfavorable': 7400, 'terms': 7024, 'agreed': 507, 'signed': 6404, 'plan': 5303, 'used': 7466, 'competition': 1621, 'pressure': 5469, 'into': 3821, 'changing': 1375, 'own': 5079, 'structure': 6774, 'while': 7700, 'some': 6539, 'argued': 707, 'move': 4724, 'helped': 3432, 'break': 1125, 'up': 7441, 'potential': 5404, 'monopoly': 4692, 'market': 4427, 'accused': 406, 'colluding': 1553, 'keep': 4015, 'prices': 5486, 'high': 3461, 'hachette': 3318, 'harpercollins': 3363, 'macmillan': 4355, 'penguin': 5200, 'simon': 6420, 'schuster': 6181, 'settled': 6300, 'department': 2087, 'justice': 3987, 'before': 941, 'going': 3214, 'trial': 7274, 'only': 4980, 'armed': 713, 'unwavering': 7438, 'belief': 953, 'rightness': 6014, 'courts': 1814, 'we': 7653, 'are': 700, 'ready': 5747, 'distribute': 2249, 'mandated': 4396, 'funds': 3097, 'kindle': 4046, 'customers': 1912, 'soon': 6551, 'instructed': 3783, 'forward': 3029, 'spokesperson': 6623, 'said': 6111, 'statement': 6686, 'provided': 5585, 'mashable': 4448, 'reps': 5913, 'did': 2160, 'immediately': 3637, 'respond': 5948, 'our': 5036, 'request': 5918, 'comment': 1584, 'however': 3560, 'after': 490, 'loss': 4319, '2013': 152, 'says': 6158, 'when': 7694, 'introduced': 3825, 'ibookstore': 3599, '2010': 149, 'gave': 3138, 'injecting': 3745, 'much': 4740, 'needed': 4808, 'innovation': 3753, 'breaking': 1126, 'monopolistic': 4691, 'grip': 3280, 'time': 7129, 've': 7511, 'done': 2293, 'nothing': 4887, 'wrong': 7799, 'have': 3380, 'once': 4971, 'again': 493, 'determined': 2135, 'otherwise': 5033, 'analysis': 608, 'there': 7069, 'reason': 5760, 'donald': 2289, 'trump': 7296, 'outrageous': 5052, 'statements': 6687, 'behavior': 948, 'feel': 2842, 'familiar': 2789, 'many': 4410, 'women': 7757, 'because': 933, 'know': 4065, 'his': 3484, 'declarative': 2014, 'style': 6796, 'trademark': 7221, 'shrug': 6390, 'from': 3077, 'reality': 5751, 'television': 7002, 'or': 5005, 'political': 5350, 'debates': 1995, 'nor': 4873, 'outsized': 5055, 'role': 6051, 'american': 590, 'business': 1211, 'made': 4357, 'unforgettable': 7402, 'impression': 3655, 'them': 7061, 'targets': 6961, 'republicans': 5916, 'say': 6156, 'he': 3389, 'could': 1792, 'cost': 1786, 'party': 5150, 'everything': 2643, 'eerie': 2424, 'familiarity': 2790, 'personal': 5230, 'encountered': 2522, 'man': 4388, 'like': 4242, 'him': 3473, 'home': 3507, 'work': 7768, 'social': 6518, 'media': 4505, 'relationship': 5848, 'extols': 2739, 'virtues': 7563, 'problem': 5508, 'reducing': 5799, 'sex': 6305, 'objects': 4915, 'casts': 1314, 'himself': 3474, 'unflappable': 7401, 'blames': 1030, 'woman': 7756, 'weaknesses': 7655, 'revealed': 5982, 'insists': 3767, 'responsibility': 5954, 'denies': 2078, 'deflects': 2046, 'perhaps': 5221, 'even': 2627, 'turns': 7322, 'violent': 7557, 'wrongdoing': 7800, 'so': 6513, 'me': 4489, 'wow': 7791, 'tough': 7195, 'nobody': 4860, 'respect': 5946, 'realdonaldtrump': 5750, 'march': 4417, '26': 180, '2016': 155, 'psychological': 5595, 'warfare': 7623, 'gaslighting': 3134, 'subtle': 6807, 'form': 3013, 'emotional': 2504, 'abuse': 382, 'puts': 5637, 'victim': 7536, 'defensive': 2039, 'go': 3206, 'strategy': 6752, 'sees': 6246, 'actions': 422, 'arguably': 706, 'projects': 5540, 'voters': 7597, 'tuesday': 7305, 'florida': 2962, 'police': 5346, 'charged': 1387, 'campaign': 1250, 'manager': 4392, 'corey': 1775, 'lewandowski': 4213, 'battery': 908, 'female': 2853, 'reporter': 5899, 'issued': 3876, 'tweets': 7333, 'refuting': 5817, 'video': 7542, 'evidence': 2648, 'discrediting': 2221, 'journalist': 3960, 'implying': 3648, 'she': 6332, 'been': 939, 'dangerous': 1945, 'deadly': 1984, 'threat': 7094, 'my': 4760, 'very': 7527, 'decent': 2005, 'was': 7632, 'assaulting': 749, 'look': 4306, 'tapes': 6956, '29': 189, 'powerful': 5415, 'left': 4185, 'right': 6013, 'forcefully': 3000, 'criticized': 1863, 'reaction': 5740, 'charges': 1389, 'too': 7172, 'often': 4952, 'victims': 7538, 'violence': 7556, 'stay': 6694, 'silent': 6411, 'remarks': 5870, 'today': 7152, 'demonstrate': 2072, 'reasons': 5762, 'why': 7709, 'dawn': 1970, 'laguens': 4104, 'vice': 7533, 'president': 5458, 'planned': 5305, 'parenthood': 5130, 'action': 420, 'fund': 3094, 'underscore': 7386, 'fiction': 2867, 'supportive': 6862, 'any': 659, 'country': 1802, 'reporters': 5900, 'found': 3033, 'tape': 6954, 'facility': 2765, 'changed': 1373, 'her': 3439, 'tune': 7308, 'pic': 5265, 'twitter': 7336, 'com': 1560, 'n5815rs1at': 4764, 'touching': 7194, 'leave': 4176, 'conference': 1664, 'hand': 3336, 'hqb8dl0fhn': 3562, 'wednesday': 7664, 'group': 3283, 'conservative': 1695, 'journalists': 3961, 'commentators': 1588, 'called': 1238, 'firing': 2916, 'track': 7216, 'record': 5783, 'making': 4381, 'defenders': 2036, 'seem': 6241, 'foolish': 2991, 'ones': 4974, 'nichole': 4845, 'bauer': 914, 'assistant': 755, 'professor': 5529, 'science': 6183, 'university': 7414, 'alabama': 541, 'response': 5951, 'classic': 1482, 'practiced': 5418, 'denying': 2084, 'previous': 5480, 'incontrovertible': 3684, 'exists': 2686, 'isn': 3871, 'exclusive': 2673, 'commentary': 1586, 'course': 1808, 'wasn': 7637, 'reference': 5803, 'menstruation': 4535, 'joked': 3950, 'fox': 3040, 'anchor': 614, 'megyn': 4522, 'kelly': 4019, 'having': 3382, 'blood': 1050, 'coming': 1579, 'wherever': 7697, 'during': 2363, 'debate': 1994, 'certainly': 1351, 'use': 7465, 'schlonged': 6177, 'vulgar': 7603, 'term': 7022, 'describing': 2104, 'badly': 865, 'pres': 5450, 'barack': 885, 'obama': 4909, 'beat': 928, 'hillary': 3470, 'clinton': 1503, '2008': 147, 'democratic': 2069, 'presidential': 5459, 'primary': 5489, 'press': 5461, 'way': 7650, 'convince': 1755, 'people': 5205, 'opposite': 5000, 'definitely': 2042, 'accepted': 388, 'among': 596, 'republican': 5915, 'mainstream': 4369, 'sort': 6556, 'outmoded': 5050, 'sexism': 6306, 'demonstrates': 2074, 'real': 5749, 'place': 5298, 'discourse': 2218, 'yet': 7832, 'instead': 3779, 'apologizing': 669, 'acknowledging': 413, 'question': 5660, 'matter': 4469, 'might': 4590, 'find': 2899, 'offensive': 4939, 'doubled': 2305, 'down': 2308, 'obfuscated': 4910, 'whether': 7698, 'coworker': 1821, 'who': 7703, 'meaning': 4493, 'sexist': 6307, 'partner': 5144, 'quick': 5664, 'end': 2525, 'argument': 709, 'using': 7471, 'word': 7765, 'telling': 7005, 'don': 2287, 'deserve': 2106, 'same': 6123, 'agency': 498, 'men': 4533, 'denials': 2076, 'may': 4477, 'make': 4376, 'appear': 674, 'unassailable': 7366, 'send': 6260, 'entirely': 2565, 'different': 2171, 'message': 4551, 'nature': 4788, 'tend': 7014, 'support': 6858, 'traditional': 7226, 'roles': 6052, 'believing': 957, 'deference': 2040, 'male': 4383, 'authority': 807, 'figure': 2879, 'husband': 3594, 'faultless': 2817, 'effectively': 2430, 'takes': 6941, 'autonomy': 815, 'away': 833, 'creating': 1840, 'climate': 1498, 'always': 580, 'explain': 2708, 'performed': 5217, 'poorly': 5363, 'survey': 6879, 'nbc': 4795, 'wsj': 7802, 'poll': 5354, '47': 242, 'cannot': 1267, 'themselves': 7064, 'voting': 7599, 'favorability': 2820, 'ratings': 5726, 'higher': 3462, '59': 282, 'amongst': 597, 'registered': 5823, 'cnn': 1520, 'orc': 5006, 'though': 7087, 'ask': 739, 'intended': 3795, 'vote': 7594, 'don_vito_08': 2288, 'picture': 5274, 'worth': 7786, 'thousand': 7091, 'words': 7767, 'lyingted': 4347, 'nevercruz': 4828, 'melaniatrump': 4524, '5bvvewmvf8': 285, '24': 174, 'these': 7071, 'polls': 5356, 'fully': 3089, 'reflect': 5808, 'unseemly': 7429, 'attacks': 776, 'heidi': 3423, 'cruz': 1883, 'wife': 7718, 'opponent': 4995, 'sen': 6258, 'ted': 6997, 'if': 3613, 'abortion': 373, 'banned': 881, 'illegally': 3620, 'should': 6371, 'face': 2758, 'punishment': 5623, 'clarified': 1478, 'hypothetical': 3598, 'reversed': 5989, 'position': 5385, 'clear': 1488, 'willing': 7724, 'publicly': 5605, 'embarrass': 2478, 'degrade': 2050, 'consider': 1696, 'depriving': 2096, 'physical': 5263, 'freedom': 3054, 'such': 6814, 'power': 5413, 'dynamics': 2372, 'aren': 703, 'abusive': 383, 'workplaces': 7776, 'relationships': 5849, 'fathom': 2815, 'mirrors': 4637, 'private': 5500, 'hell': 3428, 'doing': 2280, 'control': 1744, 'others': 5032, 'elevate': 2454, 'insulate': 3785, 'attack': 772, 'jackie': 3890, 'white': 7702, 'emerita': 2491, 'psychology': 5597, 'senior': 6262, 'research': 5925, 'scientist': 6186, 'center': 1342, 'health': 3401, 'wellness': 7678, 'north': 4878, 'carolina': 1291, 'greensboro': 3269, 'approach': 690, 'victimizes': 7537, 'playing': 5323, 'part': 5138, 'degradation': 2049, 'manipulation': 4401, 'present': 5453, 'self': 6251, 'assured': 763, 'knowledgable': 4067, 'person': 5228, 'meanwhile': 4498, 'silences': 6410, 'depressed': 2094, 'passive': 5161, 'paralyzed': 5127, 'feelings': 2844, 'doubt': 2307, 'insecurity': 3758, 'set': 6295, 'experience': 2701, 'well': 7677, 'worries': 7782, 'tactics': 6931, 'display': 2236, 'consciously': 1690, 'subconsciously': 6800, 'lead': 4155, 'endorse': 2530, 'harmful': 3361, 'stereotypes': 6714, 'engage': 2539, 'particularly': 5141, 'threatened': 7095, 'demographic': 2071, 'change': 1372, 'evolving': 2652, 'gender': 3147, 'inspire': 3771, 'justified': 3989, 'clinging': 1501, 'deserved': 2107, 'elevated': 2455, 'status': 6693, 'makes': 4380, 'offering': 4942, 'public': 5604, 'performance': 5215, 'fallout': 2785, 'unconscionable': 7374, 'both': 1101, 'london': 4302, 'last': 4127, 'month': 4695, 'dog': 2276, 'sized': 6444, 'rabbit': 5678, 'desperate': 2117, 'need': 4807, 'under': 7379, 'atlas': 770, 'permanent': 5223, 'story': 6739, 'went': 7680, 'global': 3199, 'over': 5061, 'including': 3678, 'canada': 1255, 'france': 3043, 'started': 6679, 'reaching': 5736, 'scottish': 6195, 'society': 6520, 'prevention': 5477, 'cruelty': 1879, 'animals': 633, 'thanks': 7049, 'jen': 3920, 'hislop': 3485, 'ayrshire': 839, 'adorable': 459, 'bunny': 1196, 'will': 7721, 'get': 3173, 'native': 4785, 'scotland': 6193, 'buggy': 1178, 'facebook': 2759, 'spcajen': 6581, 'financial': 2898, 'fraud': 3049, 'investigator': 3836, 'told': 7157, 'charity': 1391, 'burst': 1205, 'tears': 6985, 'got': 3230, 'phone': 5251, 'call': 1237, 'saying': 6157, 'had': 3322, 'chosen': 1448, 'cried': 1854, 'collected': 1547, '43': 231, 'year': 7822, 'old': 4962, 'two': 7339, 'bunnies': 1195, 'currently': 1906, 'rex': 5997, 'named': 4772, 'coconut': 1528, 'looking': 4309, 'still': 6722, 'growing': 3287, 'summer': 6839, 'house': 3552, 'heating': 3414, 'air': 525, 'conditioning': 1659, 'accommodation': 396, 'large': 4122, 'garden': 3128, 'enclosure': 2519, 'run': 6089, 'perfect': 5211, 'addition': 445, 'family': 2792, 'thing': 7076, 'name': 4771, 'hello': 3430, 'atilla': 767, 'bun': 1193, 'binky': 1003, 'master': 4458, 'jazz': 3913, 'paws': 5177, 'worry': 7783, 'you': 7836, 'can': 1254, 'atty': 789, 'short': 6360, 'digital': 2179, 'marketing': 4430, 'experts': 2707, 'must': 4754, 'diverse': 2254, 'skill': 6447, 'sophisticated': 6553, 'grasp': 3257, 'available': 816, 'channels': 1378, 'ability': 370, 'identify': 3608, 'opportunities': 4997, 'top': 7177, 'basic': 897, 'skills': 6448, 'brilliant': 1140, 'marketer': 4428, 'possess': 5389, 'balance': 871, 'critical': 1860, 'creative': 1843, 'thinking': 7079, 'order': 5008, 'drive': 2336, 'measurable': 4499, 'success': 6811, 'asked': 740, '15': 72, 'members': 4528, 'young': 7837, 'entrepreneur': 2570, 'council': 1794, 'yec': 7824, 'hiring': 3483, 'marketers': 4429, 'best': 974, 'answers': 653, 'below': 960, 'paid': 5106, 'advertising': 471, 'expertise': 2706, 'hire': 3480, 'versed': 7523, 'especially': 2595, 'similar': 6419, 'platform': 5312, 'uses': 7470, 'regularly': 5826, 'able': 371, 'understand': 7387, 'implement': 3647, 'analytics': 611, 'insights': 3764, 'create': 1837, 'lookalike': 4307, 'custom': 1910, 'audiences': 793, 'experiment': 2704, 'test': 7033, 'images': 3627, 'secure': 6234, 'knowledge': 4068, 'overall': 5062, 'landscape': 4112, 'budget': 1173, 'saving': 6153, 'within': 7750, 'space': 6576, 'sure': 6866, 'talent': 6943, 'knows': 4070, 'ins': 3757, 'outs': 5053, 'popular': 5369, 'easy': 2398, 'miles': 4599, 'jennings': 3922, 'recruiter': 5790, 'web': 7661, 'developers': 2142, 'data': 1959, 'scientists': 6187, 'ai': 514, 'hired': 3481, 'years': 7823, 'outsourced': 5056, 'successfully': 6813, 'better': 979, 'turn': 7317, 'your': 7840, 'closing': 1511, 'deals': 1989, 'directly': 2198, 'sell': 6252, 'll': 4281, 'wasting': 7640, 'valuable': 7495, 'dollars': 2282, 'without': 7751, 'generating': 3152, 'qualified': 5646, 'team': 6982, 'mark': 4425, 'cenicola': 1341, 'bannerview': 882, 'specific': 6594, 'channel': 1376, 'extol': 2738, 'every': 2639, 'conceivable': 1646, 'seo': 6270, 'sem': 6254, 'etc': 2609, 'key': 4026, 'successful': 6812, 'focusing': 2977, 'few': 2861, 'really': 5756, 'understanding': 7388, 'deeply': 2027, 'leveraging': 4211, 'those': 7086, 'example': 2659, 'helping': 3434, 'local': 4288, 'businesses': 1212, 'pinterest': 5282, 'less': 4200, 'interesting': 3806, 'although': 578, 'drivers': 2338, 'ecommerce': 2403, 'traffic': 7230, 'rather': 5723, 'focus': 2975, 'intricate': 3823, 'client': 1495, 'google': 3225, 'maps': 4416, 'yelp': 7827, 'trevor': 7273, 'sumner': 6841, 'localvox': 4291, 'objectively': 4914, 'usually': 7475, ...}
# rows are documents, columns are terms (aka "tokens" or "features")
X_dtm.shape
(82, 7969)
# last 50 features
print(vect.get_feature_names()[-150:-100])
['ydwnm50jlu', 'ye', 'yeah', 'year', 'years', 'yec', 'yeezy', 'yellow', 'yelp', 'yep', 'yes', 'yesterday', 'yesweather', 'yet', 'yoga', 'yong', 'york', 'you', 'young', 'younger', 'youngest', 'your', 'yourself', 'youth', 'youtube', 'youtubeduck', 'yup', 'yuyuan', 'yücel', 'zach', 'zaxoqbv487', 'zero', 'zgkymde1lzewlza0l2zkl1n0yxj0dxayljq0mdvhlmpwzwpwcxrodw1ictexnxgxmtujcmujanbn', 'zgkymde1lzewlza0l2zkl1n0yxj0dxayljq0mdvhlmpwzwpwcxrodw1icteymdb4nji3iwplcwpwzw', 'zgkymde1lzewlza0l2zkl1n0yxj0dxayljq0mdvhlmpwzwpwcxrodw1icti4ohgxnjijcmujanbn', 'zgkymde1lzewlza0l2zkl1n0yxj0dxayljq0mdvhlmpwzwpwcxrodw1ictk1mhg1mzqjcmujanbn', 'zgkymde1lzewlza0l2zkl1n0yxj0dxayljq0mdvhlmpwzwpwcxrodw1ictu2mhg3ntakzqlqcgc', 'zgkymde1lzewlza0l2zkl1n0yxj0dxayljq0mdvhlmpwzwpwcxrodw1ictywmhgzmzgjcmujanbn', 'zgkymde1lzewlza0lzm1l2jpcmrfdgfudhj1lmu3zwmzlmpwzwpwcxrodw1ictexnxgxmtujcmujanbn', 'zgkymde1lzewlza0lzm1l2jpcmrfdgfudhj1lmu3zwmzlmpwzwpwcxrodw1icteymdb4nji3iwplcwpwzw', 'zgkymde1lzewlza0lzm1l2jpcmrfdgfudhj1lmu3zwmzlmpwzwpwcxrodw1icti4ohgxnjijcmujanbn', 'zgkymde1lzewlza0lzm1l2jpcmrfdgfudhj1lmu3zwmzlmpwzwpwcxrodw1ictk1mhg1mzqjcmujanbn', 'zgkymde1lzewlza0lzm1l2jpcmrfdgfudhj1lmu3zwmzlmpwzwpwcxrodw1ictu2mhg3ntakzqlqcgc', 'zgkymde1lzewlza0lzm1l2jpcmrfdgfudhj1lmu3zwmzlmpwzwpwcxrodw1ictywmhgzmzgjcmujanbn', 'zgkymde1lzewlzaxlzhhl1rttfnjcmvlblnolmnkmgjklnbuzwpwcxrodw1ictexnxgxmtujcmujanbn', 'zgkymde1lzewlzaxlzhhl1rttfnjcmvlblnolmnkmgjklnbuzwpwcxrodw1icteymdb4nji3iwplcwpwzw', 'zgkymde1lzewlzaxlzhhl1rttfnjcmvlblnolmnkmgjklnbuzwpwcxrodw1icti4ohgxnjijcmujanbn', 'zgkymde1lzewlzaxlzhhl1rttfnjcmvlblnolmnkmgjklnbuzwpwcxrodw1ictk1mhg1mzqjcmujanbn', 'zgkymde1lzewlzaxlzhhl1rttfnjcmvlblnolmnkmgjklnbuzwpwcxrodw1ictu2mhg3ntakzqlqcgc', 'zgkymde1lzewlzaxlzhhl1rttfnjcmvlblnolmnkmgjklnbuzwpwcxrodw1ictywmhgzmzgjcmujanbn']
# show vectorizer options
vect
CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
vect = CountVectorizer(lowercase=False)
X_dtm = vect.fit_transform(X)
X_dtm.shape
(82, 8759)
X_dtm.todense()[0].argmax()
8097
vect.get_feature_names()[8097]
'the'
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 4))
X_dtm = vect.fit_transform(X)
X_dtm.shape
(82, 115172)
# last 50 features
print(vect.get_feature_names()[-1000:-950])
['you to fly', 'you to fly your', 'you to know', 'you to know quite', 'you to robertdowneyjr', 'you to robertdowneyjr for', 'you to sit', 'you to sit back', 'you to stand', 'you to stand away', 'you to watch', 'you to watch out', 'you twisty', 'you twisty the', 'you twisty the clown', 'you ve', 'you ve created', 'you ve created for', 'you ve destroyed', 'you ve destroyed just', 'you ve done', 'you ve done this', 'you ve experienced', 'you ve experienced similar', 'you ve got', 'you ve got seven', 'you ve gotten', 'you ve gotten yourself', 'you ve made', 'you ve made that', 'you ve sown', 'you ve sown across', 'you venture', 'you venture out', 'you venture out into', 'you want', 'you want to', 'you want to create', 'you want to drive', 'you want to lock', 'you want to rent', 'you want to talk', 'you what', 'you what they', 'you what they are', 'you what they look', 'you when', 'you when done', 'you when done properly', 'you which']
# Default CountVectorizer
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)
# use Naive Bayes to predict the star rating
nb = MultinomialNB()
pd.Series(cross_val_score(nb, X_dtm, y, cv=10)).describe()
count 10.000000 mean 0.420094 std 0.117514 min 0.250000 25% 0.366477 50% 0.409722 75% 0.500000 max 0.571429 dtype: float64
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
X_dtm = vect.fit_transform(X)
print('Features: ', X_dtm.shape[1])
nb = MultinomialNB()
print(pd.Series(cross_val_score(nb, X_dtm, y, cv=10)).describe())
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))
tokenize_test(vect)
Features: 37905 count 10.000000 mean 0.405808 std 0.087028 min 0.250000 25% 0.375000 50% 0.375000 75% 0.440476 max 0.571429 dtype: float64
# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)
Features: 7710 count 10.000000 mean 0.355411 std 0.085808 min 0.250000 25% 0.270833 50% 0.369318 75% 0.415179 max 0.500000 dtype: float64
# set of stop words
print(vect.get_stop_words())
frozenset({'full', 'please', 'anyhow', 'cant', 'everyone', 'been', 'there', 'behind', 'or', 'such', 'through', 'once', 'anyone', 'becoming', 'perhaps', 'why', 'yourselves', 'call', 'her', 'twenty', 'while', 'enough', 'amoungst', 'anywhere', 'many', 'above', 'an', 'elsewhere', 'see', 'than', 'my', 'no', 'who', 'nobody', 'do', 'thereafter', 'always', 'down', 'wherever', 'due', 'empty', 'hereby', 'others', 'become', 'well', 'last', 'afterwards', 'during', 'co', 'be', 'almost', 'on', 'are', 'same', 'must', 'another', 'into', 'name', 'nothing', 'itself', 'every', 'back', 'beforehand', 'hasnt', 'may', 're', 'should', 'though', 'towards', 'our', 'could', 'upon', 'thin', 'here', 'you', 'together', 'onto', 'none', 'became', 'myself', 'third', 'throughout', 'its', 'will', 'noone', 'six', 'has', 'thus', 'somehow', 'among', 'seems', 'serious', 'mostly', 'when', 'done', 'between', 'out', 'someone', 'two', 'me', 'put', 'everything', 'thick', 'one', 'and', 'because', 'by', 'often', 'although', 'this', 'go', 'seemed', 'hers', 'most', 'until', 'herein', 'very', 'bill', 'couldnt', 'what', 'whatever', 'wherein', 'which', 'whole', 'top', 'else', 'whereby', 'give', 'fifty', 'where', 'about', 'former', 'a', 'eight', 'front', 'beyond', 'hence', 'show', 'his', 'might', 'take', 'per', 'four', 'so', 'whereas', 'him', 'whose', 'fill', 'all', 'nevertheless', 'con', 'their', 'some', 'sometimes', 'but', 'have', 'he', 'himself', 'latter', 'these', 'we', 'etc', 'ever', 'whom', 'had', 'ourselves', 'interest', 'how', 'still', 'toward', 'that', 'whether', 'somewhere', 'find', 'those', 'whenever', 'am', 'hereupon', 'from', 'cannot', 'own', 'ie', 'namely', 'something', 'your', 'already', 'yourself', 'eleven', 'yours', 'everywhere', 'found', 'next', 'other', 'thru', 'detail', 'side', 'themselves', 'below', 'whereafter', 'becomes', 'would', 'was', 'besides', 'bottom', 'mine', 'inc', 'whoever', 'except', 'seeming', 'before', 'first', 'sometime', 'to', 'ten', 'for', 'even', 'being', 'across', 'ltd', 'whereupon', 'amongst', 'i', 'anything', 'both', 'rather', 'the', 'alone', 'thereupon', 'least', 'system', 'un', 'now', 'moreover', 'then', 'she', 'indeed', 'only', 'de', 'as', 'fifteen', 'them', 'yet', 'neither', 'otherwise', 'after', 'hundred', 'too', 'either', 'therefore', 'keep', 'along', 'beside', 'five', 'hereafter', 'anyway', 'however', 'latterly', 'describe', 'much', 'over', 'meanwhile', 'part', 'us', 'whither', 'within', 'not', 'fire', 'sixty', 'thereby', 'less', 'therein', 'mill', 'cry', 'of', 'amount', 'twelve', 'also', 'few', 'off', 'under', 'more', 'further', 'they', 'three', 'forty', 'move', 'nor', 'since', 'without', 'in', 'nowhere', 'any', 'sincere', 'with', 'were', 'can', 'eg', 'formerly', 'again', 'herself', 'nine', 'up', 'each', 'made', 'whence', 'if', 'never', 'via', 'it', 'ours', 'seem', 'at', 'is', 'get', 'around', 'thence', 'several', 'against'})
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)
Features: 100 count 10.000000 mean 0.375126 std 0.168480 min 0.125000 25% 0.250000 50% 0.401786 75% 0.486111 max 0.625000 dtype: float64
# all 100 features
print(vect.get_feature_names())
['01', '10', '11', '15', '1cd', '2015', '2016', '28', 'article', 'australian', 'author', 'best', 'big', 'business', 'campaign', 'com', 'company', 'conversion', 'cystic', 'daniel', 'day', 'description', 'digital', 'don', 'downey', 'entertainment', 'facebook', 'false', 'fibrosis', 'function', 'good', 'hot', 'http', 'https', 'image', 'initpage', 'instagram', 'internal', 'iron', 'jpg', 'jr', 'js', 'just', 'know', 'life', 'like', 'make', 'man', 'marketing', 'mashable', 'media', 'movie', 'movies', 'mshcdn', 'new', 'null', 'oct', 'og', 'old', 'open', 'paris', 'people', 'photo', 'pic', 'platform', 'police', 'posted', 'premiere', 'pu', 'rack', 'rdj', 'return', 'rights', 'rising', 'robert', 'said', 'sailthru', 'says', 'season', 'short_url', 'state', 'time', 'timer', 'title', 'topics', 'travel', 'true', 'trump', 'twitter', 'twttr', 'uncategorized', 'url', 've', 'watercooler', 'way', 'window', 'work', 'world', 'year', 'years']
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=1000)
tokenize_test(vect)
Features: 1000 count 10.000000 mean 0.405574 std 0.130813 min 0.250000 25% 0.270833 50% 0.414773 75% 0.500000 max 0.571429 dtype: float64
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)
Features: 7620 count 10.000000 mean 0.407594 std 0.141763 min 0.125000 25% 0.366477 50% 0.409722 75% 0.500000 max 0.571429 dtype: float64
Stemming:
# initialize stemmer
stemmer = SnowballStemmer('english')
# words
vect = CountVectorizer()
vect.fit(X)
CountVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
words = list(vect.vocabulary_.keys())[:100]
# stem each word
print([stemmer.stem(word) for word in words])
['appl', 'long', 'and', 'controversi', 'ebook', 'case', 'has', 'reach', 'it', 'final', 'chapter', 'it', 'not', 'the', 'happi', 'end', 'compani', 'want', 'suprem', 'court', 'on', 'monday', 'reject', 'an', 'appeal', 'file', 'by', 'to', 'overturn', 'sting', 'rule', 'that', 'led', 'broad', 'conspiraci', 'with', 'sever', 'major', 'publish', 'fix', 'price', 'of', 'book', 'sold', 'through', 'onlin', 'bookstor', 'decis', 'mean', 'now', 'no', 'choic', 'but', 'pay', 'out', '400', 'million', 'consum', 'addit', '50', 'in', 'legal', 'fee', 'accord', 'origin', 'settlement', '2014', 'see', 'also', 'here', 'how', 'marshal', 'entir', 'tech', 'industri', 'fight', 'fbi', 'for', 'verdict', 'is', 'more', 'damag', 'reput', 'as', 'consum', 'friend', 'brand', 'mention', 'legaci', 'belov', 'founder', 'steve', 'job', 'than', 'actual', 'bottom', 'line', 'put', 'fine', 'context']
Lemmatization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /home/al/nltk_data... [nltk_data] Package wordnet is already up-to-date!
True
# assume every word is a noun
print([wordnet_lemmatizer.lemmatize(word) for word in words])
['apple', 'long', 'and', 'controversial', 'ebook', 'case', 'ha', 'reached', 'it', 'final', 'chapter', 'it', 'not', 'the', 'happy', 'ending', 'company', 'wanted', 'supreme', 'court', 'on', 'monday', 'rejected', 'an', 'appeal', 'filed', 'by', 'to', 'overturn', 'stinging', 'ruling', 'that', 'led', 'broad', 'conspiracy', 'with', 'several', 'major', 'publisher', 'fix', 'price', 'of', 'book', 'sold', 'through', 'online', 'bookstore', 'decision', 'mean', 'now', 'no', 'choice', 'but', 'pay', 'out', '400', 'million', 'consumer', 'additional', '50', 'in', 'legal', 'fee', 'according', 'original', 'settlement', '2014', 'see', 'also', 'here', 'how', 'marshalled', 'entire', 'tech', 'industry', 'fight', 'fbi', 'for', 'verdict', 'is', 'more', 'damaging', 'reputation', 'a', 'consumer', 'friendly', 'brand', 'mention', 'legacy', 'beloved', 'founder', 'steve', 'job', 'than', 'actual', 'bottom', 'line', 'put', 'fine', 'context']
# assume every word is a verb
print([wordnet_lemmatizer.lemmatize(word,pos='v') for word in words])
['apple', 'long', 'and', 'controversial', 'ebook', 'case', 'have', 'reach', 'its', 'final', 'chapter', 'it', 'not', 'the', 'happy', 'end', 'company', 'want', 'supreme', 'court', 'on', 'monday', 'reject', 'an', 'appeal', 'file', 'by', 'to', 'overturn', 'sting', 'rule', 'that', 'lead', 'broad', 'conspiracy', 'with', 'several', 'major', 'publishers', 'fix', 'price', 'of', 'book', 'sell', 'through', 'online', 'bookstore', 'decision', 'mean', 'now', 'no', 'choice', 'but', 'pay', 'out', '400', 'million', 'consumers', 'additional', '50', 'in', 'legal', 'fee', 'accord', 'original', 'settlement', '2014', 'see', 'also', 'here', 'how', 'marshal', 'entire', 'tech', 'industry', 'fight', 'fbi', 'for', 'verdict', 'be', 'more', 'damage', 'reputation', 'as', 'consumer', 'friendly', 'brand', 'mention', 'legacy', 'beloved', 'founder', 'steve', 'job', 'than', 'actual', 'bottom', 'line', 'put', 'fine', 'context']
# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
text = text.lower()
words = text.split()
return [wordnet_lemmatizer.lemmatize(word) for word in words]
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)
Features: 10208 count 10.000000 mean 0.423990 std 0.112463 min 0.250000 25% 0.375000 50% 0.436508 75% 0.500000 max 0.571429 dtype: float64
# example documents
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
# Term Frequency
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 1 | 1 |
1 | 1 | 1 | 1 | 0 | 0 | 0 |
2 | 0 | 1 | 1 | 2 | 0 | 0 |
# Document Frequency
vect = CountVectorizer(binary=True)
df_ = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df_.reshape(1, 6), columns=vect.get_feature_names())
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 1 | 3 | 2 | 1 | 1 | 1 |
# Term Frequency-Inverse Document Frequency (simple version)
tf/df_
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 0.0 | 0.333333 | 0.0 | 0.0 | 1.0 | 1.0 |
1 | 1.0 | 0.333333 | 0.5 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.333333 | 0.5 | 2.0 | 0.0 | 0.0 |
# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
cab | call | me | please | tonight | you | |
---|---|---|---|---|---|---|
0 | 0.000000 | 0.385372 | 0.000000 | 0.000000 | 0.652491 | 0.652491 |
1 | 0.720333 | 0.425441 | 0.547832 | 0.000000 | 0.000000 | 0.000000 |
2 | 0.000000 | 0.266075 | 0.342620 | 0.901008 | 0.000000 | 0.000000 |
More details: TF-IDF is about what matters
# create a document-term matrix using TF-IDF
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(X)
features = vect.get_feature_names()
dtm.shape
(82, 7710)
# choose a random text
review_id = 40
review_text = X[review_id]
review_length = len(review_text)
# create a dictionary of words and their TF-IDF scores
word_scores = {}
for word in vect.vocabulary_.keys():
word = word.lower()
if word in features:
word_scores[word] = dtm[review_id, features.index(word)]
# print words with the top 5 TF-IDF scores
print('TOP SCORING WORDS:')
top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for word, score in top_scores:
print(word)
TOP SCORING WORDS: sanders iowa precinct coin des
# print 5 random words
print('\n' + 'RANDOM WORDS:')
random_words = np.random.choice(list(word_scores.keys()), size=5, replace=False)
for word in random_words:
print(word)
RANDOM WORDS: fann simplereach hey dubai 28z