from google.colab import drive
drive.mount('/data/')
data_dir = '/data/My Drive/Colab Notebooks/Experiment'
!ls '/data/My Drive/Colab Notebooks/Experiment'
!pip install matplotlib
Mounted at /data/ diamonds.csv Iris.csv m_data.csv news_data.csv TSLA.csv w_data.csv Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1) Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)
df = pd.read_csv(data_dir + '/news_data.csv')
print(df.shape)
df = df.drop_duplicates('description') # drop dupes
print("drop dupes: " + str(df.shape))
df = df[~df['description'].isnull()] # drop null values
print("drop null values: " + str(df.shape))
df = df[(df.description.map(len) > 120) & (df.description.map(len) <= 350)] # limit to descriptions between 120 and 350 characters
df.reset_index(inplace=True, drop=True)
print("filter on desc lengths: " + str(df.shape))
(50126, 9) drop dupes: (44774, 9) drop null values: (44773, 9) filter on desc lengths: (19467, 9)
df['description'].head()
0 Researchers discover what could be one of the ... 1 Yemen is now classified as the world's worst h... 2 Malcolm Turnbull and Joko Widodo hold talks in... 3 KUALA LUMPUR, Malaysia (AP) — Malaysia's healt... 4 HANOI, Vietnam (AP) — Two women — a Vietnamese... Name: description, dtype: object
import nltk
from nltk.stem import *
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
from functools import reduce
import re
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')
en_stops = set(stopwords.words('english'))
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
### Cleaning the data set
def clean_text(text):
text = text.lower()
text = re.sub(r"what's", "what is ", text)
text = text.replace('(ap)', '')
text = re.sub(r"\'s", " is ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r'\W+', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\"", "", text)
text = re.sub('[^a-zA-Z ?!]+', '', text)
text = text.strip()
return text
df['text_clean'] = df['description'].progress_map(lambda d: clean_text(d))
HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))
df['text_clean'].head()
0 researchers discover what could be one of the ... 1 yemen is now classified as the world is worst ... 2 malcolm turnbull and joko widodo hold talks in... 3 kuala lumpur malaysia malaysia is health minis... 4 hanoi vietnam two women a vietnamese and an in... Name: text_clean, dtype: object
text = df['text_clean'].to_list()
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation
def tokenizer(text):
tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
tokens = list(reduce(lambda x,y: x+y, tokens))
tokens = list(filter(lambda token: token not in (en_stops), tokens))
return tokens
df['token'] = df['text_clean'].progress_map(lambda d: tokenizer(d))
HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))
for descripition, tokens in zip(df['description'].head(5), df['token'].head(5)):
print('description:', descripition)
print('tokens:', tokens)
print()
description: Researchers discover what could be one of the worst cases of mine pollution in the world in the heart of New South Wales' pristine heritage-listed Blue Mountains. tokens: ['researchers', 'discover', 'could', 'one', 'worst', 'cases', 'mine', 'pollution', 'world', 'heart', 'new', 'south', 'wales', 'pristine', 'heritage', 'listed', 'blue', 'mountains'] description: Yemen is now classified as the world's worst humanitarian disaster but Australia has committed no funding to help save lives there. tokens: ['yemen', 'classified', 'world', 'worst', 'humanitarian', 'disaster', 'australia', 'committed', 'funding', 'help', 'save', 'lives'] description: Malcolm Turnbull and Joko Widodo hold talks in Sydney, reviving cooperation halted after the discovery of insulting posters at a military base, and reaching deals on trade and a new consulate in east Java. tokens: ['malcolm', 'turnbull', 'joko', 'widodo', 'hold', 'talks', 'sydney', 'reviving', 'cooperation', 'halted', 'discovery', 'insulting', 'posters', 'military', 'base', 'reaching', 'deals', 'trade', 'new', 'consulate', 'east', 'java'] description: KUALA LUMPUR, Malaysia (AP) — Malaysia's health minister said Sunday that the dose of nerve agent given to North Korean ruler Kim Jong Un's exiled half brother was so high that it killed him within 20 minutes and caused… tokens: ['kuala', 'lumpur', 'malaysia', 'malaysia', 'health', 'minister', 'said', 'sunday', 'dose', 'nerve', 'agent', 'given', 'north', 'korean', 'ruler', 'kim', 'jong', 'un', 'exiled', 'half', 'brother', 'high', 'killed', 'within', 'minutes', 'caused'] description: HANOI, Vietnam (AP) — Two women — a Vietnamese and an Indonesian — have been arrested for allegedly coating their hands with the immensely toxic chemical agent VX and wiping them on the face of the North Korean leader's… tokens: ['hanoi', 'vietnam', 'two', 'women', 'vietnamese', 'indonesian', 'arrested', 'allegedly', 'coating', 'hands', 'immensely', 'toxic', 'chemical', 'agent', 'vx', 'wiping', 'face', 'north', 'korean', 'leader']
from collections import Counter
from nltk.probability import FreqDist
vf = pd.DataFrame(df.head(1000)['category'])
vectors = pd.DataFrame()
for row in df.head(1000)['token']:
vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)
vectors.fillna(0,inplace=True)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=30, random_state=123).fit(vectors)
centers=kmeans.cluster_centers_
{k:v for k,v in dict(zip(vectors.columns,centers[6])).items() if v >= 0.1}
{'administration': 0.11999999999999993, 'attend': 0.10000000000000019, 'attorney': 0.11999999999999984, 'correspondents': 0.11999999999999987, 'dinner': 0.1600000000000003, 'donald': 0.56, 'general': 0.1200000000000001, 'house': 0.6799999999999996, 'media': 0.12000000000000015, 'new': 0.10000000000000002, 'news': 0.1400000000000002, 'president': 0.23999999999999969, 'presidential': 0.13999999999999985, 'press': 0.13999999999999987, 'sessions': 0.11999999999999993, 'sunday': 0.1, 'trump': 0.84, 'twitter': 0.10000000000000006, 'us': 0.10000000000000006, 'white': 0.6599999999999995}
kmeans.inertia_
16532.01116078362
vec = {k:v for k,v in dict(FreqDist(tokens)).items() if k in vectors.columns}
vectors = vectors.append(vec,ignore_index=True)
vectors.fillna(0,inplace=True)
kmeans.predict([vectors.iloc[-1]])
array([10], dtype=int32)
centers=kmeans.cluster_centers_
{k:v for k,v in dict(zip(vectors.columns,centers[1])).items() if v >= 0.1}
{'accusations': 1.0, 'carmaker': 1.0, 'ceo': 1.0, 'conditions': 1.0, 'elon': 1.0, 'employee': 1.0, 'factory': 1.0, 'following': 1.0, 'investigation': 1.0, 'musk': 1.0, 'results': 1.0, 'shared': 1.0, 'tesla': 1.0, 'working': 1.0}
from sklearn.metrics import pairwise_distances
from scipy.spatial import distance
dist = pd.DataFrame(pairwise_distances(vectors, metric='cosine'))
vectors[dist.iloc[2]<0.8]
blue | cases | could | discover | heart | heritage | listed | mine | mountains | new | one | pollution | pristine | researchers | south | wales | world | worst | australia | classified | committed | disaster | funding | help | humanitarian | lives | save | yemen | base | consulate | cooperation | deals | discovery | east | halted | hold | insulting | java | joko | malcolm | ... | slovacko | attacking | backyard | flying | neighbor | sheriff | summons | ballots | husted | ohio | registered | uncovered | pentagon | belong | hint | lunardi | approaches | jayhawks | sits | lahore | punjab | adrien | midfielder | rabiot | stature | alvaro | impressive | isco | morata | outings | substitute | allows | applications | browser | optimized | qt | remote | webgl | defence | ramp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
146 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 rows × 6144 columns