In [ ]:

from google.colab import drive
drive.mount('/data/')
data_dir = '/data/My Drive/Colab Notebooks/Experiment'
!ls '/data/My Drive/Colab Notebooks/Experiment'
!pip install matplotlib

Mounted at /data/
diamonds.csv  Iris.csv	m_data.csv  news_data.csv  TSLA.csv  w_data.csv
Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)
Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)

In [ ]:

df = pd.read_csv(data_dir + '/news_data.csv')
print(df.shape)

df = df.drop_duplicates('description') # drop dupes
print("drop dupes: " + str(df.shape))

df = df[~df['description'].isnull()] # drop null values
print("drop null values: " + str(df.shape))

df = df[(df.description.map(len) > 120) & (df.description.map(len) <= 350)] # limit to descriptions between 120 and 350 characters

df.reset_index(inplace=True, drop=True)
print("filter on desc lengths: " + str(df.shape))

(50126, 9)
drop dupes: (44774, 9)
drop null values: (44773, 9)
filter on desc lengths: (19467, 9)

In [ ]:

df['description'].head()

Out[ ]:

0    Researchers discover what could be one of the ...
1    Yemen is now classified as the world's worst h...
2    Malcolm Turnbull and Joko Widodo hold talks in...
3    KUALA LUMPUR, Malaysia (AP) — Malaysia's healt...
4    HANOI, Vietnam (AP) — Two women — a Vietnamese...
Name: description, dtype: object

In [ ]:

import nltk
from nltk.stem import *
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
from functools import reduce
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

In [ ]:

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords.words('english')

en_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

In [ ]:

### Cleaning the data set 

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = text.replace('(ap)', '')
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    text = re.sub('[^a-zA-Z ?!]+', '', text)
    text = text.strip()
    return text

df['text_clean'] = df['description'].progress_map(lambda d: clean_text(d))

HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))

In [ ]:

df['text_clean'].head()

Out[ ]:

0    researchers discover what could be one of the ...
1    yemen is now classified as the world is worst ...
2    malcolm turnbull and joko widodo hold talks in...
3    kuala lumpur malaysia malaysia is health minis...
4    hanoi vietnam two women a vietnamese and an in...
Name: text_clean, dtype: object

In [ ]:

text = df['text_clean'].to_list()

In [ ]:

from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation

def tokenizer(text):

    tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
    tokens = list(reduce(lambda x,y: x+y, tokens))
    tokens = list(filter(lambda token: token not in (en_stops), tokens))

    return tokens

df['token'] = df['text_clean'].progress_map(lambda d: tokenizer(d))

HBox(children=(FloatProgress(value=0.0, max=19467.0), HTML(value='')))

In [ ]:

for descripition, tokens in zip(df['description'].head(5), df['token'].head(5)):
    print('description:', descripition)
    print('tokens:', tokens)
    print()

description: Researchers discover what could be one of the worst cases of mine pollution in the world in the heart of New South Wales' pristine heritage-listed Blue Mountains.
tokens: ['researchers', 'discover', 'could', 'one', 'worst', 'cases', 'mine', 'pollution', 'world', 'heart', 'new', 'south', 'wales', 'pristine', 'heritage', 'listed', 'blue', 'mountains']

description: Yemen is now classified as the world's worst humanitarian disaster but Australia has committed no funding to help save lives there.
tokens: ['yemen', 'classified', 'world', 'worst', 'humanitarian', 'disaster', 'australia', 'committed', 'funding', 'help', 'save', 'lives']

description: Malcolm Turnbull and Joko Widodo hold talks in Sydney, reviving cooperation halted after the discovery of insulting posters at a military base, and reaching deals on trade and a new consulate in east Java.
tokens: ['malcolm', 'turnbull', 'joko', 'widodo', 'hold', 'talks', 'sydney', 'reviving', 'cooperation', 'halted', 'discovery', 'insulting', 'posters', 'military', 'base', 'reaching', 'deals', 'trade', 'new', 'consulate', 'east', 'java']

description: KUALA LUMPUR, Malaysia (AP) — Malaysia's health minister said Sunday that the dose of nerve agent given to North Korean ruler Kim Jong Un's exiled half brother was so high that it killed him within 20 minutes and caused…
tokens: ['kuala', 'lumpur', 'malaysia', 'malaysia', 'health', 'minister', 'said', 'sunday', 'dose', 'nerve', 'agent', 'given', 'north', 'korean', 'ruler', 'kim', 'jong', 'un', 'exiled', 'half', 'brother', 'high', 'killed', 'within', 'minutes', 'caused']

description: HANOI, Vietnam (AP) — Two women — a Vietnamese and an Indonesian — have been arrested for allegedly coating their hands with the immensely toxic chemical agent VX and wiping them on the face of the North Korean leader's…
tokens: ['hanoi', 'vietnam', 'two', 'women', 'vietnamese', 'indonesian', 'arrested', 'allegedly', 'coating', 'hands', 'immensely', 'toxic', 'chemical', 'agent', 'vx', 'wiping', 'face', 'north', 'korean', 'leader']

In [ ]:

from collections import Counter
from nltk.probability import FreqDist

vf = pd.DataFrame(df.head(1000)['category'])

vectors = pd.DataFrame()
for row in df.head(1000)['token']:
    vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)

In [ ]:

vectors.fillna(0,inplace=True)

In [ ]:

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=30, random_state=123).fit(vectors)

In [ ]:

centers=kmeans.cluster_centers_
{k:v for k,v in dict(zip(vectors.columns,centers[6])).items() if v >= 0.1}

Out[ ]:

{'administration': 0.11999999999999993,
 'attend': 0.10000000000000019,
 'attorney': 0.11999999999999984,
 'correspondents': 0.11999999999999987,
 'dinner': 0.1600000000000003,
 'donald': 0.56,
 'general': 0.1200000000000001,
 'house': 0.6799999999999996,
 'media': 0.12000000000000015,
 'new': 0.10000000000000002,
 'news': 0.1400000000000002,
 'president': 0.23999999999999969,
 'presidential': 0.13999999999999985,
 'press': 0.13999999999999987,
 'sessions': 0.11999999999999993,
 'sunday': 0.1,
 'trump': 0.84,
 'twitter': 0.10000000000000006,
 'us': 0.10000000000000006,
 'white': 0.6599999999999995}

In [ ]:

kmeans.inertia_

Out[ ]:

16532.01116078362

In [ ]:

vec = {k:v for k,v in dict(FreqDist(tokens)).items() if k in vectors.columns}

vectors = vectors.append(vec,ignore_index=True)
vectors.fillna(0,inplace=True)
kmeans.predict([vectors.iloc[-1]])

Out[ ]:

array([10], dtype=int32)

In [ ]:

centers=kmeans.cluster_centers_
{k:v for k,v in dict(zip(vectors.columns,centers[1])).items() if v >= 0.1}

Out[ ]:

{'accusations': 1.0,
 'carmaker': 1.0,
 'ceo': 1.0,
 'conditions': 1.0,
 'elon': 1.0,
 'employee': 1.0,
 'factory': 1.0,
 'following': 1.0,
 'investigation': 1.0,
 'musk': 1.0,
 'results': 1.0,
 'shared': 1.0,
 'tesla': 1.0,
 'working': 1.0}

In [ ]:

from sklearn.metrics import pairwise_distances
from scipy.spatial import distance

dist = pd.DataFrame(pairwise_distances(vectors, metric='cosine'))
vectors[dist.iloc[2]<0.8]

Out[ ]:

	blue	cases	could	discover	heart	heritage	listed	mine	mountains	new	one	pollution	pristine	researchers	south	wales	world	worst	australia	classified	committed	disaster	funding	help	humanitarian	lives	save	yemen	base	consulate	cooperation	deals	discovery	east	halted	hold	insulting	java	joko	malcolm	...	slovacko	attacking	backyard	flying	neighbor	sheriff	summons	ballots	husted	ohio	registered	uncovered	pentagon	belong	hint	lunardi	approaches	jayhawks	sits	lahore	punjab	adrien	midfielder	rabiot	stature	alvaro	impressive	isco	morata	outings	substitute	allows	applications	browser	optimized	qt	remote	webgl	defence	ramp
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
146	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

2 rows × 6144 columns

In [ ]:

	blue	cases	could	discover	heart	heritage	listed	mine	mountains	new	one	pollution	pristine	researchers	south	wales	world	worst	australia	classified	committed	disaster	funding	help	humanitarian	lives	save	yemen	base	consulate	cooperation	deals	discovery	east	halted	hold	insulting	java	joko	malcolm	...	slovacko	attacking	backyard	flying	neighbor	sheriff	summons	ballots	husted	ohio	registered	uncovered	pentagon	belong	hint	lunardi	approaches	jayhawks	sits	lahore	punjab	adrien	midfielder	rabiot	stature	alvaro	impressive	isco	morata	outings	substitute	allows	applications	browser	optimized	qt	remote	webgl	defence	ramp
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
146	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	blue	cases	could	discover	heart	heritage	listed	mine	mountains	new	one	pollution	pristine	researchers	south	wales	world	worst	australia	classified	committed	disaster	funding	help	humanitarian	lives	save	yemen	base	consulate	cooperation	deals	discovery	east	halted	hold	insulting	java	joko	malcolm	...	slovacko	attacking	backyard	flying	neighbor	sheriff	summons	ballots	husted	ohio	registered	uncovered	pentagon	belong	hint	lunardi	approaches	jayhawks	sits	lahore	punjab	adrien	midfielder	rabiot	stature	alvaro	impressive	isco	morata	outings	substitute	allows	applications	browser	optimized	qt	remote	webgl	defence	ramp
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
146	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	blue	cases	could	discover	heart	heritage	listed	mine	mountains	new	one	pollution	pristine	researchers	south	wales	world	worst	australia	classified	committed	disaster	funding	help	humanitarian	lives	save	yemen	base	consulate	cooperation	deals	discovery	east	halted	hold	insulting	java	joko	malcolm	...	slovacko	attacking	backyard	flying	neighbor	sheriff	summons	ballots	husted	ohio	registered	uncovered	pentagon	belong	hint	lunardi	approaches	jayhawks	sits	lahore	punjab	adrien	midfielder	rabiot	stature	alvaro	impressive	isco	morata	outings	substitute	allows	applications	browser	optimized	qt	remote	webgl	defence	ramp
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
146	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0