!pip install gensim
Requirement already satisfied: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0) Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.12.0) Requirement already satisfied: numpy>=1.11.3 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.17.3) Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.8.4) Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.3.1) Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (1.10.2) Requirement already satisfied: boto>=2.32 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (2.49.0) Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (2.21.0) Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (0.9.4) Requirement already satisfied: botocore<1.14.0,>=1.13.2 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (1.13.2) Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (0.2.1) Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (1.24.3) Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (3.0.4) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (2019.9.11) Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (2.8) Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= "2.7" in /usr/local/lib/python3.6/dist-packages (from botocore<1.14.0,>=1.13.2->boto3->smart-open>=1.2.1->gensim) (2.6.1) Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.14.0,>=1.13.2->boto3->smart-open>=1.2.1->gensim) (0.15.2)
import numpy as np
from scipy.linalg import norm
import matplotlib as mpl
import matplotlib.pyplot as plt
import gensim.downloader as api
from gensim.models import Word2Vec
dataset = api.load("text8")
model = Word2Vec(dataset)
[==================================================] 100.0% 31.6/31.6MB downloaded
/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:398: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
print(norm(model.wv['organisational']))# not standardized.
1.3941699266433716
freq_norm = np.array(list(map(lambda kw: norm(model.wv[kw]), model.wv.vocab.keys())))
with mpl.style.context('seaborn'):
plt.hist(freq_norm, bins=20)
plt.xlabel('norm'); plt.ylabel('frequency')
plt.show()
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
Downloading 20news dataset. This may take a few minutes. Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
common_texts = list(map(lambda st: st.split(),twenty_train.data))
print(common_texts[0])
#alternative => from gensim.test.utils import common_texts
['From:', 'sd345@city.ac.uk', '(Michael', 'Collier)', 'Subject:', 'Converting', 'images', 'to', 'HP', 'LaserJet', 'III?', 'Nntp-Posting-Host:', 'hampton', 'Organization:', 'The', 'City', 'University', 'Lines:', '14', 'Does', 'anyone', 'know', 'of', 'a', 'good', 'way', '(standard', 'PC', 'application/PD', 'utility)', 'to', 'convert', 'tif/img/tga', 'files', 'into', 'LaserJet', 'III', 'format.', 'We', 'would', 'also', 'like', 'to', 'do', 'the', 'same,', 'converting', 'to', 'HPGL', '(HP', 'plotter)', 'files.', 'Please', 'email', 'any', 'response.', 'Is', 'this', 'the', 'correct', 'group?', 'Thanks', 'in', 'advance.', 'Michael.', '--', 'Michael', 'Collier', '(Programmer)', 'The', 'Computer', 'Unit,', 'Email:', 'M.P.Collier@uk.ac.city', 'The', 'City', 'University,', 'Tel:', '071', '477-8000', 'x3769', 'London,', 'Fax:', '071', '477-8565', 'EC1V', '0HB.']
from gensim.models import FastText
model = FastText(size=4, window=3, min_count=1) # instantiate
model.build_vocab(sentences=common_texts)
%time model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)
CPU times: user 43 s, sys: 110 ms, total: 43.1 s Wall time: 24.7 s
freq_norm = np.array(list(map(lambda kw: norm(model.wv[kw]), model.wv.vocab.keys())))
with mpl.style.context('seaborn'):
plt.hist(freq_norm, bins=20)
plt.xlabel('norm'); plt.ylabel('frequency')
plt.show()