In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas
pandas.options.display.max_columns = 11
pandas.options.display.max_rows = 5

import matplotlib
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['figure.figsize'] = 15, 9
matplotlib.rcParams['savefig.dpi'] = 227


# Python for scientists and anarchists¶

## Dmitrijs Milajevs @dimazest. Pygrunn 2014¶

1. Data structures
2. Algorithms
3. Reporting

# Arrays, arrays are everywhere¶

In [12]:
import numpy as np

a = np.array([1, 2, 3, 4])

In [13]:
a + 1

Out[13]:
array([2, 3, 4, 5])
In [23]:
a * 2

Out[23]:
array([2, 4, 6, 8])
In [24]:
b = np.array([2, 2, 2, 2])
a + b

Out[24]:
array([3, 4, 5, 6])

# Sparse matrices¶

In [26]:
from scipy.sparse import csc_matrix

m = csc_matrix(
[
[1, 0, 0],
[0, 2, 3],
[4, 0, 0],
]
)

In [27]:
m

Out[27]:
<3x3 sparse matrix of type '<class 'numpy.int64'>'
with 4 stored elements in Compressed Sparse Column format>
In [32]:
m.T.todense()

Out[32]:
matrix([[1, 0, 4],
[0, 2, 0],
[0, 3, 0]], dtype=int64)

# English word frequencies¶

Zipf's law states that given some corpus of natural language utterances, the frequency of any word is inversely proportional to its rank in the frequency table.

[1] provides word frequencies from the British National Corpus.

The format for the list is:

sort-order, frequency, word, word-class

and a sample of the list is:

1 6187267 the det
2 4239632 be v
3 3093444 of prep
4 2687863 and conj
5 2186369 a det

[1] Kilgarriff, Adam. "Putting frequencies in the dictionary." International Journal of Lexicography 10.2 (1997): 135-155.

# Does Zipf's law hold for the BNC?¶

The easiest way to check is to plot the sorted word frequencies on the log-log scale. We should observe a straight line.

In [2]:
import pandas as pd
import seaborn as sns

In [3]:
frame = pd.read_csv(
'http://www.kilgarriff.co.uk/BNClists/lemma.num',
names=('sort-order', 'frequency', 'word', 'word-class'),
sep=' ',
index_col='sort-order',
)

In [4]:
frame

Out[4]:
frequency word word-class
sort-order
1 6187267 the det
2 4239632 be v
3 3093444 of prep
4 2687863 and conj
5 2186369 a det
... ... ...

6318 rows × 3 columns

In [23]:
p = frame.plot().loglog()


# Estimating the slope¶

In [11]:
log_frame = frame[['frequency']].reset_index().apply(np.log)
log_frame

Out[11]:
sort-order frequency
0 0.000000 15.638004
1 0.693147 15.259987
2 1.098612 14.944796
3 1.386294 14.804257
4 1.609438 14.597753
... ...

6318 rows × 2 columns

In [67]:
from scipy.stats import linregress

In [71]:
slope, intercept, r_value, p_value, std_err = linregress(log_frame['sort-order'].values, log_frame['frequency'].values)
slope, intercept

Out[71]:
(-1.1842652934251638, 17.255044729014401)
In [69]:
log_frame['estimate'] = slope * log_frame['sort-order'] + intercept

In [70]:
log_frame.set_index('sort-order').plot()

Out[70]:
<matplotlib.axes.AxesSubplot at 0x1047339d0>

# Word class frequency¶

In [78]:
frame.groupby('word-class').sum()

Out[78]:
frequency
word-class
a 5397318
conj 5782063
det 13425305
infinitive-marker 1620850
interjection 253323
modal 1457053
n 17737655
prep 11487866
pron 6104169
v 16251437

11 rows × 1 columns

In [34]:
frame.groupby('word-class').sum().plot(kind='bar')

Out[34]:
<matplotlib.axes.AxesSubplot at 0x6440fc0>

# How diverse are word classes?¶

In [35]:
frame.groupby('word-class')['word-class'].count().plot(kind='bar')

Out[35]:
<matplotlib.axes.AxesSubplot at 0x64f46d0>

# An evening project idea: compare languages¶

• Take The Bible in English, Dutch and Italian
• Count word frequencies
• Rank words
• Plot on the log-log scale
• What will you see?

# A hint: morphology in general and verb conjugation in particular¶

English Dutch Italian
I go Ik ga Io vado
You go Je gaat Tu vai
He/she/it goes Hij/ze/het gaat Lui/lei va
We go Wij gaan Noi andiamo
You go Jullie gaan Voi andate
They go Zij gaan Essi vanno

# Distributional semantics¶

• You shall know a word by the company it keeps.

John R. Firth. 1957. A Synopsis of Linguistic Theory, 1930-1955. Studies in Linguistic Analysis, pages 1– 32.

• Semantically similar words tend to appear in similar contexts.

Harris, Zellig Sabbettai 1968. Mathematical structures of language.

# Proximity co-occurrence¶

A boy might carry sweet apples.

In [37]:
from urllib.request import urlretrieve

frame_file, _ = urlretrieve(
'http://www.eecs.qmul.ac.uk/~dm303/static/eecs_open14/space_frame_eecs14.h5'
)

interesting_words = ['idea', 'notion', 'boy', 'girl']
space_frame.loc[interesting_words]

Out[37]:
time year people way man day thing child government part work
idea 258 33 324 128 84 58 50 29 52 60 77 ...
notion 41 4 27 22 9 5 3 5 22 10 13 ...
boy 102 102 21 62 126 48 34 53 1 16 21 ...
girl 110 134 30 76 121 51 30 64 3 23 26 ...

4 rows × 2000 columns

# Distances between 'words'¶

In [18]:
from sklearn.metrics.pairwise import pairwise_distances

distances = pairwise_distances(space_frame.loc[interesting_words].values, metric='cosine')
pd.DataFrame(distances, index=interesting_words, columns=interesting_words)

Out[18]:
idea notion boy girl
idea 0.000000 0.237791 0.579199 0.552227
notion 0.237791 0.000000 0.695892 0.682684
boy 0.579199 0.695892 0.000000 0.063881
girl 0.552227 0.682684 0.063881 0.000000

4 rows × 4 columns

# Word similarity¶

In [19]:
pd.DataFrame(np.exp(-distances), index=interesting_words, columns=interesting_words)

Out[19]:
idea notion boy girl
idea 1.000000 0.788367 0.560347 0.575667
notion 0.788367 1.000000 0.498630 0.505259
boy 0.560347 0.498630 1.000000 0.938117
girl 0.575667 0.505259 0.938117 1.000000

4 rows × 4 columns

In [25]:
from sklearn import manifold
from sklearn.preprocessing import MinMaxScaler

clf = manifold.MDS(n_components=2, dissimilarity='precomputed')
X = MinMaxScaler().fit_transform(
clf.fit_transform(pairwise_distances(space_frame.values, metric='cosine'))
)

In [26]:
pd.DataFrame(X, index=space_frame.index)

Out[26]:
0 1
boy 0.715728 0.481538
man 0.561349 0.454951
car 0.301277 0.881609
brother 0.870478 0.687468
uncle 0.970657 0.642094
... ...

33 rows × 2 columns

In [36]:
import pylab as pl

pl.figure()

for word, (x, y) in zip(space_frame.index, X):
pl.text(x, y, word)
pl.tight_layout()


# Conclusion¶

• Combination of domain knowledge and programming skills gives great results!

• simple linguistic ideas (what)
• simple implementation (how)
• Scientists and developers should cooperate. Contact me if you get any ideas

• @dimazest
• dimazest@gmail.com
In [1]:
from IPython.display import HTML

HTML(
"""
<script type="text/javascript">
/* * * CONFIGURATION VARIABLES: EDIT BEFORE PASTING INTO YOUR WEBPAGE * * */
var disqus_shortname = 'notebookcomments'; // required: replace example with your forum shortname

/* * * DON'T EDIT BELOW THIS LINE * * */
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';