from pathlib import Path
import fastText
import sklearn
import sklearn.metrics
import numpy as np
import re
Folder structure of this project:
We will use twitter_las_vegas_shooting
for training, which contains 50,000 tweets crawled during Las Vegas mass shooting massacre.
root_dir = Path("..")
data_dir = root_dir / "data" / "3-entity-extraction"
notebook_dir = root_dir / "notebooks"
model_dir = data_dir / "model"
if not model_dir.exists():
model_dir.mkdir()
# corpus
data_path = data_dir / "twitter_las_vegas_shooting"
# Training corpus filename
input_filename = str(data_path)
# Model filename
model_filename = str(model_dir / "twitter.bin")
Preprocessing tweet to obtain a good representation of language model.
# Preprocessing Config
preprocess_config = {
"hashtag": True,
"mentioned": True,
"punctuation": True,
"url": True,
}
# Pattern
hashtag_pattern = "#\w+"
mentioned_pattern = "@\w+"
url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
trans_str = "!\"$%&\'()*+,-./:;<=>?[\\]^_`{|}~" + "…"
translate_table = str.maketrans(trans_str, " " * len(trans_str))
def preprocess(s):
s = s.lower()
if preprocess_config["hashtag"]:
s = re.sub(hashtag_pattern, "", s)
if preprocess_config["mentioned"]:
s = re.sub(mentioned_pattern, "", s)
if preprocess_config["url"]:
s = re.sub(url_pattern, "", s)
if preprocess_config["punctuation"]:
s = " ".join(s.translate(translate_table).split())
return s
Preprocessing Example
Here is an example output of preprocessing.
# example of preprocessing
example_tweet = "RT @TheLeadCNN: Remembering Keri Lynn Galvan, from Thousand Oaks, California. #LasVegasLost https://t.co/QuvXa6WvlE https://t.co/hDF2d3Owgn"
print("Original Tweet:")
print(example_tweet)
print()
print("Preprocessed Tweet:")
print(preprocess(example_tweet))
Original Tweet: RT @TheLeadCNN: Remembering Keri Lynn Galvan, from Thousand Oaks, California. #LasVegasLost https://t.co/QuvXa6WvlE https://t.co/hDF2d3Owgn Preprocessed Tweet: rt remembering keri lynn galvan from thousand oaks california
Preprocessing corpus
# Preprocessing
preprocessed_data_path = data_dir / "twitter_las_vegas_shooting.preprocessed"
with data_path.open() as f:
lines = [l.strip() for l in f.readlines()]
with preprocessed_data_path.open("w") as f:
for l in lines:
f.write(preprocess(l))
f.write("\n")
# use preprocessed data as input
input_filename = str(preprocessed_data_path)
Use corpus after preprocessing to generate the 100 dimensions embedding representation model.
# fastText Config
embedding_model = "skipgram"
lr = 0.05
dim = 100
ws = 5
epoch = 5
minCount = 5
minCountLabel = 0
minn = 3
maxn = 6
neg = 5
wordNgrams = 1
loss = "ns"
bucket = 2000000
thread = 12
lrUpdateRate = 100
t = 1e-4
verbose = 2
model = fastText.train_unsupervised(
input = input_filename,
model=embedding_model,
lr=lr,
dim=dim,
ws=ws,
epoch=epoch,
minCount=minCount,
minCountLabel=minCountLabel,
minn=minn,
maxn=maxn,
neg=neg,
wordNgrams=wordNgrams,
loss=loss,
bucket=bucket,
thread=thread,
lrUpdateRate=lrUpdateRate,
t=t,
verbose=verbose,
)
print("Training finished.")
print("Dimension: {}".format(model.get_dimension()))
print("Number of words: {}".format(len(model.get_words())))
# Output model to disk if needed
model.save_model(model_filename)
# Load saved model if needed
model = fastText.load_model(model_filename)
Training finished. Dimension: 100 Number of words: 6040
Get word vectors of corpus
words = np.array(model.get_words())
word_vectors = np.array([model.get_word_vector(w) for w in words])
Similarity of word vectors In text embedding space, cosine similarity could be used for measuring similarity between words
# Calculate N neighbors based on cosine similarity
def calc_n_cosine_neighbor(inX, X, N):
if inX.ndim == 1:
inX = [inX]
distances = sklearn.metrics.pairwise.pairwise_distances(
X, inX, metric="cosine")
sortedDist = distances.reshape((distances.shape[0],)).argsort()
return sortedDist[:N], distances
# calculate nearest neighbours based on cosine similarity
def nn(query, words=words, word_vectors=word_vectors, k=10):
"""
words: numpy array of words
k: (optional, 10 by default) top k labels
"""
global model
v = model.get_word_vector(query)
idx, _ = calc_n_cosine_neighbor(v, word_vectors, k)
return words[idx]
q = "lasvegasshooting"
neighbours = nn("lasvegasshooting", k=20)
print("Neighbours of word \"{}\":".format(q))
for word in neighbours:
print(word)
Neighbours of word "lasvegasshooting": shooting lasvegas vegas” las vegas rt vega “shooting” shootin </s> shooting” shooti shootings ❤ 🙏🙏🙏 👍 cc 🙏🏾 😢💔 😓
Use API get_sentence_vector
to get a representation of sentende
example_tweet = "RT @TheLeadCNN: Remembering Keri Lynn Galvan, from Thousand Oaks, California. #LasVegasLost https://t.co/QuvXa6WvlE https://t.co/hDF2d3Owgn"
tweet_vector = model.get_sentence_vector(example_tweet)
print("Tweet vector in embedding space:")
print(example_tweet)
print()
print(tweet_vector)
print()
print("Words similar this tweet")
idx, _ = calc_n_cosine_neighbor(tweet_vector, word_vectors, 20)
print([words[i] for i in idx])
Tweet vector in embedding space: RT @TheLeadCNN: Remembering Keri Lynn Galvan, from Thousand Oaks, California. #LasVegasLost https://t.co/QuvXa6WvlE https://t.co/hDF2d3Owgn [-0.02461572 0.04784836 -0.05343785 0.00153351 0.04367601 0.10020498 -0.01127366 -0.00975734 -0.01951972 0.07512145 0.03622668 -0.00580111 0.08758368 0.031007 -0.00507403 0.07074952 -0.05185707 -0.11242248 -0.03888126 -0.01926897 0.08175821 -0.01120457 -0.07555435 -0.04022888 0.00478477 -0.0012044 0.05348494 0.0350855 0.0982817 0.01342872 0.00545024 0.00250413 0.03077969 -0.0874893 -0.03390906 0.14996992 -0.01272367 -0.02368226 -0.01887075 -0.02408492 -0.03291685 -0.05095126 -0.04614896 0.10122891 0.07110424 -0.12804917 -0.05888803 -0.03085945 -0.01463612 0.11134949 -0.08774657 -0.01715528 -0.08862083 0.00346183 0.09192748 0.05510866 -0.04465136 -0.0433164 0.02116909 -0.06731256 -0.00497376 -0.02442945 0.04918417 -0.03386533 0.05390133 0.01210842 -0.03669443 0.00295777 -0.00802929 0.05568004 0.03773327 0.02532181 -0.00200854 -0.02188686 -0.09255282 0.01222703 0.02790884 -0.03890502 -0.08059786 -0.02257247 -0.00428823 -0.00145929 0.07885873 0.07522529 0.02859196 0.06673713 0.04707326 -0.04525249 0.04185518 0.05594757 0.03690847 0.0279574 0.05838018 0.034359 0.02512365 -0.06622847 0.00108856 -0.06983574 0.02984929 -0.01983222] Words similar this tweet ['➜', '😢💔', 'nw', 'pittsburgh', '🇨🇦', '31', '🙏🏾', 'novato', 'md', '38', '💜', '👍', 'quartz', 'pasadena', '🙏🙏🙏', 'jusswaggtv', '→', '😓', 'hrc', 'umc']