Master-Thesis: Social Media & Text Mining am Beispiel von Telegram
Maximilian Bundscherer
Informatik Master
Hinweis: Die Abschnitte Arbeitungsumgebung initialisieren
und Chats laden und aufbereiten
werden im Notebook Telegram.iypnb
bereits ausführlich beschrieben und werden daher hier übersprungen.
Siehe Beschreibung im Notebook Telegram.ipynb
# Import default libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import re
import os
import sys
import demjson
import requests
import networkx as nx
import warnings
from pprint import pprint
from urllib.parse import urlparse
from collections import Counter
from pathlib import Path
from lxml.html import fromstring
# Hide DeprecationWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
!{sys.executable} -m pip install demoji
Requirement already satisfied: demoji in /opt/conda/lib/python3.8/site-packages (0.4.0) Requirement already satisfied: colorama in /opt/conda/lib/python3.8/site-packages (from demoji) (0.4.4) Requirement already satisfied: requests<3.0.0 in /opt/conda/lib/python3.8/site-packages (from demoji) (2.25.1) Requirement already satisfied: chardet<5,>=3.0.2 in /opt/conda/lib/python3.8/site-packages (from requests<3.0.0->demoji) (4.0.0) Requirement already satisfied: idna<3,>=2.5 in /opt/conda/lib/python3.8/site-packages (from requests<3.0.0->demoji) (2.10) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.8/site-packages (from requests<3.0.0->demoji) (1.26.4) Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.8/site-packages (from requests<3.0.0->demoji) (2020.12.5)
import nltk
import demoji
# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
dictGloStopwatches = dict()
# Start timer (for reporting)
def gloStartStopwatch(key):
print("[Stopwatch started >>" + str(key) + "<<]")
dictGloStopwatches[key] = time.time()
# Stop timer (for reporting)
def gloStopStopwatch(key):
endTime = time.time()
startTime = dictGloStopwatches[key]
print("[Stopwatch stopped >>" + str(key) + "<< (" + '{:5.3f}s'.format(endTime-startTime) + ")]")
nltk.download("stopwords")
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data... [nltk_data] Package stopwords is already up-to-date!
True
demoji.download_codes()
Downloading emoji data ... ... OK (Got response in 0.40 seconds) Writing emoji data to /home/jovyan/.demoji/codes.json ... ... OK
# Show all columns (pandas hides columns by default)
pd.set_option('display.max_columns', None)
# Set plot style
plt.style.use('ggplot')
font = {'size' : 13}
plt.rc('font', **font)
dir_var = "./work/notebooks/"
dir_var_output = dir_var + "output/"
dir_var_pandas_cache = dir_var + "cache/pandas/"
def gloReplaceGermanChars(inputText):
inputText = inputText.replace("ö", "oe")
inputText = inputText.replace("ü", "ue")
inputText = inputText.replace("ä", "ae")
inputText = inputText.replace("Ö", "Oe")
inputText = inputText.replace("Ü", "Ue")
inputText = inputText.replace("Ä", "Ae")
inputText = inputText.replace("ß", "ss")
return inputText
# Rm unsafe chars
def gloConvertToSafeString(text):
text = demoji.replace(text, "")
text = gloReplaceGermanChars(text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
return text
# Generate unique chat name
def gloConvertToSafeChatName(chatName):
chatName = gloConvertToSafeString(chatName)
return chatName[:30]
def gloGetStopWordsList(filterList):
stopwWorldsList = []
deWordsList = nltk.corpus.stopwords.words('german')
enWordsList = nltk.corpus.stopwords.words('english')
aStopwords = []
with open(dir_var + "additionalStopwords.txt") as file:
for line in file:
line = line.strip()
if(line != ""):
aStopwords.append(line)
for s in filterList:
s = gloReplaceGermanChars(s)
stopwWorldsList.append(s)
for s in deWordsList:
s = gloReplaceGermanChars(s)
stopwWorldsList.append(s)
for s in enWordsList:
stopwWorldsList.append(s)
for s in aStopwords:
s = gloReplaceGermanChars(s)
stopwWorldsList.append(s)
return stopwWorldsList
Siehe Beschreibung im Notebook Telegram.ipynb
Siehe Beschreibung in Thesis.pdf
C_USE_CACHE_FILE = "final-run-24-03.pkl"
gloStartStopwatch("Cache einlesen")
dfAllDataMessages = pd.read_pickle(dir_var_pandas_cache + C_USE_CACHE_FILE)
gloStopStopwatch("Cache einlesen")
[Stopwatch started >>Cache einlesen<<] [Stopwatch stopped >>Cache einlesen<< (28.593s)]
dfAllDataMessages= dfAllDataMessages[dfAllDataMessages['ftFilePath'].isin(
[
"DS-05-01-2021/ChatExport_2021-01-05-hildmann",
"DS-05-01-2021/ChatExport_2021-01-05-janich",
"DS-05-01-2021/ChatExport_2021-01-05-xavier",
"DS-05-01-2021/ChatExport_2021-01-05-evaherman"
]
)]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.ftQrIsValidText == True]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.ftTdCleanText != ""]
dfAllDataMessages = dfAllDataMessages[dfAllDataMessages.ftTdTextLength > 5]
dfAllDataMessages["from"] = dfAllDataMessages["from"].apply(gloConvertToSafeChatName)
dfAllDataMessages.head(3)
id | type | date | actor | actor_id | action | title | text | from | from_id | file | media_type | mime_type | duration_seconds | edited | thumbnail | width | height | photo | forwarded_from | poll.question | poll.closed | poll.total_voters | poll.answers | sticker_emoji | message_id | reply_to_message_id | location_information.latitude | location_information.longitude | live_location_period_seconds | via_bot | performer | ftFilePath | ftChatType | ftIsJsonFormatted | tmpExtractedTD | ftTdText | ftTdUrls | ftTdHashtags | ftTdBolds | ftTdItalics | ftTdUnderlines | ftTdEmails | ftTdCleanText | ftTdEmojis | ftTdEmojisDesc | ftTdSafeText | ftTdSafeLowerText | ftTdTextLength | ftQrIsValidText | ftQrIsEdited | ftQrIsForwarded | ftQrCoPhotos | ftQrCoFiles | ftQrCoUrls | ftQrCoHashtags | ftQrCoBolds | ftQrCoItalics | ftQrCoUnderlines | ftQrCoEmails | ftQrCoEmojis | ftTrNerRoberta | ftTrNerBert | ftTrSenBert | ftSenTb | author | saved_from | members | game_title | game_description | game_link | game_message_id | score | inviter | contact_vcard | contact_information.first_name | contact_information.last_name | contact_information.phone_number | place_name | address | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | message | 2020-04-28T08:22:29 | NaN | NaN | NaN | NaN | Dann schreiben wir mal Geschichte! | ATTILA HILDMANN | 1.003416e+10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | DS-05-01-2021/ChatExport_2021-01-05-hildmann | public_channel | False | (Dann schreiben wir mal Geschichte!, [], [], [... | Dann schreiben wir mal Geschichte! | [] | [] | [] | [] | [] | [] | Dann schreiben wir mal Geschichte! | [] | [] | Dann schreiben wir mal Geschichte | dann schreiben wir mal geschichte | 34 | True | False | False | False | False | False | False | False | False | False | False | False | {'per': [], 'misc': [], 'org': [], 'loc': []} | {'per': [], 'misc': [], 'org': [], 'loc': []} | 5.0 | {'polarity': 0.0, 'subjectivity': 0.0} | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 5 | message | 2020-04-28T08:43:27 | NaN | NaN | NaN | NaN | Das Video sollte jeder von euch schauen und ve... | ATTILA HILDMANN | 1.003416e+10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | DS-05-01-2021/ChatExport_2021-01-05-hildmann | public_channel | False | (Das Video sollte jeder von euch schauen und v... | Das Video sollte jeder von euch schauen und ve... | [] | [] | [] | [] | [] | [] | Das Video sollte jeder von euch schauen und ve... | [] | [] | Das Video sollte jeder von euch schauen und ve... | das video sollte jeder von euch schauen und ve... | 100 | True | False | False | False | False | False | False | False | False | False | False | False | {'per': [], 'misc': [], 'org': [], 'loc': ['▁C... | {'per': [], 'misc': [], 'org': [], 'loc': []} | 5.0 | {'polarity': 0.0, 'subjectivity': 0.0} | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | 6 | message | 2020-04-28T08:43:31 | NaN | NaN | NaN | NaN | [SCHAUT ES EUCH AN! 🆘 , {'type': 'link', 'text... | ATTILA HILDMANN | 1.003416e+10 | NaN | NaN | NaN | NaN | 2020-04-28T08:52:04 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | DS-05-01-2021/ChatExport_2021-01-05-hildmann | public_channel | True | (SCHAUT ES EUCH AN! 🆘 , [https://www.instagram... | SCHAUT ES EUCH AN! 🆘 | [https://www.instagram.com/tv/B_f_sYFqfvw/?igs... | [] | [] | [] | [] | [] | SCHAUT ES EUCH AN! | [🆘] | [SOS button] | SCHAUT ES EUCH AN | schaut es euch an | 21 | True | True | False | False | False | True | False | False | False | False | False | True | {'per': [], 'misc': [], 'org': [], 'loc': []} | {'per': [], 'misc': [], 'org': [], 'loc': []} | 5.0 | {'polarity': 0.0, 'subjectivity': 0.0} | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
test = CountVectorizer(ngram_range=(1, 3))
d1 = test.fit_transform(["wordA wordB wordC wordA wordB wordC"])
d1.toarray()
array([[2, 2, 2, 2, 2, 1, 2, 1, 1]])
test.get_feature_names()
['worda', 'worda wordb', 'worda wordb wordc', 'wordb', 'wordb wordc', 'wordb wordc worda', 'wordc', 'wordc worda', 'wordc worda wordb']
d2 = test.transform(["wordA wordB"])
d2.toarray()
array([[1, 1, 0, 1, 0, 0, 0, 0, 0]])
test.get_params()
{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': numpy.int64, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 3), 'preprocessor': None, 'stop_words': None, 'strip_accents': None, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'vocabulary': None}
test = TfidfTransformer()
test.fit_transform(d1).toarray()
array([[0.38490018, 0.38490018, 0.38490018, 0.38490018, 0.38490018, 0.19245009, 0.38490018, 0.19245009, 0.19245009]])
test.transform(d2).toarray()
array([[0.57735027, 0.57735027, 0. , 0.57735027, 0. , 0. , 0. , 0. , 0. ]])
targetDf = dfAllDataMessages
targetDf['clText'] = targetDf['ftTdCleanText']
targetDf['clFrom'] = targetDf['from']
targetDf['clFromId'] = targetDf['from'].factorize()[0]
_ = targetDf['clFrom'].value_counts().plot.bar()
targetDf['clText'][:5]
1 Dann schreiben wir mal Geschichte! 4 Das Video sollte jeder von euch schauen und ve... 5 SCHAUT ES EUCH AN! 6 Billyboy spricht schon heute, APRIL 2020, von ... 7 Beachtet sein Grinsen am Anfang und Ende des V... Name: clText, dtype: object
targetDf['clFromId'].value_counts()
1 30740 0 22885 2 14067 3 9103 Name: clFromId, dtype: int64
def getSamples(df, k=9103):
if len(df) < k:
return df
return df.sample(k)
targetDf = targetDf.groupby('clFromId').apply(getSamples).reset_index(drop=True)
_ = targetDf['clFrom'].value_counts().plot.bar()
dfFromId = targetDf[['clFrom', 'clFromId']].drop_duplicates().sort_values('clFromId')
dictFrom_to_id = dict(dfFromId.values)
dictId_to_from = dict(dfFromId[['clFromId', 'clFrom']].values)
dictId_to_from
{0: 'ATTILA HILDMANN ', 1: 'Oliver Janich oeffentlich', 2: 'Eva Herman Offiziell', 3: 'Xavier Naidoo inoffiziell'}
X_train, X_test, y_train, y_test = train_test_split(targetDf['clText'], targetDf['clFrom'], random_state = 42, test_size=0.20, stratify=targetDf['clFrom'])
print("Train size:\t" + str(len(X_train.index)))
print("Test size:\t" + str(len(X_test.index)))
Train size: 29129 Test size: 7283
_ = y_train.value_counts().plot.bar()
_ = y_test.value_counts().plot.bar()
gloStartStopwatch("Transform messages")
count_vect = CountVectorizer(ngram_range=(1, 3))
tfidf_transformer = TfidfTransformer()
# Transform and fit train
X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Transform test
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
gloStopStopwatch("Transform messages")
[Stopwatch started >>Transform messages<<] [Stopwatch stopped >>Transform messages<< (14.245s)]
def trainAndEvalModel(model, outputFilename):
gloStartStopwatch("Train now model " + str(model))
model.fit(X_train_tfidf, y_train)
gloStopStopwatch("Train now model " + str(model))
searchStrings = ["Folge Attila Hildmann", "Liebe Eva", "Premium Kanal", "OneLove"]
for sS in searchStrings:
sS = str(sS)
print()
print("Who has written '" + sS + "'?")
t = tfidf_transformer.transform(count_vect.transform([sS]))
r = model.predict(t)
print(str(r))
y_pred_train = model.predict(X_train_tfidf)
y_pred_test = model.predict(X_test_tfidf)
print()
print("Train Score:\t" + str(accuracy_score(y_true=y_train, y_pred=y_pred_train)))
print("Test Score:\t" + str(accuracy_score(y_true=y_test, y_pred=y_pred_test)))
print()
print("Confusion Matrix on test:")
conf_mat = confusion_matrix(y_true = y_test, y_pred = y_pred_test)
fig, ax = plt.subplots(figsize=(9,9))
plt.title(str(model).replace("()", ""))
sns.heatmap(conf_mat, annot=True, fmt='d',
xticklabels=dfFromId.clFrom.values, yticklabels=dfFromId.clFrom.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
if(outputFilename != ""):
plt.savefig(dir_var_output + outputFilename)
plt.show()
trainAndEvalModel(LinearSVC(), "class-linearsvc.svg")
[Stopwatch started >>Train now model LinearSVC()<<] [Stopwatch stopped >>Train now model LinearSVC()<< (5.147s)] Who has written 'Folge Attila Hildmann'? ['ATTILA HILDMANN '] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['Oliver Janich oeffentlich'] Who has written 'OneLove'? ['Xavier Naidoo inoffiziell'] Train Score: 0.9754883449483333 Test Score: 0.690924069751476 Confusion Matrix on test:
trainAndEvalModel(MultinomialNB(), "class-multinomialnb.svg")
[Stopwatch started >>Train now model MultinomialNB()<<] [Stopwatch stopped >>Train now model MultinomialNB()<< (0.361s)] Who has written 'Folge Attila Hildmann'? ['ATTILA HILDMANN '] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['Oliver Janich oeffentlich'] Who has written 'OneLove'? ['Xavier Naidoo inoffiziell'] Train Score: 0.927529266366851 Test Score: 0.6321570781271454 Confusion Matrix on test:
trainAndEvalModel(LogisticRegression(), "class-logisticregression.svg")
[Stopwatch started >>Train now model LogisticRegression()<<]
/opt/conda/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
[Stopwatch stopped >>Train now model LogisticRegression()<< (151.582s)] Who has written 'Folge Attila Hildmann'? ['ATTILA HILDMANN '] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['Oliver Janich oeffentlich'] Who has written 'OneLove'? ['Xavier Naidoo inoffiziell'] Train Score: 0.9421881973291222 Test Score: 0.6780173005629548 Confusion Matrix on test:
trainAndEvalModel(MLPClassifier(), "class-mlp.svg")
[Stopwatch started >>Train now model MLPClassifier()<<] [Stopwatch stopped >>Train now model MLPClassifier()<< (50427.917s)] Who has written 'Folge Attila Hildmann'? ['ATTILA HILDMANN '] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['Oliver Janich oeffentlich'] Who has written 'OneLove'? ['Xavier Naidoo inoffiziell'] Train Score: 0.97699886710838 Test Score: 0.6877660304819443 Confusion Matrix on test:
trainAndEvalModel(DecisionTreeClassifier(), "class-decisiontree.svg")
[Stopwatch started >>Train now model DecisionTreeClassifier()<<] [Stopwatch stopped >>Train now model DecisionTreeClassifier()<< (345.281s)] Who has written 'Folge Attila Hildmann'? ['ATTILA HILDMANN '] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['ATTILA HILDMANN '] Who has written 'OneLove'? ['ATTILA HILDMANN '] Train Score: 0.9771361873047478 Test Score: 0.546340793628999 Confusion Matrix on test:
trainAndEvalModel(RandomForestClassifier(), "class-randomforest.svg")
[Stopwatch started >>Train now model RandomForestClassifier()<<] [Stopwatch stopped >>Train now model RandomForestClassifier()<< (7156.269s)] Who has written 'Folge Attila Hildmann'? ['ATTILA HILDMANN '] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['ATTILA HILDMANN '] Who has written 'OneLove'? ['Xavier Naidoo inoffiziell'] Train Score: 0.9771361873047478 Test Score: 0.6328436084031306 Confusion Matrix on test:
trainAndEvalModel(DummyClassifier(strategy="uniform"), "class-dummy.svg")
[Stopwatch started >>Train now model DummyClassifier(strategy='uniform')<<] [Stopwatch stopped >>Train now model DummyClassifier(strategy='uniform')<< (0.025s)] Who has written 'Folge Attila Hildmann'? ['Xavier Naidoo inoffiziell'] Who has written 'Liebe Eva'? ['Eva Herman Offiziell'] Who has written 'Premium Kanal'? ['ATTILA HILDMANN '] Who has written 'OneLove'? ['Eva Herman Offiziell'] Train Score: 0.2508839987641182 Test Score: 0.25003432651379925 Confusion Matrix on test: