"""
Set flag to true, if you work in visual studio code (connected to custom docker)
Set flag to false, if you work in browser (jupyter notebook ui from custom docker)
"""
C_LOCAL = True
"""
Set flag to true, if you want process no long term running tasks and take spot check from data
"""
C_SHORT_RUN = False
C_NUMBER_SAMPLES = 1000 #only if C_SHORT_RUN set to true and if you dont use cache
"""
Resolve new urls?
Set flag to false, if you dont want to resolve new urls
"""
C_RESOLVE_NEW_URLS = False
"""
Load DataSets (global)
Ava: ["dataSet0", "dataSet1", "dataSet1a", "dataSet2"]
Htdocs: ["dataSet0", "dataSet1a", "dataSet2"]
Req: ["dataSet0]
"""
C_LOAD_DATASET = ["dataSet0", "dataSet1", "dataSet1a", "dataSet2"]
"""
Load Piplelines? (HuggingFace transformers)
"""
C_LOAD_PIPELINES = True
C_PIPELINE_DATASET = ["dataSet0"]
"""
Time Plot Freq
e.g. 1M = 1 Month
e.g. 1W = 1 Week
e.g. 1D = 1 Day
"""
C_TIME_PLOT_FREQ = "1D"
"""
Cache?
Set C_USE_CACHE_FILE to "", if you want to use no cache!
Set C_NEW_CACHE_FILE to "", if you want to create no cache!
Please set only one value!
Please create new cache if you change params above
# e.g
# - local-run-temp.pkl (Short run, with hf, with htdocs-datasets) - deprecated
# - local-run-ht-temp.pkl (Long run, without hf, with htdocs-datasets) - deprecated
# - long-run-server-21-01.pkl (Long run, with hf, with htdocs-datasets) - deprecated
#
# - long-run-server-28-01.pkl (Long run, with hf, with htdocs-datasets, updated with sen-pipe-2)
# - long-run-server-07-02.pkl (Long run, with hf, with all datasets, updated with sen-pipe-2)
# - local-run-28-01.pkl (Short run, with hf, with htdocs-datasets, updated with sen-pipe-2)
# - test.pkl (Test file)
"""
#e.g. data-cache.pkl
C_USE_CACHE_FILE = "long-run-server-07-02.pkl"
C_NEW_CACHE_FILE = ""
# Import default libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
# Import url libs
from urllib.parse import urlparse
from collections import Counter
import requests
# File Handler Lib
from pathlib import Path
# Set graph widget (used by jupyter notebook)
#%matplotlib notebook #interactive graphs
%matplotlib inline
# Install and import Graph Lib
import networkx as nx
#! pip install pydot
# Install and import JSON Lib
#! pip install demjson
import demjson
#import json
# Install and import Natural Language Toolkit
#! pip install nltk
import nltk
# Ngrams
import re
from nltk.util import ngrams
# Stopwords
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Gensim and pyLDAvis
#! pip install gensim
#! pip install pyLDAvis
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim
import pickle
import pyLDAvis
import os
# TODO Set to ignore?
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("once")
# SET TOKENIZERS_PARALLELISM
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Install and import demoji
# Dont exists on conda
import sys
!{sys.executable} -m pip install demoji
import demoji
demoji.download_codes()
# Install and import WordCloud
#! pip install wordcloud
from wordcloud import WordCloud
#!{sys.executable} -m pip install lxml
from lxml.html import fromstring
# Bert and co.
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
# Install Hanover Tagger
# Dont exists on conda
!{sys.executable} -m pip install HanTa
from HanTa import HanoverTagger as ht
# TODO: Install via conda
!{sys.executable} -m pip install textblob-de
from textblob_de import TextBlobDE as TextBlob
# Show all columns (pandas hides columns by default)
pd.set_option('display.max_columns', None)
# Set plot style
# TODO: Test different style
plt.style.use('ggplot')
# Set env vars
if(C_LOCAL == True):
dir_var = "./work/notebooks/"
else:
dir_var = "./"
dir_var_output = dir_var + "output/"
dir_var_cache= dir_var + "cache/"
dir_var_pandas_cache = dir_var + "cache/pandas/"
# Debug output
! echo "- Workdir -"
! ls -al $dir_var
! echo
! echo "- Outputdir -"
! ls -al $dir_var_output
! echo
! echo "- Cachedir -"
! ls -al $dir_var_cache
! echo
! echo "- Pandas -"
! ls -al $dir_var_pandas_cache
dictGloStopwatches = dict()
# Start timer (for reporting)
def gloStartStopwatch(key):
print("[Stopwatch started >>" + str(key) + "<<]")
dictGloStopwatches[key] = time.time()
# Stop timer (for reporting)
def gloStopStopwatch(key):
endTime = time.time()
startTime = dictGloStopwatches[key]
print("[Stopwatch stopped >>" + str(key) + "<< (" + '{:5.3f}s'.format(endTime-startTime) + ")]")
"""
Check if text is json formatted
param text InputText
param singleMode Boolean (set to true, if text is part of a message)
"""
def gloCheckIsTextJsonFormatted(text, singleMode):
textString = str(text)
if (singleMode == False and textString.startswith("[") == True and textString.endswith("]") == True):
return True
elif (singleMode == True and textString.startswith("{") == True and textString.endswith("}") == True):
return True
else:
return False
def gloReplaceGermanChars(inputText):
inputText = inputText.replace("ö", "oe")
inputText = inputText.replace("ü", "ue")
inputText = inputText.replace("ä", "ae")
inputText = inputText.replace("Ö", "Oe")
inputText = inputText.replace("Ü", "Ue")
inputText = inputText.replace("Ä", "Ae")
inputText = inputText.replace("ß", "ss")
return inputText
# Rm unsafe chars
def gloConvertToSafeString(text):
text = demoji.replace(text, "")
text = gloReplaceGermanChars(text)
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
#text = text.encode('ascii', 'ignore')
#text = text.decode('ascii')
return text
# Generate unique chat name
def gloConvertToSafeChatName(chatName):
chatName = gloConvertToSafeString(chatName)
return chatName[:30]
def gloGetStopWordsList(filterList):
stopwWorldsList = []
deWordsList = nltk.corpus.stopwords.words('german')
enWordsList = nltk.corpus.stopwords.words('english')
aStopwords = []
with open(dir_var + "additionalStopwords.txt") as file:
for line in file:
line = line.strip()
if(line != ""):
aStopwords.append(line)
for s in filterList:
s = gloReplaceGermanChars(s)
stopwWorldsList.append(s)
for s in deWordsList:
s = gloReplaceGermanChars(s)
stopwWorldsList.append(s)
for s in enWordsList:
stopwWorldsList.append(s)
for s in aStopwords:
s = gloReplaceGermanChars(s)
stopwWorldsList.append(s)
return stopwWorldsList
# Dict File Cache
dictFileCache = {}
# Write dict to file (CSV)
def gloWriteDictToFile(filename, targetDict):
dictFileCache = {} #Clear cache
d = pd.DataFrame.from_dict(targetDict, orient="index")
d.to_csv(dir_var_cache + filename, header=False)
# Read dict from file (CSV)
def gloReadDictFromFile(filename):
# Cache?
if(filename in dictFileCache):
return dictFileCache[filename]
d = pd.read_csv(dir_var_cache + filename, header=None, index_col=0, squeeze=True)
retDict = d.to_dict()
dictFileCache[filename] = retDict #Add to cache
return retDict
# Init csv file if not exists
def gloInitFileDict(filename):
f = Path(dir_var_cache + filename)
if(f.exists() == False):
print("Init cache file >>" + filename + "<<")
f.touch()
gloWriteDictToFile(filename, {"initKey": "initValue"})
else:
print("Cache already exists >>" + filename + "<<")
# Check if is already cached
def gloCheckIsAlreadyCached(filename, targetKey):
targetDict = gloReadDictFromFile(filename)
if(targetKey in targetDict.keys()):
return True
else:
return False
# Add key to cache
def gloAddToCache(filename, targetKey, targetValue):
targetDict = gloReadDictFromFile(filename)
targetDict[targetKey] = targetValue
gloWriteDictToFile(filename, targetDict)
# Get key from cache
def gloGetCached(filename, targetKey):
targetDict = gloReadDictFromFile(filename)
return targetDict[targetKey]
# param rowID e.g. procTDSafeText
def gloGenerateTextFromChat(df, rowID):
df = df.copy()
df = df[df.procEvalIsValidText == True]
# Iterate over text (global text from group)
textList = []
for index, row in df.iterrows():
textList.append(" " + row[rowID])
textString = ''.join(textList)
return textString
gloInitFileDict("resolved-urls.csv")
gloInitFileDict("resolved-youtube.csv")
dictPipelines = {}
def loadPipelines():
if(C_LOAD_PIPELINES == False):
print("Skip loading pipelines")
return list()
gloStartStopwatch("Load Pipelines")
gloStartStopwatch("Load ner-xlm-Roberta")
dictPipelines["ner-xlm-roberta"] = pipeline(
'ner',
model='xlm-roberta-large-finetuned-conll03-german',
tokenizer='xlm-roberta-large-finetuned-conll03-german'
)
gloStopStopwatch("Load ner-xlm-Roberta")
gloStartStopwatch("Load ner-Bert")
dictPipelines["ner-bert"] = pipeline(
'ner',
model='fhswf/bert_de_ner',
tokenizer='fhswf/bert_de_ner'
)
gloStopStopwatch("Load ner-Bert")
gloStartStopwatch("Load sen-Bert")
dictPipelines["sen-bert"] = pipeline(
'sentiment-analysis',
model='nlptown/bert-base-multilingual-uncased-sentiment',
tokenizer='nlptown/bert-base-multilingual-uncased-sentiment'
)
gloStopStopwatch("Load sen-Bert")
gloStartStopwatch("Load text-gen-gpt2")
dictPipelines["text-gen-gpt2"] = pipeline(
'text-generation',
model='dbmdz/german-gpt2',
tokenizer='dbmdz/german-gpt2'
)
gloStopStopwatch("Load text-gen-gpt2")
gloStartStopwatch("Load text-gen-gpt2-faust")
dictPipelines["text-gen-gpt2-faust"] = pipeline(
'text-generation',
model='dbmdz/german-gpt2-faust',
tokenizer='dbmdz/german-gpt2-faust'
)
gloStopStopwatch("Load text-gen-gpt2-faust")
gloStopStopwatch("Load Pipelines")
return dictPipelines.keys()
pipelineKeys = loadPipelines()
print()
print(str(pipelineKeys))
gloStartStopwatch("Global notebook")
# Read jobs from file
dfInputFiles = pd.read_csv(dir_var + "inputFiles.csv", sep=";")
dfFilter = pd.DataFrame()
for dS in C_LOAD_DATASET:
dfFilter = dfFilter.append(dfInputFiles[dfInputFiles.inputDesc == dS])
dfInputFiles = dfFilter
dfInputFiles
# Convert to DataFrame Meta (Chat Meta)
def convertToDataFrameMeta(filePath):
dF = pd.read_json(dir_var + "data/" + filePath + "/result.json", encoding='utf-8')
return dF
# Convert to DataFrame Messages (Chat Messages)
def convertToDataFrameMessages(filePath):
dF = pd.json_normalize(dictMeta[filePath].messages)
return dF
# https://stackoverflow.com/questions/6718633/python-regular-expression-again-match-url
def getUrlRegex():
return "((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)"
def getHashtagRegex():
return "#(\w+)"
def hashTagExtractHashTags(inputText):
inputText = str(inputText)
inputText = re.sub('\n', ' ', inputText) # Replace \n
inputText = demoji.replace(inputText, " ") # Rm emoji
inputText = gloReplaceGermanChars(inputText) # Replace german chars
return re.findall(getHashtagRegex(), inputText)
def urlExtractUrls(inputText):
return re.findall(getUrlRegex(), str(inputText))
def urlRemoveUrls(inputText):
return re.sub(getUrlRegex(), " ", str(inputText))
"""
Get params from extractedTextData
See cell below (key)
"""
def getExtractedTextDataParam(key, extractedTextData):
a,b,c,d,e,f,g = extractedTextData
if(key == 0):
return urlRemoveUrls(a)
elif(key == 1):
before = b
extracted = urlExtractUrls(a)
after = before
after.extend(extracted)
"""
if(str(extracted) != "[]"):
# TODO: Fix return bug
print("Debug >>" + str(before) + "/" + str(extracted) + ">>" + str(after) + "<<")
"""
return after
elif(key == 2):
# TODO: Refactor dont take it from extractedTextData
return hashTagExtractHashTags(a)
else:
switcher = {
3: d,
4: e,
5: f,
6: g
}
return switcher.get(key)
"""
Extract text data (see cell above key)
See cell above (key)
param procIsJsonFormatted Boolean (is text json formatted?)
param text String (text from message)
return
a procText Plain Text
b processedURLs Array of URLs in Text
c processedHashtags Array of Hashtags in Text #TODO: RM
d processedBolds Array of Bold Items in Text
e processedItalics Array of Italic Items in Text
f processedUnderlines Array of Underlined Items in Text
g processedEmails Array of E-Mails in Text
"""
def extractTextData(procIsJsonFormatted, text):
# 3 returns in this function...
processedURLs = list()
processedHashtags = list() # TODO: RM
processedBolds = list()
processedItalics = list()
processedUnderlines = list()
processedEmails = list()
if(procIsJsonFormatted != True):
#Is not JSON formatted (return normal text)
return (text, processedURLs, processedHashtags, processedBolds, processedItalics, processedUnderlines, processedEmails)
else:
#Is is JSON formatted (try to parse)
try:
returnList = []
jsonList = demjson.decode(str(text), encoding='utf8')
# Do for each item in list
for lItem in jsonList:
messageString = str(lItem)
isJsonSubString = gloCheckIsTextJsonFormatted(messageString, singleMode = True)
if(isJsonSubString):
# Is Json Sub String
subJsonString = demjson.decode(str(messageString), encoding='utf8')
subJsonType = subJsonString["type"]
if(subJsonType == "bold"):
#text included
processedBolds.append(subJsonString["text"])
returnList.append(subJsonString["text"])
elif(subJsonType == "italic"):
#text included
processedItalics.append(subJsonString["text"])
returnList.append(subJsonString["text"])
elif(subJsonType == "underline"):
#text included
processedUnderlines.append(subJsonString["text"])
returnList.append(subJsonString["text"])
elif(subJsonType == "email"):
#text included
processedEmails.append(subJsonString["text"])
elif(subJsonType == "text_link"):
#text and href included
processedURLs.append(subJsonString["href"])
#returnList.append(subJsonString["text"])
elif(subJsonType == "link"):
#text included
processedURLs.append(subJsonString["text"])
elif(subJsonType == "hashtag"):
#text included
#processedHashtags.append(subJsonString["text"]) # TODO: Refactor: Dont add hashtags here!
returnList.append(subJsonString["text"])
elif(subJsonType == "mention"):
#text included
returnList.append(subJsonString["text"])
elif(subJsonType == "mention_name"):
#text and user_id included
returnList.append(subJsonString["text"])
elif(subJsonType == "bot_command"):
#text included
returnList = returnList
elif(subJsonType == "code"):
#text included
returnList = returnList
elif(subJsonType == "phone"):
#text included
returnList = returnList
elif(subJsonType == "strikethrough"):
#text included
returnList.append(subJsonString["text"])
elif(subJsonType == "pre"):
#text and language included
returnList.append(subJsonString["text"])
elif(subJsonType == "bank_card"):
#text included
returnList = returnList
else:
print("- Error: Unkown json type >>" + str(subJsonType) + "<< (ignore) >>" + str(text) + "<<")
else:
# Is no json formatted sub string (append text)
returnList.append(messageString)
return (''.join(returnList), processedURLs, processedHashtags, processedBolds, processedItalics, processedUnderlines, processedEmails)
except:
# Parser error (set inputText to returnText)
print("- Warn: Json parser error (set inputText to returnText) >>" + str(text) + "<<")
return (text, processedURLs, processedHashtags, processedBolds, processedItalics, processedUnderlines, processedEmails)
# returns dict (empty dict if disabled, dict with not entries if error)
listUnknownTypes = list()
def processNerPipeline(inputText, pipelineKey, configMinScore):
if(pipelineKey in pipelineKeys):
listPer = list()
listMisc = list()
listOrg = list()
listLoc = list()
try:
data = dictPipelines[pipelineKey](inputText)
for d in data:
jsonData = demjson.decode(str(d), encoding='utf8')
if(jsonData["score"] >= configMinScore):
# Is Valid
if (jsonData["entity"] == "I-PER" or jsonData["entity"] == "B-PER"):
listPer.append(jsonData["word"])
elif (jsonData["entity"] == "I-MISC" or jsonData["entity"] == "B-MISC"):
listMisc.append(jsonData["word"])
elif (jsonData["entity"] == "I-ORG" or jsonData["entity"] == "B-ORG"):
listOrg.append(jsonData["word"])
elif (jsonData["entity"] == "I-LOC" or jsonData["entity"] == "B-LOC"):
listLoc.append(jsonData["word"])
else:
uT = str(jsonData["entity"])
if(uT not in listUnknownTypes):
print("- Warn - Got unknown type >>" + uT + "<<")
listUnknownTypes.append(uT)
except:
pass
#print("Error in processNerPipeline (ignore) >>" + str(inputText) + "<<")
return {
"per": listPer,
"misc": listMisc,
"org": listOrg,
"loc": listLoc
}
else:
return dict()
# returns
# 1 - 5 (1 = bad / 5 = good)
# -1 disabled or error
def processSenPipeline(inputText, pipelineKey, configMinScore):
if(pipelineKey in pipelineKeys):
sen = -1
try:
data = dictPipelines[pipelineKey](inputText)
for d in data:
jsonData = demjson.decode(str(d), encoding='utf-8')
if(jsonData["score"]) > configMinScore:
# Is Valid
labelData = str(jsonData["label"])
if("stars" in labelData):
labelData = re.sub(" stars", "", labelData)
else:
labelData = re.sub(" star", "", labelData)
sen = int(labelData)
except:
pass
#print("Error in processSenPipeline (ignore) >>" + str(inputText) + "<<")
return sen
else:
return -1
# returns
# dict (polarity, subjectivity) or none (fail or disabled)
def processSentimentAnalysisPython(inputText):
try:
t = TextBlob(inputText)
return {
"polarity": t.polarity,
"subjectivity": t.subjectivity
}
except:
return None
def evalIsValidText(procTDTextLength):
if(procTDTextLength > 0):
return True
else:
return False
def evalContainsSomething(att):
if(str(att) == "nan"):
return False
else:
return True
def evalNonEmptyList(att):
if(str(att) == "[]"):
return False
else:
return True
# TODO: https://github.com/wartaal/HanTa/blob/master/Demo.ipynb
hanoverTagger = ht.HanoverTagger('morphmodel_ger.pgz')
def getTokenFromText(inputText):
return nltk.word_tokenize(inputText, language="german")
def getLemmaAndTaggingFromText(inputText):
return hanoverTagger.tag_sent(getTokenFromText(inputText))
# param outputFilename, if "" - no output
def plotFreqNouns(inputText, outputFilename, mostCommon, flagRemoveStopwords):
# https://textmining.wp.hs-hannover.de/Preprocessing.html
nouns = []
sentences_tok = [nltk.tokenize.word_tokenize(sent) for sent in getTokenFromText(inputText)]
for sent in sentences_tok:
tags = hanoverTagger.tag_sent(sent)
nouns_from_sent = [lemma for (word,lemma,pos) in tags if pos == "NN" or pos == "NE"]
nouns.extend(nouns_from_sent)
pNouns = list()
if(flagRemoveStopwords):
print("- Warn: remove stopWords")
stopWords = gloGetStopWordsList(filterList = list())
for n in nouns:
if n.lower() not in stopWords:
pNouns.append(n)
else:
pNouns = nouns
# Thank you https://stackoverflow.com/questions/52908305/how-to-save-a-nltk-freqdist-plot
fig = plt.figure(figsize = (16,9))
plt.gcf().subplots_adjust(bottom=0.15)
fdist = nltk.FreqDist(pNouns)
fdist.plot(mostCommon,cumulative=False)
_ = plt.show()
if(outputFilename != ""):
fig.savefig(dir_var_output + outputFilename, bbox_inches="tight")
dictMeta = {}
# Add Key = filePath / Value = DataFrame (Chat Meta)
for fP in dfInputFiles.inputPath:
dictMeta[fP] = convertToDataFrameMeta(fP)
# return dictMessages and dfAllDataMessages
def initProcessData():
dictMessages = {}
dfAllDataMessages = pd.DataFrame()
gloStartStopwatch("Extract Text Data")
# Add Key = filePath / Value = DataFrame (Chat Message)
for fP in dfInputFiles.inputPath:
gloStartStopwatch("TD-Extract " + fP)
dfMessages = convertToDataFrameMessages(fP)
tmpMeta = convertToDataFrameMeta(fP)
# Short run
if(C_SHORT_RUN):
print("Short run active!")
dfMessages = dfMessages.head(C_NUMBER_SAMPLES)
# Get chat attributes and check if message is json formatted
dfMessages["procChatFilePath"] = fP
dfMessages["procChatType"] = tmpMeta.type.iloc[0]
dfMessages["procIsJsonFormatted"] = dfMessages["text"].apply(gloCheckIsTextJsonFormatted, singleMode = False)
# Extract Text Data
dfMessages["tmpExtractedTD"] = dfMessages.apply(lambda x: extractTextData(x.procIsJsonFormatted, x.text), axis=1)
# Extract Text Data (params)
dfMessages["procTDText"] = dfMessages.apply(lambda x: getExtractedTextDataParam(0, x.tmpExtractedTD), axis=1)
dfMessages["procTDURLs"] = dfMessages.apply(lambda x: getExtractedTextDataParam(1, x.tmpExtractedTD), axis=1)
dfMessages["procTDHashtags"] = dfMessages.apply(lambda x: getExtractedTextDataParam(2, x.tmpExtractedTD), axis=1)
dfMessages["procTDBolds"] = dfMessages.apply(lambda x: getExtractedTextDataParam(3, x.tmpExtractedTD), axis=1)
dfMessages["procTDItalics"] = dfMessages.apply(lambda x: getExtractedTextDataParam(4, x.tmpExtractedTD), axis=1)
dfMessages["procTDUnderlines"] = dfMessages.apply(lambda x: getExtractedTextDataParam(5, x.tmpExtractedTD), axis=1)
dfMessages["procTDEmails"] = dfMessages.apply(lambda x: getExtractedTextDataParam(6, x.tmpExtractedTD), axis=1)
# Process text again
dfMessages['procTDCleanText'] = dfMessages['procTDText'].map(lambda x: re.sub('\n', ' ', x)) # Replace \n
dfMessages['procTDEmojis'] = dfMessages['procTDCleanText'].map(lambda x: demoji.findall_list(x, desc = False)) # Filter out emoji
dfMessages['procTDEmojisDesc'] = dfMessages['procTDCleanText'].map(lambda x: demoji.findall_list(x, desc = True)) # Filter out emoji with desc
dfMessages['procTDCleanText'] = dfMessages['procTDCleanText'].map(lambda x: demoji.replace(x, " ")) # Rm emoji
dfMessages['procTDCleanText'] = dfMessages['procTDCleanText'].map(lambda x: gloReplaceGermanChars(x)) # Replace german chars
dfMessages['procTDSafeText'] = dfMessages['procTDCleanText'].map(lambda x: re.sub(r'[^a-zA-Z0-9\s]', ' ', x)) # Filter out . ! ? ... (get only safe chars)
dfMessages['procTDSafeLowercaseText'] = dfMessages['procTDSafeText'].map(lambda x: x.lower()) # To lower
# Calc text size
dfMessages["procTDTextLength"] = dfMessages["procTDCleanText"].str.len()
# Add columns (if not exists)
if "photo" not in dfMessages:
print("- Debug: Add column >>photo<<")
dfMessages["photo"] = np.nan
if "file" not in dfMessages:
print("- Debug: Add column >>file<<")
dfMessages["file"] = np.nan
if "edited" not in dfMessages:
print("- Debug: Add column >>edited<<")
dfMessages["edited"] = np.nan
if "forwarded_from" not in dfMessages:
print("- Debug: Add column >>forwarded_from<<")
dfMessages["forwarded_from"] = np.nan
# Evaluate attributes
dfMessages["procEvalIsValidText"] = dfMessages.procTDTextLength.apply(evalIsValidText)
dfMessages["procEvalContainsPhoto"] = dfMessages.photo.apply(evalContainsSomething)
dfMessages["procEvalContainsFile"] = dfMessages.file.apply(evalContainsSomething)
dfMessages["procEvalIsEdited"] = dfMessages.edited.apply(evalContainsSomething)
dfMessages["procEvalIsForwarded"] = dfMessages.forwarded_from.apply(evalContainsSomething)
dfMessages["procEvalContainsUrl"] = dfMessages.procTDURLs.apply(evalNonEmptyList)
dfMessages["procEvalContainsHashtag"] = dfMessages.procTDHashtags.apply(evalNonEmptyList)
dfMessages["procEvalContainsBoldItem"] = dfMessages.procTDBolds.apply(evalNonEmptyList)
dfMessages["procEvalContainsItalicItem"] = dfMessages.procTDItalics.apply(evalNonEmptyList)
dfMessages["procEvalContainsUnderlineItem"] = dfMessages.procTDUnderlines.apply(evalNonEmptyList)
dfMessages["procEvalContainsEmailItem"] = dfMessages.procTDEmails.apply(evalNonEmptyList)
dfMessages['procEvalContainsEmojiItem'] = dfMessages.procTDEmojis.apply(evalNonEmptyList)
# Pipelines
if dfInputFiles[dfInputFiles.inputPath == fP].iloc[0].inputDesc in C_PIPELINE_DATASET:
gloStartStopwatch("Process pipeline ner-xlm-roberta")
dfMessages['procPipeline-ner-xlm-roberta'] = dfMessages['procTDCleanText'].map(lambda x: processNerPipeline(x, "ner-xlm-roberta", configMinScore=0))
gloStopStopwatch("Process pipeline ner-xlm-roberta")
gloStartStopwatch("Process pipeline ner-bert")
dfMessages['procPipeline-ner-bert'] = dfMessages['procTDCleanText'].map(lambda x: processNerPipeline(x, "ner-bert", configMinScore=0))
gloStopStopwatch("Process pipeline ner-bert")
gloStartStopwatch("Process pipeline sen-bert")
dfMessages['procPipeline-sen-bert'] = dfMessages['procTDCleanText'].map(lambda x: processSenPipeline(x, "sen-bert", configMinScore=0))
gloStopStopwatch("Process pipeline sen-bert")
# Sentiment Analysis
dfMessages['procPipeline-sentiment'] = dfMessages['procTDCleanText'].map(lambda x: processSentimentAnalysisPython(x))
# Add to dict
dictMessages[fP] = dfMessages
gloStopStopwatch("TD-Extract " + fP)
gloStopStopwatch("Extract Text Data")
# All Messages to DataFrame
gloStartStopwatch("Generate global DataFrame")
for fP in dfInputFiles.inputPath:
dfMessages = dictMessages[fP].copy()
dfAllDataMessages = dfAllDataMessages.append(dfMessages)
gloStopStopwatch("Generate global DataFrame")
return (dictMessages, dfAllDataMessages)
# return dictMessages and dfAllDataMessages
def initCacheData(dfAllDataMessages):
dictMessages = {}
for fP in dfInputFiles.inputPath:
dictMessages[fP] = dfAllDataMessages[dfAllDataMessages.procChatFilePath == fP]
return (dictMessages, dfAllDataMessages)
if(C_USE_CACHE_FILE == ""):
print("Should not use cache (build new cache)")
dictMessages, dfAllDataMessages = initProcessData()
if(C_NEW_CACHE_FILE != ""):
print("Write cache to file >>" + str(C_NEW_CACHE_FILE) + "<<")
dfAllDataMessages.to_pickle(dir_var_pandas_cache + C_NEW_CACHE_FILE)
else:
print("Should use cache (load cache)")
dictMessages, dfAllDataMessages = initCacheData(pd.read_pickle(dir_var_pandas_cache + C_USE_CACHE_FILE))
sText = "Das ist ein Beispielstext. An diesem Text werde ich nun einige Verfahren anwenden. Ich komme aus dem Großraum München und ich mag Text."
# Token Text
print()
print("- Token from text")
print(getTokenFromText(sText))
# Tagging (english)
print()
print("- POS english")
print(nltk.pos_tag(getTokenFromText(sText)))
# Lemma and tagging
print()
print("- Lemma and tagging")
print(getLemmaAndTaggingFromText(sText))
print()
# Freq Nouns
print("- Freq nouns")
plotFreqNouns(sText, outputFilename = "", mostCommon = 10, flagRemoveStopwords = True)
dfInputFiles.inputType.value_counts()
def queryChatId(filePath):
dfMeta = dictMeta[filePath].copy()
return str(dfMeta["id"].iloc[0])
def queryChatName(filePath):
dfMeta = dictMeta[filePath].copy()
chatName = str(dfMeta["name"].iloc[0])
chatName = gloConvertToSafeChatName(chatName)
return chatName
def queryChatType(filePath):
dfMeta = dictMeta[filePath].copy()
return str(dfMeta["type"].iloc[0])
def queryNumberOfMessages(filePath):
dfMessages = dictMessages[filePath].copy()
return len(dfMessages.index)
def queryNumberOfMessagesByAttEqTrue(filePath, attKey):
dfMessages = dictMessages[filePath].copy()
dfMessages = dfMessages[dfMessages[attKey] == True]
return len(dfMessages.index)
dfQueryMeta = pd.DataFrame(dfInputFiles.inputPath)
dfQueryMeta["qryChatId"] = dfQueryMeta.inputPath.apply(queryChatId)
dfQueryMeta["qryChatName"] = dfQueryMeta.inputPath.apply(queryChatName)
dfQueryMeta["qryChatType"] = dfQueryMeta.inputPath.apply(queryChatType)
dfQueryMeta["qryNumberOfMessages"] = dfQueryMeta.inputPath.apply(queryNumberOfMessages)
dfQueryMeta["qryNumberOfFormattedTextMessages"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procIsJsonFormatted"), axis=1)
dfQueryMeta["qryNumberOfValidTextMessages"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalIsValidText"), axis=1)
dfQueryMeta["qryNumberOfPhotos"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsPhoto"), axis=1)
dfQueryMeta["qryNumberOfFiles"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsFile"), axis=1)
dfQueryMeta["qryNumberOfEditedMessages"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalIsEdited"), axis=1)
dfQueryMeta["qryNumberOfForwardedMessages"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalIsForwarded"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithUrl"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsUrl"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithHashtag"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsHashtag"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithBold"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsBoldItem"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithItalic"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsItalicItem"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithUnderline"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsUnderlineItem"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithEmail"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsEmailItem"), axis=1)
dfQueryMeta["qryNumberOfMessagesWithEmoji"] = dfQueryMeta.apply(lambda x: queryNumberOfMessagesByAttEqTrue(x.inputPath, "procEvalContainsEmojiItem"), axis=1)
dfQueryMeta.sort_values(by="qryNumberOfMessages", ascending=False)
# Auto label query plot
def autolabelAx(rects, ax):
"""
Attach a text label above each bar in *rects*, displaying its height.
Copied from https://matplotlib.org/3.1.1/gallery/lines_bars_and_markers/barchart.html (22.12.2020)
"""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
# param inputDescFilter set "" == no filter
# param outputFilename set "" = no output
def queryMetaPlotter(inputDescFilter, configPlotWidth, configPlotHeight, configBarWidth, outputFilename):
# Init data
dataLabels = list()
dataNumberOfMesssages = list()
dataNumberOfFormattedTextMessages = list()
dataNumberOfValidTextMessages = list()
dataNumberOfEditedMessages = list()
dataNumberOfForwardedMessages = list()
dataNumberOfPhotos = list()
dataNumberOfFiles = list()
dataNumberOfMessagesWUrl = list()
dataNumberOfMessagesWHashtag = list()
dataNumberOfMessagesWBold = list()
dataNumberOfMessagesWItalic = list()
dataNumberOfMessagesWUnderline = list()
dataNumberOfMessagesWEmail = list()
dataNumberOfMessagesWEmoji = list()
# Iterate over Meta DataFrame
for index, row in dfQueryMeta.sort_values(by="qryNumberOfMessages", ascending=False).iterrows():
# Get attributes (check filter)
if(inputDescFilter == "" or dfInputFiles[dfInputFiles.inputPath == row.inputPath].inputDesc.iloc[0] == inputDescFilter):
dataLabels .append(row.qryChatName)
dataNumberOfMesssages .append(row.qryNumberOfMessages)
dataNumberOfFormattedTextMessages .append(row.qryNumberOfFormattedTextMessages)
dataNumberOfValidTextMessages .append(row.qryNumberOfValidTextMessages)
dataNumberOfEditedMessages .append(row.qryNumberOfEditedMessages)
dataNumberOfForwardedMessages .append(row.qryNumberOfForwardedMessages)
dataNumberOfPhotos .append(row.qryNumberOfPhotos)
dataNumberOfFiles .append(row.qryNumberOfFiles)
dataNumberOfMessagesWUrl .append(row.qryNumberOfMessagesWithUrl)
dataNumberOfMessagesWHashtag .append(row.qryNumberOfMessagesWithHashtag)
dataNumberOfMessagesWBold .append(row.qryNumberOfMessagesWithBold)
dataNumberOfMessagesWItalic .append(row.qryNumberOfMessagesWithItalic)
dataNumberOfMessagesWUnderline .append(row.qryNumberOfMessagesWithUnderline)
dataNumberOfMessagesWEmail .append(row.qryNumberOfMessagesWithEmail)
dataNumberOfMessagesWEmoji .append(row.qryNumberOfMessagesWithEmoji)
# Convert list to array
dataLabels = np.array(dataLabels)
dataNumberOfMesssages = np.array(dataNumberOfMesssages)
dataNumberOfFormattedTextMessages = np.array(dataNumberOfFormattedTextMessages)
dataNumberOfValidTextMessages = np.array(dataNumberOfValidTextMessages)
dataNumberOfEditedMessages = np.array(dataNumberOfEditedMessages)
dataNumberOfForwardedMessages = np.array(dataNumberOfForwardedMessages)
dataNumberOfPhotos = np.array(dataNumberOfPhotos)
dataNumberOfFiles = np.array(dataNumberOfFiles)
dataNumberOfMessagesWUrl = np.array(dataNumberOfMessagesWUrl)
dataNumberOfMessagesWHashtag = np.array(dataNumberOfMessagesWHashtag)
dataNumberOfMessagesWBold = np.array(dataNumberOfMessagesWBold)
dataNumberOfMessagesWItalic = np.array(dataNumberOfMessagesWItalic)
dataNumberOfMessagesWUnderline = np.array(dataNumberOfMessagesWUnderline)
dataNumberOfMessagesWEmail = np.array(dataNumberOfMessagesWEmail)
dataNumberOfMessagesWEmoji = np.array(dataNumberOfMessagesWEmoji)
# Draw
with sns.color_palette("tab10", 11):
fig, ax = plt.subplots()
x = np.arange(len(dataLabels))
barWidth = configBarWidth
fig.set_figwidth(configPlotWidth)
fig.set_figheight(configPlotHeight)
r1 = x
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
r5 = [x + barWidth for x in r4]
r6 = [x + barWidth for x in r5]
r7 = [x + barWidth for x in r6]
r8 = [x + barWidth for x in r7]
r9 = [x + barWidth for x in r8]
r10 = [x + barWidth for x in r9]
r11 = [x + barWidth for x in r10]
r12 = [x + barWidth for x in r11]
r13 = [x + barWidth for x in r12]
r14 = [x + barWidth for x in r13]
rects1 = ax.bar(r1, dataNumberOfMesssages, barWidth, label='Messages')
rects2 = ax.bar(r2, dataNumberOfFormattedTextMessages, barWidth, label='Formatted Messsages')
rects3 = ax.bar(r3, dataNumberOfValidTextMessages, barWidth, label='Valid Text Messages')
rects4 = ax.bar(r4, dataNumberOfEditedMessages, barWidth, label='Edited Messages')
rects5 = ax.bar(r5, dataNumberOfForwardedMessages, barWidth, label='Forwarded Messages')
rects6 = ax.bar(r6, dataNumberOfPhotos, barWidth, label='with Photo')
rects7 = ax.bar(r7, dataNumberOfFiles, barWidth, label='with File')
rects8 = ax.bar(r8, dataNumberOfMessagesWUrl, barWidth, label='with Url')
rects9 = ax.bar(r9, dataNumberOfMessagesWHashtag, barWidth, label='with Hashtag')
rects10 = ax.bar(r10, dataNumberOfMessagesWBold, barWidth, label='with Bold Items')
rects11 = ax.bar(r11, dataNumberOfMessagesWItalic, barWidth, label='with Italic Items')
rects12 = ax.bar(r12, dataNumberOfMessagesWUnderline, barWidth, label='with Underlined Items')
rects13 = ax.bar(r13, dataNumberOfMessagesWEmail, barWidth, label='with E-Mails')
rects14 = ax.bar(r14, dataNumberOfMessagesWEmoji, barWidth, label='with Emojis')
chartTitle = ""
if(inputDescFilter != ""):
chartTitle = " (" + inputDescFilter + ")"
ax.set_ylabel("Number of")
ax.set_title("Meta Overview" + chartTitle)
ax.set_xticks(x)
ax.set_xticklabels(dataLabels)
ax.legend()
rects = [rects1, rects2, rects3, rects4, rects5, rects6, rects7, rects8, rects9, rects10, rects11, rects12, rects13, rects14]
for rect in rects:
autolabelAx(rect, ax)
fig.tight_layout()
#plt.xticks(rotation=30)
if(outputFilename != ""):
plt.savefig(dir_var_output + outputFilename)
plt.show()
queryMetaPlotter(
inputDescFilter = "dataSet0",
configPlotWidth = 32,
configPlotHeight = 9,
configBarWidth = 0.065,
outputFilename = "meta-overview-dataSet0.svg"
)
if("dataSet1" in C_LOAD_DATASET):
queryMetaPlotter(
inputDescFilter = "dataSet1",
configPlotWidth = 100,
configPlotHeight = 9,
configBarWidth = 0.065,
outputFilename = "meta-overview-dataSet1.svg"
)
if("dataSet1a" in C_LOAD_DATASET):
queryMetaPlotter(
inputDescFilter = "dataSet1a",
configPlotWidth = 16,
configPlotHeight = 9,
configBarWidth = 0.065,
outputFilename = "meta-overview-dataSet1a.svg"
)
if("dataSet2" in C_LOAD_DATASET):
queryMetaPlotter(
inputDescFilter = "dataSet2",
configPlotWidth = 34,
configPlotHeight = 9,
configBarWidth = 0.065,
outputFilename = "meta-overview-dataSet2.svg"
)
def removeTextLengthOutliersFromDataFrame(df, interval, maxTextLength):
df = df.copy()
df = df[df.procTDTextLength < maxTextLength]
# https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
# keep only the ones that are within <interval> to -<interval> standard deviations in the column 'Data'.
return df[np.abs(df.procTDTextLength-df.procTDTextLength.mean()) <= (interval*df.procTDTextLength.std())]
# param outputFilename set "" == no output file
def textLengthHistPlotter(outputFilename):
dfMessages = dfAllDataMessages.copy()
print("Number of all messages:\t\t\t\t\t\t" + str(len(dfMessages.index)))
dfMessages = dfMessages[dfMessages.procEvalIsValidText == True]
print("Number of valid text messages:\t\t\t\t\t" + str(len(dfMessages.index)))
dfMessages = removeTextLengthOutliersFromDataFrame(
dfMessages,
interval = 3, #Default is 3
maxTextLength = 999999999 #TODO: Maybe enable max text length
)
print("Number of valid text messages (after outliers filtering):\t" + str(len(dfMessages.index)))
print()
print("Text Length Hist (after outliers filtering)")
plt.figure(figsize=(8,4.5))
_ = dfMessages.procTDTextLength.hist(bins=40)
plt.title('Histogram Text Length')
if(outputFilename != ""):
plt.savefig(dir_var_output + outputFilename)
textLengthHistPlotter(outputFilename = "meta-text-length-hist.svg")
def compareIdsAndLabels(df):
gloStartStopwatch("Compare ids and labels")
dictFromTranslator = {}
dictActorTranslator = {}
df = df.copy()
df["date"] = pd.to_datetime(df["date"])
df = df.set_index("date")
df = df.sort_index()
for index, row in df.iterrows():
n_from = row["from"]
n_from_id = row["from_id"]
n_from = str(n_from)
n_from_id = str(n_from_id)
n_actor = row["actor"]
n_actor_id = row["actor_id"]
n_actor = str(n_actor)
n_actor_id = str(n_actor_id)
if(str(n_from) != "nan"):
if(n_from_id not in dictFromTranslator):
# Add new key
dictFromTranslator[n_from_id] = [n_from]
else:
# Has changed?
oValueL = dictFromTranslator[n_from_id]
if(n_from not in oValueL):
newList = oValueL.copy()
newList.append(n_from)
print("- Add changed attribute in from (prev=" + str(oValueL) + "/new=" + str(newList) + ")")
dictFromTranslator[n_from_id] = newList
if(str(n_actor) != "nan"):
if(n_actor_id not in dictActorTranslator):
# Add new key
dictActorTranslator[n_actor_id] = [n_actor]
else:
# Has changed?
oValueL = dictActorTranslator[n_actor_id]
if(n_actor not in oValueL):
newList = oValueL.copy()
newList.append(n_actor)
print("- Add changed attribute in actor (prev=" + str(oValueL) + "/new=" + str(newList) + ")")
dictActorTranslator[n_actor_id] = newList
gloStopStopwatch("Compare ids and labels")
return dictFromTranslator
if(C_SHORT_RUN == False):
compareIdsAndLabels(dfAllDataMessages)
def extractImportantHashtags(df):
dfMessages = df.copy()
dfMessages = dfMessages[dfMessages.procEvalContainsHashtag == True]
hashTagList = list()
for index, row in dfMessages.iterrows():
for hashtagItem in row["procTDHashtags"]:
hashTagList.append(hashtagItem)
return hashTagList
# return combinations
def extractImportantEmojis(df):
dfMessages = df.copy()
dfMessages = dfMessages[dfMessages.procEvalContainsEmojiItem == True]
li = dfMessages.procTDEmojisDesc.values.tolist()
retLi = list()
for l in li:
aString = ""
for e in l:
aString = aString + ":" + e
retLi.append(aString)
return retLi
# param flagResolveNewUrls Flag (see config above)
def resolveUrl(completeUrl, flagResolveNewUrls):
if "bit.ly" in completeUrl:
if(gloCheckIsAlreadyCached("resolved-urls.csv", completeUrl)):
return gloGetCached("resolved-urls.csv", completeUrl)
else:
if(flagResolveNewUrls == False):
return completeUrl
print("(Resolve now >>" + completeUrl + "<<)")
try:
r = requests.get(completeUrl, timeout = 5)
u = r.url
gloAddToCache("resolved-urls.csv", completeUrl, u)
return u
except:
print("(- Warn: Can not resolve (return completeUrl))")
return completeUrl
else:
return completeUrl
# Return
# a = urlList,
# b = refList
# c = hostList
def extractImportantUrls(df):
dfMessages = df.copy()
dfMessages = dfMessages[dfMessages.procEvalContainsUrl == True]
hostList = list()
urList = list()
refList = list()
counterSucHostname = 0
counterErrHostname = 0
for index, row in dfMessages.iterrows():
for urlItem in row["procTDURLs"]:
urlData = urlparse(str(urlItem))
completeUrl = urlData.geturl()
rUrl = resolveUrl(completeUrl, flagResolveNewUrls=C_RESOLVE_NEW_URLS)
rUrlData = urlparse(rUrl)
rCompleteUrl = rUrlData.geturl()
rCompleteHostname = rUrlData.hostname
if(str(rCompleteHostname) != "None"):
counterSucHostname = counterSucHostname + 1
hostList.append(str(rCompleteHostname))
urList.append(str(rCompleteUrl))
if "t.me" in str(rCompleteHostname):
refList.append(str(rCompleteUrl))
else:
counterErrHostname = counterErrHostname + 1
print("Got Hostnames (suc=" + str(counterSucHostname) + "/err=" + str(counterErrHostname) + ")")
return (urList, refList, hostList)
# param flagResolveNewUrls Flag (see config above)
def resolveImportantYoutubeVideos(urlList, flagResolveNewUrls):
# Thanks https://gist.github.com/rodrigoborgesdeoliveira/987683cfbfcc8d800192da1e73adc486
ytList = list()
for url in urlList:
url = str(url)
if("youtube.com" in url or "youtu.be" in url or "youtube-nocookie.com" in url):
if(gloCheckIsAlreadyCached("resolved-youtube.csv", url)):
ytList.append(gloGetCached("resolved-youtube.csv", url))
else:
if(flagResolveNewUrls == False):
print("(Disable resolve new youtube urls (return completeUrl) >>" + url + "<<)")
ytList.append(url)
else:
print("Resolve now youtube >>" + url + "<<")
try:
r = requests.get(url, timeout = 5)
t = fromstring(r.content)
a = str(t.findtext('.//title'))
ytList.append(a)
gloAddToCache("resolved-youtube.csv", url, a)
except:
print("(- Warn: Can not resolve youtube url (return completeUrl))")
ytList.append(url)
return ytList
# TODO: Bug: No Hostname detected if string startsWith ! "http" in urlparse
# TODO: Check: Refs ins both directions
# Returns
# a = Counter forwardedFromList
# b = Counter refList
# c = Counter hashtagList
# d = Counter hostList
# e = Counter emojiList
# f = Counter fromList
def extractSocialGraph(filePath, debugPrint, debugPrintCount):
dfMessages = dictMessages[filePath].copy()
hashtagList = extractImportantHashtags(dfMessages)
emojiList = extractImportantEmojis(dfMessages)
urlList, refList, hostList = extractImportantUrls(dfMessages)
ytList = resolveImportantYoutubeVideos(urlList, flagResolveNewUrls = C_RESOLVE_NEW_URLS)
forwardedFromList = list()
if("forwarded_from" in dfMessages.columns):
df = dfMessages.copy()
df = df[df.procEvalIsForwarded == True]
for index, row in df.iterrows():
forwardedFromList.append(str(row["forwarded_from"]))
actorList = list()
if("actor" in dfMessages.columns):
for index, row in dfMessages.iterrows():
actorList.append(str(row["actor"]))
memberList = list()
if("members" in dfMessages.columns):
for index, row in dfMessages.iterrows():
if(str(row["members"]) != "nan"):
for memberItem in row["members"]:
memberList.append(str(memberItem))
fromList = list()
if("from" in dfMessages.columns):
for index, row in dfMessages.iterrows():
s = str(row["from"])
s = gloConvertToSafeString(s)
if(s != "None"):
fromList.append(s)
savedFromList = list()
if("saved_from" in dfMessages.columns):
for index, row in dfMessages.iterrows():
savedFromList.append(str(row["saved_from"]))
configTopN = debugPrintCount
if(debugPrint):
print()
print("Set top n to " + str(debugPrintCount))
print()
print("- Top Hosts (resovled) -")
print ("\n".join(map(str, Counter(hostList).most_common(configTopN))))
print()
print("- Top URLs (resolved) -")
print ("\n".join(map(str, Counter(urlList).most_common(configTopN))))
print()
print("- Top Refs from text (resolved) -")
print ("\n".join(map(str, Counter(refList).most_common(configTopN))))
print()
print("- Top Refs (forwarded_from) -")
print ("\n".join(map(str, Counter(forwardedFromList).most_common(configTopN))))
print()
print("- Top Refs (actor) -")
print ("\n".join(map(str, Counter(actorList).most_common(configTopN))))
print()
print("- Top Refs (members) -")
print ("\n".join(map(str, Counter(memberList).most_common(configTopN))))
print()
print("- Top Refs (from) -")
print ("\n".join(map(str, Counter(fromList).most_common(configTopN))))
print()
print("- Top Refs (saved_from) -")
print ("\n".join(map(str, Counter(savedFromList).most_common(configTopN))))
print()
print("- Top hashtags -")
print ("\n".join(map(str, Counter(hashtagList).most_common(configTopN))))
print()
print("- Top emojis -")
print ("\n".join(map(str, Counter(emojiList).most_common(configTopN))))
print()
print("- Top yt (resolved) -")
print ("\n".join(map(str, Counter(ytList).most_common(configTopN))))
print()
return (Counter(forwardedFromList), Counter(refList), Counter(hashtagList), Counter(hostList), Counter(emojiList), Counter(fromList))
def printSocialGraphDebug(filePathList):
for fP in filePathList:
print("Analyse now >>" + fP + "<<")
_ = extractSocialGraph(fP, debugPrint=True, debugPrintCount=10)
if(C_SHORT_RUN == False):
printSocialGraphDebug(dfInputFiles[dfInputFiles.inputDesc == "dataSet0"].inputPath)
if(C_SHORT_RUN == False and False): # TODO: Enable - Disable (read)
printSocialGraphDebug(dfInputFiles[dfInputFiles.inputDesc == "dataSet1a"].inputPath)
if(C_SHORT_RUN == False) and False: # TODO: Enable - Disable (read)
printSocialGraphDebug(dfInputFiles[dfInputFiles.inputDesc == "dataSet2"].inputPath)
dictSGD_ForwardedFrom = {}
dictSGD_Ref = {}
dictSGD_Hashtag = {}
dictSGD_Host = {}
dictSGD_Emoji = {}
dictSGD_From = {}
gloStartStopwatch("Extract Social Graph Data")
for fP in dfInputFiles.inputPath:
gloStartStopwatch("Extract Social Graph Data >>" + fP + "<<")
a, b, c, d, e, f = extractSocialGraph(fP, debugPrint=False, debugPrintCount = 0)
dictSGD_ForwardedFrom[fP] = a
dictSGD_Ref[fP] = b
dictSGD_Hashtag[fP] = c
dictSGD_Host[fP] = d
dictSGD_Emoji[fP] = e
dictSGD_From[fP] = f
gloStopStopwatch("Extract Social Graph Data >>" + fP + "<<")
gloStopStopwatch("Extract Social Graph Data")
# Get Top Influencer
# param fPList filePath List
# param configTopN Get Top n influencer e.g. 10
def getTopInfluencer(fPList, configTopN):
for fP in fPList:
chatName = queryChatName(fP)
print()
print("Analyse Chat (Forwarded From) >>" + chatName + "<<")
socialGraphData = dictSGD_ForwardedFrom[fP]
socialGraphData = socialGraphData.most_common(configTopN)
counter = 1
# Iterate over data
for oChatName, oChatRefs in socialGraphData:
# Query other params
oChatName = gloConvertToSafeChatName(str(oChatName))
oChatRefs = oChatRefs
# Already downloaded?
flagDownloaded = False
if oChatName in dfQueryMeta.qryChatName.values:
flagDownloaded = True
if(oChatName != "nan"):
print(str(counter) + ": (downloaded=" + str(flagDownloaded) + ") (refs=" + str(oChatRefs) + ")\t\t>>" + str(oChatName) + "<<")
counter = counter + 1
print()
print("Analyse Chat (Refs) >>" + chatName + "<<")
socialGraphData = dictSGD_Ref[fP]
socialGraphData = socialGraphData.most_common(configTopN)
counter = 1
# Iterate over data
for oChatName, oChatRefs in socialGraphData:
# Query other params
oChatName = str(oChatName)
oChatRefs = oChatRefs
if(oChatName != "nan"):
print(str(counter) + " (refs=" + str(oChatRefs) + ")\t\t>>" + str(oChatName) + "<<")
counter = counter + 1
# TODO: Can not get all items in dataSet1
"""
# Attila Hildmann #
- Anonymous Germany - not found
- https://t.me/DEMOKRATENCHAT - no entries
- https://t.me/ChatDerFreiheit - no entries
- https://t.me/FREIHEITSCHAT2020 - not found
# Oliver Janich #
- Oliver Janich Premium - not found
# Xavier Naidoo #
- Xavier(Der VereiNiger)Naidoo😎 - not found
- https://t.me/PostAppender_bot - bot chat
"""
getTopInfluencer(list(dfInputFiles[dfInputFiles.inputDesc == "dataSet0"].inputPath), 10)
"""
Social Graph Layout Selector
param G Graph
param layoutSelector:
1 = Kamda Kawai Layout
2 = Spring Layout
3 = Graphviz Layout
"""
def getSocialGraphLayout(layoutSelector, G):
if(layoutSelector == 1):
return nx.kamada_kawai_layout(G.to_undirected())
elif(layoutSelector == 2):
return nx.spring_layout(G.to_undirected(), k = 0.15, iterations=200)
elif(layoutSelector == 3):
return nx.nx_pydot.graphviz_layout(G)
# TODO: Try different arrows (see below): https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.patches.ArrowStyle.html
# TODO: Check distances between nodes
"""
Draw social grah
param G graph
param layoutSelector see above
param configFactorEdge e.g. 100 => weight / 100
param configFactorNode e.g. 10 => weight / 10
param configArrowSize e.g. 5
param configPlotWidth e.g. 16
param configPlotHeight e.g. 9
param outputFilename e.g. test.png (set "" == no output file)
param outputTitle e.g. Graph (required)
"""
def drawSocialGraph(G, layoutSelector, configFactorEdge, configFactorNode, configArrowSize, configPlotWidth, configPlotHeight, outputFilename, outputTitle):
gloStartStopwatch("Social Graph Plot")
plt.figure(figsize=(configPlotWidth,configPlotHeight))
pos = getSocialGraphLayout(layoutSelector = layoutSelector, G = G)
# Clean edges
edges = nx.get_edge_attributes(G, "weight")
edgesTLabel = nx.get_edge_attributes(G, "tLabel")
clean_edges = dict()
clean_edges_labels = dict()
for key in edges:
#Set edge weight
clean_edges[key] = (100 - edges[key]) / configFactorEdge
#set edge layout
clean_edges_labels[key] = edgesTLabel[key]
# Clean nodes
nodes = nx.get_node_attributes(G,'weight')
nodesTLabel = nx.get_node_attributes(G,'tLabel')
nodesTColor = nx.get_node_attributes(G,'tColor')
clean_nodes = dict()
clean_nodes_labels = dict()
clean_nodes_color = dict()
for key in nodes:
#Set node weight
clean_nodes[key] = nodes[key] / configFactorNode
#Set node layout
clean_nodes_labels[key] = nodesTLabel[key]
clean_nodes_color[key] = nodesTColor[key]
# Revert DiGraph (arrows direction)
G_rev = nx.DiGraph.reverse(G)
# Draw
nx.draw(G_rev,
pos,
with_labels=True,
width=list(clean_edges.values()),
node_size=list(clean_nodes.values()),
labels=clean_nodes_labels,
node_color=list(clean_nodes_color.values()),
arrowsize=configArrowSize,
arrowstyle="wedge"
#connectionstyle="arc3, rad = 0.1"
)
# Set labels
_ = nx.draw_networkx_edge_labels(G_rev, pos, edge_labels=clean_edges_labels)
plt.title(outputTitle)
# Save and show fig
if(outputFilename != ""):
plt.savefig(dir_var_output + outputFilename)
plt.show()
gloStopStopwatch("Social Graph Plot")
# Generates Test Graph
def generateTestGraph():
G_weighted = nx.DiGraph()
G_weighted.add_edge("N1", "N2", weight=100-30, tLabel = "(≙" + str(100-30) + ")")
G_weighted.add_edge("N1", "N3", weight=100-10, tLabel = "(≙" + str(100-10) + ")")
G_weighted.add_edge("N1", "N4", weight=100-60, tLabel = "(≙" + str(100-60) + ")")
G_weighted.add_edge("N4", "N5", weight=100-80, tLabel = "(≙" + str(100-80) + ")")
G_weighted.add_edge("N4", "N6", weight=100-10, tLabel = "(≙" + str(100-10) + ")")
G_weighted.add_edge("N4", "N7", weight=100-30, tLabel = "(≙" + str(100-30) + ")")
G_weighted.add_edge("N7", "N4", weight=100-70, tLabel = "(≙" + str(100-70) + ")")
G_weighted.add_node("N1", weight=500.0, tLabel = "N1-T", tColor="red")
G_weighted.add_node("N2", weight=500.0, tLabel = "N2-T", tColor="blue")
G_weighted.add_node("N3", weight=500.0, tLabel = "N3-T", tColor="blue")
G_weighted.add_node("N4", weight=500.0, tLabel = "N4-T", tColor="red")
G_weighted.add_node("N5", weight=500.0, tLabel = "N5-T", tColor="red")
G_weighted.add_node("N6", weight=500.0, tLabel = "N6-T", tColor="red")
G_weighted.add_node("N7", weight=500.0, tLabel = "N7-T", tColor="blue")
return G_weighted
# Add node weight to dict
# Only adds new weight if newWeight > oldWeight
def addSocialGraphNodeWeight(chatName, chatWeight, targetDict):
if(chatName in targetDict):
oldWeight = targetDict[chatName]
if(chatWeight > oldWeight):
targetDict[chatName] = chatWeight
else:
targetDict[chatName] = chatWeight
"""
Generate social graph
param configTopNInfluencer e.g. For top 10 = 10
param configMinRefs e.g. 1 must have > 1 % forwarded messages
param listFilePaths List process filePaths
param socialGraphTargetDict e.g. forwarded from dict or hashtag dict
param socialGraphTargetAttribute e.g. procEvalIsForwarded (for calc percent)
param configFlagDebugLabel e.g. show debug info on label
"""
def generateSocialGraph(configTopNInfluencer, configMinRefs, listFilePaths, socialGraphTargetDict, socialGraphTargetAttribute, configFlagDebugLabel):
# Save node weights to dict
dictSocialNodeWeights = dict()
# Flag downloaded nodes (exact node weight)
dictExactNodesLabels = {}
gloStartStopwatch("Social Graph")
# Generate directed graph
G_weighted = nx.DiGraph()
print("- Add edges")
for fP in listFilePaths:
# Query own params
chatName = queryChatName(fP)
chatNumberOfMessages = queryNumberOfMessages(fP)
chatNumberOfTargetMessages = queryNumberOfMessagesByAttEqTrue(fP, socialGraphTargetAttribute)
gloStartStopwatch("SG-Extract " + chatName + "(" + str(chatNumberOfTargetMessages) + "/" + str(chatNumberOfMessages) + " messages)")
# Add exact node size (chat downloaded) and flag node
addSocialGraphNodeWeight(chatName, chatNumberOfMessages, dictSocialNodeWeights)
dictExactNodesLabels[chatName] = str(chatName) + "\n=[" + str(chatNumberOfTargetMessages) + "/" + str(chatNumberOfMessages) + "]"
# Extract social graph data and get top influencer
socialGraphData = socialGraphTargetDict[fP]
socialGraphData = socialGraphData.most_common(configTopNInfluencer)
# Iterate over forwarder
for oChatName, oChatRefs in socialGraphData:
# Query other params
oChatName = gloConvertToSafeChatName(str(oChatName))
oChatRefs = oChatRefs
# If has forwarder
if(oChatName != "nan"):
# Calc percent (forwarded_messages)
per = (oChatRefs/chatNumberOfTargetMessages) * 100
# Filter unimportant forwarders
if(per > configMinRefs):
# Add estimanted node size (chat not downloaded)
addSocialGraphNodeWeight(oChatName, oChatRefs, dictSocialNodeWeights)
# Invert percent (distance)
wei = 100 - per
# Label
if(configFlagDebugLabel):
lab = str(round(per, 3)) + "% (" + str(oChatRefs) + "/" + str(chatNumberOfTargetMessages) + "≙" + str(round(wei, 3)) + ")"
else:
lab = str(round(per, 3)) + "% (" + str(oChatRefs) + "/" + str(chatNumberOfTargetMessages) + ")"
# Add edge
G_weighted.add_edge(
chatName,
oChatName,
weight=wei,
tLabel = lab
)
gloStopStopwatch("SG-Extract " + chatName + "(" + str(chatNumberOfTargetMessages) + "/" + str(chatNumberOfMessages) + " messages)")
print("- Add different nodes")
for aNode in dictSocialNodeWeights:
# Query node params
nodeName = str(aNode)
nodeWeight = dictSocialNodeWeights[aNode]
# Set defaults
tValueColor = "#ff8000"
tLabel = str(nodeName) + "\n≈[" + str(nodeWeight) + "]"
# Overwrite (if chat downloaded = exact weight)
if(nodeName in dictExactNodesLabels):
tValueColor = "#0080ff"
tLabel = dictExactNodesLabels[nodeName]
G_weighted.add_node(
nodeName,
weight=nodeWeight,
tLabel = tLabel,
tColor=tValueColor
)
gloStopStopwatch("Social Graph")
return G_weighted
generatedTestGraph = generateTestGraph()
drawSocialGraph(
G = generatedTestGraph,
layoutSelector=1,
configFactorEdge = 10,
configFactorNode = 1,
configArrowSize = 15,
configPlotWidth = 8,
configPlotHeight = 4.5,
outputFilename = "",
outputTitle = "Test Graph Kamda Kawai Layout"
)
drawSocialGraph(
G = generatedTestGraph,
layoutSelector=2,
configFactorEdge = 10,
configFactorNode = 1,
configArrowSize = 15,
configPlotWidth = 8,
configPlotHeight = 4.5,
outputFilename = "",
outputTitle = "Test Graph Spring Layout"
)