%load_ext watermark
%watermark -a "Romell D.Z." -u -d -p tweepy,scipy,nltk,gensim,sklearn,networkx,textblob,spacy
The watermark extension is already loaded. To reload it, use: %reload_ext watermark Romell D.Z. last updated: 2019-08-25 tweepy 3.6.0 scipy 1.1.0 nltk 3.2.5 gensim 3.4.0 sklearn 0.20.0 networkx 2.3 textblob 0.15.1 spacy 2.0.12
import os
import tweepy
from tweepy import Stream
import netrc
from unidecode import unidecode
import re
from tweepy import StreamListener
%matplotlib inline
from pprint import pprint
import pyprind
import pandas as pd
import random
from scipy.stats import beta as beta_distribution
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.data import load
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim # don't skip this
from gensim.models.ldamodel import LdaModel
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from textblob import Word
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.simplefilter('ignore')
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import advertools as adv
import networkx as nx
import spacy
from spacy import displacy
nlp = spacy.load('es_core_news_sm')
auth = netrc.netrc()
ckey,_,csecret=auth.authenticators('tweet_api')
atoken,_,asecret=auth.authenticators('tweet_secret')
# LIMA_GEO_LOCATION_BOUNDING_BOX = [-77.1785277831,-12.1531578397,-76.8967618806,-11.9288928156]
LIMA_GEO_LOCATION_BOUNDING_BOX = [ -81.802362,-17.525482, -69.774343,-3.226278]
auth = tweepy.OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth)
NUMBER_OF_TWEETS = 1000
pbar = pyprind.ProgBar(NUMBER_OF_TWEETS)
class FiniteStreamListener(StreamListener):
def __init__(self, number_of_tweets):
self.number_of_tweets = number_of_tweets
self.tweets = []
self.tweets_dict = []
super(FiniteStreamListener,self).__init__()
def on_status(self, status):
if len(self.tweets) < self.number_of_tweets:
self.tweets_dict.append(status._json)
place = status._json['place']['name'] if(status._json['place']) else ""
self.tweets.append({'date':status.created_at,
'text':status.text,
'location':place,
'followers':status._json['user']['followers_count']})
pbar.update()
else:
return False
finite_stream_listener = FiniteStreamListener(number_of_tweets=NUMBER_OF_TWEETS)
streaming_api = Stream(auth=auth, listener=finite_stream_listener,timeout=60)
EMOTICONS = ">:] :-) :) :o) :] :3 :c) :> =] 8) =) :} :^) "
EMOTICONS = EMOTICONS.strip().split(' ')
# streaming_api.filter(track=EMOTICONS,async=True)
streaming_api.filter(locations=LIMA_GEO_LOCATION_BOUNDING_BOX,async=True)
print(len(finite_stream_listener.tweets))
56
0% [##############################] 100% | ETA: 00:00:00 Total time elapsed: 01:05:42
np.save('tweets_dict',finite_stream_listener.tweets_dict)
def make_lowercase(tweet):
return tweet.lower()
def remove_diacritics(tweet):
return unidecode(tweet)
def remove_non_alpha_characters(tweet):
return ''.join(character for character in tweet if character.isalpha() or character == ' ')
def remove_web_site(tweet):
return re.sub(r'http\w+', '', tweet)#, flags=re.MULTILINE)
tweets_df = pd.DataFrame.from_dict(finite_stream_listener.tweets)
tweets_df.rename(columns={'text':'Tweets'},inplace=True)
tweets_df['word_count'] = tweets_df['Tweets'].apply(lambda x: len(str(x).split(" ")))
tweets_df['char_count'] = tweets_df['Tweets'].str.len()
def avg_word(sentence):
words = sentence.split()
return (sum(len(word) for word in words)/len(words))
tweets_df['avg_word'] = tweets_df['Tweets'].apply(lambda x: avg_word(x))
tweets_df['hastags'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
tweets_df['numerics'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
cleaned_tweets = list(tweets_df['Tweets'])
for cleaning_function in \
[make_lowercase,
# remove_diacritics,
remove_non_alpha_characters,
remove_web_site]:
cleaned_tweets = [cleaning_function(tweet) for tweet in cleaned_tweets]
random.sample(cleaned_tweets,5)
['ultima noche de semanademayordomia en la iasdesperanza aprendiendo sobre la importancia de ser fiel a dios con l ', 'não existe boy legal em salvador asklgbt', 'movistarvoleype vamos perú no podemos flaquear más adelante los equipos son más fuertes sabemos de su espíritu de lucha arriba perú', 'noticiamerica tanto lío por ese pequeño lapsus', 'pobre chica la cuerda siempre se rompe por el lado mas débil']
KEYWORD='lima'
number_of_occurences = sum(KEYWORD in tweet for tweet in cleaned_tweets)
print('Nuestra palabra clave aparece: {} veces'.format(number_of_occurences))
print('Nuestra palabra clave apareció en: {}% de los tweets'.format(100 * number_of_occurences/NUMBER_OF_TWEETS))
Nuestra palabra clave aparece: 41 veces Nuestra palabra clave apareció en: 4.1% de los tweets
pprint([tweet for tweet in cleaned_tweets if KEYWORD in tweet][:5])
['cooking with fire in peru today cookingclass seerundo lima peru ' 'southamerica lima peru ', 'shopping for our cooking class today mercado seerundo lima peru ' 'southamerica mercado n de surquillo ', 'im at the beer place in lima ', 'im at centro cultural ricardo palma munimiraflores in miraflores lima ', 'estoy listo y esperándote barbon amado una nueva jornada juntos hsmn ' 'octubremesmorado en sheraton lima hot ']
indices_of_tweets_containing_keyword=[index for index, tweet in enumerate(cleaned_tweets) if KEYWORD in tweet]
print('index de los 10 tweets:%s'%(indices_of_tweets_containing_keyword))
index de los 10 tweets:[6, 23, 103, 107, 116, 144, 149, 173, 211, 219, 237, 252, 254, 260, 277, 330, 373, 389, 392, 411, 421, 512, 522, 550, 653, 667, 684, 726, 727, 736, 743, 749, 821, 867, 877, 879, 884, 907, 912, 927, 969]
distances_between_indices_of_tweets_containing_keyword = [
second_index - first_index for first_index, second_index in \
zip(indices_of_tweets_containing_keyword[:-1], indices_of_tweets_containing_keyword[1:])
]
pd.Series(distances_between_indices_of_tweets_containing_keyword).hist()
plt.savefig('snapshot/lima_tweets_hist.png')
alpha = 1 + number_of_occurences
beta = 1 + (NUMBER_OF_TWEETS - number_of_occurences)
x_values = np.linspace(0, 1, 1002)[1:-1]
pdf_y_values = beta_distribution(alpha, beta).pdf(x_values)
cdf_y_values = np.cumsum(pdf_y_values) / np.sum(pdf_y_values)
plt.figure(figsize=(18, 6))
plt.subplot(121)
plt.plot(x_values, pdf_y_values, label=(r'$\alpha=%.1f,\ \beta=%.1f$' % (alpha, beta)))
plt.xlim(0, 1)
plt.xlabel('Probability of tweet containing keyword')
plt.ylabel('Probability density')
plt.title('Beta Distribution PDF')
plt.legend(loc=1)
plt.subplot(122)
plt.plot(x_values, cdf_y_values)
plt.xlim(0, 1)
plt.ylim(0, 1.005)
plt.yticks(np.linspace(0, 1, 21))
plt.xlabel('Probability of tweet containing keyword')
plt.ylabel('Cumulative probability')
plt.title('Beta Distribution CDF')
plt.savefig('snapshot/Beta Distribution CDF.png');
ix = [n for n,b in enumerate((cdf_y_values>.5)&(cdf_y_values<.95)) if b]
range_ =cdf_y_values[ix]
a=np.c_[x_values[ix],cdf_y_values[ix]]
max_ix,_ = np.unravel_index(a.argmax(), a.shape)
min_ix,_ = np.unravel_index(a.argmin(), a.shape)
fifth_percentile, ninety_fifth_percentile = x_values[max_ix],x_values[min_ix]
print('Con 90% de certeza digamos que la verdadera probabilidad se encuentra entre: {} y {}'.format(
round(fifth_percentile, 10), round(ninety_fifth_percentile, 10)))
Con 90% de certeza digamos que la verdadera probabilidad se encuentra entre: 0.010989011 y 0.000999001
def compute_total_probability_that_probability_less_than_p(p):
return max(cumulative_prob for cumulative_prob, x_value in zip(cdf_y_values, x_values) if x_value < p)
print('La probabilidad que la verdadera probabilidad es > .1 es: {}'.format(
1 - compute_total_probability_that_probability_less_than_p(.1)))
La probabilidad que la verdadera probabilidad es > .1 es: 1.9441115384211116e-12
tweets_df.groupby('location').agg({'location':len}).sort_values('location').iloc[-15:].plot(kind='barh')
plt.title('Most Frequent locations')
plt.xlabel('Cantidad')
plt.tight_layout()
plt.savefig('snapshot/most Frequent locations.png');
tweets_df.to_csv('lima_tweets.csv',index=None)
tweets_df = pd.read_csv('lima_tweets.csv')
stop =stopwords.words('spanish')
doc_complete = tweets_df.Tweets.values
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
doc_clean = [clean(doc).split() for doc in doc_complete]
tweets_df.dropna(inplace=True)
tweets_df['Tweets_clean'] = pd.Series(doc_clean).apply(lambda x:' '.join(x))
tweets_df['word_count'] = tweets_df['Tweets_clean'].apply(lambda x: len(str(x).split(" ")))
tweets_df['char_count'] = tweets_df['Tweets_clean'].str.len()
def avg_word(sentence):
words = sentence.split()
if len(words)==0:
return 0
return (sum(len(word) for word in words)/len(words))
tweets_df['avg_word'] = tweets_df['Tweets_clean'].apply(lambda x: avg_word(x))
tweets_df['hastags'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
tweets_df['numerics'] = tweets_df['Tweets'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
tweets_df.head()
date | followers | location | Tweets | word_count | char_count | avg_word | hastags | numerics | Tweets_clean | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-10-20 01:57:03 | 92 | Lima | @elpanfletope @AlanGarciaPeru chancho no come ... | 5 | 48 | 8.800000 | 0 | 0 | elpanfletope alangarciaperu chancho come chancho |
1 | 2018-10-20 01:57:10 | 32 | Lince | @exitosape @JulianaOxenford A DONDE DICE ... | 3 | 30 | 9.333333 | 0 | 0 | exitosape julianaoxenford dice |
2 | 2018-10-20 01:57:12 | 195 | Brasil | @anajuliachs kkkk 1 mês | 4 | 22 | 4.750000 | 0 | 1 | anajuliachs kkkk 1 mês |
3 | 2018-10-20 01:57:17 | 228 | Chimbote | Ultima noche de #SemanaDeMayordomia en la @IAS... | 11 | 105 | 8.636364 | 1 | 0 | ultima noche semanademayordomia iasdesperanza ... |
4 | 2018-10-20 01:57:18 | 123 | San Miguel | @fernando_roman1 Jajaja | 2 | 21 | 10.000000 | 0 | 0 | fernandoroman1 jajaja |
freq = pd.Series(' '.join(tweets_df['Tweets_clean']).split()).value_counts()[:10]
freq.plot(kind='barh')
plt.title('Most Frequent words')
plt.xlabel('Count')
plt.tight_layout()
plt.savefig('snapshot/most Frequent words.png');
pprint(tweets_df['Tweets_clean'][:2])
0 elpanfletope alangarciaperu chancho come chancho 1 exitosape julianaoxenford dice Name: Tweets_clean, dtype: object
dictionary = corpora.Dictionary(tweets_df['Tweets_clean'].apply(lambda x:x.split()))
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
pprint(doc_term_matrix[:2])
[[(0, 1), (1, 2), (2, 1), (3, 1)], [(4, 1), (5, 1), (6, 1)]]
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)
from pprint import pprint
pprint(ldamodel.print_topics(num_topics=3, num_words=3))
[(0, '0.006*"peru" + 0.006*"lima" + 0.005*"–"'), (1, '0.004*"lima" + 0.004*"in" + 0.003*"fiscal"'), (2, '0.005*"si" + 0.003*"canaln" + 0.002*"amor"')]
# from gensim.test.utils import datapath
# fname = datapath("lda_lima_tweet_model")
ldamodel.save("lda_lima_tweet_model")
from gensim.models.ldamodel import LdaModel
ldamodel = LdaModel.load("lda_lima_tweet_model")
doc_lda = ldamodel[doc_term_matrix]
print('Perplexity: ', ldamodel.log_perplexity(doc_term_matrix)) # a measure of how good the model is. lower the better
Perplexity: -8.825555626039103
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
vis
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
texts_out = []
for sent in texts:
doc_ = nlp(sent)
texts_out.append(list(set([str(c.head) for c in doc_ if c.head.tag_.startswith(tuple(allowed_postags))])))
return texts_out
lemmatization(tweets_df['Tweets_clean'][:5],['VERB'])
[[], ['dice'], [], ['aprendiendo'], []]
def join_comma(row_list):
if row_list == []:
return np.NaN
else:
return ', '.join(row_list)
tweets_df['ACTIONS']=pd.Series(lemmatization(tweets_df['Tweets_clean'],['VERB'])).apply(join_comma)
tweets_df['NOUNS']=pd.Series(lemmatization(tweets_df['Tweets_clean'],['NOUN'])).apply(join_comma)
tweets_df[['Tweets_clean','NOUNS','ACTIONS']].head()
Tweets_clean | NOUNS | ACTIONS | |
---|---|---|---|
0 | elpanfletope alangarciaperu chancho come chancho | elpanfletope | NaN |
1 | exitosape julianaoxenford dice | exitosape | dice |
2 | anajuliachs kkkk 1 mês | mês | NaN |
3 | ultima noche semanademayordomia iasdesperanza ... | noche | aprendiendo |
4 | fernandoroman1 jajaja | jajaja | NaN |
tweets_df.head()
date | followers | location | Tweets | word_count | char_count | avg_word | hastags | numerics | Tweets_clean | ACTIONS | NOUNS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-10-20 01:57:03 | 92 | Lima | @elpanfletope @AlanGarciaPeru chancho no come ... | 5 | 48 | 8.800000 | 0 | 0 | elpanfletope alangarciaperu chancho come chancho | NaN | elpanfletope |
1 | 2018-10-20 01:57:10 | 32 | Lince | @exitosape @JulianaOxenford A DONDE DICE ... | 3 | 30 | 9.333333 | 0 | 0 | exitosape julianaoxenford dice | dice | exitosape |
2 | 2018-10-20 01:57:12 | 195 | Brasil | @anajuliachs kkkk 1 mês | 4 | 22 | 4.750000 | 0 | 1 | anajuliachs kkkk 1 mês | NaN | mês |
3 | 2018-10-20 01:57:17 | 228 | Chimbote | Ultima noche de #SemanaDeMayordomia en la @IAS... | 11 | 105 | 8.636364 | 1 | 0 | ultima noche semanademayordomia iasdesperanza ... | aprendiendo | noche |
4 | 2018-10-20 01:57:18 | 123 | San Miguel | @fernando_roman1 Jajaja | 2 | 21 | 10.000000 | 0 | 0 | fernandoroman1 jajaja | NaN | jajaja |
tweets_df.to_csv('tweets_solutions.csv',index=None)
tweets_df = pd.read_csv('tweets_solutions.csv')
tweets_df.dropna(inplace=True)
tweets_df['Tweets_clean']
1 exitosape julianaoxenford dice 3 ultima noche semanademayordomia iasdesperanza ... 6 cooking with fire in peru today cookingclass s... 7 show música vivo tecnópolis cierra temporada 2... 8 maldita perra perdón alteré 💁🏻♀️ 9 mininterperu pcmperu vizcarrhagan algo denle s... 10 🇪🇨🇪🇨quito ecuador 🇪🇨🇪🇨 vemos mañana eslae http... 11 sasha71396634 ernestojx hninurta gabospeed94 k... 12 ¡conoce proceso exhumación caso mascarilla com... 15 ronaldomendes triste fim da baleia bora fazer ... 17 mt colrichardkemp pt quinze mulheres disseram ... 19 kerch mourns victim of college massacre a surv... 21 pensar convento san francisco reposan restos f... 22 reaccionando ❌te puse perder ❌ javierramireze ... 23 shopping for our cooking class today mercado s... 26 acompañamos autores alvarobisama marcelo mella... 28 sturt0208 canaln martinvizcarrac arrugador fuj... 31 🎗️la prevención mejor tratamiento🎗️🙆♀️ cáncer... 32 tatahcomenta onde tendo live sigam anapaularen... 34 richardacunan tarde reacción sensatez tiempo h... 35 tô levando tempo necessário p entender preciso... 36 sí devolvió libropero parte cumpliócon debíaah... 39 cara palo jc rodríguez preguntando álvaro sala... 40 capital967 mauriciomulder vergüenza dan person... 41 story time i almost made my mom amp i late for... 43 malibulox pra mim não aparece sigam anapaulare... 44 210002ws2080 wind225°at00kmh gust00kmh t176°cd... 45 sentimientos encontrados respecto quedo 3 cosa... 47 emelecmax fefecuador pueden esperar hijoputa n... 48 varios días observado sesgo fujimorista optand... ... 946 increíble cantidad bobadas twiteaba ex😂😂😂 947 conozco hace poquito tomé cariño solcito 948 vale verga gente alguien importa convierto pri... 949 gracias queridas amigas rcreadores romanticism... 950 heyjeans buen viaje 951 sirenita 💕🐚🐠 sirena famosa toda posando nosotr... 952 diariocorreo do hermanitos hermanita menor car... 954 jhueb bradgehlosu troncarternlu exactly right ... 955 acordo perfeito httpstcogcgczwtd4z belissima m... 959 gabrielitaaa10 rosamariabartra alvarosarco mil... 960 tu date cuenta vale va salir toda si quiero 961 meyastos salvajedigital pánico sabe cochinaditas 962 repost mheremer ・・・ name something hot and wet... 964 larryportera idlr puedes adelantar poquito fiz... 965 tiowalo9 labandadel86 edad colombia estan viaj... 971 kelitagrand2 xileone si pues “tipa” q veas exc... 974 acordo perfeito httpstcogcgczwtd4z maisvoce ma... 977 fernandarrocha toda hora nem xuxa 978 tubinocarlos ja cómo orinas gente amoral digni... 980 dando argumentos solicite asilo político 982 ayacucho casi lleva 3 pto huancayo empató 33 v... 985 quem diria 20102018 sérgio lobo sessentou ning... 987 tiowalo9 labandadel86 tener miedo poner jugado... 988 convocape milagrosleivag mulderrctm aún ustede... 989 noticiastvperu richardacunan ustedes sarta cor... 990 gente do céu cuzco é muito longe 993 xileone luzsalgador tvperupe imagínate q indig... 994 abrazo inmenso aquí cielo ¡feliz cumpleaños ab... 996 feliz siento orgullosa pertenecer gran familia... 997 padre caraquista pedro quiere magallanero pped... Name: Tweets_clean, Length: 523, dtype: object
tweets_df['Tweets_clean'] = tweets_df['Tweets_clean'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
print(tweets_df.shape)
tweets_df['Tweets_clean'].head()
(523, 12)
1 exitosape julianaoxenford dice 3 ultima noche semanademayordomia iasdesperanza ... 6 cooking with fire in peru today cookingclass s... 7 show música vivo tecnópolis cierra temporada 2... 8 maldita perra perdón alteré 💁🏻♀️ Name: Tweets_clean, dtype: object
# displacy.serve(doc, style="dep")
doc = nlp(tweets_df['Tweets_clean'][1])
from IPython.display import HTML, Image, display
displacy.render(doc, style="dep",jupyter=True,options={'distance':100})
displacy.render(doc, style="ent",jupyter=True,)
TextBlob(tweets_df['Tweets_clean'][1]).ngrams(2)
[WordList(['exitosape', 'julianaoxenford']), WordList(['julianaoxenford', 'dice'])]
tf1 = (tweets_df['Tweets_clean']).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
for i,word in enumerate(tf1['words']):
tf1.loc[i, 'idf'] = np.log(tweets_df.shape[0]/(len(tweets_df[tweets_df['Tweets_clean'].str.contains(word)])))
tf1['tfidf'] = tf1['tf'] * tf1['idf']
print(tf1.shape)
tf1.head(10)
(3630, 4)
words | tf | idf | tfidf | |
---|---|---|---|---|
0 | dice | 7.0 | 3.774675 | 26.422724 |
1 | julianaoxenford | 1.0 | 6.259581 | 6.259581 |
2 | exitosape | 2.0 | 5.566434 | 11.132869 |
3 | httpstcogmo19ienh6 | 1.0 | 6.259581 | 6.259581 |
4 | iasdesperanza | 1.0 | 6.259581 | 6.259581 |
5 | l… | 1.0 | 3.551531 | 3.551531 |
6 | fiel | 1.0 | 6.259581 | 6.259581 |
7 | noche | 3.0 | 4.873287 | 14.619861 |
8 | semanademayordomia | 1.0 | 6.259581 | 6.259581 |
9 | dios | 3.0 | 4.650144 | 13.950431 |
stop = set(stopwords.words('spanish'))
stop |= set(['lima','si','ser'])
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= stop,ngram_range=(1,1))
train_vect = tfidf.fit_transform(tweets_df['Tweets_clean'])
train_vect
<523x1000 sparse matrix of type '<class 'numpy.float64'>' with 2517 stored elements in Compressed Sparse Row format>
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(tweets_df['Tweets_clean'])
print(train_bow.shape)
train_bow
(523, 1000)
<523x1000 sparse matrix of type '<class 'numpy.int64'>' with 2618 stored elements in Compressed Sparse Row format>
from sklearn.metrics.pairwise import linear_kernel
def find_similar(tfidf_matrix, index, top_n = 5):
cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
tweet = tweets_df.sample(1)
tweet
date | followers | location | Tweets | word_count | char_count | avg_word | hastags | numerics | Tweets_clean | ACTIONS | NOUNS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
542 | 2018-10-20 02:32:30 | 159 | Brasil | @RogerioVilela @pauloap O problema é, quantos serão mortos??? | 7 | 53 | 6.714286 | 0 | 0 | rogeriovilela pauloap problema é quantos serão mortos | serão | problema, quantos |
print(tweet['Tweets'].values)
['@RogerioVilela @pauloap O problema é, quantos serão mortos???']
tweet.reset_index(drop=True,inplace=True)
pd.options.display.max_colwidth = 120
vals = pd.DataFrame()
for index, score in find_similar(train_vect, tweet.index[0],top_n = 5):
vals = vals.append(tweets_df.iloc[index:index+1,:])
vals.loc[index,'score'] = score
vals[['Tweets','score']].head()
Tweets | score | |
---|---|---|
845 | @sigridbazan Se agranda mi admiración Sigrid. Lo dices porque lo dices. | NaN |
440 | NaN | 0.565055 |
452 | @exitosape \nNo sé qué es lo que sucede en #Exitosa que invitan a adefesios como ese seudoanalista politico de apell... | NaN |
234 | NaN | 0.535451 |
900 | Para mí aquí siempre dice "MI ALMA" | NaN |
corpus = nlp('\n'.join(tweets_df['NOUNS'].dropna()))
visited = {}
nouns = []
for word in corpus:
if word.pos_.startswith('N') and len(word.string) < 15 and len(word.string) > 2:
token = word.string.strip().lower()
if token in visited:
visited[token] += 1
continue
else:
visited[token] = 1
nouns.append(word)
nouns = sorted(nouns, key=lambda w: -visited[w.string.strip().lower()])[:150]
pd.DataFrame([[w.text, visited[w.string.strip().lower()]] for w in nouns], columns=['Noun', 'Freq'])
Noun | Freq | |
---|---|---|
0 | canaln | 13 |
1 | pra | 6 |
2 | gente | 6 |
3 | amor | 6 |
4 | patas | 6 |
5 | casa | 5 |
6 | franpetrozzi | 5 |
7 | juez | 5 |
8 | seguridad | 4 |
9 | caso | 4 |
10 | mama | 4 |
11 | cosas | 4 |
12 | país | 4 |
13 | día | 4 |
14 | vizcarra | 4 |
15 | persona | 4 |
16 | procesion | 4 |
17 | pueblo | 3 |
18 | señor | 3 |
19 | hora | 3 |
20 | tema | 3 |
21 | fútbol | 3 |
22 | fuerza | 3 |
23 | gracias | 3 |
24 | from | 3 |
25 | equipos | 3 |
26 | años | 3 |
27 | veces | 3 |
28 | presidente | 3 |
29 | milibrujita | 3 |
... | ... | ... |
120 | tratamiento | 1 |
121 | tumor | 1 |
122 | tatahcomenta | 1 |
123 | reacción | 1 |
124 | voz | 1 |
125 | perdoar | 1 |
126 | tempo | 1 |
127 | minhas | 1 |
128 | orejas | 1 |
129 | rodríguez | 1 |
130 | personajes | 1 |
131 | made | 1 |
132 | they | 1 |
133 | sentimientos | 1 |
134 | narradores | 1 |
135 | emelecmax | 1 |
136 | ghibellini | 1 |
137 | linares | 1 |
138 | chantaje | 1 |
139 | momentos | 1 |
140 | derecho | 1 |
141 | scattered | 1 |
142 | elcomerciocom | 1 |
143 | panoramaptv | 1 |
144 | niñito | 1 |
145 | página | 1 |
146 | video | 1 |
147 | prisión | 1 |
148 | verdad | 1 |
149 | josé | 1 |
150 rows × 2 columns
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y, s=2.0)
plt.annotate(label, xy=(x, y), xytext=(5, 2),
textcoords='offset points',ha='right',va='bottom')
plt.tight_layout()
plt.savefig('snapshot/lima_words_TSNE.png')
plt.show()
# Creating the tsne plot [Warning: will take time]
tsne = TSNE(perplexity=50.0, n_components=2, init='pca', n_iter=10000)
low_dim_embedding = tsne.fit_transform(np.array([word.vector for word in nouns]))
# Finally plotting and saving the fig
plot_with_labels(low_dim_embedding, [word.text for word in nouns])
tweets_df = pd.read_csv('tweets_solutions.csv')
tweets_df.head(1)
date | followers | location | Tweets | word_count | char_count | avg_word | hastags | numerics | Tweets_clean | ACTIONS | NOUNS | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2018-10-20 01:57:03 | 92 | Lima | @elpanfletope @AlanGarciaPeru chancho no come chancho | 5 | 48 | 8.8 | 0 | 0 | elpanfletope alangarciaperu chancho come chancho | NaN | elpanfletope |
hashtag_summary = adv.extract_hashtags(tweets_df['Tweets'])
hashtag_summary.keys()
dict_keys(['hashtags', 'hashtags_flat', 'hashtag_counts', 'hashtag_freq', 'top_hashtags', 'overview'])
hashtag_summary['overview']
{'num_posts': 1000, 'num_hashtags': 344, 'hashtags_per_post': 0.344, 'unique_hashtags': 238}
hashtag_summary['hashtags'][:20]
[[], [], [], ['#semanademayordomia'], [], [], ['#cookingclass', '#seerundo', '#lima', '#peru', '#southamerica'], [], [], [], ['#eslae'], [], ['#comprometidosconlaverdad', '#forensesec'], [], [], [], [], [], [], []]
hashtag_summary['hashtag_counts'][:20]
[0, 0, 0, 1, 0, 0, 5, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0]
hashtag_summary['hashtag_freq'][:20]
[(0, 841), (1, 93), (2, 30), (3, 7), (4, 6), (5, 5), (6, 10), (7, 5), (8, 1), (9, 2)]
plt.figure(facecolor='#ebebeb', figsize=(11, 8))
plt.bar([x[0] for x in hashtag_summary['hashtag_freq'][:15]],
[x[1] for x in hashtag_summary['hashtag_freq'][:15]])
plt.title('Hashtag frequency')
plt.xlabel('Hashtags per tweet')
plt.ylabel('Number of tweets')
plt.yscale('log')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
hashtag_summary['top_hashtags'][:10]
[('#lima', 9), ('#mulderrctm', 9), ('#22díasparaveracaché', 6), ('#cristomoreno', 6), ('#señordelosmilagros', 6), ('#elorigendelorigen', 5), ('#esviernesyyonecesito', 5), ('#mesmorado', 5), ('#turrondedoñapepa', 5), ('#anticuchos', 5)]
plt.figure(facecolor='#ebebeb', figsize=(8, 12))
plt.barh([x[0] for x in hashtag_summary['top_hashtags'][2:][:5]][::-1],
[x[1] for x in hashtag_summary['top_hashtags'][2:][:5]][::-1])
plt.title('Top Hashtags')
# plt.xticks(range(3))
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
emoji_summary = adv.extract_emoji(tweets_df['Tweets'])
emoji_summary.keys()
dict_keys(['emoji', 'emoji_text', 'emoji_flat', 'emoji_flat_text', 'emoji_counts', 'emoji_freq', 'top_emoji', 'top_emoji_text', 'overview'])
emoji_summary['overview']
{'num_posts': 1000, 'num_emoji': 511, 'emoji_per_post': 0.511, 'unique_emoji': 132}
emoji_summary['emoji'][50:80]
[['📻', '📣'], [], [], [], [], [], ['🤔'], [], [], [], [], [], [], [], [], [], [], [], ['😦'], [], [], [], [], [], [], ['😀'], [], [], [], []]
emoji_summary['emoji_text'][50:80]
[['radio', 'megaphone'], [], [], [], [], [], ['thinking face'], [], [], [], [], [], [], [], [], [], [], [], ['frowning face with open mouth'], [], [], [], [], [], [], ['grinning face'], [], [], [], []]
emoji_summary['emoji_flat'][:10]
['💁🏻\u200d♀️', '🇪🇨', '🇪🇨', '🇪🇨', '🇪🇨', '🤔', '😆', '😆', '😆', '😆']
emoji_summary['emoji_flat_text'][:10]
['woman tipping hand light skin tone', 'Ecuador', 'Ecuador', 'Ecuador', 'Ecuador', 'thinking face', 'grinning squinting face', 'grinning squinting face', 'grinning squinting face', 'grinning squinting face']
list(zip(emoji_summary['emoji_flat'][:10], emoji_summary['emoji_flat_text'][:10]))
[('💁🏻\u200d♀️', 'woman tipping hand light skin tone'), ('🇪🇨', 'Ecuador'), ('🇪🇨', 'Ecuador'), ('🇪🇨', 'Ecuador'), ('🇪🇨', 'Ecuador'), ('🤔', 'thinking face'), ('😆', 'grinning squinting face'), ('😆', 'grinning squinting face'), ('😆', 'grinning squinting face'), ('😆', 'grinning squinting face')]
emoji_summary['emoji_counts'][:15]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 1]
emoji_summary['emoji_freq'][:15]
[(0, 808), (1, 108), (2, 27), (3, 23), (4, 11), (5, 9), (6, 2), (8, 1), (9, 2), (10, 1), (11, 2), (13, 1), (15, 1), (18, 1), (23, 1)]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.bar([x[0] for x in emoji_summary['emoji_freq'][:15]],
[x[1] for x in emoji_summary['emoji_freq'][:15]])
plt.title('Emoji frequency')
plt.xlabel('Emoji per tweet')
plt.ylabel('Number of tweets')
plt.yscale('log')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
emoji_summary['top_emoji'][:8]
[('😭', 39), ('😂', 37), ('🤣', 32), ('❤', 21), ('♥', 20), ('😍', 17), ('🤤', 14), ('😻', 14)]
emoji_summary['top_emoji_text'][:8]
[('loudly crying face', 39), ('face with tears of joy', 37), ('rolling on the floor laughing', 32), ('red heart', 21), ('heart suit', 20), ('smiling face with heart-eyes', 17), ('drooling face', 14), ('smiling cat face with heart-eyes', 14)]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.barh([x[0] for x in emoji_summary['top_emoji_text'][:8]][::-1],
[x[1] for x in emoji_summary['top_emoji_text'][:8]][::-1])
plt.title('Top Emoji')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
mention_summary = adv.extract_mentions(tweets_df.Tweets)
mention_summary.keys()
dict_keys(['mentions', 'mentions_flat', 'mention_counts', 'mention_freq', 'top_mentions', 'overview'])
mention_summary['overview']
{'num_posts': 1000, 'num_mentions': 886, 'mentions_per_post': 0.886, 'unique_mentions': 503}
mention_summary['mentions'][:15]
[['@elpanfletope', '@alangarciaperu'], ['@exitosape', '@julianaoxenford'], ['@anajuliachs'], ['@iasdesperanza'], ['@fernando_roman1'], [], [], [], [], ['@mininterperu', '@pcmperu', '@vizcarrhagan'], [], ['@sasha71396634', '@ernesto_jx', '@hninurta', '@gabospeed94', '@karla_ugaz', '@vero_mendoza_f'], [], [], []]
mention_summary['mentions_flat'][:10]
['@elpanfletope', '@alangarciaperu', '@exitosape', '@julianaoxenford', '@anajuliachs', '@iasdesperanza', '@fernando_roman1', '@mininterperu', '@pcmperu', '@vizcarrhagan']
mention_summary['mention_counts'][:20]
[2, 2, 1, 1, 1, 0, 0, 0, 0, 3, 0, 6, 0, 0, 0, 1, 1, 1, 0, 0]
mention_summary['mention_freq'][:15]
[(0, 484), (1, 341), (2, 105), (3, 29), (4, 11), (5, 6), (6, 4), (7, 10), (8, 10)]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.bar([x[0] for x in mention_summary['mention_freq'][:15]],
[x[1] for x in mention_summary['mention_freq'][:15]])
plt.title('Mention frequency')
plt.xlabel('Mention per tweet')
plt.ylabel('Number of tweets')
plt.grid(alpha=0.5)
plt.yscale('log')
plt.gca().set_frame_on(False)
plt.savefig('snapshot/Mention Frequency.png');
mention_summary['top_mentions'][:10]
[('@canaln_', 31), ('@franpetrozzi', 11), ('@idl_r', 10), ('@rppnoticias', 10), ('@rosamariabartra', 9), ('@keikofujimori', 8), ('@ximena_casanova', 7), ('@policiaperu', 7), ('@milagrosleivag', 7), ('@abbu25', 7)]
plt.figure(facecolor='#ebebeb', figsize=(8, 8))
plt.barh([x[0] for x in mention_summary['top_mentions'][:15]][::-1],
[x[1] for x in mention_summary['top_mentions'][:15]][::-1])
plt.title('Top Mentions')
plt.grid(alpha=0.5)
plt.gca().set_frame_on(False)
plt.savefig('snapshot/Top Mentions.png');
tweets_df.columns
Index(['date', 'followers', 'location', 'Tweets', 'word_count', 'char_count', 'avg_word', 'hastags', 'numerics', 'Tweets_clean', 'ACTIONS', 'NOUNS'], dtype='object')
extracted_tweets = (tweets_df[['Tweets', 'followers']]
.assign(hashtags=hashtag_summary['hashtags'],
hashcounts=hashtag_summary['hashtag_counts'],
mentions=mention_summary['mentions'],
mention_count=mention_summary['mention_counts'],
emoji=emoji_summary['emoji'],
emoji_text=emoji_summary['emoji_text'],
emoji_count=emoji_summary['emoji_counts'],))
extracted_tweets.head()
Tweets | followers | hashtags | hashcounts | mentions | mention_count | emoji | emoji_text | emoji_count | |
---|---|---|---|---|---|---|---|---|---|
0 | @elpanfletope @AlanGarciaPeru chancho no come chancho | 92 | [] | 0 | [@elpanfletope, @alangarciaperu] | 2 | [] | [] | 0 |
1 | @exitosape @JulianaOxenford A DONDE DICE ... | 32 | [] | 0 | [@exitosape, @julianaoxenford] | 2 | [] | [] | 0 |
2 | @anajuliachs kkkk 1 mês | 195 | [] | 0 | [@anajuliachs] | 1 | [] | [] | 0 |
3 | Ultima noche de #SemanaDeMayordomia en la @IASDEsperanza, aprendiendo sobre la importancia de ser fiel a Dios con l…... | 228 | [#semanademayordomia] | 1 | [@iasdesperanza] | 1 | [] | [] | 0 |
4 | @fernando_roman1 Jajaja | 123 | [] | 0 | [@fernando_roman1] | 1 | [] | [] | 0 |
extracted_tweets.columns
Index(['Tweets', 'followers', 'hashtags', 'hashcounts', 'mentions', 'mention_count', 'emoji', 'emoji_text', 'emoji_count'], dtype='object')
word_freq_hash = adv.word_frequency(extracted_tweets['hashtags'].str.join(' '),
extracted_tweets['followers'].fillna(0))#.sort_values(['abs_freq'], ascending=False).head(20)
word_freq_hash.head(10)
word | abs_freq | wtd_freq | rel_value | |
---|---|---|---|---|
0 | #lima | 9 | 430823 | 47869.0 |
1 | #drogas | 2 | 295632 | 147816.0 |
2 | #microcomercializar | 2 | 295632 | 147816.0 |
3 | #chorrillos | 1 | 147819 | 147819.0 |
4 | #esviernesyyonecesito | 5 | 147727 | 29545.0 |
5 | #renuncia | 1 | 107446 | 107446.0 |
6 | #pérez | 1 | 107446 | 107446.0 |
7 | #tablitasexcel | 2 | 92850 | 46425.0 |
8 | #22díasparaveracaché | 6 | 40669 | 6778.0 |
9 | #burnthestagethemovieinvzla | 1 | 39846 | 39846.0 |
extracted_tweets[extracted_tweets['hashtags'].str.join(' ')
.str.contains('lima',case=False)]
Tweets | followers | hashtags | hashcounts | mentions | mention_count | emoji | emoji_text | emoji_count | |
---|---|---|---|---|---|---|---|---|---|
6 | Cooking with fire in Peru today #cookingclass #seerundo #lima #peru #southamerica @ Lima, Peru https://t.co/llmzIMjMUo | 13 | [#cookingclass, #seerundo, #lima, #peru, #southamerica] | 5 | [] | 0 | [] | [] | 0 |
23 | Shopping for our cooking class today #mercado #seerundo #lima #peru #southamerica @ Mercado N#1 de Surquillo https:/... | 13 | [#mercado, #seerundo, #lima, #peru, #southamerica, #1] | 6 | [] | 0 | [] | [] | 0 |
252 | Feels good to be back home 🇵🇪 #peru #larcomar #lima @ Lima, Peru https://t.co/agFso0gRcs | 247 | [#peru, #larcomar, #lima] | 3 | [] | 0 | [🇵🇪] | [Peru] | 1 |
330 | #esviernesyyonecesito es ahora una tendencia en #Lima\n\nhttps://t.co/UOmWB9sTSw https://t.co/oZBAUS7GPe | 107446 | [#esviernesyyonecesito, #lima] | 2 | [] | 0 | [] | [] | 0 |
389 | By @mariotestino 🙌🏼 \n.\n.\n.\n.\n.\n.\n.\n.\n#igersperu #lima #museum #musee #peru #mariotestino #photography #mode... | 213 | [#igersperu, #lima, #museum, #musee, #peru, #mariotestino, #photography, #mode] | 8 | [@mariotestino] | 1 | [🙌🏼] | [raising hands medium-light skin tone] | 1 |
512 | #pérez es ahora una tendencia en #Lima\n\nhttps://t.co/CXgZJw73Ty https://t.co/a7wfyKchQU | 107446 | [#pérez, #lima] | 2 | [] | 0 | [] | [] | 0 |
653 | Pequeña\n#traveler #explorer #flower #lima #pic #nice #photograpy https://t.co/82r1wgHu3w | 553 | [#traveler, #explorer, #flower, #lima, #pic, #nice, #photograpy] | 7 | [] | 0 | [] | [] | 0 |
749 | #renuncia es ahora una tendencia en #Lima\n\nhttps://t.co/DhxjeOKeix https://t.co/61jYXcNSwk | 107446 | [#renuncia, #lima] | 2 | [] | 0 | [] | [] | 0 |
821 | 'cerradura', 'desconocidos' y 'amedrentamiento' es ahora una tendencia en #Lima\n\nhttps://t.co/zpEunuSxrb https://t... | 107446 | [#lima] | 1 | [] | 0 | [] | [] | 0 |
word_freq_mention = adv.word_frequency(extracted_tweets['mentions'].str.join(' '),
extracted_tweets['followers'].fillna(0))
#.sort_values(['abs_freq'], ascending=False).head(20)
word_freq_mention.head(10)
word | abs_freq | wtd_freq | rel_value | |
---|---|---|---|---|
0 | @gissellereyes | 1 | 214460 | 214460.0 |
1 | @ucatolicaec | 3 | 52617 | 17539.0 |
2 | @macara_oficial | 1 | 46425 | 46425.0 |
3 | @dcm_online | 1 | 44161 | 44161.0 |
4 | @brasil247 | 1 | 44161 | 44161.0 |
5 | @terranoticiasbr | 1 | 44160 | 44160.0 |
6 | @canaln_ | 31 | 31715 | 1023.0 |
7 | @jacquelinabravo | 1 | 31299 | 31299.0 |
8 | @alokadalis | 2 | 27305 | 13652.0 |
9 | @colrichardkemp | 1 | 23611 | 23611.0 |
word_freq_emoji = adv.word_frequency(extracted_tweets['emoji'].str.join(' '),
extracted_tweets['followers'].fillna(0))#.sort_values(['abs_freq'], ascending=False).head(20)
word_freq_emoji.head(10)
word | abs_freq | wtd_freq | rel_value | |
---|---|---|---|---|
0 | 😭 | 39 | 434614 | 11144.0 |
1 | 🖤 | 1 | 377740 | 377740.0 |
2 | 🤔 | 7 | 304059 | 43437.0 |
3 | 📣 | 3 | 295880 | 98627.0 |
4 | 🚔 | 2 | 295632 | 147816.0 |
5 | 👮🏿♀️ | 2 | 295632 | 147816.0 |
6 | 👮🏿♂️ | 2 | 295632 | 147816.0 |
7 | 👉 | 7 | 221454 | 31636.0 |
8 | 😝 | 2 | 214546 | 107273.0 |
9 | 🇪🇨 | 4 | 153264 | 38316.0 |
[adv.emoji_dict.emoji_dict[k] for k in word_freq_emoji['word'][:10]]
[':loudly_crying_face:', ':black_heart:', ':thinking_face:', ':megaphone:', ':oncoming_police_car:', ':woman_police_officer_dark_skin_tone:', ':man_police_officer_dark_skin_tone:', ':backhand_index_pointing_right:', ':squinting_face_with_tongue:', ':Ecuador:']
word_freq_emoji[:10].assign(emoji_text=[adv.emoji_dict.emoji_dict[k] for k in word_freq_emoji['word'][:10]])
word | abs_freq | wtd_freq | rel_value | emoji_text | |
---|---|---|---|---|---|
0 | 😭 | 39 | 434614 | 11144.0 | :loudly_crying_face: |
1 | 🖤 | 1 | 377740 | 377740.0 | :black_heart: |
2 | 🤔 | 7 | 304059 | 43437.0 | :thinking_face: |
3 | 📣 | 3 | 295880 | 98627.0 | :megaphone: |
4 | 🚔 | 2 | 295632 | 147816.0 | :oncoming_police_car: |
5 | 👮🏿♀️ | 2 | 295632 | 147816.0 | :woman_police_officer_dark_skin_tone: |
6 | 👮🏿♂️ | 2 | 295632 | 147816.0 | :man_police_officer_dark_skin_tone: |
7 | 👉 | 7 | 221454 | 31636.0 | :backhand_index_pointing_right: |
8 | 😝 | 2 | 214546 | 107273.0 | :squinting_face_with_tongue: |
9 | 🇪🇨 | 4 | 153264 | 38316.0 | :Ecuador: |
sotu_retweets = np.load('tweets_dict.npy')
def buildDataFrameFromDict(mapping):
df=[]
for f in mapping:
f_n = {}
for k,item in f.items():
if isinstance(item,dict):
for i,j in item.items():
f_n[k+'-'+i] = j
else:
f_n[k] = f[k]
df.append(f_n)
df = pd.DataFrame(df)
return df
sotu = buildDataFrameFromDict(sotu_retweets)
sotu.head()
contributors | coordinates | coordinates-coordinates | coordinates-type | created_at | display_text_range | entities-hashtags | entities-media | entities-symbols | entities-urls | ... | user-profile_text_color | user-profile_use_background_image | user-protected | user-screen_name | user-statuses_count | user-time_zone | user-translator_type | user-url | user-utc_offset | user-verified | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | None | NaN | NaN | NaN | Sat Oct 20 01:57:03 +0000 2018 | [30, 53] | [] | NaN | [] | [] | ... | 333333 | True | False | geriitx | 293 | None | none | https://m.facebook.com/geriita.ab?__user=1571199766 | None | False |
1 | None | NaN | NaN | NaN | Sat Oct 20 01:57:10 +0000 2018 | [28, 44] | [] | NaN | [] | [] | ... | 333333 | True | False | MurgaRodolfo | 256 | None | none | None | None | False |
2 | None | NaN | NaN | NaN | Sat Oct 20 01:57:12 +0000 2018 | [13, 23] | [] | NaN | [] | [] | ... | 333333 | True | False | GusRodrigues4 | 752 | None | none | None | None | False |
3 | None | NaN | NaN | NaN | Sat Oct 20 01:57:17 +0000 2018 | [0, 140] | [{'text': 'SemanaDeMayordomia', 'indices': [16, 35]}] | NaN | [] | [{'url': 'https://t.co/gMo19IEnh6', 'expanded_url': 'https://twitter.com/i/web/status/1053465144875991045', 'display... | ... | 333333 | True | False | IASDEsperanza | 698 | None | none | None | None | False |
4 | None | NaN | NaN | NaN | Sat Oct 20 01:57:18 +0000 2018 | [17, 23] | [] | NaN | [] | [] | ... | 333333 | True | False | mcthuglife666 | 1168 | None | none | None | None | False |
5 rows × 125 columns
for c in sotu.columns:
print(c)
contributors coordinates coordinates-coordinates coordinates-type created_at display_text_range entities-hashtags entities-media entities-symbols entities-urls entities-user_mentions extended_entities-media extended_tweet-display_text_range extended_tweet-entities extended_tweet-extended_entities extended_tweet-full_text favorite_count favorited filter_level geo geo-coordinates geo-type id id_str in_reply_to_screen_name in_reply_to_status_id in_reply_to_status_id_str in_reply_to_user_id in_reply_to_user_id_str is_quote_status lang place-attributes place-bounding_box place-country place-country_code place-full_name place-id place-name place-place_type place-url possibly_sensitive quote_count quoted_status-contributors quoted_status-coordinates quoted_status-created_at quoted_status-display_text_range quoted_status-entities quoted_status-extended_entities quoted_status-extended_tweet quoted_status-favorite_count quoted_status-favorited quoted_status-filter_level quoted_status-geo quoted_status-id quoted_status-id_str quoted_status-in_reply_to_screen_name quoted_status-in_reply_to_status_id quoted_status-in_reply_to_status_id_str quoted_status-in_reply_to_user_id quoted_status-in_reply_to_user_id_str quoted_status-is_quote_status quoted_status-lang quoted_status-place quoted_status-possibly_sensitive quoted_status-quote_count quoted_status-quoted_status_id quoted_status-quoted_status_id_str quoted_status-reply_count quoted_status-retweet_count quoted_status-retweeted quoted_status-source quoted_status-text quoted_status-truncated quoted_status-user quoted_status_id quoted_status_id_str quoted_status_permalink-display quoted_status_permalink-expanded quoted_status_permalink-url reply_count retweet_count retweeted source text timestamp_ms truncated user-contributors_enabled user-created_at user-default_profile user-default_profile_image user-description user-favourites_count user-follow_request_sent user-followers_count user-following user-friends_count user-geo_enabled user-id user-id_str user-is_translator user-lang user-listed_count user-location user-name user-notifications user-profile_background_color user-profile_background_image_url user-profile_background_image_url_https user-profile_background_tile user-profile_banner_url user-profile_image_url user-profile_image_url_https user-profile_link_color user-profile_sidebar_border_color user-profile_sidebar_fill_color user-profile_text_color user-profile_use_background_image user-protected user-screen_name user-statuses_count user-time_zone user-translator_type user-url user-utc_offset user-verified
data = sotu.sample(10)['user-screen_name']
sotu['retweeted_status-user-screen_name'] = np.random.choice(data,len(sotu))
sotu[['user-screen_name','retweeted_status-user-screen_name']].head()
user-screen_name | retweeted_status-user-screen_name | |
---|---|---|
0 | geriitx | hhbacigalupo |
1 | MurgaRodolfo | hhbacigalupo |
2 | GusRodrigues4 | mpauta |
3 | IASDEsperanza | hhbacigalupo |
4 | mcthuglife666 | casidi13243 |
G_rt = nx.from_pandas_edgelist(
sotu,
source = 'user-screen_name',
target = 'retweeted_status-user-screen_name',
create_using = nx.DiGraph())
print('Nodes in RT network:', len(G_rt.nodes()))
print('Edges in RT network:', len(G_rt.edges()))
Nodes in RT network: 505 Edges in RT network: 855
G_reply = nx.from_pandas_edgelist(
sotu,
source = 'user-screen_name',
target = 'in_reply_to_screen_name',
create_using = nx.DiGraph())
print('Nodes in reply network:', len(G_reply.nodes()))
print('Edges in reply network:', len(G_reply.edges()))
Nodes in reply network: 796 Edges in reply network: 708
pos = nx.random_layout(G_rt)
sizes = [x[1] for x in G_rt.degree()]
nx.draw_networkx(G_rt, pos,
with_labels = False,
node_size = sizes,
width = 0.1, alpha = 0.7,
arrowsize = 2, linewidths = 0)
plt.savefig('snapshot/lima_tweets_influencing_graph.png')
plt.axis('off'); plt.show()
pos = nx.random_layout(G_reply)
sizes = [x[1] for x in G_reply.degree()]
nx.draw_networkx(G_reply, pos,
with_labels = False,
node_size = sizes,
width = 0.1, alpha = 0.7,
arrowsize = 2, linewidths = 0)
plt.axis('off'); plt.show()
column_names = ['screen_name', 'degree_centrality']
rt_centrality = nx.in_degree_centrality(G_rt)
reply_centrality = nx.in_degree_centrality(G_reply)
rt = pd.DataFrame(list(rt_centrality.items()), columns = column_names)
reply = pd.DataFrame(list(reply_centrality.items()), columns = column_names)
display(rt.sort_values('degree_centrality', ascending = False).head())
display(reply.sort_values('degree_centrality', ascending = False).head())
screen_name | degree_centrality | |
---|---|---|
1 | hhbacigalupo | 0.208333 |
12 | NOTIELMOMENTO | 0.178571 |
7 | casidi13243 | 0.176587 |
4 | mpauta | 0.174603 |
17 | Rockmanita | 0.168651 |
screen_name | degree_centrality | |
---|---|---|
7 | None | 0.441509 |
105 | canalN_ | 0.015094 |
83 | SolCn | 0.005031 |
400 | CesarNakazaki | 0.005031 |
448 | IDL_R | 0.003774 |
column_names = ['screen_name', 'betweenness_centrality']
# Generate betweenness centrality for retweets
rt_centrality = nx.betweenness_centrality(G_rt)
# Generate betweenness centrality for replies
reply_centrality = nx.betweenness_centrality(G_reply)
# Store centralities in data frames
rt = pd.DataFrame(list(rt_centrality.items()), columns = column_names)
reply = pd.DataFrame(list(reply_centrality.items()), columns = column_names)
# Print first five results in descending order of centrality
display(rt.sort_values('betweenness_centrality', ascending = False).head())
# Print first five results in descending order of centrality
display(reply.sort_values('betweenness_centrality', ascending = False).head())
screen_name | betweenness_centrality | |
---|---|---|
12 | NOTIELMOMENTO | 0.004589 |
27 | FedericoClemen8 | 0.004563 |
10 | marioporlavida | 0.003482 |
17 | Rockmanita | 0.003354 |
1 | hhbacigalupo | 0.003119 |
screen_name | betweenness_centrality | |
---|---|---|
413 | mindreaux | 0.000003 |
442 | famasem | 0.000003 |
405 | Ldavidesc | 0.000002 |
0 | geriitx | 0.000000 |
526 | rolo18al | 0.000000 |
column_names = ['screen_name', 'degree']
degree_rt = pd.DataFrame(list(G_rt.in_degree()), columns = column_names)
degree_reply = pd.DataFrame(list(G_reply.in_degree()), columns = column_names)
ratio = degree_rt.merge(degree_reply, on = 'screen_name', suffixes = ('_rt', '_reply'))
ratio['ratio'] = ratio['degree_reply'] / ratio['degree_rt']
ratio = ratio[ratio['degree_rt'] >= 5]
display(ratio.sort_values('ratio', ascending = False).head())
screen_name | degree_rt | degree_reply | ratio | |
---|---|---|---|---|
1 | hhbacigalupo | 105 | 0 | 0.0 |
4 | mpauta | 88 | 0 | 0.0 |
7 | casidi13243 | 89 | 0 | 0.0 |
10 | marioporlavida | 82 | 0 | 0.0 |
12 | NOTIELMOMENTO | 90 | 0 | 0.0 |