Exports nodes and edges from tweets (either from retweets or mentions) in json format that can be exported from SFM, and saves it in a file format compatible with various social network graph tools such as Gephi, Cytoscape, Kumu, etc. These are for directed graphs.
import sys
import json
import re
import numpy as np
from datetime import datetime
import pandas as pd
tweetfile = '/home/soominpark/sfmproject/Work/Network Graphs/food_security.csv'
tweets = pd.read_csv(tweetfile)
# 1. Export edges from Retweets
retweets = tweets[tweets['is_retweet'] == 'Yes']
retweets['original_twitter'] = retweets['text'].str.extract('RT @([a-zA-Z0-9]\w{0,}):', expand=True)
edges = retweets[['screen_name', 'original_twitter','created_at']]
edges.columns = ['Source', 'Target', 'Strength']
/home/soominpark/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy app.launch_new_instance()
# 2. Export edges from Mentions
mentions = tweets[tweets['mentions'].notnull()]
edges = pd.DataFrame(columns=('Source','Target','Strength'))
for index, row in mentions.iterrows():
mention_list = row['mentions'].split(", ")
for mention in mention_list:
edges = edges.append(pd.DataFrame([[row['screen_name'],
mention,
row['created_at']]]
, columns=('Source','Target','Strength')), ignore_index=True)
# 3. Export edges from Replies
replies = tweets[tweets['in_reply_to_screen_name'].notnull()]
edges = replies[['screen_name', 'in_reply_to_screen_name','created_at']]
edges.columns = ['Source', 'Target', 'Strength']
strengthLevel = 3 # Network connection strength level: the number of times in total each of the tweeters responded to or mentioned the other.
# If you have 1 as the level, then all tweeters who mentioned or replied to another at least once will be displayed. But if you have 5, only those who have mentioned or responded to a particular tweeter at least 5 times will be displayed, which means that only the strongest bonds are shown.
edges2 = edges.groupby(['Source','Target'])['Strength'].count()
edges2 = edges2.reset_index()
edges2 = edges2[edges2['Strength'] >= strengthLevel]
# Export nodes from the edges and add node attributes for both Sources and Targets.
users = tweets[['screen_name','followers_count','friends_count']]
users = users.sort_values(['screen_name','followers_count'], ascending=[True, False])
users = users.drop_duplicates(['screen_name'], keep='first')
ids = edges2['Source'].append(edges2['Target']).to_frame()
ids['Label'] = ids
ids.columns = ['screen_name', 'Label']
ids = ids.drop_duplicates(['screen_name'], keep='first')
nodes = pd.merge(ids, users, on='screen_name', how='left')
print(nodes.shape)
print(edges2.shape)
(463, 4) (349, 3)
# change column names for Kumu import (Run this when using Kumu)
edges2.columns = ['From','To','Strength']
# Print nodes to check
nodes.head(3)
screen_name | Label | followers_count | friends_count | |
---|---|---|---|---|
0 | 104boild | 104boild | 3665.0 | 3641.0 |
1 | AapRajasthan_ | AapRajasthan_ | 30.0 | 391.0 |
2 | ActuallyD0NG | ActuallyD0NG | 415.0 | 457.0 |
# Print edges to check
edges2.head(3)
Source | Target | Strength | |
---|---|---|---|
45 | 104boild | 104boild | 5 |
604 | AapRajasthan_ | neelesh403 | 3 |
606 | AapRajasthan_ | sardesairajdeep | 3 |
# Export nodes and edges to csv files
nodes.to_csv('nodes.csv', encoding='utf-8', index=False)
edges2.to_csv('edges.csv', encoding='utf-8', index=False)