加载数据集¶

In [1]:

import twitter
consumer_key = "52Nu7ubm2szT1JyJEOB7V2lGM"
consumer_secret = "mqA94defqjioyWeMxdJsSduthxdMMGd2vfOUKvOFpm0n7JTqfY"
access_token = "16065520-USf3DBbQAh6ZA8CnSAi6NAUlkorXdppRXpC4cQCKk"
access_token_secret = "DowMQeXqh5ZsGvZGrmUmkI0iCmI34ShFzKF3iOdiilpX5"
authorization = twitter.OAuth(access_token, access_token_secret, consumer_key, consumer_secret)
t = twitter.Twitter(auth=authorization, retry=True)

In [ ]:

import os
data_folder = os.path.join(os.path.expanduser("~"), "Data", "twitter")
output_filename = os.path.join(data_folder, "python_tweets.json")

In [ ]:

import json

In [ ]:

original_users = []
tweets = []
user_ids = {}

search_results = t.search.tweets(q="python", count=100)['statuses']
for tweet in search_results:
    if 'text' in tweet:
        original_users.append(tweet['user']['screen_name'])
        user_ids[tweet['user']['screen_name']] = tweet['user']['id']
        tweets.append(tweet['text'])

In [ ]:

len(tweets)

In [ ]:

model_filename = os.path.join(os.path.expanduser("~"), "Models", "twitter", "python_context.pkl")

In [ ]:

from sklearn.base import TransformerMixin
from nltk import word_tokenize

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [{word: True for word in word_tokenize(document)}
                 for document in X]

In [ ]:

from sklearn.externals import joblib
context_classifier = joblib.load(model_filename)

In [ ]:

y_pred = context_classifier.predict(tweets)
y_pred

In [ ]:

relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]
relevant_users = [original_users[i] for i in range(len(tweets)) if y_pred[i] == 1]
print(len(relevant_tweets))

In [ ]:

import time
import sys

def get_friends(t, user_id):
    friends = []
    cursor = -1  # Start with the first page
    while cursor != 0:  # If zero, that is the end:
        try:
            results = t.friends.ids(user_id=user_id, cursor=cursor, count=5000)
            friends.extend([friends for friends in results['ids']])
            cursor = results['next_cursor']
            if len(friends) >= 10000:
                break
            if cursor != 0:
                print("Collected {} friends so far, but there are more".format(len(friends)))
                sys.stdout.flush
        except TypeError as e:
            if results is None:
                print("You probably reached your API limit, waiting for 5 minutes")
                sys.stdout.flush()
                time.sleep(5*60) # 5 minute wait
            else:
                raise e
        except twitter.TwitterHTTPError as e:
            break
        finally:
            time.sleep(60)  # Wait 1 minute before continuing
    return friends

In [ ]:

test_friends = get_friends(t, user_ids[relevant_users[0]])
print(test_friends)

In [ ]:

friends = {}
for screen_name in relevant_users:
    print("Obtaining friends for user {}".format(user))
    sys.stdout.flush()
    user_id = user_id[user]
    friends[user_id] = get_friends(t, user_id)
friends = {user_id:friends[user_id] for user_id in friends
             if len(friends[user_id]) > 0}

In [ ]:

from collections import defaultdict

In [ ]:

def count_friends(friends):
    friend_count = defaultdict(int)
    for friend_list in friends.values():
        for friend in friend_list:
            friend_count[friend] += 1
    return friend_count

In [ ]:

#TODO: Remove before production
import os
import json

data_folder = os.path.join(os.path.expanduser("~"), "Data", "twitter")
friends_filename = os.path.join(data_folder, "python_friends.json")
with open(friends_filename) as inf:
    friends = json.load(inf)

In [ ]:

friend_count = count_friends(friends)

In [ ]:

from operator import itemgetter
best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)

In [ ]:

best_friends[:10]

In [ ]:

while len(friends) < 150:
    # Get the best friend that isn't already in our list
    for user_id, count in best_friends:
        if user_id not in friends and str(user_id) != '467407284':
            break
    print("Getting friends of user {}".format(user_id))
    sys.stdout.flush()
    friends[user_id] = get_friends(t, user_id)
    print("Received {} friends".format(len(friends[user_id])))
    print("We now have the friends of {} users".format(len(friends)))
    sys.stdout.flush()
    # Update friend_count
    for friend in friends[user_id]:
        friend_count[friend] += 1
    # Update the best friends list
    best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)

In [ ]:

import json
friends_filename = os.path.join(data_folder, "python_friends.json")
with open(friends_filename, 'w') as outf:
    json.dump(friends, outf)

In [ ]:

import networkx as nx
G = nx.DiGraph()

main_users = friends.keys()
G.add_nodes_from(main_users)

for user_id in friends:
    for friend in friends[user_id]:
        if friend in main_users:
           G.add_edge(user_id, friend) 
G

In [ ]:

%matplotlib inline
nx.draw(G)

In [ ]:

from matplotlib import pyplot as plt
plt.figure(3,figsize=(10, 10))
nx.draw(G)

寻找子图¶

In [ ]:

import os
import json
data_folder = os.path.join(os.path.expanduser("~"), "Data", "twitter")
friends_filename = os.path.join(data_folder, "python_friends.json")
with open(friends_filename) as inf:
    friends = json.load(inf)

In [ ]:

len(friends.keys())

In [ ]:

import networkx as nx
G = nx.DiGraph()

In [ ]:

main_users = [int(f) for f in friends.keys()]
G.add_nodes_from(main_users)

In [ ]:

for user_id in friends:
    for friend in friends[user_id]:
        if friend in main_users:
            G.add_edge(user_id, friend)

In [ ]:

mu = set(main_users)
au = set([f for ff in friends.values() for f in ff])
len(mu), len(au), len(mu & au)

In [ ]:

%matplotlib inline
nx.draw(G)

In [ ]:

from matplotlib import pyplot as plt
plt.figure(3,figsize=(40,40))
nx.draw(G, alpha=0.1, edge_color='b', node_color='g', node_size=2000)
plt.axis('on')
plt.xlim(0.45, 0.55)
plt.ylim(0.45, 0.55)

In [ ]:

from matplotlib import pyplot as plt
plt.figure(3,figsize=(10, 10))
connection_number = sorted(nx.degree(G).values(), reverse=True)
plt.plot(connection_number,'r-')

In [ ]:

# Power law example
import numpy as np
import matplotlib.pyplot as plt
data = 1 - np.random.power(5, (20000,))

plt.hist(data, bins=10,facecolor="r")

In [ ]:

friends = {user: set(friends[user]) for user in friends}

In [ ]:

def compute_similarity(friends1, friends2):
    return len(friends1 & friends2) / len(friends1 | friends2)

In [ ]:

def create_graph(followers, threshold=0):
    G = nx.Graph()
    for user1 in friends.keys():
        for user2 in friends.keys():
            if user1 == user2:
                continue
            weight = compute_similarity(friends[user1], friends[user2])
            if weight >= threshold:
                G.add_node(user1)
                G.add_node(user2)
                G.add_edge(user1, user2, weight=weight)
    return G

In [ ]:

G = create_graph(friends)

In [ ]:

plt.figure(figsize=(10,10))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos)
edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=edgewidth)

In [ ]:

G = create_graph(friends, 0.1)
sub_graphs = nx.connected_component_subgraphs(G)
for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))

In [ ]:

G = create_graph(friends, 0.25)
sub_graphs = nx.connected_component_subgraphs(G)

for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))

In [ ]:

sub_graphs = nx.connected_component_subgraphs(G)
nx.draw(list(sub_graphs)[6])

In [ ]:

fig = plt.figure()
fig.add_subplot?

In [ ]:

sub_graphs = nx.connected_component_subgraphs(G)
n_subgraphs = nx.number_connected_components(G)
fig = plt.figure(figsize=(20, (n_subgraphs * 2)))
for i, sub_graph in enumerate(sub_graphs):
    ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)
    nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)

In [ ]:

from sklearn.metrics import silhouette_score

def compute_silhouette(threshold, friends):
    G = create_graph(friends, threshold=threshold)
    if len(G.nodes()) < 2:
        return -99
    sub_graphs = nx.connected_components(G)
    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):
        return -99
    label_dict = {}
    for i, sub_graph in enumerate(sub_graphs):
        print(type(sub_graph))
        print(sub_graph)
        for node in sub_graph:  #.nodes():
            label_dict[node] = i
    labels = np.array([label_dict[node] for node in G.nodes()])
    X = nx.to_scipy_sparse_matrix(G)#.todense()
    #X = 1 - X
    return silhouette_score(X, labels, metric='precomputed')

In [ ]:

compute_silhouette(0.25, friends)

第二部分¶

In [ ]:

import os
import json

data_folder = os.path.join(os.path.expanduser("~"), "Data", "twitter")
friends_filename = os.path.join(data_folder, "python_friends.json")
with open(friends_filename) as inf:
    friends = json.load(inf)

In [ ]:

friends = {user: set(friends[user]) for user in friends}

In [ ]:

def compute_similarity(friends1, friends2):
    set_friends1 = set(friends1)
    set_friends2 = set(friends2)
    return len(set_friends1 & set_friends2) / len(set_friends1 | set_friends2)

In [ ]:

import networkx as nx
def create_graph(friends, threshold=0):
    G = nx.Graph()
    weights = []
    for user1 in friends.keys():
        for user2 in friends.keys():
            if user1 == user2:
                continue
            weight = compute_similarity(friends[user1], friends[user2])
            weights.append(weight)
            if weight >= threshold:
                G.add_node(user1)
                G.add_node(user2)
                G.add_edge(user1, user2, weight=weight)
    return G

G = create_graph(friends, 0)

In [ ]:

%matplotlib inline
from matplotlib import pyplot as plt
plt.figure(figsize=(10,10))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=500)

edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=edgewidth)

In [ ]:

G = create_graph(friends, 0.1)
sub_graphs = nx.connected_component_subgraphs(G)

for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))

In [ ]:

G = create_graph(friends, 0.15)
sub_graphs = nx.connected_component_subgraphs(G)

for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))

In [ ]:

sub_graphs = nx.connected_component_subgraphs(G)
label_dict = {}
for i, sub_graph in enumerate(sub_graphs):
    for node in sub_graph.nodes():
        label_dict[node] = i
labels = [label_dict[node] for node in G.nodes()]

plt.figure(figsize=(10,10))
nx.draw(G,node_color=labels,cmap=plt.cm.Paired, node_size=500)

In [ ]:

sub_graphs = nx.connected_component_subgraphs(G)
plt.figure(figsize=(10,10))
pos = nx.spring_layout(G)
for i, sub_graph in enumerate(sub_graphs):
    nodes = sub_graph.nodes()
    edges = sub_graph.edges()
    nx.draw_networkx_nodes(G, pos, nodes,node_size=500)
    nx.draw_networkx_edges(G, pos, edges)
    

In [ ]:

sub_graphs = nx.connected_component_subgraphs(G)
n_subgraphs = nx.number_connected_components(G)

fig = plt.figure(figsize=(20, (n_subgraphs * 3)))
for i, sub_graph in enumerate(sub_graphs):
    ax = fig.add_subplot(int(n_subgraphs / 2), 2, i)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, sub_graph.nodes(), ax=ax, node_size=500)
    nx.draw_networkx_edges(G, pos, sub_graph.edges(), ax=ax)

In [ ]:

#from sklearn.metrics import silhouette_score
import numpy as np

def compute_silhouette(threshold, friends):
    G = create_graph(friends, threshold=threshold)
    if len(G.nodes()) == 0:
        return -99  # Invalid graph
    sub_graphs = nx.connected_component_subgraphs(G)
    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1):
        return -99  # Invalid number of components, Silhouette not defined
    label_dict = {}
    for i, sub_graph in enumerate(sub_graphs):
        for node in sub_graph.nodes():
            label_dict[node] = i
    labels = np.array([label_dict[node] for node in G.nodes()])
    X = nx.to_scipy_sparse_matrix(G).todense()
    X = 1 - X
    return silhouette_score(X, labels, metric='precomputed')


print(compute_silhouette(0.1, friends))

In [ ]:

from scipy.optimize import minimize #(fun, x0, args=(),

def invert(func):
    def inverted_function(*args, **kwds):
        return -func(*args, **kwds)
    return inverted_function

result = minimize(invert(compute_silhouette), 0.1, method='nelder-mead', args=(friends,), options={'maxiter':10, })
print(result)

In [ ]:

G = create_graph(friends, threshold=0.135)
sub_graphs = nx.connected_component_subgraphs(G)

for i, sub_graph in enumerate(sub_graphs):
    n_nodes = len(sub_graph.nodes())
    print("Subgraph {0} has {1} nodes".format(i, n_nodes))

In [ ]:

labels

In [ ]:

X = 1-nx.to_scipy_sparse_matrix(G).todense()

In [ ]:

def silhouette_score(X, labels, metric='precomputed'):
    labels = np.array(labels)
    print(labels.shape)
    return np.mean(silhouette_samples(X, labels, metric=metric))

def silhouette_samples(X, labels, metric='precomputed'):
    print(X.shape)
    distances = X  #pairwise_distances(X, metric=metric, **kwds)
    n = labels.shape[0]
    A = np.array([_intra_cluster_distance(distances[i], labels, i)
                  for i in range(n)])
    B = np.array([_nearest_cluster_distance(distances[i], labels, i)
                  for i in range(n)])
    sil_samples = (B - A) / np.maximum(A, B)
    # nan values are for clusters of size 1, and should be 0
    return np.nan_to_num(sil_samples)

def _intra_cluster_distance(distances_row, labels, i):
    """Calculate the mean intra-cluster distance for sample i.

    Parameters
    ----------
    distances_row : array, shape = [n_samples]
        Pairwise distance matrix between sample i and each sample.

    labels : array, shape = [n_samples]
        label values for each sample

    i : int
        Sample index being calculated. It is excluded from calculation and
        used to determine the current label

    Returns
    -------
    a : float
        Mean intra-cluster distance for sample i
    """
    mask = (labels == labels[i])
    mask[i] = False
    mask = mask.reshape(distances_row.shape)
    #print("Cluster {}".format(i))
    #print(mask)
    #print(distances_row.flatten())
    #print(distances_row.flatten()[mask])
    a = np.mean(distances_row[mask])
    return a


def _nearest_cluster_distance(distances_row, labels, i):
    """Calculate the mean nearest-cluster distance for sample i.

    Parameters
    ----------
    distances_row : array, shape = [n_samples]
        Pairwise distance matrix between sample i and each sample.

    labels : array, shape = [n_samples]
        label values for each sample

    i : int
        Sample index being calculated. It is used to determine the current
        label.

    Returns
    -------
    b : float
        Mean nearest-cluster distance for sample i
    """
    label = labels[i]
    b = np.min([np.mean(distances_row[(labels == cur_label).reshape(distances_row.shape)])
               for cur_label in set(labels) if not cur_label == label])
    return b

In [ ]:

silhouette_score(X, labels, metric='precomputed')

In [ ]: