import twitter def oauth_login(): # XXX: Go to http://twitter.com/apps/new to create an app and get values # for these credentials that you'll need to provide in place of these # empty string values that are defined as placeholders. # See https://dev.twitter.com/docs/auth/oauth for more information # on Twitter's OAuth implementation. CONSUMER_KEY = '' CONSUMER_SECRET = '' OAUTH_TOKEN = '' OAUTH_TOKEN_SECRET = '' auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET) twitter_api = twitter.Twitter(auth=auth) return twitter_api # Sample usage twitter_api = oauth_login() # Nothing to see by displaying twitter_api except that it's now a # defined variable print twitter_api import json import pymongo # pip install pymongo def insert_into_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw): # Connects to the MongoDB server running on # localhost:27017 by default client = pymongo.MongoClient(**mongo_conn_kw) # Get a reference to a particular database db = client[mongo_db] # Reference a particular collection in the database coll = db[mongo_db_coll] # Perform a bulk insert and return the IDs return coll.insert(data) #If we have an _id pre-exists, insert_into_mongo raises an error #save_to_mongo will create a new document if the _id does not exist, or replace the old doc with the new one if it does def save_to_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw): # Connects to the MongoDB server running on # localhost:27017 by default client = pymongo.MongoClient(**mongo_conn_kw) # Get a reference to a particular database db = client[mongo_db] # Reference a particular collection in the database coll = db[mongo_db_coll] return coll.save(data) def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False, criteria=None, projection=None, **mongo_conn_kw): # Optionally, use criteria and projection to limit the data that is # returned as documented in # http://docs.mongodb.org/manual/reference/method/db.collection.find/ # Consider leveraging MongoDB's aggregations framework for more # sophisticated queries. client = pymongo.MongoClient(**mongo_conn_kw) db = client[mongo_db] coll = db[mongo_db_coll] if criteria is None: criteria = {} if projection is None: cursor = coll.find(criteria) else: cursor = coll.find(criteria, projection) # Returning a cursor is recommended for large amounts of data if return_cursor: return cursor else: return [ item for item in cursor ] #Some mongo utility functions, useful howtos, etc. def mongo_dbs(**mongo_conn_kw): mc= pymongo.MongoClient(**mongo_conn_kw) #c = Connection() print mc.database_names() #mongo_dbs() def getCollections_in_mongo(mongo_db, **mongo_conn_kw): client = pymongo.MongoClient(**mongo_conn_kw) db = client[mongo_db] return db.collection_names() #getCollections_in_mongo('twitter')[:10] ## Drop a database #from pymongo import Connection #c = Connection() #c.drop_database('twitter') #Sample usage: #getCollections_in_mongo('twitter')[:10] import sys import time from urllib2 import URLError from httplib import BadStatusLine import json import twitter def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): # A nested helper function that handles common HTTPErrors. Return an updated # value for wait_period if the problem is a 500 level error. Block until the # rate limit is reset if it's a rate limiting issue (429 error). Returns None # for 401 and 404 errors, which requires special handling by the caller. def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True): if wait_period > 3600: # Seconds print >> sys.stderr, 'Too many retries. Quitting.' raise e # See https://dev.twitter.com/docs/error-codes-responses for common codes if e.e.code == 401: print >> sys.stderr, 'Encountered 401 Error (Not Authorized)' return None elif e.e.code == 404: print >> sys.stderr, 'Encountered 404 Error (Not Found)' return None elif e.e.code == 429: print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)' if sleep_when_rate_limited: print >> sys.stderr, "Retrying in 15 minutes...ZzZ..." sys.stderr.flush() time.sleep(60*15 + 5) print >> sys.stderr, '...ZzZ...Awake now and trying again.' return 2 else: raise e # Caller must handle the rate limiting issue elif e.e.code in (500, 502, 503, 504): print >> sys.stderr, 'Encountered %i Error. Retrying in %i seconds' % \ (e.e.code, wait_period) time.sleep(wait_period) wait_period *= 1.5 return wait_period else: raise e # End of nested helper function wait_period = 2 error_count = 0 while True: try: return twitter_api_func(*args, **kw) except twitter.api.TwitterHTTPError, e: error_count = 0 wait_period = handle_twitter_http_error(e, wait_period) if wait_period is None: return except URLError, e: error_count += 1 time.sleep(wait_period) wait_period *= 1.5 print >> sys.stderr, "URLError encountered. Continuing." if error_count > max_errors: print >> sys.stderr, "Too many consecutive errors...bailing out." raise except BadStatusLine, e: error_count += 1 time.sleep(wait_period) wait_period *= 1.5 print >> sys.stderr, "BadStatusLine encountered. Continuing." if error_count > max_errors: print >> sys.stderr, "Too many consecutive errors...bailing out." raise # Sample usage #twitter_api = oauth_login() # See https://dev.twitter.com/docs/api/1.1/get/users/lookup for # twitter_api.users.lookup #response = make_twitter_request(twitter_api.users.lookup, screen_name="SocialWebMining") #print json.dumps(response, indent=1) def get_user_profile(twitter_api, screen_names=None, user_ids=None): # Must have either screen_name or user_id (logical xor) assert (screen_names != None) != (user_ids != None), \ "Must have screen_names or user_ids, but not both" items_to_info = {} items = screen_names or user_ids print >> sys.stderr, 'Grabbing {0} user data records, up to 100 at a time...'.format(len(items)) while len(items) > 0: # Process 100 items at a time per the API specifications for /users/lookup. # See https://dev.twitter.com/docs/api/1.1/get/users/lookup for details. items_str = ','.join([str(item) for item in items[:100]]) items = items[100:] if screen_names: response = make_twitter_request(twitter_api.users.lookup, screen_name=items_str) else: # user_ids response = make_twitter_request(twitter_api.users.lookup, user_id=items_str) for user_info in response: if screen_names: items_to_info[user_info['screen_name']] = user_info else: # user_ids items_to_info[user_info['id']] = user_info return items_to_info # Sample usage #twitter_api = oauth_login() #print get_user_profile(twitter_api, screen_names=["SocialWebMining", "ptwobrussell"]) get_user_profile(twitter_api, user_ids=[132373965]) def get_list_members(twitter_api, owner_screen_name=None, slug=None): assert (owner_screen_name != None) & (slug != None), \ "Must have screen_names and list name" print >> sys.stderr, 'Grabbing members of list {0}/{1}'.format(owner_screen_name,slug) items_to_info = {} response = make_twitter_request(twitter_api.lists.members, owner_screen_name=owner_screen_name,slug=slug) for user_info in response['users']: items_to_info[user_info['screen_name']] = user_info return items_to_info #Sample usage #twitter_api = oauth_login() #print get_list_members(twitter_api, "sidepodcast", "f1-drivers") from functools import partial from sys import maxint def get_friends_followers_ids(twitter_api, screen_name=None, user_id=None, friends_limit=maxint, followers_limit=maxint): # Must have either screen_name or user_id (logical xor) assert (screen_name != None) != (user_id != None), \ "Must have screen_name or user_id, but not both" # See https://dev.twitter.com/docs/api/1.1/get/friends/ids and # https://dev.twitter.com/docs/api/1.1/get/followers/ids for details # on API parameters get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids, count=5000) get_followers_ids = partial(make_twitter_request, twitter_api.followers.ids, count=5000) friends_ids, followers_ids = [], [] for twitter_api_func, limit, ids, label in [ [get_friends_ids, friends_limit, friends_ids, "friends"], [get_followers_ids, followers_limit, followers_ids, "followers"] ]: if limit == 0: continue cursor = -1 while cursor != 0: # Use make_twitter_request via the partially bound callable... if screen_name: response = twitter_api_func(screen_name=screen_name, cursor=cursor) else: # user_id response = twitter_api_func(user_id=user_id, cursor=cursor) if response is not None: ids += response['ids'] cursor = response['next_cursor'] print >> sys.stderr, 'Fetched {0} total {1} ids for {2}'.format(len(ids), label, (user_id or screen_name)) # XXX: You may want to store data during each iteration to provide an # an additional layer of protection from exceptional circumstances if len(ids) >= limit or response is None: break # Do something useful with the IDs, like store them to disk... return friends_ids[:friends_limit], followers_ids[:followers_limit] # Sample usage #twitter_api = oauth_login() #friends_ids, followers_ids = get_friends_followers_ids(twitter_api, screen_name="SocialWebMining", friends_limit=10, followers_limit=10) #print friends_ids #print followers_ids import random #Rather than crawl all followers, crawl a sample... def crawl_followers_sample(twitter_api, screen_name, limit=5000, depth=2, sample=50): print >> sys.stderr, 'Crawling depth {0} with sample size {1}'.format(depth, sample) # Resolve the ID for screen_name and start working with IDs for consistency # in storage ##THIS SECTION CAN BE REPLACED BY ESTABLISH USER? #THOUGH TO PASS ON APPROPRIATE VALS TO next_queue seed_id_ = twitter_api.users.show(screen_name=screen_name) #tmp friend_ids next_queue follower_ids tmp, next_queue = get_friends_followers_ids(twitter_api, user_id=seed_id_['id_str'], friends_limit=limit, followers_limit=limit) # Store a seed_id => _follower_ids mapping in MongoDB # Use a Twitter user id as the mongo document _id (native indexing, prevent multiple records for one individual) save_to_mongo({'_id': seed_id_['id'], 'screen_name':screen_name, 'id_str':seed_id_['id_str'], 'follower_ids' : [ _id for _id in next_queue ]}, 'twitter', 'followers') save_to_mongo({'_id': seed_id_['id'] , 'screen_name':screen_name, 'id_str':seed_id_['id_str'], 'friend_ids' : [ _id for _id in tmp ]}, 'twitter', 'friends') udata=get_user_profile(twitter_api, user_ids=[ seed_id_['id_str'] ]) for u in udata: save_to_mongo({'_id':udata[u]['id'],'screen_name':udata[u]['screen_name'],'id_str':udata[u]['id_str'], 'name':udata[u]['name'],'description':udata[u]['description'], 'location':udata[u]['location'],'followers_count':udata[u]['followers_count'], 'followers_count':udata[u]['friends_count'],'created_at':udata[u]['created_at']},'twitter', 'userdata') #We're going to try to mimimise the amount of calls we make to the Twitter API #HEURISTIC: if we already have follower data for a user, don't get friend/follower data again sspool=set() mgd=load_from_mongo('twitter','userdata', projection={'_id':1}) namesdone=set([ i['_id'] for i in mgd ]) d = 1 while d < depth: d += 1 (queue, next_queue) = (next_queue, []) #TH: only interested in grabbing data we haven't grabbed before diff = set(queue) - set( [ i['_id'] for i in load_from_mongo('twitter','followers', projection={'_id':1})] ) #TH: propagate the sampling measure queue = random.sample(list(diff), sample) if len(diff) > sample else list(diff) for fid in queue: friend_ids, follower_ids = get_friends_followers_ids(twitter_api, user_id=fid, friends_limit=limit, followers_limit=limit) #Get some user info while we're here... sspoolt= set(follower_ids).union(set(friend_ids)) - namesdone sspoolt = sspoolt.union(sspool) if len(sspoolt)<100 else sspoolt ssize = 99 if len(sspoolt) > 99 else len(sspoolt) uids=[fid]+random.sample(list(sspoolt), ssize) namesdone=namesdone.union(set(uids)) sspool=sspoolt.union(sspool)-namesdone udata=get_user_profile(twitter_api, user_ids=uids) for u in udata: save_to_mongo( {'_id':udata[u]['id'],'screen_name':udata[u]['screen_name'], 'id_str':udata[u]['id_str'], 'name':udata[u]['name'],'description':udata[u]['description'], 'location':udata[u]['location'],'followers_count':udata[u]['followers_count'], 'followers_count':udata[u]['friends_count'],'created_at':udata[u]['created_at']}, 'twitter', 'userdata') tmp=load_from_mongo('twitter','userdata',criteria={'_id':fid},projection={'screen_name':1,'_id':1}) s_name=tmp[0]['screen_name'] # Store a fid => follower_ids mapping in MongoDB save_to_mongo({'_id': fid, 'id_str': str(fid) , 'screen_name':s_name, 'follower_ids' : [ _id for _id in follower_ids ]}, 'twitter', 'followers') save_to_mongo({'_id': fid, 'id_str': str(fid) , 'screen_name':s_name, 'friend_ids' : [ _id for _id in friend_ids ]}, 'twitter', 'friends') next_queue += follower_ids # Sample usage #twitter_api = oauth_login() #screen_name = "bbcinternetblog" #crawl_followers_sample(twitter_api, screen_name, depth=2, limit=5000, sample=10) import networkx as nx def get_common_friends_of_followers_grapher(twitter_api, screen_name, foid_list, toget, minsupport=5): #We're going to use networkx to construct the graph DG=nx.DiGraph() print >> sys.stderr, 'Getting friends of followers of {0}'.format(screen_name) #The toget folk should already have friends/followers in the db for fo in toget: tmp=load_from_mongo('twitter','friends',criteria={'_id':fo}, projection={'screen_name':1, 'friend_ids':1,'_id':1}) members2=tmp[0]['friend_ids'] if len(members2)>0: for foid in foid_list: DG.add_edge(fo,foid) fedges=[(fo,u) for u in members2] DG.add_edges_from(fedges) print >> sys.stderr, 'Filtering network...' #Now we can filter the network filterNodes=[] for n in DG: if DG.degree(n)>=minsupport: filterNodes.append(n) H=DG.subgraph(set(filterNodes)) #Label the filtered graph, getting in additional labels if we need them mgd=load_from_mongo('twitter','userdata', projection={'_id':1}) got= [ i['_id'] for i in mgd ] tofetch=[ _id for _id in H.nodes() if _id not in got] for n in set(H.nodes()).intersection(got): mgd=load_from_mongo('twitter','userdata', criteria={'_id':n}, projection={'screen_name':1,'id_str':1,'_id':1}) H.node[n]['label']=mgd[0]['screen_name'] udata=get_user_profile(twitter_api, user_ids=tofetch) for u in udata: save_to_mongo( {'_id':udata[u]['id'],'screen_name':udata[u]['screen_name'], 'id_str':udata[u]['id_str'], 'name':udata[u]['name'],'description':udata[u]['description'], 'location':udata[u]['location'],'followers_count':udata[u]['followers_count'], 'followers_count':udata[u]['friends_count'],'created_at':udata[u]['created_at']}, 'twitter', 'userdata') H.node[udata[u]['id']]['label']=udata[u]['screen_name'] print >> sys.stderr, 'Writing network to {0}_{1}.gexf'.format(screen_name,minsupport) #Write the resulting network to a gexf file nx.write_gexf(H, '{0}_{1}.gexf'.format(screen_name,minsupport) ) #print tofetch print >> sys.stderr, 'Done...' def get_common_friends_of_followers(twitter_api, screen_name, minsupport=5): print >> sys.stderr, 'Getting followers of {0}'.format(screen_name) ff=load_from_mongo('twitter','followers',criteria={'screen_name':screen_name}, projection={'screen_name':1,'follower_ids':1,'_id':1}) #Get the follower ids of the target individual members=ff[0]['follower_ids'] #For now, find which followers we have friend data for and use that tmp=load_from_mongo('twitter', 'friends', projection={'_id':1}) fr =[ i['_id'] for i in tmp ] toget = [ i for i in members if i in fr ] #What we really need to do is: ## - set a sample size of followers ## - get the set of ids we have friend data for and see if size of intersect with user's followers is greater than sample ## - if it is, we can get the sample out of the database. If it isn't, we need to crawl some more. get_common_friends_of_followers_grapher(twitter_api, screen_name, [ff[0]['_id']], toget, minsupport=5) #Sample usage: #screen_name = "schoolofdata" #get_common_friends_of_followers(twitter_api, screen_name) def quickExpt(screen_name, sample=119, minsupport=5): save_to_mongo( {'_id':screen_name, 'screen_name':screen_name},'twitter', 'quickexpt_source') crawl_followers_sample(twitter_api, screen_name, depth=2, limit=5000, sample=sample) get_common_friends_of_followers(twitter_api, screen_name, minsupport=minsupport) ##THIS IS WHERE YOU NEED TO ADD THE USERNAME OF THE ACCOUNT YOU WANT TO GRAB THE DATA FOR twitter_username='schoolofdata' quickExpt(twitter_username,sample=119, minsupport=5)