In [1]:
'''

This script will scrape tweets from the 'following' list of twitter accounts that was output of 00_Scrape_Following.ipynb.
A csv is created for tweets from every account and saved as username_date.csv 


References:
div by 1000  - https://stackoverflow.com/questions/37494983/python-fromtimestamp-oserror

'''

#Import required libraries
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from datetime import datetime
from pandas import DataFrame as df
from datetime import datetime
import pandas as pd
In [2]:
def get_driver():
    '''
    
    This function will create a headless browser driver object that we'll use to scrape data automatically
    
    '''
    
    #Initialize options
    options = webdriver.ChromeOptions()
    #Pass in headless argument to options
    options.add_argument('--headless')
    #Initialize driver
    driver = webdriver.Chrome('chromedriver.exe',chrome_options=options)
    return driver
In [3]:
def scrape_user_tweets(browser,twitter_username,no_of_pagedowns):
    '''
    
    This function will 
     - open the twitter account page
     - scroll down
     - scrape tweets and save in the form username_date.csv
     
    '''

    browser.get("https://twitter.com/" + twitter_username)
    time.sleep(1)
    
    print('Scraping '+twitter_username)

    
    elem = browser.find_element_by_tag_name("body")
    no_of_pagedowns = no_of_pagedowns
    while no_of_pagedowns:
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)
        no_of_pagedowns-=1    

    twitter_elm = browser.find_elements_by_class_name("tweet")

    #We'll iterate over tweets found & keep appending tweet info in this list
    tweet_details=[]

    for post in twitter_elm:
        #print(post)
        username = post.find_element_by_class_name("username")
        #print(username.text)
        tweet_username=username.text

        tweet = post.find_element_by_class_name("tweet-text")
        #print(tweet.text)
        tweet_text=tweet.text

        tweet_hashtag_mentions=[hashtag.text for hashtag in tweet.find_elements_by_class_name("twitter-hashtag")]

        rt_count=post.find_elements_by_class_name("ProfileTweet-actionCount")

        tweet_comment_count=rt_count[1].get_attribute("data-tweet-stat-count")
        tweet_retweet_count=rt_count[2].get_attribute("data-tweet-stat-count")
        tweet_fav_count=rt_count[3].get_attribute("data-tweet-stat-count")


        time_stp=int(post.find_elements_by_class_name("_timestamp")[0]
                .get_attribute("data-time-ms"))
        utc_time = datetime.fromtimestamp(time_stp/1000)

        tweet_date_posted=utc_time.strftime("%Y-%m-%d %H:%M:%S.%f+00:00 (UTC)")

        pic=post.find_elements_by_class_name('js-adaptive-photo')

        tweet_img_url=pic[0].get_attribute('data-image-url') if pic else False
        #print(img_url)

        tweet_details.append({
                            'tweet_username':username.text
                            ,'tweet_text': tweet_text
                            ,'tweet_hashtag_mentions':tweet_hashtag_mentions
                            ,'tweet_comment_count':tweet_comment_count
                            ,'tweet_retweet_count':tweet_retweet_count
                            ,'tweet_fav_count':tweet_fav_count
                            ,'tweet_img_url':tweet_img_url
                            })
    
    #Create dataframe from tweets scraped for the account. Store them in csv 
    scraped_tweets=pd.DataFrame(tweet_details)
    scraped_tweets.to_csv(twitter_username+"_"+datetime.today().strftime('%Y%m%d')+".csv",index=False )
    print('Scraped '+twitter_username)
    
In [ ]:
#Initiate driver object
browser = get_driver()

#Read in the list of funny accounts and scrape one by one
good_accounts=pd.read_csv('Following list.csv')

for index,row in good_accounts.iterrows():
    #print(row['Username'])
    scrape_user_tweets(browser,row['Username'],10)