#!/usr/bin/env python # coding: utf-8 # In[271]: import os import requests import urllib.request from pathlib import Path import pandas as pd import time import tweepy import json import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime from tweepy import OAuthHandler from IPython.display import display, Markdown from IPython.display import Image import matplotlib.rcsetup as rcsetup # ## Gathering The WeRateDogs Twitter Archive # In[222]: # Reading in csv file as pandas dataframe, displayng a tuple of the array dimensions, and printing the first few rows. twitter_archive = pd.read_csv('twitter-archive-enhanced.csv') print("Array Dimensions = ", twitter_archive.shape) twitter_archive.head() # ### Gathering The Tweet Image Predictions # In[218]: # Using requests library to download tsv file # putting a check so I don't have to redownload the file each time. my_file_pred = Path("image_predictions.tsv") if my_file_pred.exists(): print("File already exists") else: url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv" response = requests.get(url) with open('image_predictions.tsv', 'wb') as file: file.write(response.content) # In[223]: # Read tsv file into Pandas image_predictions = pd.read_csv('image_predictions.tsv', sep='\t') # Looking at the shape and the first couple of rows of the dataframe print("Array Dimensions = ",image_predictions.shape) image_predictions.head() # Resources: https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file # ### Gathering Tweets from Twitter API # In[9]: # Storing and loading passwords locally password_list = pd.read_csv('password_list.csv') consumer_key = password_list.consumer_key[0] consumer_secret = password_list.consumer_secret[0] access_token = password_list.access_token[0] access_secret = password_list.access_secret[0] auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) if (not api): print ("Error: Unable to Authenticate") sys.exit(-1) # Put in error handling # Resource: https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./ # In[202]: # For loop which to add each found tweet information to a new line of tweet_json.txt # I put a file exist check because I kept adding to the file everytime I needed to rerun this cell. missing_tweets = [] # Recording the missing tweets my_file = Path("tweet_json.txt") if my_file.exists(): print("File already exists") else: with open('tweet_json.txt', 'a', encoding='utf8') as f: # opening file in append mode for tweet_id in twitter_archive['tweet_id']: try: tweet = api.get_status(tweet_id, tweet_mode='extended') json.dump(tweet._json, f) f.write('\n') except: print('{} Tweet not found'.format(tweet_id)) missing_tweets.append(tweet_id) print("File Created / Task Completed") # Resources: https://docs.python.org/3/tutorial/inputoutput.html # Resources: https://stackoverflow.com/questions/44581647/retrieving-a-list-of-tweets-using-tweet-id-in-tweepy # In[10]: # Opening tweet_json to api_info with open('tweet_json.txt', 'r') as fp: api_info = pd.read_json(fp, lines=True) # Resources: https://stackoverflow.com/questions/30088006/loading-a-file-with-more-than-one-line-of-json-into-pythons-pandas # In[211]: # Viewing the columns in the api_info database. api_info.info() # In[14]: # Viewing the 3 columns I am interested in. api_info[["id", "retweet_count", "favorite_count"]] # In[208]: # Moving the 3 columns into a separate variable tweet_info = api_info[['id','retweet_count','favorite_count']] tweet_info.info() tweet_info.shape # # ## Exploring Data # # In[18]: # Looking at image prediction to get a feel for what I am looking at. I will probably look into this category in more detail. image_predictions.p1.value_counts() # In[19]: # Looking at the second image classification to see how it differs from the first. image_predictions.p2.value_counts() # In[20]: # Looking at the different names and noticing that some do not make sense. twitter_archive.name.value_counts() # In[21]: # Looking through the text to see how many lines contain & twitter_archive.text[twitter_archive.text.str.contains('&')] # ### Assessing Data for this Project # # After gathering each of the above pieces of data, assess them visually and programmatically for quality and tidiness issues. Detect and document at least eight (8) quality issues and two (2) tidiness issues in your wrangle_act.ipynb Jupyter Notebook. To meet specifications, the issues that satisfy the Project Motivation (see the Key Points header on the previous page) must be assessed. # # #### List of Quality issues: # # 1. Replace & amp; in text with just &. # # 2. Convert id to string in tweet info dataframe. # # 3. Rename tweet info id to tweet_id to merge it with the other two dataframes. # # 4. Convert tweet_id to a string in image predictions dataframe. # # 5. Convert tweet_id to a string in twitter archive dataframe. # # 6. Convert datetime from string to datetime. # # 7. Remove columns that contain no information, and the redundant dog stage columns. # # 8. Some of the name records in Twitter Archive contain articles (the, an, a) instead of actual names. I will rename them to None for consistency. # # 9. Remove retweets # # #### List of Tidiness Issues # # 1. Merge all lists into a master list. # # 2. Combine Dog Stages into one column. # # ## Cleaning Data # # In[25]: # Creating copies of the dataframe to clean twitter_archive_clean = twitter_archive.copy() image_predictions_clean = image_predictions.copy() tweet_info_clean = tweet_info.copy() # In[39]: # Viewing a summary of all of the copied dataframes print("=" * 50) twitter_archive_clean.info() print("=" * 50) image_predictions_clean.info() print("=" * 50) tweet_info_clean.info() print("=" * 50) # In[28]: # Quality Issue 1 # Replacing & with &. Then verifying code works as expected. twitter_archive_clean['text'] = twitter_archive_clean['text'].str.replace('&','&') twitter_archive_clean.text[twitter_archive_clean.text.str.contains('&')] # In[206]: # Quality Issue 2 and 3 # Changing id to tweet_id and converting it to string datatype tweet_info_clean['id'] = tweet_info_clean['id'].astype(str) tweet_info_clean.rename(columns={'id': 'tweet_id'}, inplace=True) # Resources: https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas tweet_info_clean["tweet_id"] # In[40]: # Quality Issue 4 # We need to convert tweet id from a number to a string. image_predictions_clean['tweet_id'] = image_predictions_clean['tweet_id'].astype(str) tweet_info_clean.info() # In[224]: # Quality Issue 5 # We need to convert tweet id from a number to a string. twitter_archive_clean['tweet_id'] = twitter_archive_clean['tweet_id'].astype(str) twitter_archive_clean.info() # In[207]: # Tidiness Issue 1 # Merging all three dataframes tweet_merge_clean = pd.merge(tweet_info_clean, twitter_archive_clean, on='tweet_id', how='outer') tweet_merge_clean = pd.merge(tweet_merge_clean, image_predictions_clean, on='tweet_id', how='outer') tweet_merge_clean.info() # In[204]: # Quality Issue 6 # Converting to datetime tweet_merge_clean['timestamp'] = pd.to_datetime(tweet_merge_clean['timestamp'], infer_datetime_format=True) tweet_merge_clean.timestamp # In[203]: # Tidiness Issue 2 # I will be merging all the dog stage columns into one column. tweet_merge_clean['dog_stage'] = tweet_merge_clean.apply(lambda row: row['doggo'] + row['floofer'] + row['pupper'] + row['puppo'], axis=1) # Running a value count on the dog stage column. tweet_merge_clean.dog_stage.value_counts() # Resources https://stackoverflow.com/questions/34023918/make-new-column-in-panda-dataframe-by-adding-values-from-other-columns # Results in a new column named dog_stage that is a result of the 4 columns combined # In[69]: # Tidiness Issue 2 # Replacing all the values in dog stage into easier to understand terms. dogstage_replace_values = {'NoneNoneNoneNone' : "None", "doggoNoneNoneNone" : "Doggo", "NoneflooferNoneNone" : "Floofer", "NoneNonepupperNone" : "Pupper", "NoneNoneNonepuppo" : "Puppo", "doggoNonepupperNone" : "Doggo and Pupper", "doggoflooferNoneNone" : "Doggo and Floofer", "doggoNoneNonepuppo" : "Doggo and Puppo"} tweet_merge_clean = tweet_merge_clean.replace({"dog_stage": dogstage_replace_values}) tweet_merge_clean.dog_stage.value_counts() # Resources: https://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column # In[71]: # Quality Issue 7 # Dropping the 4 redundant columns, 3 empty columns, img_num and both in reply columns tweet_merge_clean = tweet_merge_clean.drop(['doggo', 'floofer', 'pupper', 'puppo', "retweeted_status_id", "retweeted_status_user_id", "retweeted_status_timestamp", 'in_reply_to_status_id', 'in_reply_to_user_id', 'img_num'], axis=1) # In[65]: # Quality Issue 8 tweet_merge_clean.name.value_counts() # In[66]: # Quality Issue 8 # Looking at name I will change a, the, an, and very to None name_replace_values = {'a' : "None", "the" : "None", "an" : "None", "very" : "None"} tweet_merge_clean = tweet_merge_clean.replace({"name": name_replace_values}) # Resources: https://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column # In[205]: # Quality Issue 9 # Clearing out the retweets tweet_merge_clean = tweet_merge_clean[tweet_merge_clean['retweeted_status_id'].isnull()] tweet_merge_clean.info() # ### Storing, Analyzing, and Visualizing Data for this Project # # Store the clean DataFrame(s) in a CSV file with the main one named twitter_archive_master.csv. If additional files exist because multiple tables are required for tidiness, name these files appropriately. Additionally, you may store the cleaned data in a SQLite database (which is to be submitted as well if you do). # # # In[76]: # Saving cleaned dataframe to csv file. tweet_merge_clean.to_csv('twitter_archive_master.csv') # In[91]: total_rating = tweet_merge_clean['rating_numerator'] / tweet_merge_clean['rating_denominator'] # ### Analyze and visualize your wrangled data in your wrangle_act.ipynb Jupyter Notebook. At least three (3) insights and one (1) visualization must be produced. # # ### Insights # In[93]: # Insight 1 # I wanted to find out which dogs breeds were identified the most via the neural network. breeds = tweet_merge_clean.groupby(by='p1') # In[151]: # Insight 1 # I then organized it by the top 5 identified breeds top_breeds = breeds.agg({'p1': 'count', 'favorite_count': 'mean', 'retweet_count': 'mean'}).sort_values('p1')[-5:] top_breeds = top_breeds.rename(columns={'p1':'Count', 'favorite_count':'Favorite Count', 'retweet_count':'Retweet Count'}) top_breeds = top_breeds.reset_index() top_breeds = top_breeds.rename(columns={'p1':'Breed'}) breed_replace_values = {'pug': 'Pug', 'Labrador_retriever': 'Labrador', 'golden_retriever':'Golden Retriever'} top_breeds = top_breeds.replace({"Breed": breed_replace_values}) top_breeds # Resources: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html # In[266]: # Insight 2 # As we see from the scatterplot below there appears to be a positive linear relationship between the number # of retweets and favorites that a picture has. # Scatterplot favorite_count vs retweet_count plt.scatter(x=tweet_merge_clean['favorite_count'], y=tweet_merge_clean['retweet_count']) plt.xlabel('Favorite Count'); plt.ylabel('Retweet Count'); plt.title('Favorite Count vs Retweet Count'); plt.style.use('default') plt.show() #plt.savefig('favretweet.png') plt.gcf().clear() # In[275]: # Insight 3 # I sorted via the 3 largest numbers in favorite count and then looked up the picture of the dog with the largest # favorite count and placed it below. print (tweet_merge_clean.nlargest(3, 'favorite_count')) Image("https://pbs.twimg.com/media/C2tugXLXgAArJO4.jpg") #Resources: https://stackoverflow.com/questions/16958499/sort-pandas-dataframe-and-print-highest-n-values # In[274]: #Insight 4 #I sorted via the 3 largest numbers in retweet count and then looked up the picture of the dog with the largest #retweet count and placed it below. print (tweet_merge_clean.nlargest(3, 'retweet_count')) Image("https://pbs.twimg.com/ext_tw_video_thumb/744234667679821824/pu/img/1GaWmtJtdqzZV7jy.jpg") #Resources: https://stackoverflow.com/questions/16958499/sort-pandas-dataframe-and-print-highest-n-values # ### Visualization # In[259]: # Visualization from Insight 1 # I created the visualization below to display the 5 most popular dog breeds identified by neural network. y_pos = np.arange(len(top_breeds)) x_pos = top_breeds.Count plt.bar(y_pos, x_pos, align='center', alpha=0.5) plt.xticks(y_pos, top_breeds.Breed) plt.ylabel('Count') plt.title('Top Identified Dog Breed') plt.style.use('bmh') #plt.savefig('breed.png') plt.show() plt.gcf().clear() # Resources: https://pythonspot.com/matplotlib-bar-chart/ # In[276]: # Visualization from Insight 2 plt.scatter(x=tweet_merge_clean['favorite_count'], y=tweet_merge_clean['retweet_count']) plt.xlabel('Favorite Count'); plt.ylabel('Retweet Count'); plt.title('Favorite Count vs Retweet Count'); plt.style.use('default') plt.show() #plt.savefig('favretweet.png') plt.gcf().clear() # Reporting for this Project # Create a 300-600 word written report called wrangle_report.pdf that briefly describes your wrangling efforts. This is to be framed as an internal document. # # Create a 250-word-minimum written report called act_report.pdf that communicates the insights and displays the visualization(s) produced from your wrangled data. This is to be framed as an external document, like a blog post or magazine article, for example. # # Both of these documents can be created in separate Jupyter Notebooks using the Markdown functionality of Jupyter Notebooks, then downloading those notebooks as PDF files (see image below). You might prefer to use a word processor like Google Docs or Microsoft Word, however. # In[ ]: # In[ ]: