import os
import requests
import urllib.request
from pathlib import Path
import pandas as pd
import time
import tweepy
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tweepy import OAuthHandler
from IPython.display import display, Markdown
from IPython.display import Image
import matplotlib.rcsetup as rcsetup
# Reading in csv file as pandas dataframe, displayng a tuple of the array dimensions, and printing the first few rows.
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
print("Array Dimensions = ", twitter_archive.shape)
twitter_archive.head()
Array Dimensions = (2356, 17)
tweet_id | in_reply_to_status_id | in_reply_to_user_id | timestamp | source | text | retweeted_status_id | retweeted_status_user_id | retweeted_status_timestamp | expanded_urls | rating_numerator | rating_denominator | name | doggo | floofer | pupper | puppo | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892420643555336193 | NaN | NaN | 2017-08-01 16:23:56 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Phineas. He's a mystical boy. Only eve... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892420643... | 13 | 10 | Phineas | None | None | None | None |
1 | 892177421306343426 | NaN | NaN | 2017-08-01 00:17:27 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Tilly. She's just checking pup on you.... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/892177421... | 13 | 10 | Tilly | None | None | None | None |
2 | 891815181378084864 | NaN | NaN | 2017-07-31 00:18:03 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Archie. He is a rare Norwegian Pouncin... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891815181... | 12 | 10 | Archie | None | None | None | None |
3 | 891689557279858688 | NaN | NaN | 2017-07-30 15:58:51 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Darla. She commenced a snooze mid meal... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891689557... | 13 | 10 | Darla | None | None | None | None |
4 | 891327558926688256 | NaN | NaN | 2017-07-29 16:00:24 +0000 | <a href="http://twitter.com/download/iphone" r... | This is Franklin. He would like you to stop ca... | NaN | NaN | NaN | https://twitter.com/dog_rates/status/891327558... | 12 | 10 | Franklin | None | None | None | None |
# Using requests library to download tsv file
# putting a check so I don't have to redownload the file each time.
my_file_pred = Path("image_predictions.tsv")
if my_file_pred.exists():
print("File already exists")
else:
url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
response = requests.get(url)
with open('image_predictions.tsv', 'wb') as file:
file.write(response.content)
File already exists
# Read tsv file into Pandas
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')
# Looking at the shape and the first couple of rows of the dataframe
print("Array Dimensions = ",image_predictions.shape)
image_predictions.head()
# Resources: https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file
Array Dimensions = (2075, 12)
tweet_id | jpg_url | img_num | p1 | p1_conf | p1_dog | p2 | p2_conf | p2_dog | p3 | p3_conf | p3_dog | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 666020888022790149 | https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg | 1 | Welsh_springer_spaniel | 0.465074 | True | collie | 0.156665 | True | Shetland_sheepdog | 0.061428 | True |
1 | 666029285002620928 | https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg | 1 | redbone | 0.506826 | True | miniature_pinscher | 0.074192 | True | Rhodesian_ridgeback | 0.072010 | True |
2 | 666033412701032449 | https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg | 1 | German_shepherd | 0.596461 | True | malinois | 0.138584 | True | bloodhound | 0.116197 | True |
3 | 666044226329800704 | https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg | 1 | Rhodesian_ridgeback | 0.408143 | True | redbone | 0.360687 | True | miniature_pinscher | 0.222752 | True |
4 | 666049248165822465 | https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg | 1 | miniature_pinscher | 0.560311 | True | Rottweiler | 0.243682 | True | Doberman | 0.154629 | True |
# Storing and loading passwords locally
password_list = pd.read_csv('password_list.csv')
consumer_key = password_list.consumer_key[0]
consumer_secret = password_list.consumer_secret[0]
access_token = password_list.access_token[0]
access_secret = password_list.access_secret[0]
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
if (not api):
print ("Error: Unable to Authenticate")
sys.exit(-1)
# Put in error handling
# Resource: https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./
# For loop which to add each found tweet information to a new line of tweet_json.txt
# I put a file exist check because I kept adding to the file everytime I needed to rerun this cell.
missing_tweets = []
# Recording the missing tweets
my_file = Path("tweet_json.txt")
if my_file.exists():
print("File already exists")
else:
with open('tweet_json.txt', 'a', encoding='utf8') as f:
# opening file in append mode
for tweet_id in twitter_archive['tweet_id']:
try:
tweet = api.get_status(tweet_id, tweet_mode='extended')
json.dump(tweet._json, f)
f.write('\n')
except:
print('{} Tweet not found'.format(tweet_id))
missing_tweets.append(tweet_id)
print("File Created / Task Completed")
# Resources: https://docs.python.org/3/tutorial/inputoutput.html
# Resources: https://stackoverflow.com/questions/44581647/retrieving-a-list-of-tweets-using-tweet-id-in-tweepy
File already exists
# Opening tweet_json to api_info
with open('tweet_json.txt', 'r') as fp:
api_info = pd.read_json(fp, lines=True)
# Resources: https://stackoverflow.com/questions/30088006/loading-a-file-with-more-than-one-line-of-json-into-pythons-pandas
# Viewing the columns in the api_info database.
api_info.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2346 entries, 0 to 2345 Data columns (total 31 columns): contributors 0 non-null float64 coordinates 0 non-null float64 created_at 2346 non-null datetime64[ns] display_text_range 2346 non-null object entities 2346 non-null object extended_entities 2068 non-null object favorite_count 2346 non-null int64 favorited 2346 non-null bool full_text 2346 non-null object geo 0 non-null float64 id 2346 non-null int64 id_str 2346 non-null int64 in_reply_to_screen_name 78 non-null object in_reply_to_status_id 78 non-null float64 in_reply_to_status_id_str 78 non-null float64 in_reply_to_user_id 78 non-null float64 in_reply_to_user_id_str 78 non-null float64 is_quote_status 2346 non-null bool lang 2346 non-null object place 1 non-null object possibly_sensitive 2206 non-null float64 possibly_sensitive_appealable 2206 non-null float64 quoted_status 28 non-null object quoted_status_id 29 non-null float64 quoted_status_id_str 29 non-null float64 retweet_count 2346 non-null int64 retweeted 2346 non-null bool retweeted_status 171 non-null object source 2346 non-null object truncated 2346 non-null bool user 2346 non-null object dtypes: bool(4), datetime64[ns](1), float64(11), int64(4), object(11) memory usage: 504.1+ KB
# Viewing the 3 columns I am interested in.
api_info[["id", "retweet_count", "favorite_count"]]
id | retweet_count | favorite_count | |
---|---|---|---|
0 | 892420643555336193 | 8726 | 39179 |
1 | 892177421306343426 | 6406 | 33544 |
2 | 891815181378084864 | 4256 | 25283 |
3 | 891689557279858688 | 8820 | 42532 |
4 | 891327558926688256 | 9605 | 40715 |
5 | 891087950875897856 | 3188 | 20411 |
6 | 890971913173991426 | 2122 | 11968 |
7 | 890729181411237888 | 19325 | 66197 |
8 | 890609185150312448 | 4348 | 28026 |
9 | 890240255349198849 | 7579 | 32246 |
10 | 890006608113172480 | 7494 | 30908 |
11 | 889880896479866881 | 5069 | 28038 |
12 | 889665388333682689 | 10264 | 48331 |
13 | 889638837579907072 | 4647 | 27434 |
14 | 889531135344209921 | 2280 | 15231 |
15 | 889278841981685760 | 5559 | 25532 |
16 | 888917238123831296 | 4614 | 29346 |
17 | 888804989199671297 | 4462 | 25834 |
18 | 888554962724278272 | 3673 | 20121 |
19 | 888078434458587136 | 3595 | 21986 |
20 | 887705289381826560 | 5518 | 30490 |
21 | 887517139158093824 | 11916 | 46634 |
22 | 887473957103951883 | 18663 | 69817 |
23 | 887343217045368832 | 10612 | 33979 |
24 | 887101392804085760 | 6093 | 30840 |
25 | 886983233522544640 | 7940 | 35529 |
26 | 886736880519319552 | 3371 | 12189 |
27 | 886680336477933568 | 4561 | 22636 |
28 | 886366144734445568 | 3255 | 21336 |
29 | 886267009285017600 | 4 | 117 |
... | ... | ... | ... |
2316 | 666411507551481857 | 335 | 453 |
2317 | 666407126856765440 | 42 | 112 |
2318 | 666396247373291520 | 90 | 170 |
2319 | 666373753744588802 | 95 | 193 |
2320 | 666362758909284353 | 583 | 791 |
2321 | 666353288456101888 | 74 | 227 |
2322 | 666345417576210432 | 144 | 305 |
2323 | 666337882303524864 | 95 | 202 |
2324 | 666293911632134144 | 364 | 514 |
2325 | 666287406224695296 | 69 | 151 |
2326 | 666273097616637952 | 79 | 179 |
2327 | 666268910803644416 | 36 | 106 |
2328 | 666104133288665088 | 6764 | 14577 |
2329 | 666102155909144576 | 14 | 80 |
2330 | 666099513787052032 | 72 | 159 |
2331 | 666094000022159362 | 76 | 166 |
2332 | 666082916733198337 | 46 | 120 |
2333 | 666073100786774016 | 172 | 328 |
2334 | 666071193221509120 | 65 | 153 |
2335 | 666063827256086533 | 226 | 486 |
2336 | 666058600524156928 | 58 | 115 |
2337 | 666057090499244032 | 144 | 303 |
2338 | 666055525042405380 | 259 | 446 |
2339 | 666051853826850816 | 867 | 1240 |
2340 | 666050758794694657 | 59 | 135 |
2341 | 666049248165822465 | 40 | 110 |
2342 | 666044226329800704 | 143 | 305 |
2343 | 666033412701032449 | 46 | 126 |
2344 | 666029285002620928 | 47 | 131 |
2345 | 666020888022790149 | 522 | 2529 |
2346 rows × 3 columns
# Moving the 3 columns into a separate variable
tweet_info = api_info[['id','retweet_count','favorite_count']]
tweet_info.info()
tweet_info.shape
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2346 entries, 0 to 2345 Data columns (total 3 columns): id 2346 non-null int64 retweet_count 2346 non-null int64 favorite_count 2346 non-null int64 dtypes: int64(3) memory usage: 55.1 KB
(2346, 3)
# Looking at image prediction to get a feel for what I am looking at. I will probably look into this category in more detail.
image_predictions.p1.value_counts()
golden_retriever 150 Labrador_retriever 100 Pembroke 89 Chihuahua 83 pug 57 chow 44 Samoyed 43 toy_poodle 39 Pomeranian 38 malamute 30 cocker_spaniel 30 French_bulldog 26 Chesapeake_Bay_retriever 23 miniature_pinscher 23 seat_belt 22 Staffordshire_bullterrier 20 Siberian_husky 20 German_shepherd 20 web_site 19 Cardigan 19 Maltese_dog 18 beagle 18 Shetland_sheepdog 18 teddy 18 Eskimo_dog 18 Lakeland_terrier 17 Rottweiler 17 Shih-Tzu 17 Italian_greyhound 16 kuvasz 16 ... walking_stick 1 bakery 1 timber_wolf 1 African_grey 1 lorikeet 1 cheetah 1 bookshop 1 traffic_light 1 sea_urchin 1 binoculars 1 lion 1 bee_eater 1 three-toed_sloth 1 school_bus 1 sandbar 1 quilt 1 pitcher 1 African_crocodile 1 water_bottle 1 carousel 1 hare 1 sliding_door 1 harp 1 leaf_beetle 1 pool_table 1 hammer 1 book_jacket 1 zebra 1 mailbox 1 killer_whale 1 Name: p1, Length: 378, dtype: int64
# Looking at the second image classification to see how it differs from the first.
image_predictions.p2.value_counts()
Labrador_retriever 104 golden_retriever 92 Cardigan 73 Chihuahua 44 Pomeranian 42 Chesapeake_Bay_retriever 41 French_bulldog 41 toy_poodle 37 cocker_spaniel 34 miniature_poodle 33 Siberian_husky 33 beagle 28 Pembroke 27 Eskimo_dog 27 collie 27 kuvasz 26 Italian_greyhound 22 Pekinese 21 American_Staffordshire_terrier 21 miniature_pinscher 20 toy_terrier 20 malinois 20 chow 20 Samoyed 20 Boston_bull 19 Norwegian_elkhound 19 Staffordshire_bullterrier 18 Irish_terrier 17 pug 17 Shih-Tzu 16 ... trench_coat 1 home_theater 1 giant_panda 1 lawn_mower 1 komondor 1 seashore 1 snorkel 1 computer_keyboard 1 toaster 1 streetcar 1 pier 1 bucket 1 soccer_ball 1 water_buffalo 1 tiger 1 hair_spray 1 hummingbird 1 cowboy_boot 1 sock 1 purse 1 coral_fungus 1 jigsaw_puzzle 1 red_fox 1 cloak 1 quail 1 rifle 1 apron 1 triceratops 1 shovel 1 armadillo 1 Name: p2, Length: 405, dtype: int64
# Looking at the different names and noticing that some do not make sense.
twitter_archive.name.value_counts()
None 745 a 55 Charlie 12 Cooper 11 Lucy 11 Oliver 11 Penny 10 Lola 10 Tucker 10 Winston 9 Bo 9 the 8 Sadie 8 an 7 Toby 7 Bailey 7 Buddy 7 Daisy 7 Jax 6 Leo 6 Oscar 6 Scout 6 Dave 6 Milo 6 Koda 6 Bella 6 Jack 6 Rusty 6 Stanley 6 very 5 ... Baron 1 Obie 1 Katie 1 Blakely 1 Mookie 1 Aubie 1 Trevith 1 Nida 1 Napolean 1 Stewie 1 Schnozz 1 Alf 1 Vince 1 Tycho 1 Grey 1 Jebberson 1 Cora 1 Terrenth 1 Hall 1 Harrison 1 Brutus 1 Chloe 1 Skittle 1 Ralphus 1 Duchess 1 Remus 1 Gunner 1 Ben 1 Snoopy 1 Bodie 1 Name: name, Length: 957, dtype: int64
# Looking through the text to see how many lines contain &
twitter_archive.text[twitter_archive.text.str.contains('&')]
262 Meet Indie. She's not a fan of baths but she's... 273 RT @dog_rates: This is Pipsy. He is a fluffbal... 320 Meet Chester (bottom) & Harold (top). They... 461 Say hello to Eugene & Patti Melt. No matte... 485 RT @dog_rates: Meet Beau & Wilbur. Wilbur ... 516 Meet Sam. She smiles 24/7 & secretly aspir... 799 Meet Roosevelt. He's preparing for takeoff. Ma... 889 Meet Maggie & Lila. Maggie is the doggo, L... 898 This is Lilli Bee & Honey Bear. Unfortunat... 976 Meet Jax & Jil. Jil is yelling the pledge ... 1104 Meet Buckley. His family & some neighbors ... 1179 Meet Sid & Murphy. Murphy floats alongside... 1199 Meet Jennifur. She's supposed to be navigating... 1222 Meet Travis and Flurp. Travis is pretty chill ... 1274 From left to right:\nCletus, Jerome, Alejandro... 1366 Meet Rambo & Kiwi. Rambo's the pup with th... 1421 Meet Beau & Wilbur. Wilbur stole Beau's be... 1465 Meet Oliviér. He takes killer selfies. Has a d... 1481 This is Sadie and her 2 pups Shebang & Ruf... 1508 When bae says they can't go out but you see th... 1524 This is Lolo. She's America af. Behind in scie... 1538 Meet Fynn & Taco. Fynn is an all-powerful ... 1569 Meet Trooper & Maya. Trooper protects Maya... 1593 Say hello to Crimson. He's a Speckled Winnebag... 1621 Meet Bruiser & Charlie. They are the best ... 1646 Here we see a faulty pupper. Might need to rep... 1707 Great picture here. Dog on the right panicked ... 1763 Touching scene here. Really stirs up the emoti... 1795 Meet Tassy & Bee. Tassy is pretty chill, b... 1812 Say hello to Penny & Gizmo. They are pract... 1817 This is Godzilla pupper. He had a ruff childho... 1842 & this is Yoshi. Another world record cont... 1897 Meet Rufio. He is unaware of the pink legless ... 1899 Meet Jeb & Bush. Jeb is somehow stuck in t... 1901 Two gorgeous dogs here. Little waddling dog is... 1913 Meet Chesney. On the outside he stays calm &am... 1931 Meet Daisy. She has no eyes & her face has... 2031 When you try to recreate the scene from Lady &... 2037 This is the best thing I've ever seen so sprea... 2064 Meet Holly. She's trying to teach small human-... 2084 Say hello to Andy. He can balance on one foot,... 2096 Say hello to Gin & Tonic. They're having a... 2137 This is Ben & Carson. It's impossible for ... 2177 Here we have Pancho and Peaches. Pancho is a C... 2190 Meet Jaycob. He got scared of the vacuum. Hide... 2196 Say hello to Bobb. Bobb is a Golden High Fescu... 2207 This is Timofy. He's a pilot for Southwest. It... 2210 Say hello to Kallie. There was a tornado in th... 2216 This is Spark. He's nervous. Other dog hasn't ... 2232 These two dogs are Bo & Smittens. Smittens... 2246 This is Tedrick. He lives on the edge. Needs s... 2268 This is Dook & Milo. Dook is struggling to... 2293 This is Pipsy. He is a fluffball. Enjoys trave... 2306 These are Peruvian Feldspars. Their names are ... Name: text, dtype: object
After gathering each of the above pieces of data, assess them visually and programmatically for quality and tidiness issues. Detect and document at least eight (8) quality issues and two (2) tidiness issues in your wrangle_act.ipynb Jupyter Notebook. To meet specifications, the issues that satisfy the Project Motivation (see the Key Points header on the previous page) must be assessed.
Replace & amp; in text with just &.
Convert id to string in tweet info dataframe.
Rename tweet info id to tweet_id to merge it with the other two dataframes.
Convert tweet_id to a string in image predictions dataframe.
Convert tweet_id to a string in twitter archive dataframe.
Convert datetime from string to datetime.
Remove columns that contain no information, and the redundant dog stage columns.
Some of the name records in Twitter Archive contain articles (the, an, a) instead of actual names. I will rename them to None for consistency.
Remove retweets
Merge all lists into a master list.
Combine Dog Stages into one column.
# Creating copies of the dataframe to clean
twitter_archive_clean = twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
tweet_info_clean = tweet_info.copy()
# Viewing a summary of all of the copied dataframes
print("=" * 50)
twitter_archive_clean.info()
print("=" * 50)
image_predictions_clean.info()
print("=" * 50)
tweet_info_clean.info()
print("=" * 50)
================================================== <class 'pandas.core.frame.DataFrame'> RangeIndex: 2356 entries, 0 to 2355 Data columns (total 17 columns): tweet_id 2356 non-null int64 in_reply_to_status_id 78 non-null float64 in_reply_to_user_id 78 non-null float64 timestamp 2356 non-null object source 2356 non-null object text 2356 non-null object retweeted_status_id 181 non-null float64 retweeted_status_user_id 181 non-null float64 retweeted_status_timestamp 181 non-null object expanded_urls 2297 non-null object rating_numerator 2356 non-null int64 rating_denominator 2356 non-null int64 name 2356 non-null object doggo 2356 non-null object floofer 2356 non-null object pupper 2356 non-null object puppo 2356 non-null object dtypes: float64(4), int64(3), object(10) memory usage: 313.0+ KB ================================================== <class 'pandas.core.frame.DataFrame'> RangeIndex: 2075 entries, 0 to 2074 Data columns (total 12 columns): tweet_id 2075 non-null int64 jpg_url 2075 non-null object img_num 2075 non-null int64 p1 2075 non-null object p1_conf 2075 non-null float64 p1_dog 2075 non-null bool p2 2075 non-null object p2_conf 2075 non-null float64 p2_dog 2075 non-null bool p3 2075 non-null object p3_conf 2075 non-null float64 p3_dog 2075 non-null bool dtypes: bool(3), float64(3), int64(2), object(4) memory usage: 152.1+ KB ================================================== <class 'pandas.core.frame.DataFrame'> RangeIndex: 2346 entries, 0 to 2345 Data columns (total 3 columns): id 2346 non-null int64 retweet_count 2346 non-null int64 favorite_count 2346 non-null int64 dtypes: int64(3) memory usage: 55.1 KB ==================================================
# Quality Issue 1
# Replacing & with &. Then verifying code works as expected.
twitter_archive_clean['text'] = twitter_archive_clean['text'].str.replace('&','&')
twitter_archive_clean.text[twitter_archive_clean.text.str.contains('&')]
Series([], Name: text, dtype: object)
# Quality Issue 2 and 3
# Changing id to tweet_id and converting it to string datatype
tweet_info_clean['id'] = tweet_info_clean['id'].astype(str)
tweet_info_clean.rename(columns={'id': 'tweet_id'}, inplace=True)
# Resources: https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
tweet_info_clean["tweet_id"]
0 892420643555336193 1 892177421306343426 2 891815181378084864 3 891689557279858688 4 891327558926688256 5 891087950875897856 6 890971913173991426 7 890729181411237888 8 890609185150312448 9 890240255349198849 10 890006608113172480 11 889880896479866881 12 889665388333682689 13 889638837579907072 14 889531135344209921 15 889278841981685760 16 888917238123831296 17 888804989199671297 18 888554962724278272 19 888078434458587136 20 887705289381826560 21 887517139158093824 22 887473957103951883 23 887343217045368832 24 887101392804085760 25 886983233522544640 26 886736880519319552 27 886680336477933568 28 886366144734445568 29 886267009285017600 ... 2316 666411507551481857 2317 666407126856765440 2318 666396247373291520 2319 666373753744588802 2320 666362758909284353 2321 666353288456101888 2322 666345417576210432 2323 666337882303524864 2324 666293911632134144 2325 666287406224695296 2326 666273097616637952 2327 666268910803644416 2328 666104133288665088 2329 666102155909144576 2330 666099513787052032 2331 666094000022159362 2332 666082916733198337 2333 666073100786774016 2334 666071193221509120 2335 666063827256086533 2336 666058600524156928 2337 666057090499244032 2338 666055525042405380 2339 666051853826850816 2340 666050758794694657 2341 666049248165822465 2342 666044226329800704 2343 666033412701032449 2344 666029285002620928 2345 666020888022790149 Name: tweet_id, Length: 2346, dtype: object
# Quality Issue 4
# We need to convert tweet id from a number to a string.
image_predictions_clean['tweet_id'] = image_predictions_clean['tweet_id'].astype(str)
tweet_info_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2346 entries, 0 to 2345 Data columns (total 3 columns): id 2346 non-null int64 retweet_count 2346 non-null int64 favorite_count 2346 non-null int64 dtypes: int64(3) memory usage: 55.1 KB
# Quality Issue 5
# We need to convert tweet id from a number to a string.
twitter_archive_clean['tweet_id'] = twitter_archive_clean['tweet_id'].astype(str)
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2356 entries, 0 to 2355 Data columns (total 17 columns): tweet_id 2356 non-null object in_reply_to_status_id 78 non-null float64 in_reply_to_user_id 78 non-null float64 timestamp 2356 non-null object source 2356 non-null object text 2356 non-null object retweeted_status_id 181 non-null float64 retweeted_status_user_id 181 non-null float64 retweeted_status_timestamp 181 non-null object expanded_urls 2297 non-null object rating_numerator 2356 non-null int64 rating_denominator 2356 non-null int64 name 2356 non-null object doggo 2356 non-null object floofer 2356 non-null object pupper 2356 non-null object puppo 2356 non-null object dtypes: float64(4), int64(2), object(11) memory usage: 313.0+ KB
# Tidiness Issue 1
# Merging all three dataframes
tweet_merge_clean = pd.merge(tweet_info_clean, twitter_archive_clean, on='tweet_id', how='outer')
tweet_merge_clean = pd.merge(tweet_merge_clean, image_predictions_clean, on='tweet_id', how='outer')
tweet_merge_clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4528 entries, 0 to 4708 Data columns (total 22 columns): tweet_id 4528 non-null object retweet_count 2346 non-null float64 favorite_count 2346 non-null float64 timestamp 2175 non-null datetime64[ns] source 2175 non-null object text 2175 non-null object expanded_urls 2117 non-null object rating_numerator 2175 non-null float64 rating_denominator 2175 non-null float64 name 2175 non-null object jpg_url 2075 non-null object p1 2075 non-null object p1_conf 2075 non-null float64 p1_dog 2075 non-null object p2 2075 non-null object p2_conf 2075 non-null float64 p2_dog 2075 non-null object p3 2075 non-null object p3_conf 2075 non-null float64 p3_dog 2075 non-null object dog_stage 2175 non-null object total_rating 2175 non-null float64 dtypes: datetime64[ns](1), float64(8), object(13) memory usage: 973.6+ KB
# Quality Issue 6
# Converting to datetime
tweet_merge_clean['timestamp'] = pd.to_datetime(tweet_merge_clean['timestamp'], infer_datetime_format=True)
tweet_merge_clean.timestamp
0 NaT 1 NaT 2 NaT 3 NaT 4 NaT 5 NaT 6 NaT 7 NaT 8 NaT 9 NaT 10 NaT 11 NaT 12 NaT 13 NaT 14 NaT 15 NaT 16 NaT 17 NaT 18 NaT 19 NaT 20 NaT 21 NaT 22 NaT 23 NaT 24 NaT 25 NaT 26 NaT 27 NaT 28 NaT 29 NaT ... 4679 2015-11-16 19:31:45 4680 2015-11-16 16:37:02 4681 2015-11-16 16:11:11 4682 2015-11-16 15:14:19 4683 2015-11-16 14:57:41 4684 2015-11-16 04:02:55 4685 2015-11-16 03:55:04 4686 2015-11-16 03:44:34 4687 2015-11-16 03:22:39 4688 2015-11-16 02:38:37 4689 2015-11-16 01:59:36 4690 2015-11-16 01:52:02 4691 2015-11-16 01:22:45 4692 2015-11-16 01:01:59 4693 2015-11-16 00:55:59 4694 2015-11-16 00:49:46 4695 2015-11-16 00:35:11 4696 2015-11-16 00:30:50 4697 2015-11-16 00:24:50 4698 2015-11-16 00:04:52 4699 2015-11-15 23:21:54 4700 2015-11-15 23:05:30 4701 2015-11-15 22:32:08 4702 NaT 4703 NaT 4704 NaT 4705 NaT 4706 NaT 4707 NaT 4708 NaT Name: timestamp, Length: 4528, dtype: datetime64[ns]
# Tidiness Issue 2
# I will be merging all the dog stage columns into one column.
tweet_merge_clean['dog_stage'] = tweet_merge_clean.apply(lambda row: row['doggo'] + row['floofer'] + row['pupper'] + row['puppo'], axis=1)
# Running a value count on the dog stage column.
tweet_merge_clean.dog_stage.value_counts()
# Resources https://stackoverflow.com/questions/34023918/make-new-column-in-panda-dataframe-by-adding-values-from-other-columns
# Results in a new column named dog_stage that is a result of the 4 columns combined
None 1831 Pupper 224 Doggo 75 Puppo 24 Doggo and Pupper 10 Floofer 9 Doggo and Puppo 1 Doggo and Floofer 1 Name: dog_stage, dtype: int64
# Tidiness Issue 2
# Replacing all the values in dog stage into easier to understand terms.
dogstage_replace_values = {'NoneNoneNoneNone' : "None", "doggoNoneNoneNone" : "Doggo",
"NoneflooferNoneNone" : "Floofer",
"NoneNonepupperNone" : "Pupper",
"NoneNoneNonepuppo" : "Puppo",
"doggoNonepupperNone" : "Doggo and Pupper",
"doggoflooferNoneNone" : "Doggo and Floofer",
"doggoNoneNonepuppo" : "Doggo and Puppo"}
tweet_merge_clean = tweet_merge_clean.replace({"dog_stage": dogstage_replace_values})
tweet_merge_clean.dog_stage.value_counts()
# Resources: https://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column
# Quality Issue 7
# Dropping the 4 redundant columns, 3 empty columns, img_num and both in reply columns
tweet_merge_clean = tweet_merge_clean.drop(['doggo', 'floofer', 'pupper',
'puppo', "retweeted_status_id", "retweeted_status_user_id", "retweeted_status_timestamp",
'in_reply_to_status_id', 'in_reply_to_user_id', 'img_num'], axis=1)
# Quality Issue 8
tweet_merge_clean.name.value_counts()
None 680 a 55 Lucy 11 Charlie 11 Oliver 10 Cooper 10 Tucker 9 Penny 9 Lola 8 Winston 8 the 8 Sadie 8 Daisy 7 Toby 7 Bo 6 Bailey 6 Bella 6 an 6 Jax 6 Oscar 6 Koda 6 Stanley 6 Chester 5 Bentley 5 Milo 5 Leo 5 Dave 5 Louis 5 Scout 5 Rusty 5 ... Vinscent 1 by 1 Dunkin 1 Sailer 1 Chloe 1 Brutus 1 Hall 1 Napolean 1 Terrance 1 Robin 1 Cal 1 Shaggy 1 Derby 1 Kloey 1 Corey 1 Trip 1 Antony 1 Nida 1 Rizzy 1 Stewie 1 Schnozz 1 Alf 1 Vince 1 Tycho 1 Grey 1 Hercules 1 Cora 1 Terrenth 1 Leela 1 Bodie 1 Name: name, Length: 956, dtype: int64
# Quality Issue 8
# Looking at name I will change a, the, an, and very to None
name_replace_values = {'a' : "None", "the" : "None", "an" : "None", "very" : "None"}
tweet_merge_clean = tweet_merge_clean.replace({"name": name_replace_values})
# Resources: https://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column
# Quality Issue 9
# Clearing out the retweets
tweet_merge_clean = tweet_merge_clean[tweet_merge_clean['retweeted_status_id'].isnull()]
tweet_merge_clean.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4528 entries, 0 to 4708 Data columns (total 22 columns): tweet_id 4528 non-null object retweet_count 2346 non-null float64 favorite_count 2346 non-null float64 timestamp 2175 non-null datetime64[ns] source 2175 non-null object text 2175 non-null object expanded_urls 2117 non-null object rating_numerator 2175 non-null float64 rating_denominator 2175 non-null float64 name 2175 non-null object jpg_url 2075 non-null object p1 2075 non-null object p1_conf 2075 non-null float64 p1_dog 2075 non-null object p2 2075 non-null object p2_conf 2075 non-null float64 p2_dog 2075 non-null object p3 2075 non-null object p3_conf 2075 non-null float64 p3_dog 2075 non-null object dog_stage 2175 non-null object total_rating 2175 non-null float64 dtypes: datetime64[ns](1), float64(8), object(13) memory usage: 973.6+ KB
Store the clean DataFrame(s) in a CSV file with the main one named twitter_archive_master.csv. If additional files exist because multiple tables are required for tidiness, name these files appropriately. Additionally, you may store the cleaned data in a SQLite database (which is to be submitted as well if you do).
# Saving cleaned dataframe to csv file.
tweet_merge_clean.to_csv('twitter_archive_master.csv')
total_rating = tweet_merge_clean['rating_numerator'] / tweet_merge_clean['rating_denominator']
# Insight 1
# I wanted to find out which dogs breeds were identified the most via the neural network.
breeds = tweet_merge_clean.groupby(by='p1')
# Insight 1
# I then organized it by the top 5 identified breeds
top_breeds = breeds.agg({'p1': 'count', 'favorite_count': 'mean', 'retweet_count': 'mean'}).sort_values('p1')[-5:]
top_breeds = top_breeds.rename(columns={'p1':'Count', 'favorite_count':'Favorite Count', 'retweet_count':'Retweet Count'})
top_breeds = top_breeds.reset_index()
top_breeds = top_breeds.rename(columns={'p1':'Breed'})
breed_replace_values = {'pug': 'Pug', 'Labrador_retriever': 'Labrador', 'golden_retriever':'Golden Retriever'}
top_breeds = top_breeds.replace({"Breed": breed_replace_values})
top_breeds
# Resources: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html
Breed | Count | Favorite Count | Retweet Count | |
---|---|---|---|---|
0 | Pug | 57 | 5554.105263 | 1949.754386 |
1 | Chihuahua | 83 | 8484.135802 | 3022.543210 |
2 | Pembroke | 89 | 11299.897727 | 3157.795455 |
3 | Labrador | 100 | 11108.120000 | 4206.930000 |
4 | Golden Retriever | 150 | 11383.409396 | 3843.255034 |
# Insight 2
# As we see from the scatterplot below there appears to be a positive linear relationship between the number
# of retweets and favorites that a picture has.
# Scatterplot favorite_count vs retweet_count
plt.scatter(x=tweet_merge_clean['favorite_count'], y=tweet_merge_clean['retweet_count'])
plt.xlabel('Favorite Count');
plt.ylabel('Retweet Count');
plt.title('Favorite Count vs Retweet Count');
plt.style.use('default')
plt.show()
#plt.savefig('favretweet.png')
plt.gcf().clear()
# Insight 3
# I sorted via the 3 largest numbers in favorite count and then looked up the picture of the dog with the largest
# favorite count and placed it below.
print (tweet_merge_clean.nlargest(3, 'favorite_count'))
Image("https://pbs.twimg.com/media/C2tugXLXgAArJO4.jpg")
#Resources: https://stackoverflow.com/questions/16958499/sort-pandas-dataframe-and-print-highest-n-values
tweet_id retweet_count favorite_count timestamp source text \ 407 822872901745569793 49733.0 143493.0 NaT NaN NaN 1029 744234799360020481 78386.0 129565.0 NaT NaN NaN 528 807106840509214720 61978.0 123976.0 NaT NaN NaN expanded_urls rating_numerator rating_denominator name ... \ 407 NaN NaN NaN NaN ... 1029 NaN NaN NaN NaN ... 528 NaN NaN NaN NaN ... p1_conf p1_dog p2 p2_conf p2_dog p3 \ 407 0.196015 True Labrador_retriever 0.160329 True Irish_terrier 1029 0.825333 True ice_bear 0.044681 False whippet 528 0.505370 True Pomeranian 0.120358 True toy_terrier p3_conf p3_dog dog_stage total_rating 407 0.069126 True NaN NaN 1029 0.018442 True NaN NaN 528 0.077008 True NaN NaN [3 rows x 22 columns]
#Insight 4
#I sorted via the 3 largest numbers in retweet count and then looked up the picture of the dog with the largest
#retweet count and placed it below.
print (tweet_merge_clean.nlargest(3, 'retweet_count'))
Image("https://pbs.twimg.com/ext_tw_video_thumb/744234667679821824/pu/img/1GaWmtJtdqzZV7jy.jpg")
#Resources: https://stackoverflow.com/questions/16958499/sort-pandas-dataframe-and-print-highest-n-values
tweet_id retweet_count favorite_count timestamp source text \ 1029 744234799360020481 78386.0 129565.0 NaT NaN NaN 528 807106840509214720 61978.0 123976.0 NaT NaN NaN 808 770743923962707968 51607.0 0.0 NaT NaN NaN expanded_urls rating_numerator rating_denominator name ... \ 1029 NaN NaN NaN NaN ... 528 NaN NaN NaN NaN ... 808 NaN NaN NaN NaN ... p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf \ 1029 0.825333 True ice_bear 0.044681 False whippet 0.018442 528 0.505370 True Pomeranian 0.120358 True toy_terrier 0.077008 808 NaN NaN NaN NaN NaN NaN NaN p3_dog dog_stage total_rating 1029 True NaN NaN 528 True NaN NaN 808 NaN NaN NaN [3 rows x 22 columns]
# Visualization from Insight 1
# I created the visualization below to display the 5 most popular dog breeds identified by neural network.
y_pos = np.arange(len(top_breeds))
x_pos = top_breeds.Count
plt.bar(y_pos, x_pos, align='center', alpha=0.5)
plt.xticks(y_pos, top_breeds.Breed)
plt.ylabel('Count')
plt.title('Top Identified Dog Breed')
plt.style.use('bmh')
#plt.savefig('breed.png')
plt.show()
plt.gcf().clear()
# Resources: https://pythonspot.com/matplotlib-bar-chart/
# Visualization from Insight 2
plt.scatter(x=tweet_merge_clean['favorite_count'], y=tweet_merge_clean['retweet_count'])
plt.xlabel('Favorite Count');
plt.ylabel('Retweet Count');
plt.title('Favorite Count vs Retweet Count');
plt.style.use('default')
plt.show()
#plt.savefig('favretweet.png')
plt.gcf().clear()
Reporting for this Project Create a 300-600 word written report called wrangle_report.pdf that briefly describes your wrangling efforts. This is to be framed as an internal document.
Create a 250-word-minimum written report called act_report.pdf that communicates the insights and displays the visualization(s) produced from your wrangled data. This is to be framed as an external document, like a blog post or magazine article, for example.
Both of these documents can be created in separate Jupyter Notebooks using the Markdown functionality of Jupyter Notebooks, then downloading those notebooks as PDF files (see image below). You might prefer to use a word processor like Google Docs or Microsoft Word, however.