In [271]:
import os
import requests
import urllib.request
from pathlib import Path
import pandas as pd
import time
import tweepy
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tweepy import OAuthHandler
from IPython.display import display, Markdown
from IPython.display import Image
import matplotlib.rcsetup as rcsetup

Gathering The WeRateDogs Twitter Archive

In [222]:
# Reading in csv file as pandas dataframe, displayng a tuple of the array dimensions, and printing the first few rows.
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
print("Array Dimensions = ", twitter_archive.shape)
twitter_archive.head()
Array Dimensions =  (2356, 17)
Out[222]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None

Gathering The Tweet Image Predictions

In [218]:
# Using requests library to download tsv file
# putting a check so I don't have to redownload the file each time.

my_file_pred = Path("image_predictions.tsv")
if my_file_pred.exists():
    print("File already exists")
    
else:
    
    url="https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv"
    response = requests.get(url)

    with open('image_predictions.tsv', 'wb') as file:
        file.write(response.content)
File already exists
In [223]:
# Read tsv file into Pandas   
image_predictions = pd.read_csv('image_predictions.tsv', sep='\t')
# Looking at the shape and the first couple of rows of the dataframe
print("Array Dimensions = ",image_predictions.shape)
image_predictions.head()
# Resources:  https://stackoverflow.com/questions/31126596/saving-response-from-requests-to-file
Array Dimensions =  (2075, 12)
Out[223]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True

Gathering Tweets from Twitter API

In [9]:
# Storing and loading passwords locally
password_list = pd.read_csv('password_list.csv')
consumer_key = password_list.consumer_key[0]
consumer_secret = password_list.consumer_secret[0]
access_token = password_list.access_token[0]
access_secret = password_list.access_secret[0]

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print ("Error: Unable to Authenticate")
    sys.exit(-1)
    
# Put in error handling     
# Resource: https://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./
In [202]:
# For loop which to add each found tweet information to a new line of tweet_json.txt
# I put a file exist check because I kept adding to the file everytime I needed to rerun this cell.
missing_tweets = []
# Recording the missing tweets

my_file = Path("tweet_json.txt")
if my_file.exists():
    print("File already exists")
    
else:
    with open('tweet_json.txt', 'a', encoding='utf8') as f:
        # opening file in append mode
        for tweet_id in twitter_archive['tweet_id']:
            try:
                tweet = api.get_status(tweet_id, tweet_mode='extended')
                json.dump(tweet._json, f)
                f.write('\n')
            except:
                print('{} Tweet not found'.format(tweet_id))
                missing_tweets.append(tweet_id)

    print("File Created / Task Completed")
# Resources: https://docs.python.org/3/tutorial/inputoutput.html
# Resources:  https://stackoverflow.com/questions/44581647/retrieving-a-list-of-tweets-using-tweet-id-in-tweepy
File already exists
In [10]:
# Opening tweet_json to api_info

with open('tweet_json.txt', 'r') as fp:
    api_info = pd.read_json(fp, lines=True)
    
# Resources:  https://stackoverflow.com/questions/30088006/loading-a-file-with-more-than-one-line-of-json-into-pythons-pandas
In [211]:
# Viewing the columns in the api_info database.
api_info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2346 entries, 0 to 2345
Data columns (total 31 columns):
contributors                     0 non-null float64
coordinates                      0 non-null float64
created_at                       2346 non-null datetime64[ns]
display_text_range               2346 non-null object
entities                         2346 non-null object
extended_entities                2068 non-null object
favorite_count                   2346 non-null int64
favorited                        2346 non-null bool
full_text                        2346 non-null object
geo                              0 non-null float64
id                               2346 non-null int64
id_str                           2346 non-null int64
in_reply_to_screen_name          78 non-null object
in_reply_to_status_id            78 non-null float64
in_reply_to_status_id_str        78 non-null float64
in_reply_to_user_id              78 non-null float64
in_reply_to_user_id_str          78 non-null float64
is_quote_status                  2346 non-null bool
lang                             2346 non-null object
place                            1 non-null object
possibly_sensitive               2206 non-null float64
possibly_sensitive_appealable    2206 non-null float64
quoted_status                    28 non-null object
quoted_status_id                 29 non-null float64
quoted_status_id_str             29 non-null float64
retweet_count                    2346 non-null int64
retweeted                        2346 non-null bool
retweeted_status                 171 non-null object
source                           2346 non-null object
truncated                        2346 non-null bool
user                             2346 non-null object
dtypes: bool(4), datetime64[ns](1), float64(11), int64(4), object(11)
memory usage: 504.1+ KB
In [14]:
# Viewing the 3 columns I am interested in.
api_info[["id", "retweet_count", "favorite_count"]]
Out[14]:
id retweet_count favorite_count
0 892420643555336193 8726 39179
1 892177421306343426 6406 33544
2 891815181378084864 4256 25283
3 891689557279858688 8820 42532
4 891327558926688256 9605 40715
5 891087950875897856 3188 20411
6 890971913173991426 2122 11968
7 890729181411237888 19325 66197
8 890609185150312448 4348 28026
9 890240255349198849 7579 32246
10 890006608113172480 7494 30908
11 889880896479866881 5069 28038
12 889665388333682689 10264 48331
13 889638837579907072 4647 27434
14 889531135344209921 2280 15231
15 889278841981685760 5559 25532
16 888917238123831296 4614 29346
17 888804989199671297 4462 25834
18 888554962724278272 3673 20121
19 888078434458587136 3595 21986
20 887705289381826560 5518 30490
21 887517139158093824 11916 46634
22 887473957103951883 18663 69817
23 887343217045368832 10612 33979
24 887101392804085760 6093 30840
25 886983233522544640 7940 35529
26 886736880519319552 3371 12189
27 886680336477933568 4561 22636
28 886366144734445568 3255 21336
29 886267009285017600 4 117
... ... ... ...
2316 666411507551481857 335 453
2317 666407126856765440 42 112
2318 666396247373291520 90 170
2319 666373753744588802 95 193
2320 666362758909284353 583 791
2321 666353288456101888 74 227
2322 666345417576210432 144 305
2323 666337882303524864 95 202
2324 666293911632134144 364 514
2325 666287406224695296 69 151
2326 666273097616637952 79 179
2327 666268910803644416 36 106
2328 666104133288665088 6764 14577
2329 666102155909144576 14 80
2330 666099513787052032 72 159
2331 666094000022159362 76 166
2332 666082916733198337 46 120
2333 666073100786774016 172 328
2334 666071193221509120 65 153
2335 666063827256086533 226 486
2336 666058600524156928 58 115
2337 666057090499244032 144 303
2338 666055525042405380 259 446
2339 666051853826850816 867 1240
2340 666050758794694657 59 135
2341 666049248165822465 40 110
2342 666044226329800704 143 305
2343 666033412701032449 46 126
2344 666029285002620928 47 131
2345 666020888022790149 522 2529

2346 rows × 3 columns

In [208]:
# Moving the 3 columns into a separate variable
tweet_info = api_info[['id','retweet_count','favorite_count']]
tweet_info.info()
tweet_info.shape
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2346 entries, 0 to 2345
Data columns (total 3 columns):
id                2346 non-null int64
retweet_count     2346 non-null int64
favorite_count    2346 non-null int64
dtypes: int64(3)
memory usage: 55.1 KB
Out[208]:
(2346, 3)

Exploring Data

In [18]:
# Looking at image prediction to get a feel for what I am looking at.  I will probably look into this category in more detail.
image_predictions.p1.value_counts()
Out[18]:
golden_retriever             150
Labrador_retriever           100
Pembroke                      89
Chihuahua                     83
pug                           57
chow                          44
Samoyed                       43
toy_poodle                    39
Pomeranian                    38
malamute                      30
cocker_spaniel                30
French_bulldog                26
Chesapeake_Bay_retriever      23
miniature_pinscher            23
seat_belt                     22
Staffordshire_bullterrier     20
Siberian_husky                20
German_shepherd               20
web_site                      19
Cardigan                      19
Maltese_dog                   18
beagle                        18
Shetland_sheepdog             18
teddy                         18
Eskimo_dog                    18
Lakeland_terrier              17
Rottweiler                    17
Shih-Tzu                      17
Italian_greyhound             16
kuvasz                        16
                            ... 
walking_stick                  1
bakery                         1
timber_wolf                    1
African_grey                   1
lorikeet                       1
cheetah                        1
bookshop                       1
traffic_light                  1
sea_urchin                     1
binoculars                     1
lion                           1
bee_eater                      1
three-toed_sloth               1
school_bus                     1
sandbar                        1
quilt                          1
pitcher                        1
African_crocodile              1
water_bottle                   1
carousel                       1
hare                           1
sliding_door                   1
harp                           1
leaf_beetle                    1
pool_table                     1
hammer                         1
book_jacket                    1
zebra                          1
mailbox                        1
killer_whale                   1
Name: p1, Length: 378, dtype: int64
In [19]:
# Looking at the second image classification to see how it differs from the first.
image_predictions.p2.value_counts()
Out[19]:
Labrador_retriever                104
golden_retriever                   92
Cardigan                           73
Chihuahua                          44
Pomeranian                         42
Chesapeake_Bay_retriever           41
French_bulldog                     41
toy_poodle                         37
cocker_spaniel                     34
miniature_poodle                   33
Siberian_husky                     33
beagle                             28
Pembroke                           27
Eskimo_dog                         27
collie                             27
kuvasz                             26
Italian_greyhound                  22
Pekinese                           21
American_Staffordshire_terrier     21
miniature_pinscher                 20
toy_terrier                        20
malinois                           20
chow                               20
Samoyed                            20
Boston_bull                        19
Norwegian_elkhound                 19
Staffordshire_bullterrier          18
Irish_terrier                      17
pug                                17
Shih-Tzu                           16
                                 ... 
trench_coat                         1
home_theater                        1
giant_panda                         1
lawn_mower                          1
komondor                            1
seashore                            1
snorkel                             1
computer_keyboard                   1
toaster                             1
streetcar                           1
pier                                1
bucket                              1
soccer_ball                         1
water_buffalo                       1
tiger                               1
hair_spray                          1
hummingbird                         1
cowboy_boot                         1
sock                                1
purse                               1
coral_fungus                        1
jigsaw_puzzle                       1
red_fox                             1
cloak                               1
quail                               1
rifle                               1
apron                               1
triceratops                         1
shovel                              1
armadillo                           1
Name: p2, Length: 405, dtype: int64
In [20]:
# Looking at the different names and noticing that some do not make sense.
twitter_archive.name.value_counts()
Out[20]:
None         745
a             55
Charlie       12
Cooper        11
Lucy          11
Oliver        11
Penny         10
Lola          10
Tucker        10
Winston        9
Bo             9
the            8
Sadie          8
an             7
Toby           7
Bailey         7
Buddy          7
Daisy          7
Jax            6
Leo            6
Oscar          6
Scout          6
Dave           6
Milo           6
Koda           6
Bella          6
Jack           6
Rusty          6
Stanley        6
very           5
            ... 
Baron          1
Obie           1
Katie          1
Blakely        1
Mookie         1
Aubie          1
Trevith        1
Nida           1
Napolean       1
Stewie         1
Schnozz        1
Alf            1
Vince          1
Tycho          1
Grey           1
Jebberson      1
Cora           1
Terrenth       1
Hall           1
Harrison       1
Brutus         1
Chloe          1
Skittle        1
Ralphus        1
Duchess        1
Remus          1
Gunner         1
Ben            1
Snoopy         1
Bodie          1
Name: name, Length: 957, dtype: int64
In [21]:
# Looking through the text to see how many lines contain &amp
twitter_archive.text[twitter_archive.text.str.contains('&amp;')]
Out[21]:
262     Meet Indie. She's not a fan of baths but she's...
273     RT @dog_rates: This is Pipsy. He is a fluffbal...
320     Meet Chester (bottom) &amp; Harold (top). They...
461     Say hello to Eugene &amp; Patti Melt. No matte...
485     RT @dog_rates: Meet Beau &amp; Wilbur. Wilbur ...
516     Meet Sam. She smiles 24/7 &amp; secretly aspir...
799     Meet Roosevelt. He's preparing for takeoff. Ma...
889     Meet Maggie &amp; Lila. Maggie is the doggo, L...
898     This is Lilli Bee &amp; Honey Bear. Unfortunat...
976     Meet Jax &amp; Jil. Jil is yelling the pledge ...
1104    Meet Buckley. His family &amp; some neighbors ...
1179    Meet Sid &amp; Murphy. Murphy floats alongside...
1199    Meet Jennifur. She's supposed to be navigating...
1222    Meet Travis and Flurp. Travis is pretty chill ...
1274    From left to right:\nCletus, Jerome, Alejandro...
1366    Meet Rambo &amp; Kiwi. Rambo's the pup with th...
1421    Meet Beau &amp; Wilbur. Wilbur stole Beau's be...
1465    Meet Oliviér. He takes killer selfies. Has a d...
1481    This is Sadie and her 2 pups Shebang &amp; Ruf...
1508    When bae says they can't go out but you see th...
1524    This is Lolo. She's America af. Behind in scie...
1538    Meet Fynn &amp; Taco. Fynn is an all-powerful ...
1569    Meet Trooper &amp; Maya. Trooper protects Maya...
1593    Say hello to Crimson. He's a Speckled Winnebag...
1621    Meet Bruiser &amp; Charlie. They are the best ...
1646    Here we see a faulty pupper. Might need to rep...
1707    Great picture here. Dog on the right panicked ...
1763    Touching scene here. Really stirs up the emoti...
1795    Meet Tassy &amp; Bee. Tassy is pretty chill, b...
1812    Say hello to Penny &amp; Gizmo. They are pract...
1817    This is Godzilla pupper. He had a ruff childho...
1842    &amp; this is Yoshi. Another world record cont...
1897    Meet Rufio. He is unaware of the pink legless ...
1899    Meet Jeb &amp; Bush. Jeb is somehow stuck in t...
1901    Two gorgeous dogs here. Little waddling dog is...
1913    Meet Chesney. On the outside he stays calm &am...
1931    Meet Daisy. She has no eyes &amp; her face has...
2031    When you try to recreate the scene from Lady &...
2037    This is the best thing I've ever seen so sprea...
2064    Meet Holly. She's trying to teach small human-...
2084    Say hello to Andy. He can balance on one foot,...
2096    Say hello to Gin &amp; Tonic. They're having a...
2137    This is Ben &amp; Carson. It's impossible for ...
2177    Here we have Pancho and Peaches. Pancho is a C...
2190    Meet Jaycob. He got scared of the vacuum. Hide...
2196    Say hello to Bobb. Bobb is a Golden High Fescu...
2207    This is Timofy. He's a pilot for Southwest. It...
2210    Say hello to Kallie. There was a tornado in th...
2216    This is Spark. He's nervous. Other dog hasn't ...
2232    These two dogs are Bo &amp; Smittens. Smittens...
2246    This is Tedrick. He lives on the edge. Needs s...
2268    This is Dook &amp; Milo. Dook is struggling to...
2293    This is Pipsy. He is a fluffball. Enjoys trave...
2306    These are Peruvian Feldspars. Their names are ...
Name: text, dtype: object

Assessing Data for this Project

After gathering each of the above pieces of data, assess them visually and programmatically for quality and tidiness issues. Detect and document at least eight (8) quality issues and two (2) tidiness issues in your wrangle_act.ipynb Jupyter Notebook. To meet specifications, the issues that satisfy the Project Motivation (see the Key Points header on the previous page) must be assessed.

List of Quality issues:

  1. Replace & amp; in text with just &.

  2. Convert id to string in tweet info dataframe.

  3. Rename tweet info id to tweet_id to merge it with the other two dataframes.

  4. Convert tweet_id to a string in image predictions dataframe.

  5. Convert tweet_id to a string in twitter archive dataframe.

  6. Convert datetime from string to datetime.

  7. Remove columns that contain no information, and the redundant dog stage columns.

  8. Some of the name records in Twitter Archive contain articles (the, an, a) instead of actual names. I will rename them to None for consistency.

  9. Remove retweets

List of Tidiness Issues

  1. Merge all lists into a master list.

  2. Combine Dog Stages into one column.

Cleaning Data

In [25]:
# Creating copies of the dataframe to clean
twitter_archive_clean = twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
tweet_info_clean = tweet_info.copy()
In [39]:
# Viewing a summary of all of the copied dataframes

print("=" * 50)
twitter_archive_clean.info()
print("=" * 50)
image_predictions_clean.info()
print("=" * 50)
tweet_info_clean.info()
print("=" * 50)
==================================================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
==================================================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
==================================================
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2346 entries, 0 to 2345
Data columns (total 3 columns):
id                2346 non-null int64
retweet_count     2346 non-null int64
favorite_count    2346 non-null int64
dtypes: int64(3)
memory usage: 55.1 KB
==================================================
In [28]:
# Quality Issue 1
# Replacing &amp; with &.  Then verifying code works as expected.
twitter_archive_clean['text'] = twitter_archive_clean['text'].str.replace('&amp;','&')
twitter_archive_clean.text[twitter_archive_clean.text.str.contains('&amp;')]
Out[28]:
Series([], Name: text, dtype: object)
In [206]:
# Quality Issue 2 and 3
# Changing id to tweet_id and converting it to string datatype

tweet_info_clean['id'] = tweet_info_clean['id'].astype(str)
tweet_info_clean.rename(columns={'id': 'tweet_id'}, inplace=True)
# Resources:  https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
tweet_info_clean["tweet_id"]
Out[206]:
0       892420643555336193
1       892177421306343426
2       891815181378084864
3       891689557279858688
4       891327558926688256
5       891087950875897856
6       890971913173991426
7       890729181411237888
8       890609185150312448
9       890240255349198849
10      890006608113172480
11      889880896479866881
12      889665388333682689
13      889638837579907072
14      889531135344209921
15      889278841981685760
16      888917238123831296
17      888804989199671297
18      888554962724278272
19      888078434458587136
20      887705289381826560
21      887517139158093824
22      887473957103951883
23      887343217045368832
24      887101392804085760
25      886983233522544640
26      886736880519319552
27      886680336477933568
28      886366144734445568
29      886267009285017600
               ...        
2316    666411507551481857
2317    666407126856765440
2318    666396247373291520
2319    666373753744588802
2320    666362758909284353
2321    666353288456101888
2322    666345417576210432
2323    666337882303524864
2324    666293911632134144
2325    666287406224695296
2326    666273097616637952
2327    666268910803644416
2328    666104133288665088
2329    666102155909144576
2330    666099513787052032
2331    666094000022159362
2332    666082916733198337
2333    666073100786774016
2334    666071193221509120
2335    666063827256086533
2336    666058600524156928
2337    666057090499244032
2338    666055525042405380
2339    666051853826850816
2340    666050758794694657
2341    666049248165822465
2342    666044226329800704
2343    666033412701032449
2344    666029285002620928
2345    666020888022790149
Name: tweet_id, Length: 2346, dtype: object
In [40]:
# Quality Issue 4
# We need to convert tweet id from a number to a string.
image_predictions_clean['tweet_id'] = image_predictions_clean['tweet_id'].astype(str)
tweet_info_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2346 entries, 0 to 2345
Data columns (total 3 columns):
id                2346 non-null int64
retweet_count     2346 non-null int64
favorite_count    2346 non-null int64
dtypes: int64(3)
memory usage: 55.1 KB
In [224]:
# Quality Issue 5
# We need to convert tweet id from a number to a string.
twitter_archive_clean['tweet_id'] = twitter_archive_clean['tweet_id'].astype(str)
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null object
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(2), object(11)
memory usage: 313.0+ KB
In [207]:
# Tidiness Issue 1
# Merging all three dataframes
tweet_merge_clean = pd.merge(tweet_info_clean, twitter_archive_clean, on='tweet_id', how='outer')
tweet_merge_clean = pd.merge(tweet_merge_clean, image_predictions_clean, on='tweet_id', how='outer')
tweet_merge_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4528 entries, 0 to 4708
Data columns (total 22 columns):
tweet_id              4528 non-null object
retweet_count         2346 non-null float64
favorite_count        2346 non-null float64
timestamp             2175 non-null datetime64[ns]
source                2175 non-null object
text                  2175 non-null object
expanded_urls         2117 non-null object
rating_numerator      2175 non-null float64
rating_denominator    2175 non-null float64
name                  2175 non-null object
jpg_url               2075 non-null object
p1                    2075 non-null object
p1_conf               2075 non-null float64
p1_dog                2075 non-null object
p2                    2075 non-null object
p2_conf               2075 non-null float64
p2_dog                2075 non-null object
p3                    2075 non-null object
p3_conf               2075 non-null float64
p3_dog                2075 non-null object
dog_stage             2175 non-null object
total_rating          2175 non-null float64
dtypes: datetime64[ns](1), float64(8), object(13)
memory usage: 973.6+ KB
In [204]:
# Quality Issue 6
# Converting to datetime
tweet_merge_clean['timestamp'] =  pd.to_datetime(tweet_merge_clean['timestamp'], infer_datetime_format=True)
tweet_merge_clean.timestamp
Out[204]:
0                      NaT
1                      NaT
2                      NaT
3                      NaT
4                      NaT
5                      NaT
6                      NaT
7                      NaT
8                      NaT
9                      NaT
10                     NaT
11                     NaT
12                     NaT
13                     NaT
14                     NaT
15                     NaT
16                     NaT
17                     NaT
18                     NaT
19                     NaT
20                     NaT
21                     NaT
22                     NaT
23                     NaT
24                     NaT
25                     NaT
26                     NaT
27                     NaT
28                     NaT
29                     NaT
               ...        
4679   2015-11-16 19:31:45
4680   2015-11-16 16:37:02
4681   2015-11-16 16:11:11
4682   2015-11-16 15:14:19
4683   2015-11-16 14:57:41
4684   2015-11-16 04:02:55
4685   2015-11-16 03:55:04
4686   2015-11-16 03:44:34
4687   2015-11-16 03:22:39
4688   2015-11-16 02:38:37
4689   2015-11-16 01:59:36
4690   2015-11-16 01:52:02
4691   2015-11-16 01:22:45
4692   2015-11-16 01:01:59
4693   2015-11-16 00:55:59
4694   2015-11-16 00:49:46
4695   2015-11-16 00:35:11
4696   2015-11-16 00:30:50
4697   2015-11-16 00:24:50
4698   2015-11-16 00:04:52
4699   2015-11-15 23:21:54
4700   2015-11-15 23:05:30
4701   2015-11-15 22:32:08
4702                   NaT
4703                   NaT
4704                   NaT
4705                   NaT
4706                   NaT
4707                   NaT
4708                   NaT
Name: timestamp, Length: 4528, dtype: datetime64[ns]
In [203]:
# Tidiness Issue 2
# I will be merging all the dog stage columns into one column.
tweet_merge_clean['dog_stage'] = tweet_merge_clean.apply(lambda row: row['doggo'] + row['floofer'] + row['pupper'] + row['puppo'], axis=1)
# Running a value count on the dog stage column.
tweet_merge_clean.dog_stage.value_counts()

# Resources https://stackoverflow.com/questions/34023918/make-new-column-in-panda-dataframe-by-adding-values-from-other-columns
# Results in a new column named dog_stage that is a result of the 4 columns combined
Out[203]:
None                 1831
Pupper                224
Doggo                  75
Puppo                  24
Doggo and Pupper       10
Floofer                 9
Doggo and Puppo         1
Doggo and Floofer       1
Name: dog_stage, dtype: int64
In [69]:
# Tidiness Issue 2
# Replacing all the values in dog stage into easier to understand terms.
dogstage_replace_values = {'NoneNoneNoneNone' : "None", "doggoNoneNoneNone" : "Doggo", 
                           "NoneflooferNoneNone" : "Floofer", 
                           "NoneNonepupperNone" : "Pupper",  
                           "NoneNoneNonepuppo" : "Puppo",  
                           "doggoNonepupperNone" : "Doggo and Pupper", 
                           "doggoflooferNoneNone" : "Doggo and Floofer",  
                           "doggoNoneNonepuppo" : "Doggo and Puppo"}
tweet_merge_clean = tweet_merge_clean.replace({"dog_stage": dogstage_replace_values})  
tweet_merge_clean.dog_stage.value_counts()

# Resources:  https://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column
In [71]:
# Quality Issue 7
# Dropping the 4 redundant columns, 3 empty columns, img_num and both in reply columns
tweet_merge_clean = tweet_merge_clean.drop(['doggo', 'floofer', 'pupper', 
            'puppo', "retweeted_status_id", "retweeted_status_user_id", "retweeted_status_timestamp", 
            'in_reply_to_status_id', 'in_reply_to_user_id', 'img_num'], axis=1)
In [65]:
# Quality Issue 8
tweet_merge_clean.name.value_counts()
Out[65]:
None        680
a            55
Lucy         11
Charlie      11
Oliver       10
Cooper       10
Tucker        9
Penny         9
Lola          8
Winston       8
the           8
Sadie         8
Daisy         7
Toby          7
Bo            6
Bailey        6
Bella         6
an            6
Jax           6
Oscar         6
Koda          6
Stanley       6
Chester       5
Bentley       5
Milo          5
Leo           5
Dave          5
Louis         5
Scout         5
Rusty         5
           ... 
Vinscent      1
by            1
Dunkin        1
Sailer        1
Chloe         1
Brutus        1
Hall          1
Napolean      1
Terrance      1
Robin         1
Cal           1
Shaggy        1
Derby         1
Kloey         1
Corey         1
Trip          1
Antony        1
Nida          1
Rizzy         1
Stewie        1
Schnozz       1
Alf           1
Vince         1
Tycho         1
Grey          1
Hercules      1
Cora          1
Terrenth      1
Leela         1
Bodie         1
Name: name, Length: 956, dtype: int64
In [66]:
# Quality Issue 8
# Looking at name I will change a, the, an, and very to None
name_replace_values = {'a' : "None", "the" : "None", "an" : "None", "very" : "None"}
tweet_merge_clean = tweet_merge_clean.replace({"name": name_replace_values})  

# Resources:  https://stackoverflow.com/questions/22100130/pandas-replace-multiple-values-one-column
In [205]:
# Quality Issue 9
# Clearing out the retweets
tweet_merge_clean = tweet_merge_clean[tweet_merge_clean['retweeted_status_id'].isnull()]
tweet_merge_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4528 entries, 0 to 4708
Data columns (total 22 columns):
tweet_id              4528 non-null object
retweet_count         2346 non-null float64
favorite_count        2346 non-null float64
timestamp             2175 non-null datetime64[ns]
source                2175 non-null object
text                  2175 non-null object
expanded_urls         2117 non-null object
rating_numerator      2175 non-null float64
rating_denominator    2175 non-null float64
name                  2175 non-null object
jpg_url               2075 non-null object
p1                    2075 non-null object
p1_conf               2075 non-null float64
p1_dog                2075 non-null object
p2                    2075 non-null object
p2_conf               2075 non-null float64
p2_dog                2075 non-null object
p3                    2075 non-null object
p3_conf               2075 non-null float64
p3_dog                2075 non-null object
dog_stage             2175 non-null object
total_rating          2175 non-null float64
dtypes: datetime64[ns](1), float64(8), object(13)
memory usage: 973.6+ KB

Storing, Analyzing, and Visualizing Data for this Project

Store the clean DataFrame(s) in a CSV file with the main one named twitter_archive_master.csv. If additional files exist because multiple tables are required for tidiness, name these files appropriately. Additionally, you may store the cleaned data in a SQLite database (which is to be submitted as well if you do).

In [76]:
# Saving cleaned dataframe to csv file.
tweet_merge_clean.to_csv('twitter_archive_master.csv')
In [91]:
total_rating = tweet_merge_clean['rating_numerator'] / tweet_merge_clean['rating_denominator']

Analyze and visualize your wrangled data in your wrangle_act.ipynb Jupyter Notebook. At least three (3) insights and one (1) visualization must be produced.

Insights

In [93]:
# Insight 1 
# I wanted to find out which dogs breeds were identified the most via the neural network.
breeds = tweet_merge_clean.groupby(by='p1')
In [151]:
# Insight 1
# I then organized it by the top 5 identified breeds

top_breeds = breeds.agg({'p1': 'count', 'favorite_count': 'mean', 'retweet_count': 'mean'}).sort_values('p1')[-5:]
top_breeds = top_breeds.rename(columns={'p1':'Count', 'favorite_count':'Favorite Count', 'retweet_count':'Retweet Count'})
top_breeds = top_breeds.reset_index()
top_breeds = top_breeds.rename(columns={'p1':'Breed'})

breed_replace_values = {'pug': 'Pug', 'Labrador_retriever': 'Labrador', 'golden_retriever':'Golden Retriever'}
top_breeds = top_breeds.replace({"Breed": breed_replace_values})  
top_breeds

# Resources:  https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.DataFrameGroupBy.agg.html
Out[151]:
Breed Count Favorite Count Retweet Count
0 Pug 57 5554.105263 1949.754386
1 Chihuahua 83 8484.135802 3022.543210
2 Pembroke 89 11299.897727 3157.795455
3 Labrador 100 11108.120000 4206.930000
4 Golden Retriever 150 11383.409396 3843.255034
In [266]:
# Insight 2
# As we see from the scatterplot below there appears to be a positive linear relationship between the number 
# of retweets and favorites that a picture has.

# Scatterplot favorite_count vs retweet_count
plt.scatter(x=tweet_merge_clean['favorite_count'], y=tweet_merge_clean['retweet_count'])
plt.xlabel('Favorite Count');
plt.ylabel('Retweet Count');
plt.title('Favorite Count vs Retweet Count');
plt.style.use('default')
plt.show()
#plt.savefig('favretweet.png')
plt.gcf().clear()
In [275]:
# Insight 3
# I sorted via the 3 largest numbers in favorite count and then looked up the picture of the dog with the largest 
# favorite count and placed it below.

print (tweet_merge_clean.nlargest(3, 'favorite_count'))
Image("https://pbs.twimg.com/media/C2tugXLXgAArJO4.jpg")
#Resources: https://stackoverflow.com/questions/16958499/sort-pandas-dataframe-and-print-highest-n-values
                tweet_id  retweet_count  favorite_count timestamp source text  \
407   822872901745569793        49733.0        143493.0       NaT    NaN  NaN   
1029  744234799360020481        78386.0        129565.0       NaT    NaN  NaN   
528   807106840509214720        61978.0        123976.0       NaT    NaN  NaN   

     expanded_urls  rating_numerator  rating_denominator name     ...       \
407            NaN               NaN                 NaN  NaN     ...        
1029           NaN               NaN                 NaN  NaN     ...        
528            NaN               NaN                 NaN  NaN     ...        

       p1_conf p1_dog                  p2   p2_conf p2_dog             p3  \
407   0.196015   True  Labrador_retriever  0.160329   True  Irish_terrier   
1029  0.825333   True            ice_bear  0.044681  False        whippet   
528   0.505370   True          Pomeranian  0.120358   True    toy_terrier   

       p3_conf p3_dog  dog_stage total_rating  
407   0.069126   True        NaN          NaN  
1029  0.018442   True        NaN          NaN  
528   0.077008   True        NaN          NaN  

[3 rows x 22 columns]
Out[275]:
In [274]:
#Insight 4
#I sorted via the 3 largest numbers in retweet count and then looked up the picture of the dog with the largest 
#retweet count and placed it below. 

print (tweet_merge_clean.nlargest(3, 'retweet_count'))
Image("https://pbs.twimg.com/ext_tw_video_thumb/744234667679821824/pu/img/1GaWmtJtdqzZV7jy.jpg")
#Resources: https://stackoverflow.com/questions/16958499/sort-pandas-dataframe-and-print-highest-n-values
                tweet_id  retweet_count  favorite_count timestamp source text  \
1029  744234799360020481        78386.0        129565.0       NaT    NaN  NaN   
528   807106840509214720        61978.0        123976.0       NaT    NaN  NaN   
808   770743923962707968        51607.0             0.0       NaT    NaN  NaN   

     expanded_urls  rating_numerator  rating_denominator name     ...       \
1029           NaN               NaN                 NaN  NaN     ...        
528            NaN               NaN                 NaN  NaN     ...        
808            NaN               NaN                 NaN  NaN     ...        

       p1_conf p1_dog          p2   p2_conf p2_dog           p3   p3_conf  \
1029  0.825333   True    ice_bear  0.044681  False      whippet  0.018442   
528   0.505370   True  Pomeranian  0.120358   True  toy_terrier  0.077008   
808        NaN    NaN         NaN       NaN    NaN          NaN       NaN   

     p3_dog  dog_stage total_rating  
1029   True        NaN          NaN  
528    True        NaN          NaN  
808     NaN        NaN          NaN  

[3 rows x 22 columns]
Out[274]:

Visualization

In [259]:
# Visualization from Insight 1
# I created the visualization below to display the 5 most popular dog breeds identified by neural network. 
y_pos = np.arange(len(top_breeds))
x_pos = top_breeds.Count
 
plt.bar(y_pos, x_pos, align='center', alpha=0.5)
plt.xticks(y_pos, top_breeds.Breed)
plt.ylabel('Count')
plt.title('Top Identified Dog Breed')
plt.style.use('bmh')
#plt.savefig('breed.png') 
plt.show()
plt.gcf().clear()
# Resources: https://pythonspot.com/matplotlib-bar-chart/
In [276]:
# Visualization from Insight 2
plt.scatter(x=tweet_merge_clean['favorite_count'], y=tweet_merge_clean['retweet_count'])
plt.xlabel('Favorite Count');
plt.ylabel('Retweet Count');
plt.title('Favorite Count vs Retweet Count');
plt.style.use('default')
plt.show()
#plt.savefig('favretweet.png')
plt.gcf().clear()

Reporting for this Project Create a 300-600 word written report called wrangle_report.pdf that briefly describes your wrangling efforts. This is to be framed as an internal document.

Create a 250-word-minimum written report called act_report.pdf that communicates the insights and displays the visualization(s) produced from your wrangled data. This is to be framed as an external document, like a blog post or magazine article, for example.

Both of these documents can be created in separate Jupyter Notebooks using the Markdown functionality of Jupyter Notebooks, then downloading those notebooks as PDF files (see image below). You might prefer to use a word processor like Google Docs or Microsoft Word, however.

In [ ]:
 
In [ ]: