We'll pretend we're working as data analysts for a company that builds Android and iOS mobile apps. The goal of this project
## Open and explore the data
# open two datasets and save both as lists of lists
from csv import reader
open_file = open('AppleStore.csv')
read_file = reader(open_file)
appstore = list(read_file)
appstore_header = appstore[0]
appstore = appstore[1:]
open_file = open('googleplaystore.csv')
read_file = reader(open_file)
googlestore = list(read_file)
googlestore_header = googlestore[0]
googlestore = googlestore[1:]
## explore both datasets
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n')
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns', len(dataset[0]))
print(appstore_header)
print('\n')
explore_data(appstore, 0, 2, True)
print('\n')
print(googlestore_header)
print('\n')
explore_data(googlestore, 0, 2, True)
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'] ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'] Number of rows: 7197 Number of columns 16 ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'] Number of rows: 10841 Number of columns 13
## Deleting wrong data: index 10472 is missing "category" data
print(len(googlestore_header))
print(len(googlestore[1]))
print(len(googlestore[10472]))
print(len(googlestore))
# del googlestore[10472] *comment this out to avoid running multiple times
print(len(googlestore))
13 13 12 10841 10841
## Removing duplicate entries: part I
# from the data source discussion section, it is reported that Google Play Store dataset has duplicate entries. For example, there's 3 Twitter entries
for app in googlestore:
name = app[0]
if name == 'Twitter':
print(app)
dup_app = []
uniq_app = []
for app in googlestore:
name = app[0]
if name in uniq_app:
dup_app.append(name)
else:
uniq_app.append(name)
print(len(dup_app))
print(len(uniq_app))
print('number of duplicate apps:', len(dup_app))
print('examples of duplicate apps:', dup_app[:10])
# most recent entry of app will be kept will the other redundent entries of same app will be removed. highest number of review (index 3) is consider as most recent entry
['Twitter', 'NEWS_AND_MAGAZINES', '4.3', '11667403', 'Varies with device', '500,000,000+', 'Free', '0', 'Mature 17+', 'News & Magazines', 'August 6, 2018', 'Varies with device', 'Varies with device'] ['Twitter', 'NEWS_AND_MAGAZINES', '4.3', '11667403', 'Varies with device', '500,000,000+', 'Free', '0', 'Mature 17+', 'News & Magazines', 'August 6, 2018', 'Varies with device', 'Varies with device'] ['Twitter', 'NEWS_AND_MAGAZINES', '4.3', '11657972', 'Varies with device', '500,000,000+', 'Free', '0', 'Mature 17+', 'News & Magazines', 'July 30, 2018', 'Varies with device', 'Varies with device'] 1181 9660 number of duplicate apps: 1181 examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']
## Removing duplicate entries: part II
# 2 steps to remove duplicates:
# 1. create a dictionary where each dictionary key is a unique app name and the corresponding dictionary value is the highest number of reviews of that app.
# 2. use the information stored in the dictionary and create a new dataset, this dataset will have only 1 entry per app
reviews_max = {}
for app in googlestore:
name = app[0]
n_reviews = float(app[3])
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max = n_reviews
elif name not in reviews_max:
reviews_max[name] = n_reviews
print(len(reviews_max))
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-4-fa30a760993a> in <module> 11 n_reviews = float(app[3]) 12 ---> 13 if name in reviews_max and reviews_max[name] < n_reviews: 14 reviews_max = n_reviews 15 elif name not in reviews_max: TypeError: argument of type 'float' is not iterable
## I copied below from the solution and just changed android to googlestore, it worked. Not sure why my original code above doesn't work...
# reviews_max = {}
# for app in googlestore:
# name = app[0]
# n_reviews = float(app[3])
# if name in reviews_max and reviews_max[name] < n_reviews:
# reviews_max[name] = n_reviews
# elif name not in reviews_max:
# reviews_max[name] = n_reviews
# print(len(reviews_max))