Our aim in this project is to find mobile app profiles that are profitable for the App Store and Google Play markets. We're working as data analysts for a company that builds Android and iOS mobile apps, and our job is to enable our team of developers to make data-driven decisions with respect to the kind of apps they build.
At our company, we only build apps that are free to download and install, and our main source of revenue consists of in-app ads. This means that our revenue for any given app is mostly influenced by the number of users that use our app. Our goal for this project is to analyze data to help our developers understand what kinds of apps are likely to attract more users.
from csv import reader
### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]
### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n') # adds a new (empty) line between rows
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns:', len(dataset[0]))
print(android_header)
print('\n')
explore_data(android, 0, 3, True)
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'] ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] Number of rows: 10841 Number of columns: 13
print(len(android))
print(len(ios))
#incorrect data
print(android[10472])
10841 7197 ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
#remove the row with wrong data
del android[10472] #run this once only
#check total rows again
print(len(android))
10840
#check app data and how many rows associated with the app
for app in android:
name = app[0]
if name == 'Facebook':
print(app)
['Facebook', 'SOCIAL', '4.1', '78158306', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'August 3, 2018', 'Varies with device', 'Varies with device'] ['Facebook', 'SOCIAL', '4.1', '78128208', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'August 3, 2018', 'Varies with device', 'Varies with device']
#duplicates in android dataset
duplicates = []
unique = []
for app in android:
name = app[0]
if name not in unique:
unique.append(name)
else:
duplicates.append(name)
print(len(duplicates))
print(len(unique))
print('Number of duplicate apps:', len(duplicates))
print('\n')
print('Examples of duplicate apps:', duplicates[:15])
1181 9659 Number of duplicate apps: 1181 Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']
#dictionary code to remove duplicate entries
#my code
review_max = {}
for apps in android:
name = apps[0]
n_reviews = float(app[3])
if name in review_max and review_max[name] < n_reviews:
review_max[name]=n_reviews
elif name not in review_max:
review_max[name] = n_reviews
#solution code
# reviews_max = {}
# for app in android:
# name = app[0]
# n_reviews = float(app[3])
# if name in reviews_max and reviews_max[name] < n_reviews:
# reviews_max[name] = n_reviews
# elif name not in reviews_max:
# reviews_max[name] = n_reviews
#check length
print('Expected length:', len(android) - 1181)
print('Actual length:', len(review_max))
Expected length: 9659 Actual length: 9659
#removing duplicate rows
#my code
# android_clean = []
# already_added = []
# for app in android:
# name = app[0]
# n_reviews = float(app[3])
# if (reviews_max[name] == n_reviews) and (name not in already_added):
# android_clean.append(app)
# already_added.append(name)
# print(len(android_clean))
#solution code
android_clean = []
already_added = []
for app in android:
name = app[0]
n_reviews = float(app[3])
if (review_max[name] == n_reviews) and (name not in already_added):
android_clean.append(app)
already_added.append(name) # make sure this is inside the if block
print(len(android_clean))
1
explore_data(android_clean, 0, 3, True)
['iHoroscope - 2018 Daily Horoscope & Astrology', 'LIFESTYLE', '4.5', '398307', '19M', '10,000,000+', 'Free', '0', 'Everyone', 'Lifestyle', 'July 25, 2018', 'Varies with device', 'Varies with device'] Number of rows: 1 Number of columns: 13