Notebook

Profitable App Profiles for the App Store and Google Play Markets¶

Our aim in this project is to find mobile app profiles that are profitable for the App Store and Google Play markets. We're working as data analysts for a company that builds Android and iOS mobile apps, and our job is to enable our team of developers to make data-driven decisions with respect to the kind of apps they build.

At our company, we only build apps that are free to download and install, and our main source of revenue consists of in-app ads. This means that our revenue for any given app is mostly influenced by the number of users that use our app. Our goal for this project is to analyze data to help our developers understand what kinds of apps are likely to attract more users.

In [19]:

from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

In [20]:

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line between rows
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print(android_header)
print('\n')
explore_data(android, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13

In [21]:

print(len(android))
print(len(ios))
#incorrect data
print(android[10472])

10841
7197
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']

In [22]:

#remove the row with wrong data
del android[10472] #run this once only

In [23]:

#check total rows again
print(len(android))

In [24]:

#check app data and how many rows associated with the app
for app in android:
    name = app[0]
    if name == 'Facebook':
        print(app)
        

['Facebook', 'SOCIAL', '4.1', '78158306', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'August 3, 2018', 'Varies with device', 'Varies with device']
['Facebook', 'SOCIAL', '4.1', '78128208', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'August 3, 2018', 'Varies with device', 'Varies with device']

Check duplicates¶

In [25]:

#duplicates in android dataset
duplicates = []
unique = []

for app in android:
    name = app[0]
    if name not in unique:
        unique.append(name)
    else:
        duplicates.append(name)

print(len(duplicates))
print(len(unique))

print('Number of duplicate apps:', len(duplicates))
print('\n')
print('Examples of duplicate apps:', duplicates[:15])

1181
9659
Number of duplicate apps: 1181


Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']

In [26]:

#dictionary code to remove duplicate entries
#my code
review_max = {}

for apps in android:
    name = apps[0]
    n_reviews = float(app[3])

    if name in review_max and review_max[name] < n_reviews:
        review_max[name]=n_reviews

    elif name not in review_max:
        review_max[name] = n_reviews
        
        
#solution code
# reviews_max = {}

# for app in android:
#     name = app[0]
#     n_reviews = float(app[3])
    
#     if name in reviews_max and reviews_max[name] < n_reviews:
#         reviews_max[name] = n_reviews
        
#     elif name not in reviews_max:
#         reviews_max[name] = n_reviews
        

In [27]:

#check length
print('Expected length:', len(android) - 1181)
print('Actual length:', len(review_max))

Expected length: 9659
Actual length: 9659

In [28]:

#removing duplicate rows
#my code

# android_clean = []
# already_added = []

# for app in android:
#     name = app[0]
#     n_reviews = float(app[3])
    
#     if (reviews_max[name] == n_reviews) and (name not in already_added):
#         android_clean.append(app)
#         already_added.append(name) 

# print(len(android_clean))


#solution code
android_clean = []
already_added = []

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (review_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name) # make sure this is inside the if block
        
print(len(android_clean))

In [29]:

explore_data(android_clean, 0, 3, True)

['iHoroscope - 2018 Daily Horoscope & Astrology', 'LIFESTYLE', '4.5', '398307', '19M', '10,000,000+', 'Free', '0', 'Everyone', 'Lifestyle', 'July 25, 2018', 'Varies with device', 'Varies with device']


Number of rows: 1
Number of columns: 13