#!/usr/bin/env python # coding: utf-8 # # Profitable App Profiles for the App Store and Google Play Markets# # # We'll pretend we're working as data analysts for a company that builds Android and iOS mobile apps. The goal of this project # In[1]: ## Open and explore the data # open two datasets and save both as lists of lists from csv import reader open_file = open('AppleStore.csv') read_file = reader(open_file) appstore = list(read_file) appstore_header = appstore[0] appstore = appstore[1:] open_file = open('googleplaystore.csv') read_file = reader(open_file) googlestore = list(read_file) googlestore_header = googlestore[0] googlestore = googlestore[1:] ## explore both datasets def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns', len(dataset[0])) print(appstore_header) print('\n') explore_data(appstore, 0, 2, True) print('\n') print(googlestore_header) print('\n') explore_data(googlestore, 0, 2, True) # In[2]: ## Deleting wrong data: index 10472 is missing "category" data print(len(googlestore_header)) print(len(googlestore[1])) print(len(googlestore[10472])) print(len(googlestore)) # del googlestore[10472] *comment this out to avoid running multiple times print(len(googlestore)) # In[3]: ## Removing duplicate entries: part I # from the data source discussion section, it is reported that Google Play Store dataset has duplicate entries. For example, there's 3 Twitter entries for app in googlestore: name = app[0] if name == 'Twitter': print(app) dup_app = [] uniq_app = [] for app in googlestore: name = app[0] if name in uniq_app: dup_app.append(name) else: uniq_app.append(name) print(len(dup_app)) print(len(uniq_app)) print('number of duplicate apps:', len(dup_app)) print('examples of duplicate apps:', dup_app[:10]) # most recent entry of app will be kept will the other redundent entries of same app will be removed. highest number of review (index 3) is consider as most recent entry # In[4]: ## Removing duplicate entries: part II # 2 steps to remove duplicates: # 1. create a dictionary where each dictionary key is a unique app name and the corresponding dictionary value is the highest number of reviews of that app. # 2. use the information stored in the dictionary and create a new dataset, this dataset will have only 1 entry per app reviews_max = {} for app in googlestore: name = app[0] n_reviews = float(app[3]) if name in reviews_max and reviews_max[name] < n_reviews: reviews_max = n_reviews elif name not in reviews_max: reviews_max[name] = n_reviews print(len(reviews_max)) # In[ ]: ## I copied below from the solution and just changed android to googlestore, it worked. Not sure why my original code above doesn't work... # reviews_max = {} # for app in googlestore: # name = app[0] # n_reviews = float(app[3]) # if name in reviews_max and reviews_max[name] < n_reviews: # reviews_max[name] = n_reviews # elif name not in reviews_max: # reviews_max[name] = n_reviews # print(len(reviews_max))