Title: Profitable App Profiles for the App Store and Google Play Markets
Introduction: To analyse the number of users who use our apps, and what apps attract the most users on Google Play and App Store.
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n') # adds a new (empty) line after each row
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns:', len(dataset[0]))
from csv import reader
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
#opened_file.close()
apps_data = list(read_file)
appsapple_header = apps_data[0]
appsapple_data = apps_data[1:]
opened_file2 = open('googleplaystore.csv')
read_file2 = reader(opened_file2)
#opened_file2.close()
appsg_data = list(read_file2)
appsgoogle_header = appsg_data[0]
appsgoogle_data = appsg_data[1:]
#print(appsgoogle_header)
#print(appsgoogle_data)
#print(appsgoogle_header)
#print('\n')
#explore_data(appsapple_data, 0, 2, True)
#print(appsgoogle_header)
#print('\n')
#explore_data(appsgoogle_data, 0, 2, True)
#print(appsgoogle_data[10472])
#print(len(appsgoogle_data))
#del appsgoogle_data[10472] # don't run this more than once
#print(len(appsgoogle_data))
The Google play data set has duplicate entries
for app in appsgoogle_data:
name = app[0]
if name == 'Instagram':
print(app)
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'] ['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'] ['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'] ['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
duplicate_apps = []
unique_apps = []
for app in appsgoogle_data:
name = app[0]
if name in unique_apps:
duplicate_apps.append(name)
else:
unique_apps.append(name)
print('Number of duplicate apps:', len(duplicate_apps))
Number of duplicate apps: 1181
Going to remove all duplicates, but keep the record that has the most number of reviews as that is probably the most recent record.
reviews_max = {}
for app in appsgoogle_data:
name = app[0]
n_reviews = float(app[3])
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max[name] = n_reviews
elif name not in reviews_max:
reviews_max[name] = n_reviews
#print(reviews_max)
ValueErrorTraceback (most recent call last) <ipython-input-9-23c3ea3ce216> in <module>() 3 for app in appsgoogle_data: 4 name = app[0] ----> 5 n_reviews = float(app[3]) 6 7 if name in reviews_max and reviews_max[name] < n_reviews: ValueError: could not convert string to float: '3.0M'