#!/usr/bin/env python # coding: utf-8 # Title: my app # # Intro: Our aim it to help our developers understand what type of apps are likely to attract more users on Google Play and the App Store. We'll need to to collect and analyze data about mobile apps available on Google Play and the App Store. # In[1]: from csv import reader opened_file = open('AppleStore.csv') read_file = reader(opened_file) applestore = list(read_file) apple = applestore[1:] appleheader = applestore[0] print(appleheader) opened_file = open('googleplaystore.csv') read_file = reader(opened_file) googleplay = list(read_file) google = googleplay[1:] googleheader = googleplay[0] print(googleheader) def explore_data(dataset, start, end, rows_and_columns=False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') if rows_and_columns: print(('number of rows:'), len(dataset)) print(('number of columns:'), len(dataset[0])) # In[2]: explore_data(apple, 0, 3,rows_and_columns=True) explore_data(google, 0, 3, rows_and_columns=True) # In[3]: print(google[10472]) # In[4]: for row in google: if len(row) != len(googleplay[0]): print(row) print(google.index(row)) # In[5]: del google[10472] # In[6]: duplicategoogleapp=[] uniquegoogleapp=[] for row in google: appname=row[0] if appname in uniquegoogleapp: duplicategoogleapp.append(appname) else: uniquegoogleapp.append(appname) print(duplicategoogleapp[1:5]) # In[7]: print(len(duplicategoogleapp)) # We don't want to count certain apps more than once when we analyze data, so we need to remove the duplicate entries and keep only one entry per app. # # From the rows we printed for the Instagram app, the main difference happens on the fourth position of each row, which corresponds to the number of reviews. The different numbers show the data was collected at different times. Rather than removing duplicates randomly, we'll only keep the row with the highest number of reviews and remove the other entries for any given app. # In[8]: reviews_max={} for row in google: name = row[0] n_reviews = float(row[3]) if name in reviews_max and reviews_max[name] < n_reviews: reviews_max[name] = n_reviews if name not in reviews_max: reviews_max[name] = n_reviews print(len(reviews_max)) # In[9]: print('Expected length:', len(google) - 1181) # In[10]: android_clean=[] already_added=[] for row in google: name = row[0] n_reviews = float(row[3]) if n_reviews == reviews_max[name] and (name not in already_added): android_clean.append(row) already_added.append(name) print(len(android_clean)) # In[11]: def check_english(string): for character in string: if ord(character) > 127: return False return True print(check_english('Instagram')) print(check_english('爱奇艺PPS -《欢乐颂2》电视剧热播' )) print(check_english('Docs To Go™ Free Office Suite' )) print(check_english('Instachat 😜')) # In[12]: def checking_english(string): nonen = 0 for character in string: if ord(character) > 127: nonen += 1 if nonen > 3: return False else: return True print(checking_english('Instagram')) print(checking_english('爱奇艺PPS -《欢乐颂2》电视剧热播' )) print(checking_english('Docs To Go™ Free Office Suite' )) print(checking_english('Instachat 😜')) # In[13]: google_en=[] apple_en=[] for app in android_clean: name = app[0] if checking_english(name): google_en.append(app) for app in apple: name = app[1] if checking_english(name): apple_en.append(app) print(explore_data(google_en, 0, 3, rows_and_columns=True)) print('\n') print(explore_data(apple_en, 0, 3, rows_and_columns=True)) # In[14]: free_google_en=[] free_apple_en=[] for app in google_en: price = app[7] if price == '0': free_google_en.append(app) for app in apple_en: price = app[4] if price == '0.0': free_apple_en.append(app) print(len(free_google_en)) print(len(free_apple_en)) # Our validation strategy for an app idea is comprised of three steps: # # 1. Build a minimal Android version of the app, and add it to Google Play. # 2. If the app has a good response from users, we develop it further. # 3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store. # # Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. # In[15]: def freq_table(dataset, index): freq_dic = {} number = 0 for row in dataset: key = row[index] number +=1 if key in freq_dic: freq_dic[key] += 1 else: freq_dic[key] = 1 percentage={} for key in freq_dic: percentage [key] = (freq_dic[key] / number) * 100 return percentage def display_table(dataset, index): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # In[16]: display_table(free_apple_en , -5) # Games is the most common genre, and Entertainment the runner-up. # The top genres are for fun.Most of the apps in appstore are designed for entertainment. # In[17]: display_table(free_google_en, 1) # In[18]: display_table(free_google_en, -4) # In[20]: unique_apple_genre = freq_table(free_apple_en , -5) for genre in unique_apple_genre: total = 0 len_genre = 0 for app in free_apple_en: genre_app = app[-5] if genre_app == genre: total += float(app[5]) len_genre += 1 average_n_rating = total / len_genre print(genre, "," , average_n_rating) # In[21]: unique_google_category = freq_table(free_google_en , 1) for category in unique_google_category: total = 0 len_category = 0 for app in free_google_en: category_app = app[1] if category_app == category: n_installs = app[5] n_installs = n_installs.replace('+', '') n_installs = n_installs.replace(',', '') total += float(n_installs) len_category += 1 average_n_installs = total / len_category print(category, "," , average_n_installs) # In[ ]: