#!/usr/bin/env python # coding: utf-8 # # GUIDED PROJECT APP STORE # # In this project i will analyze the data to help developers understand the most appealing apps # # In[1]: from csv import reader Apple_store = open('AppleStore.csv',encoding='utf8') data_apple = reader(Apple_store) list_apple= list(data_apple) apple_header = list_apple[0] Apple = list_apple[1:] Google_Store = open('googleplaystore.csv',encoding='utf8') data_google = reader(Google_Store) list_google = list(data_google) google_header = list_google[0] Android = list_google[1:] # In[2]: def explore_data(dataset, start, end, rows_and_columns = False): dataset_slice = dataset[start:end] for row in dataset_slice: print(row) print('\n') if rows_and_columns: print('Number of rows:', len(dataset)) print('Number of columns:', len(dataset[0])) print(apple_header) print('\n') explore_data( Apple, 0, 3, True ) # In[3]: print(google_header) print('\n') explore_data( Android, 0, 3, True ) # In[6]: for row in Android: length_header = len(google_header) if len(row) != length_header: print(row) print(len(Android)) # In[4]: del Android[10472] # In[5]: print(len(Android)) # the google play store has duplicates that we are trying to find # In[7]: duplicate_name = [] unique_name = [] for row in Android: name = row[0] if name in unique_name: duplicate_name.append(name) else: unique_name.append(name) number_duplicate = len(duplicate_name) print(number_duplicate) # We will not remove duplicates randomly. innstead we will remove all duplicates but the one with the highest review # In[8]: reviews_max = {} for row in Android: name = row[0] n_reviews = float(row[3]) if name in reviews_max and reviews_max[name] < n_reviews: reviews_max[name] = n_reviews elif name not in reviews_max: reviews_max[name] = n_reviews print(len(reviews_max)) # In[9]: android_clean = [] already_added = [] for row in Android: name = row[0] n_reviews = float(row[3]) if (n_reviews == reviews_max[name]) and (name not in already_added): android_clean.append(row) already_added.append(name) print(len(android_clean)) # In[10]: def is_english(string): for letter in string: if ord(letter) > 127: return False return True print(is_english('Instagram')) print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')) print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) Our function is not perfect yet because it doesnt recognize special character like emoji. We will modify it and allow it to recognize those special characters. # In[11]: def is_english(string): ascii_count = 0 for letter in string: if ord(letter) > 127: ascii_count += 1 if ascii_count > 3: return False else: return True print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播')) # In[12]: apple_english = [] android_english = [] for app in android_clean: name = app[0] if is_english(name): android_english.append(app) for app in Apple: name = app[1] if is_english(name): apple_english.append(app) explore_data(android_english,0,3,True) print('\n') explore_data(apple_english,0,3,True) # In[13]: free_android = [] free_apple = [] for app in android_english: app_price = app[7] if app_price == '0': free_android.append(app) for app in apple_english: app_price = app[4] if app_price == '0.0': free_apple.append(app) print(len(free_android)) print(len(free_apple)) # We are trying to find an app idea. For that we build a minimal version on google play and if the response from users are good we develop ot further. If it is profitable we build a io version of it after 6 months. # In order to generate frequency tables and find out the most common genres , we will use the columns genre for both store. # In[15]: def freq_table(dataset, index): frequency_table = {} total = 0 for row in dataset: total += 1 value = row[index] if value in frequency_table: frequency_table[value] += 1 else: frequency_table[value] = 1 percentage_table = {} for key in frequency_table: percentage = (frequency_table[key]/total)*100 percentage_table[key] = percentage return percentage_table def display_table(dataset, index): table = freq_table(dataset, index) table_display = [] for key in table: key_val_as_tuple = (table[key], key) table_display.append(key_val_as_tuple) table_sorted = sorted(table_display, reverse = True) for entry in table_sorted: print(entry[1], ':', entry[0]) # we will display the prime genre frequency table # In[16]: display_table(free_apple, 11) # we will use the display table function to display the Genres frequency table # In[17]: display_table(free_android, 9) # We will then display the category frequency table # In[18]: display_table(free_android, 1) # 1-the most common genre is Games. The runner-up is entertainment. # - there are a lot of apps on education and photo and video are the 3rd most common app on the apple store. # - most apps are designed for entertainment. # - no i cannot recommend a app profile base on these data because it represent just an sample of the app store . Moreover a particular genre that are more common doesnt mean a ot of people are using it. # # 2-The most common genre are Tools, Entertainment, Education, Business and Productivity. # - There are more genre than the app store . also, apps that are supposed to be popular are not the most common (social, gaming genres...). No i cannot recommend an app profile based on these sample because there represent a small part of the store # In[19]: prime_genre = freq_table(free_apple,-5) for genre in prime_genre: total = 0 len_genre = 0 for app in free_apple: genre_app = app[-5] if genre_app == genre: user_rating = float(app[5]) total += user_rating len_genre += 1 average_user_rating = total/len_genre print(genre, ':' ,average_user_rating) # Based on the data , i will recommend to build a social networking or a navigation because there are the most download( they have the most users rating) # In[26]: Category = freq_table(free_android, 1) for user in Category: total = 0 len_category = 0 for app in free_android: category_app = app[1] if category_app == user: number_install = app[5] number_install = number_install.replace('+', '') number_install = number_install.replace(',','') total += float(number_install) len_category += 1 average_install = total / len_category print(user, ':', average_install) # I will recommend to build a social networking app, travelling or entertainment