#!/usr/bin/env python
# coding: utf-8

# **Profitable App Profiles for both App Store and Google Play Markets**

# *This project is to find mobile app profiles that are profitable for both App Store and Google Play markets.*

# In[1]:


from csv import reader

### The Google Play data set ###
opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]

### The App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]


# Define a function explore_data() that can be used repeatedly to explore rows in a more readable way and also add an option for our function to show the number of rows and columns for any data set

# In[2]:


def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') 
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print(android_header)
print('\n')
explore_data(android, 0, 3, True)


# In[3]:


print(ios_header)
print('\n')
explore_data(ios, 0, 3, True)


# **Deleting wrong row**

# In[4]:


print(android[10472]) 
print('\n')
print(android_header)  
print('\n')
print(android[0]) 


# In[5]:


print(len(android))
del android[10472]
print(len(android))


# **Deleting Duplicte Entries**

# In[6]:


for app in android:
    name = app[0]
    if name == 'Instagram':
        print(app)


# In[7]:


duplicate_apps = []
unique_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
    
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:10])


# Duplicate entries need to be removed and keep only one entry per app. Duplicate rows could be removed randomly or better use another alternative. We printed two cells above for the Instagram app, the main difference happens on the fourth position of each row, which corresponds to the number of reviews. The different numbers show that the Data was collected at different times. We can use this to build a criterion for holding rows. We can keep the rows that have the highest number of reviews because the higher the number of reviews, the more reliable the ratings. it can be done in two ways.
# First, Create a dictionary where each key is a unique app name, and the value is the highest number of reviews of that app. Second, Use the dictionary to create a new data set, which will have only one entry per app (and we only select the apps with the highest number of reviews) 

# In[8]:


reviews_max = {}

for app in android[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews


# In[9]:


print('Expected length:', len(android) - 1181)
print('Actual length:', len(reviews_max))


# *To use reviews_max dictionary to remove the duplicates. For the duplicate cases, we'll only keep the entries with the highest number of reviews.
# We start by initializing two empty lists, android_clean and already_added.
# We loop through the android data set, and for every iteration:
# We isolate the name of the app and the number of reviews.
# We add the current row (app) to the android_clean list, and the app name (name) to the already_added list if:
# The number of reviews of the current app matches the number of reviews of that app as described in the reviews_max dictionary; and
# The name of the app is not already in the already_added list. We need to add this supplementary condition to account for those cases where the highest number of reviews of a duplicate app is the same for more than one entry (for example, the Box app has three entries, and the number of reviews is the same). If we just check for reviews_max[name] == n_reviews, we'll still end up with duplicate entries for some apps.*

# In[10]:


android_clean = [] #List of apps without dupelicates(w/ data for the apps) 
already_added = [] #List of app names already inside android_clean (names only)

for app in android[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)


# In[11]:


explore_data(android_clean, 0, 3, True)


# **Removing Non-English Apps**

# In[12]:


def definitely_english(string):
    
    for character in string:
        if ord(character) > 127:
            return False
    
    return True

print(definitely_english('Instagram'))
print(definitely_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(definitely_english('Docs To Go™ Free Office Suite'))
print(definitely_english('Instachat 😜'))


# *The function couldn't correctly identify certain English app names like 'Docs To Go™ Free Office Suite' and 'Instachat 😜'. This is because emojis and characters like ™ fall outside the ASCII range and have corresponding numbers over 127.
# To minimize the impact of data loss, we'll only remove an app if its name has more than three non-ASCII characters:*

# In[13]:


def definitely_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True

print(definitely_english('Docs To Go™ Free Office Suite'))
print(definitely_english('Instachat 😜'))
print(definitely_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))


# In[14]:


android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    if definitely_english(name):
        android_english.append(app)
        
for app in ios:
    name = app[1]
    if definitely_english(name):
        ios_english.append(app)
        
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)


# **Isolating the Free Apps**

# In[15]:


fresh_android = []
fresh_ios = []

for app in android_english:
    price = app[7]
    if price == '0':
        fresh_android.append(app)
        
for app in ios_english:
    price = app[4]
    if price == '0.0':
        fresh_ios.append(app)
        
print(len(fresh_android))
print(len(fresh_ios))


# *As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.*
# 
# *To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:*
# 
# *Build a minimal Android version of the app, and add it to Google Play.*
# *If the app has a good response from users, we then develop it further.*
# *If the app is profitable after six months, we also build an iOS version of the app and add it to the App Store.*
# *Because our end goal is to add the app on both the App Store and Google Play, we need to find app profiles that are successful on both markets. For instance, a profile that might work well for both markets might be a productivity app that makes use of gamification.*
# 
# *Let's begin the analysis by getting a sense of the most common genres for each market. For this, we'll build a frequency table for the prime_genre column of the App Store data set, and the Genres and Category columns of the Google Play data set.*

# *We'll build two functions we can use to analyze the frequency tables:*
# 
# *One function to generate frequency tables that show percentages
# Another function that we can use to display the percentages in a descending order*

# In[16]:


def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages


def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])


# In[17]:


display_table(fresh_ios, -5) #prime_genre


# **Frequency Tablefor Apple App Store prime_genre Column Analisis:**
# The most common genre for the apple app store is Games (58.1%)
# The next most common is Entertainment (8%)
# While the Least is (0.1%) Give the table above I strongly recommend the company to focus on making apps intended for entertainment and focus more on the Games app genres. However this table only shows the percentage of apps made for each genre and not the amount of user traffic or the number of users, It does not imply that apps of the gaming genre have a large number of users or that other app genres have a low number of users. Further study should be made on the number of reviews/ratings there for each genre of apps, which can be used to judge the number of users for each genre.

# In[ ]:


# In[18]:


display_table(fresh_android, 1)  #Genres


# In[19]:


display_table(fresh_android, -4)  #Category


# **Frequency Table for Google Play Store Category, and Genres Columns Analisis:**
# For the Google Play Store, the most common app categories in order are Family (%18.9), Games (%9.7), and Tools (%8.5) with Family being slightly more than the next 2 largest combined. The least common app categories are Beauty (%0.6) and Comics (%0.6). If we look at the secondary app genres for apps we will see that the three most common genres are Tools (%8.4), Entertainment (%6.0), Education(%5.3), and the least common genres are Arcade; Pretend Play (%0.01), and Adventure; Education (%0.01). It should be noted however that many of the genres in the genres column (containing secondary app genres for apps that accompany the primary app genres in the category column) overlap unlike those in the categories column.
# On the Google Play Store app categories related to productivity appear to have more apps than on the Apple App Store, however app categories related to entertainment appear to still have a great number of apps. I would still recommend that the company focus on apps in the Games category even if there are fewer games apps than Family apps on the Google Play Store. I base this on the fact that there is an extremely large number of games apps on the Apps Store. However, the frequency tables for the Google Play Store again do not show that the categories have a large number of users, only that those categories have a large number of apps made for them. We can use that to guess whether there is a large or small number of users however at the end of the day that is only a guess until further analysis is made.

# 

# In[20]:


genres_ios = freq_table(fresh_ios, -5)

for genre in genres_ios:
    total = 0
    len_genre = 0
    for app in fresh_ios:
        genre_app = app[-5]
        if genre_app == genre:            
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', avg_n_ratings)


# In[26]:


for app in fresh_ios:
    if app[-5] == 'Navigation':
        print(app[1], ':', app[5]) # print name and number of ratings


# In[27]:


display_table(fresh_android, 5) # the Installs columns


# In[29]:


categories_android = freq_table(fresh_android, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in fresh_android:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)


# #CONCLUSION
# 
# In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.

# In[ ]: