#importing our libraries import pandas as pd #reading our ios and android dataset #Google datset android = pd.read_csv('googleplaystore (2).csv') #ios dataset ios = pd.read_csv('AppleStore (1).csv') #ios dataset ios.shape #Google dataset android.shape #exploring the columns names and their datatypes android.info() ios.info() #exploring samples of our datasets android.sample(2) ios.sample(2) android[10471:10474] #dropping the row android=android.drop(10472) android.duplicated('App').sum() ios.duplicated('track_name').sum() #Exploring some of the duplicate app names in the Google Dataset android.App.value_counts() #Exploring one of the duplicated app name. Roblox exists 9 times in the Google dataset android.groupby('App').get_group('ROBLOX') #Exploring some of the duplicate app names in the ios Dataset ios.track_name.value_counts() ios.groupby('track_name').get_group('VR Roller Coaster') #I sorted the values in the Reviews columns by descending order, identified duplicates app names and dropped them. android_clean = android.sort_values('Reviews', ascending=False).drop_duplicates('App').sort_index() android_clean ios_clean = ios.sort_values('rating_count_tot', ascending=False).drop_duplicates('track_name').sort_index() ios_clean android_clean.duplicated('App').sum() ios_clean.duplicated('track_name').sum() #Conversion to lists android_list = android_clean.values.tolist() ios_list = ios_clean.values.tolist() def is_english(string): non_ascii = 0 for character in string: if ord(character) > 127: non_ascii += 1 if non_ascii > 3: return False else: return True print(is_english('Docs To Go™ Free Office Suite')) print(is_english('Instachat 😜')) android_english = [] ios_english = [] for app in android_list: name = app[0] if is_english(name): android_english.append(app) print ('English Android Apps:',len(android_english)) for app in ios_list: name = app[1] if is_english(name): ios_english.append(app) print ('IOS Android Apps:',len(ios_english)) android_data = pd.DataFrame(android_english,columns =['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']) ios_data = pd.DataFrame(ios_english, columns = ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']) #identifying free android apps android_free = android_data.loc[android_data['Price'] == '0'] len(android_free) #identifying free ios apps ios_free = ios_data.loc[ios_data['price'] == 0.0] len(ios_free) #Exploring the "Category" columns with a frequency table. freq = pd.crosstab(index = android_free['Category'], columns='count') freq= freq/len(android_free)*100 freq= freq.sort_values(by=['count'], ascending=False) freq import warnings warnings.filterwarnings('ignore') android_free['Installs']= android_free.Installs.str.replace(',', '', regex = True) android_free['Installs']=android_free.Installs.str.replace('+', '', regex = True) android_free['Installs'] = android_free['Installs'].astype(float) #Exploring the FAMILY category and_family = android_free[android_free["Category"]=='FAMILY'] and_family = and_family.groupby('App')['Installs'].sum().sort_values( ascending=False) and_family[:20] #Exploring the GAME category and_game = android_free[android_free["Category"]=='GAME'] and_game = and_game.groupby('App')['Installs'].sum().sort_values( ascending=False) and_game[:20] #Exploring the TOOLS category and_tools = android_free[android_free["Category"]=='TOOLS'] and_tools = and_tools.groupby('App')['Installs'].sum().sort_values( ascending=False) and_tools[:10] #Exploring the BUSINESS category and_biz = android_free[android_free["Category"]=='BUSINESS'] and_biz = and_biz.groupby('App')['Installs'].sum().sort_values( ascending=False) and_biz[:10] #Exploring the LIFESTYLE category and_ls = android_free[android_free["Category"]=='LIFESTYLE'] and_ls = and_ls.groupby('App')['Installs'].sum().sort_values( ascending=False) and_ls[:10] pop_data = android_free.groupby('Category')['Installs'].sum() pop_data = pop_data/android_free['Category'].value_counts() pop_data.sort_values(ascending=False) # Exploring the "COMMUNICATION" category and_comms = android_free[android_free["Category"]=='COMMUNICATION'] and_comms = and_comms.groupby('App')['Installs'].sum().sort_values( ascending=False) and_comms[:20] # Exploring the "VIDEO PLAYERS" category and_vids = android_free[android_free["Category"]=='VIDEO_PLAYERS'] and_vids = and_vids.groupby('App')['Installs'].sum().sort_values( ascending=False) and_vids[:20] # Exploring the "SOCIAL" category and_social = android_free[android_free["Category"]=='SOCIAL'] and_social = and_social.groupby('App')['Installs'].sum().sort_values( ascending=False) and_social[:20] # Exploring the "PHOTOGRAGHY" category and_photo = android_free[android_free["Category"]=='PHOTOGRAPHY'] and_photo = and_photo.groupby('App')['Installs'].sum().sort_values( ascending=False) and_photo[:20] # Exploring the "PRODUCTIVITY" category and_prod = android_free[android_free["Category"]=='PRODUCTIVITY'] and_prod = and_prod.groupby('App')['Installs'].sum().sort_values( ascending=False) and_prod[:20] # Exploring the "TRAVEL" category and_trav = android_free[android_free["Category"]=='TRAVEL_AND_LOCAL'] and_trav = and_trav.groupby('App')['Installs'].sum().sort_values( ascending=False) and_trav[:20] # Exploring the "PHOTOGRAGHY" category and_photo = android_free[android_free["Category"]=='NEWS_AND_MAGAZINES'] and_photo = and_photo.groupby('App')['Installs'].sum().sort_values( ascending=False) and_photo[:20] and_br = android_free[android_free["Category"]=='BOOKS_AND_REFERENCE'] and_br = and_br.groupby('App')['Installs'].sum().sort_values( ascending=False) and_br[:30] #exploring the "prime_genre" columns freq = pd.crosstab(index=ios_free['prime_genre'], columns='count') freq= freq/len(ios_free)*100 freq= freq.sort_values(by=['count'], ascending=False) freq #exploring the "Games" columns ios_games = ios_free[ios_free["prime_genre"]=='Games'] ios_games = ios_games.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_games[:20] #exploring the "Entertainment" columns ios_ent = ios_free[ios_free["prime_genre"]=='Entertainment'] ios_ent = ios_ent.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_ent[:20] #exploring the "Photo & Video" columns ios_pv = ios_free[ios_free["prime_genre"]=='Photo & Video'] ios_pv = ios_pv.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_pv[:20] #exploring the "Education" columns ios_education = ios_free[ios_free["prime_genre"]=='Education'] ios_education = ios_education.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_education[:20] #average downloads by Genre in the ios dataset pop_data = ios_free.groupby('prime_genre')['rating_count_tot'].sum() pop_data = pop_data/ios_free['prime_genre'].value_counts() pop_data.sort_values(ascending=False) #Exploring the "Navigation" columns ios_nav = ios_free[ios_free["prime_genre"]=='Navigation'] ios_nav = ios_nav.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_nav[:10] #exploring the "Reference" columns ios_ref = ios_free[ios_free["prime_genre"]=='Reference'] ios_ref = ios_ref.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_ref[:15] #exploring the "Music" columns ios_mus = ios_free[ios_free["prime_genre"]=='Music'] ios_mus = ios_mus.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_mus[:15] #exploring the "Weather" columns ios_wea = ios_free[ios_free["prime_genre"]=='Weather'] ios_wea = ios_wea.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_wea[:15] #exploring the "Social Networking" columns ios_soc = ios_free[ios_free["prime_genre"]=='Social Networking'] ios_soc = ios_soc.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_soc[:15] #exploring the "Travel" columns ios_trav = ios_free[ios_free["prime_genre"]=='Travel'] ios_trav = ios_trav.groupby('track_name')['rating_count_tot'].sum().sort_values( ascending=False) ios_trav[:15]