This project is about identifying profitable profiles for the App Store and Google Play Markets
### Google Play data set ###
opened_file = open('googleplaystore.csv')
from csv import reader
read_file = reader(opened_file)
android = list(read_file)
android_header = android[0]
android = android[1:]
### App Store data set ###
opened_file = open('AppleStore.csv')
read_file = reader(opened_file)
ios = list(read_file)
ios_header =ios[0]
ios = ios[2:]
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n') # adds a new (empty) line after each row
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns:', len(dataset[0]))
print(android_header)
print('\n')
explore_data(android, 0, 3, True)
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'] ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] Number of rows: 10841 Number of columns: 13
print(android[10472]) # incorrect row
print('\n')
print(android_header) # header
print('\n')
print(android[0]) # correct row
del(android[10472])
print(android[10472]) # deleted row
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up'] ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']
### Google Play data set has duplicates, so let's get rid of em! ###
for app in android:
name = app[0]
if name == 'Instagram':
print(app)
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'] ['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'] ['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device'] ['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
### I'm not going to remove duplicates randomly.
### Rather, I'm going to delete all except the first one found ###
seen_duplicates = []
unique_apps = []
for app in android:
name == app[0]
if name in unique_apps:
seen_duplicates.append(name)
else:
unique_apps.append(name)
print('Number of duplicates:', len(seen_duplicates))
print('\n')
print('Number of unique', len(unique_apps))
print('Examples of duplicate apps:', seen_duplicates[:15] )
Number of duplicates: 10839 Number of unique 1 Examples of duplicate apps: ['iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology', 'iHoroscope - 2018 Daily Horoscope & Astrology']
### Removing duplicate entries and store
### separate lists for new cleaned data set and
### just app names for detecting duplicates ###
reviews_max = {}
for app in android[1:]:
name = app[0]
n_reviews = float(app[3])
if (name in reviews_max) and (reviews_max[name] < n_reviews):
reviews_max[name] = n_reviews
if (name not in reviews_max):
reviews_max[name] = n_reviews
print('Expected length:', len(android) - 1181)
print('Actual length:', len(reviews_max))
android_clean = []
already_added = []
for app in android[1:]:
name = app[0]
n_reviews = float(app[3])
if n_reviews == reviews_max[name] and name not in already_added:
android_clean.append(app)
already_added.append(name)
Expected length: 9659 Actual length: 9658
### exploring android_clean data set to ensure it displays as expected ###
explore_data(android_clean, 0, 3, True)
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'] ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'] Number of rows: 9658 Number of columns: 13
## adding function that takes a string and determines if there is any
## character that doesn't belong to the set of common English characters
## if there are more than 3 chars that fall outside the ASCII range (0-127)
## it is determined to be non-english
def english_only(language):
count = 0
for char in language:
if(ord(char) > 127):
count += 1;
if count > 3:
return False;
else:
return True;
print(english_only('Instagram'))
print(english_only('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(english_only('Docs To Go™ Free Office Suite'))
print(english_only('Instachat 😜'))
True False True True
## separate Android and iOS apps and find out how many of each we have ###
android_english = []
ios_english = []
for app in android_clean:
name = app[0]
if english_only(name):
android_english.append(app)
for app in ios:
name = app[1];
if english_only(name):
ios_english.append(app)
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'] ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'] Number of rows: 9613 Number of columns: 13 ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'] ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'] ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1'] Number of rows: 6182 Number of columns: 16
## isolating the free android and iOS apps ###
for app in android_clean:
name = app[0]
if(english_only):
android_english.append(name)
for app in ios:
name = app[1];
if(english_only):
ios_english.append(name)
### We want to find and app profile that fits both the App Store and Google Play because by analyzing apps that are successful on both, we have a way to measure the threshold of entry
### in order to be/remain competitive on those platforms ###
explore_data(android_english, 0, 3, True)
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'] ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'] Number of rows: 19271 Number of columns: 13
### function to generate frequency tables to show percentages ###
def freq_table(dataset, index):
table = {}
total = 0
for row in dataset:
total += 1
value = row[index]
if value in table:
table[value] += 1
else:
table[value] = 1
table_percentages = {}
for key in table:
percentage = (table[key] / total) * 100
table_percentages[key] = percentage
return table_percentages
### function to display the percentages in desc ###
def display_table(dataset, index, label):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
print("Column: " + "***" + label.upper() + "***")
for entry in table_sorted:
print(entry[1], ':', entry[0])
display_table(ios, -5, "ios")
Column: ***IOS*** Games : 53.66870483602001 Entertainment : 7.434685936631462 Education : 6.2951639799888826 Photo & Video : 4.849916620344636 Utilities : 3.446359088382435 Health & Fitness : 2.501389660922735 Productivity : 2.473596442468038 Social Networking : 2.3068371317398557 Lifestyle : 2.001111728738188 Music : 1.9177320733740968 Shopping : 1.6953863257365203 Sports : 1.584213451917732 Book : 1.556420233463035 Finance : 1.4452473596442468 Travel : 1.1256253474152307 News : 1.0422456920511394 Weather : 1.000555864369094 Reference : 0.8893829905503057 Food & Drink : 0.8754863813229572 Business : 0.792106725958866 Navigation : 0.6392440244580322 Medical : 0.3196220122290161 Catalogs : 0.13896609227348528
display_table(android_clean, 1, "Category")
Column: ***CATEGORY*** FAMILY : 19.403603230482503 GAME : 9.79498861047836 TOOLS : 8.583557672395942 BUSINESS : 4.348726444398427 MEDICAL : 4.089873679850901 PERSONALIZATION : 3.8931455787947815 PRODUCTIVITY : 3.8724373576309796 LIFESTYLE : 3.8206668047214745 FINANCE : 3.57216815075585 SPORTS : 3.36508593911783 COMMUNICATION : 3.26154483329882 HEALTH_AND_FITNESS : 2.9819838475874922 PHOTOGRAPHY : 2.909505073514185 NEWS_AND_MAGAZINES : 2.629944087802858 SOCIAL : 2.4746324290743424 BOOKS_AND_REFERENCE : 2.2986125491820255 TRAVEL_AND_LOCAL : 2.2675502174363222 SHOPPING : 2.091530337544005 DATING : 1.7601987989231724 VIDEO_PLAYERS : 1.6980741354317663 MAPS_AND_NAVIGATION : 1.356388486229033 FOOD_AND_DRINK : 1.1596603851729135 EDUCATION : 1.1078898322634085 ENTERTAINMENT : 0.9008076206253882 AUTO_AND_VEHICLES : 0.8800993994615862 LIBRARIES_AND_DEMO : 0.8697452888796852 WEATHER : 0.8179747359701802 HOUSE_AND_HOME : 0.755850072478774 EVENTS : 0.662663077241665 PARENTING : 0.6212466349140608 ART_AND_DESIGN : 0.6212466349140608 COMICS : 0.5798301925864567 BEAUTY : 0.5487678608407538
display_table(android_clean, -4, "Genres")
Column: ***GENRES*** Tools : 8.57320356181404 Entertainment : 5.808656036446469 Education : 5.280596396769518 Business : 4.348726444398427 Medical : 4.089873679850901 Personalization : 3.8931455787947815 Productivity : 3.8724373576309796 Lifestyle : 3.8103126941395735 Finance : 3.57216815075585 Sports : 3.427210602609236 Communication : 3.26154483329882 Action : 3.0958790639884035 Health & Fitness : 2.9819838475874922 Photography : 2.909505073514185 News & Magazines : 2.629944087802858 Social : 2.4746324290743424 Books & Reference : 2.2986125491820255 Travel & Local : 2.2571961068544213 Shopping : 2.091530337544005 Simulation : 1.9983433423068957 Arcade : 1.9051563470697868 Dating : 1.7601987989231724 Casual : 1.7084282460136675 Video Players & Editors : 1.6773659142679642 Maps & Navigation : 1.356388486229033 Puzzle : 1.2321391592462207 Food & Drink : 1.1596603851729135 Role Playing : 1.0871816110996066 Strategy : 0.9836405052805964 Racing : 0.9422240629529923 Auto & Vehicles : 0.8800993994615862 Libraries & Demo : 0.8697452888796852 Weather : 0.8179747359701802 House & Home : 0.755850072478774 Adventure : 0.755850072478774 Events : 0.662663077241665 Art & Design : 0.5798301925864567 Comics : 0.5694760820045558 Beauty : 0.5487678608407538 Card : 0.4866431973493477 Parenting : 0.4762890867674467 Board : 0.4348726444398426 Casino : 0.4038103126941396 Trivia : 0.3934562021122386 Educational;Education : 0.3934562021122386 Educational : 0.38310209153033753 Education;Education : 0.37274798094843653 Casual;Pretend Play : 0.25885276454752537 Word : 0.23814454338372334 Music : 0.1967281010561193 Puzzle;Brain Games : 0.17601987989231727 Education;Pretend Play : 0.17601987989231727 Racing;Action & Adventure : 0.16566576931041624 Entertainment;Music & Video : 0.1553116587285152 Board;Brain Games : 0.14495754814661418 Arcade;Action & Adventure : 0.14495754814661418 Educational;Pretend Play : 0.13460343756471319 Casual;Action & Adventure : 0.13460343756471319 Casual;Brain Games : 0.12424932698281217 Action;Action & Adventure : 0.12424932698281217 Simulation;Action & Adventure : 0.07247877407330709 Parenting;Education : 0.07247877407330709 Entertainment;Brain Games : 0.07247877407330709 Parenting;Music & Video : 0.062124663491406086 Educational;Brain Games : 0.062124663491406086 Education;Creativity : 0.062124663491406086 Casual;Creativity : 0.062124663491406086 Art & Design;Creativity : 0.062124663491406086 Educational;Creativity : 0.051770552909505066 Adventure;Action & Adventure : 0.051770552909505066 Sports;Action & Adventure : 0.04141644232760406 Role Playing;Pretend Play : 0.04141644232760406 Role Playing;Action & Adventure : 0.04141644232760406 Education;Brain Games : 0.04141644232760406 Education;Action & Adventure : 0.04141644232760406 Simulation;Pretend Play : 0.031062331745703043 Simulation;Education : 0.031062331745703043 Puzzle;Action & Adventure : 0.031062331745703043 Music;Music & Video : 0.031062331745703043 Entertainment;Creativity : 0.031062331745703043 Entertainment;Action & Adventure : 0.031062331745703043 Educational;Action & Adventure : 0.031062331745703043 Education;Music & Video : 0.031062331745703043 Casual;Education : 0.031062331745703043 Board;Action & Adventure : 0.031062331745703043 Video Players & Editors;Music & Video : 0.02070822116380203 Strategy;Action & Adventure : 0.02070822116380203 Puzzle;Creativity : 0.02070822116380203 Entertainment;Pretend Play : 0.02070822116380203 Card;Action & Adventure : 0.02070822116380203 Books & Reference;Education : 0.02070822116380203 Video Players & Editors;Creativity : 0.010354110581901015 Trivia;Education : 0.010354110581901015 Travel & Local;Action & Adventure : 0.010354110581901015 Tools;Education : 0.010354110581901015 Strategy;Education : 0.010354110581901015 Strategy;Creativity : 0.010354110581901015 Role Playing;Education : 0.010354110581901015 Role Playing;Brain Games : 0.010354110581901015 Racing;Pretend Play : 0.010354110581901015 Puzzle;Education : 0.010354110581901015 Parenting;Brain Games : 0.010354110581901015 Music & Audio;Music & Video : 0.010354110581901015 Lifestyle;Pretend Play : 0.010354110581901015 Lifestyle;Education : 0.010354110581901015 Health & Fitness;Education : 0.010354110581901015 Health & Fitness;Action & Adventure : 0.010354110581901015 Entertainment;Education : 0.010354110581901015 Communication;Creativity : 0.010354110581901015 Comics;Creativity : 0.010354110581901015 Casual;Music & Video : 0.010354110581901015 Books & Reference;Creativity : 0.010354110581901015 Board;Pretend Play : 0.010354110581901015 Art & Design;Pretend Play : 0.010354110581901015 Art & Design;Action & Adventure : 0.010354110581901015 Arcade;Pretend Play : 0.010354110581901015 Adventure;Education : 0.010354110581901015 Adventure;Brain Games : 0.010354110581901015
freq_table(ios, -5)
{'Book': 1.556420233463035, 'Business': 0.792106725958866, 'Catalogs': 0.13896609227348528, 'Education': 6.2951639799888826, 'Entertainment': 7.434685936631462, 'Finance': 1.4452473596442468, 'Food & Drink': 0.8754863813229572, 'Games': 53.66870483602001, 'Health & Fitness': 2.501389660922735, 'Lifestyle': 2.001111728738188, 'Medical': 0.3196220122290161, 'Music': 1.9177320733740968, 'Navigation': 0.6392440244580322, 'News': 1.0422456920511394, 'Photo & Video': 4.849916620344636, 'Productivity': 2.473596442468038, 'Reference': 0.8893829905503057, 'Shopping': 1.6953863257365203, 'Social Networking': 2.3068371317398557, 'Sports': 1.584213451917732, 'Travel': 1.1256253474152307, 'Utilities': 3.446359088382435, 'Weather': 1.000555864369094}
### calculate most popular apps by genre ###
genres_ios = freq_table(ios, -5)
for genre in genres_ios:
total = 0
len_genre = 0
for genre_app in ios:
genre_app = app[-5]
if genre_app == genre:
user_ratings = float(app[5])
total += user_ratings
len_genre += 1
avg_user_ratings = total / len_genre
print(genre)
print(avg_user_ratings)
ZeroDivisionErrorTraceback (most recent call last) <ipython-input-27-9da2f3792fc3> in <module>() 12 total += user_ratings 13 len_genre += 1 ---> 14 avg_user_ratings = total / len_genre 15 print(genre) 16 print(avg_user_ratings) ZeroDivisionError: division by zero