class Data:
#-------------------------------------------------------------------#
def header(dataset):
header = dataset[0]
return header
#-------------------------------------------------------------------#
def data_without_header(dataset):
dataset = dataset[1:]
return dataset
#-------------------------------------------------------------------#
def explore_data(dataset):
dataset_slice = dataset[0:3]
print("Number of rows (without header):", len(dataset))
print("Number of columns(without header):", len(dataset[0]))
print('\n')
print("First 3 rows:")
print('\n')
for row in dataset_slice:
print(row)
print('\n')
#-------------------------------------------------------------------#
def missing_value(dataset):
len_row = 0
header = Data.header(dataset)
print("Row with missing value:")
print('\n')
for row in dataset:
if len(header) != len(row):
len_row += 1
print(row)
print("Row Index Number:", dataset.index(row))
print("Number of rows with missing value:", len_row)
print('\n')
#-------------------------------------------------------------------#
def duplicate_row(dataset, integer):
duplicate_entry = []
unique_entry = []
for row in dataset:
value = row[integer]
if value in unique_entry:
duplicate_entry.append(value)
else:
unique_entry.append(value)
print("Duplicate Entries:")
print('\n')
print("{num} duplicate entries".format(num=len(duplicate_entry), data=dataset))
#-------------------------------------------------------------------#
def is_english(string):
app_not_eng = 0
for character in string:
if ord(character) > 127:
app_not_eng += 1
if app_not_eng > 3:
return False
else:
return True
#-------------------------------------------------------------------#
def free(price):
if price == '0.0' or price == '0':
return True
else:
return False
from csv import reader
file = open('googleplaystore.csv', encoding="utf8")
read = reader(file)
android = list(read)
android_header = Data.header(android)
android = Data.data_without_header(android)
file = open('AppleStore.csv', encoding="utf8")
read = reader(file)
ios = list(read)
ios_header = Data.header(ios)
ios = Data.data_without_header(ios)
print("Header:")
print('\n')
print(ios_header)
print('\n')
Data.explore_data(ios)
Data.missing_value(ios)
Data.duplicate_row(ios, 0)
Header: ['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] Number of rows (without header): 7197 Number of columns(without header): 16 First 3 rows: ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'] ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'] ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'] Row with missing value: Number of rows with missing value: 0 Duplicate Entries: 0 duplicate entries
print("Header:")
print('\n')
print(android_header)
print('\n')
Data.explore_data(android)
Data.missing_value(android)
Data.duplicate_row(android, 0)
Header: ['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] Number of rows (without header): 10841 Number of columns(without header): 13 First 3 rows: ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'] ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] Row with missing value: ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up'] Row Index Number: 10472 Number of rows with missing value: 1 Duplicate Entries: 1181 duplicate entries
del(android[10472])
print("Number of rows after deletion:", len(android))
Number of rows after deletion: 10840
reviews_max = {}
for row in android:
name = row[0]
n_reviews = float(row[3])
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max[name] = n_reviews
elif name not in reviews_max:
reviews_max[name] = n_reviews
android_clean = []
already_added = []
for row in android:
name = row[0]
n_reviews = float(row[3])
if (reviews_max[name] == n_reviews) and (name not in already_added):
android_clean.append(row)
already_added.append(name)
print("Number of rows after removing duplicate entries:", len(android_clean))
Number of rows after removing duplicate entries: 9659
ios_english = []
android_english = []
for row in ios:
name = row[1]
if Data.is_english(name):
ios_english.append(row)
for row in android_clean:
name = row[0]
if Data.is_english(name):
android_english.append(row)
print("English apps in IOS Data:", len(ios_english))
print('\n')
print("English apps in Android Data:", len(android_english))
English apps in IOS Data: 6183 English apps in Android Data: 9614
ios_final = []
android_final = []
for row in ios_english:
price = row[4]
if Data.free(price):
ios_final.append(row)
for row in android_english:
price = row[7]
if Data.free(price):
android_final.append(row)
print("Free English apps in IOS Data:", len(ios_final))
print('\n')
print("Free English apps in Android Data:", len(android_final))
Free English apps in IOS Data: 3222 Free English apps in Android Data: 8864
def freq_table(dataset, index):
table = {}
total = 0
for row in dataset:
total += 1
value = row[index]
if value in table:
table[value] += 1
else:
table[value] = 1
table_percentages = {}
for key in table:
percentage = (table[key] / total) * 100
table_percentages[key] = percentage
return table_percentages
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
print('\n')
print("IOS apps in % of total apps:")
print('\n')
display_table(ios_final, -5)
IOS apps in % of total apps: Games : 58.16263190564867 Entertainment : 7.883302296710118 Photo & Video : 4.9658597144630665 Education : 3.662321539416512 Social Networking : 3.2898820608317814 Shopping : 2.60707635009311 Utilities : 2.5139664804469275 Sports : 2.1415270018621975 Music : 2.0484171322160147 Health & Fitness : 2.0173805090006205 Productivity : 1.7380509000620732 Lifestyle : 1.5828677839851024 News : 1.3345747982619491 Travel : 1.2414649286157666 Finance : 1.1173184357541899 Weather : 0.8690254500310366 Food & Drink : 0.8069522036002483 Reference : 0.5586592178770949 Business : 0.5276225946617008 Book : 0.4345127250155183 Navigation : 0.186219739292365 Medical : 0.186219739292365 Catalogs : 0.12414649286157665
print("Android apps in % of total apps:")
print('\n')
display_table(android_final, 1)
print('\n')
Android apps in % of total apps: FAMILY : 18.907942238267147 GAME : 9.724729241877256 TOOLS : 8.461191335740072 BUSINESS : 4.591606498194946 LIFESTYLE : 3.9034296028880866 PRODUCTIVITY : 3.892148014440433 FINANCE : 3.7003610108303246 MEDICAL : 3.531137184115524 SPORTS : 3.395758122743682 PERSONALIZATION : 3.3167870036101084 COMMUNICATION : 3.2378158844765346 HEALTH_AND_FITNESS : 3.0798736462093865 PHOTOGRAPHY : 2.944494584837545 NEWS_AND_MAGAZINES : 2.7978339350180503 SOCIAL : 2.6624548736462095 TRAVEL_AND_LOCAL : 2.33528880866426 SHOPPING : 2.2450361010830324 BOOKS_AND_REFERENCE : 2.1435018050541514 DATING : 1.861462093862816 VIDEO_PLAYERS : 1.7937725631768955 MAPS_AND_NAVIGATION : 1.3989169675090252 FOOD_AND_DRINK : 1.2409747292418771 EDUCATION : 1.1620036101083033 ENTERTAINMENT : 0.9589350180505415 LIBRARIES_AND_DEMO : 0.9363718411552346 AUTO_AND_VEHICLES : 0.9250902527075812 HOUSE_AND_HOME : 0.8235559566787004 WEATHER : 0.8009927797833934 EVENTS : 0.7107400722021661 PARENTING : 0.6543321299638989 ART_AND_DESIGN : 0.6430505415162455 COMICS : 0.6204873646209386 BEAUTY : 0.5979241877256317
Android Apps: Family (18.90%) IOS Apps: Games (58.16%)
Android Apps: Games (9.72%) IOS Apps: Entertainment (7.88%)
print("IOS:")
print('\n')
genres_ios = freq_table(ios_final, -5)
for genre in genres_ios:
total = 0
len_genre = 0
for app in ios_final:
genre_app = app[-5]
if genre_app == genre:
n_ratings = float(app[5])
total += n_ratings
len_genre += 1
avg_n_ratings = total / len_genre
print(genre, ':', avg_n_ratings)
### standalone code ###
### def avg_rating(dataset, index):
### prime_genre = {}
### prime_total = {}
### for row in dataset:
### genre = row[index]
### rating = float(row[5])
### if genre in prime_genre:
### prime_genre[genre] += rating
### prime_total[genre] += 1
### else:
### prime_genre[genre] = rating
### prime_total[genre] = 1
### so_avg = {}
### for genre in prime_genre and prime_total:
### average_rating = prime_genre[genre] / prime_total[genre]
### so_avg[genre] = average_rating
### return so_avg
IOS: Social Networking : 71548.34905660378 Photo & Video : 28441.54375 Games : 22788.6696905016 Music : 57326.530303030304 Reference : 74942.11111111111 Health & Fitness : 23298.015384615384 Weather : 52279.892857142855 Utilities : 18684.456790123455 Travel : 28243.8 Shopping : 26919.690476190477 News : 21248.023255813954 Navigation : 86090.33333333333 Lifestyle : 16485.764705882353 Entertainment : 14029.830708661417 Food & Drink : 33333.92307692308 Sports : 23008.898550724636 Book : 39758.5 Finance : 31467.944444444445 Education : 7003.983050847458 Productivity : 21028.410714285714 Business : 7491.117647058823 Catalogs : 4004.0 Medical : 612.0
On average, navigation apps have the highest number of user reviews, but this figure is heavily influenced by Waze and Google Maps.
The same pattern applies to social networking apps, where the average number is heavily influenced by a few giants like Facebook, Pinterest, Skype, etc.
Same applies to music apps, where a few big players like Pandora, Spotify, and Shazam heavily influence the average number.
The average number of ratings seem to be skewed by very few apps which have hundreds of thousands of user ratings, while the other apps may struggle to get past the 10,000 threshold.
We could get a better picture by removing these extremely popular apps for each genre and then rework the averages.
Reference apps have 74,942 user ratings on average, but it's actually the Bible and Dictionary.com which skew up the average rating.However, this niche seems to show some potential. One thing we could do is take another popular book and turn it into an app where we could add different features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes about the book, etc. On top of that, we could also embed a dictionary within the app, so users don't need to exit our app to look up words in an external app.
This idea seems to fit well with the fact that the App Store is dominated by for-fun apps. This suggests the market might be a bit saturated with for-fun apps, which means a practical app might have more of a chance to stand out among the huge number of apps on the App Store.
Other genres that seem popular include weather, book, food and drink, or finance. The book genre seem to overlap a bit with the app idea we described above, but the other genres don't seem too interesting to us:
Weather apps — people generally don't spend too much time in-app, and the chances of making profit from in-app adds are low. Also, getting reliable live weather data may require us to connect our apps to non-free APIs.
Food and drink — examples here include Starbucks, Dunkin' Donuts, McDonald's, etc. So making a popular food and drink app requires actual cooking and a delivery service, which is outside the scope of our company.
Finance apps — these apps involve banking, paying bills, money transfer, etc. Building a finance app requires domain knowledge, and we don't want to hire a finance expert just to build an app.
print("Android:")
print('\n')
categories_android = freq_table(android_final, 1)
for category in categories_android:
total = 0
len_category = 0
for app in android_final:
category_app = app[1]
if category_app == category:
n_installs = app[5]
n_installs = n_installs.replace(',', '')
n_installs = n_installs.replace('+', '')
total += float(n_installs)
len_category += 1
avg_n_installs = total / len_category
print(category, ':', avg_n_installs)
Android: ART_AND_DESIGN : 1986335.0877192982 AUTO_AND_VEHICLES : 647317.8170731707 BEAUTY : 513151.88679245283 BOOKS_AND_REFERENCE : 8767811.894736841 BUSINESS : 1712290.1474201474 COMICS : 817657.2727272727 COMMUNICATION : 38456119.167247385 DATING : 854028.8303030303 EDUCATION : 1833495.145631068 ENTERTAINMENT : 11640705.88235294 EVENTS : 253542.22222222222 FINANCE : 1387692.475609756 FOOD_AND_DRINK : 1924897.7363636363 HEALTH_AND_FITNESS : 4188821.9853479853 HOUSE_AND_HOME : 1331540.5616438356 LIBRARIES_AND_DEMO : 638503.734939759 LIFESTYLE : 1437816.2687861272 GAME : 15588015.603248259 FAMILY : 3695641.8198090694 MEDICAL : 120550.61980830671 SOCIAL : 23253652.127118643 SHOPPING : 7036877.311557789 PHOTOGRAPHY : 17840110.40229885 SPORTS : 3638640.1428571427 TRAVEL_AND_LOCAL : 13984077.710144928 TOOLS : 10801391.298666667 PERSONALIZATION : 5201482.6122448975 PRODUCTIVITY : 16787331.344927534 PARENTING : 542603.6206896552 WEATHER : 5074486.197183099 VIDEO_PLAYERS : 24727872.452830188 NEWS_AND_MAGAZINES : 9549178.467741935 MAPS_AND_NAVIGATION : 4056941.7741935486
On average, communication apps have the most installs: 38,456,119. This number is heavily skewed up by a few apps that have over one billion installs (WhatsApp, Facebook Messenger, Skype, Google Chrome, Gmail, and Hangouts), and a few others with over 100 and 500 million installs. If we removed all the communication apps that have over 100 million installs, the average would be reduced roughly ten times.
We see the same pattern for the video players category, which is the runner-up with 24,727,872 installs. The market is dominated by apps like Youtube, Google Play Movies & TV, or MX Player. The pattern is repeated for social apps (where we have giants like Facebook, Instagram, Google+, etc.), photography apps (Google Photos and other popular photo editors), or productivity apps (Microsoft Word, Dropbox, Google Calendar, Evernote, etc.).
Again, the main concern is that these app genres might seem more popular than they really are. Moreover, these niches seem to be dominated by a few giants who are hard to compete against.
The game genre seems pretty popular, but previously we found out this part of the market seems a bit saturated, so we'd like to come up with a different app recommendation if possible.
The books and reference genre looks fairly popular as well, with an average number of installs of 8,767,811. It's interesting to explore this in more depth, since we found this genre has some potential to work well on the App Store, and our aim is to recommend an app genre that shows potential for being profitable on both the App Store and Google Play.
The book and reference genre includes a variety of apps: software for processing and reading ebooks, various collections of libraries, dictionaries, tutorials on programming or languages, etc. It seems there's still a small number of extremely popular apps that skew the average:
Google Play Books : 1,000,000,000+ Bible : 100,000,000+ Amazon Kindle : 100,000,000+ Wattpad Free Books : 100,000,000+ Audiobooks from Audible : 100,000,000+
However, it looks like there are only a few very popular apps, so this market still shows potential.
We also notice there are quite a few apps built around the book Quran, which suggests that building an app around a popular book can be profitable. It seems that taking a popular book (perhaps a more recent book) and turning it into an app could be profitable for both the Google Play and the App Store markets.
However, it looks like the market is already full of libraries, so we need to add some special features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes on the book, a forum where people can discuss the book, etc.
The above steps would assist:
In this project, we analyzed data about the App Store and Google Play mobile apps with the goal of recommending an app profile that can be profitable for both markets.
We conclude that taking a popular book (perhaps a more recent book) and turning it into an app could be profitable for both the Google Play and the App Store markets. The markets are already full of libraries, so we need to add some special features besides the raw version of the book. This might include daily quotes from the book, an audio version of the book, quizzes on the book, a forum where people can discuss the book, etc.