In this project i will analyze the data to help developers understand the most appealing apps
from csv import reader
Apple_store = open('AppleStore.csv',encoding='utf8')
data_apple = reader(Apple_store)
list_apple= list(data_apple)
apple_header = list_apple[0]
Apple = list_apple[1:]
Google_Store = open('googleplaystore.csv',encoding='utf8')
data_google = reader(Google_Store)
list_google = list(data_google)
google_header = list_google[0]
Android = list_google[1:]
def explore_data(dataset, start, end, rows_and_columns = False):
dataset_slice = dataset[start:end]
for row in dataset_slice:
print(row)
print('\n')
if rows_and_columns:
print('Number of rows:', len(dataset))
print('Number of columns:', len(dataset[0]))
print(apple_header)
print('\n')
explore_data( Apple, 0, 3, True )
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'] ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'] ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'] Number of rows: 7197 Number of columns: 16
print(google_header)
print('\n')
explore_data( Android, 0, 3, True )
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'] ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] Number of rows: 10841 Number of columns: 13
for row in Android:
length_header = len(google_header)
if len(row) != length_header:
print(row)
print(len(Android))
10840
del Android[10472]
print(len(Android))
10840
the google play store has duplicates that we are trying to find
duplicate_name = []
unique_name = []
for row in Android:
name = row[0]
if name in unique_name:
duplicate_name.append(name)
else:
unique_name.append(name)
number_duplicate = len(duplicate_name)
print(number_duplicate)
1181
We will not remove duplicates randomly. innstead we will remove all duplicates but the one with the highest review
reviews_max = {}
for row in Android:
name = row[0]
n_reviews = float(row[3])
if name in reviews_max and reviews_max[name] < n_reviews:
reviews_max[name] = n_reviews
elif name not in reviews_max:
reviews_max[name] = n_reviews
print(len(reviews_max))
9659
android_clean = []
already_added = []
for row in Android:
name = row[0]
n_reviews = float(row[3])
if (n_reviews == reviews_max[name]) and (name not in already_added):
android_clean.append(row)
already_added.append(name)
print(len(android_clean))
9659
def is_english(string):
for letter in string:
if ord(letter) > 127:
return False
return True
print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))
True False False False
def is_english(string):
ascii_count = 0
for letter in string:
if ord(letter) > 127:
ascii_count += 1
if ascii_count > 3:
return False
else:
return True
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
True True False
apple_english = []
android_english = []
for app in android_clean:
name = app[0]
if is_english(name):
android_english.append(app)
for app in Apple:
name = app[1]
if is_english(name):
apple_english.append(app)
explore_data(android_english,0,3,True)
print('\n')
explore_data(apple_english,0,3,True)
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'] ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'] ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'] Number of rows: 9614 Number of columns: 13 ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'] ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'] ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'] Number of rows: 6183 Number of columns: 16
free_android = []
free_apple = []
for app in android_english:
app_price = app[7]
if app_price == '0':
free_android.append(app)
for app in apple_english:
app_price = app[4]
if app_price == '0.0':
free_apple.append(app)
print(len(free_android))
print(len(free_apple))
8864 3222
We are trying to find an app idea. For that we build a minimal version on google play and if the response from users are good we develop ot further. If it is profitable we build a io version of it after 6 months.
In order to generate frequency tables and find out the most common genres , we will use the columns genre for both store.
def freq_table(dataset, index):
frequency_table = {}
total = 0
for row in dataset:
total += 1
value = row[index]
if value in frequency_table:
frequency_table[value] += 1
else:
frequency_table[value] = 1
percentage_table = {}
for key in frequency_table:
percentage = (frequency_table[key]/total)*100
percentage_table[key] = percentage
return percentage_table
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
we will display the prime genre frequency table
display_table(free_apple, 11)
Games : 58.16263190564867 Entertainment : 7.883302296710118 Photo & Video : 4.9658597144630665 Education : 3.662321539416512 Social Networking : 3.2898820608317814 Shopping : 2.60707635009311 Utilities : 2.5139664804469275 Sports : 2.1415270018621975 Music : 2.0484171322160147 Health & Fitness : 2.0173805090006205 Productivity : 1.7380509000620732 Lifestyle : 1.5828677839851024 News : 1.3345747982619491 Travel : 1.2414649286157666 Finance : 1.1173184357541899 Weather : 0.8690254500310366 Food & Drink : 0.8069522036002483 Reference : 0.5586592178770949 Business : 0.5276225946617008 Book : 0.4345127250155183 Navigation : 0.186219739292365 Medical : 0.186219739292365 Catalogs : 0.12414649286157665
we will use the display table function to display the Genres frequency table
display_table(free_android, 9)
Tools : 8.449909747292418 Entertainment : 6.069494584837545 Education : 5.347472924187725 Business : 4.591606498194946 Productivity : 3.892148014440433 Lifestyle : 3.892148014440433 Finance : 3.7003610108303246 Medical : 3.531137184115524 Sports : 3.463447653429603 Personalization : 3.3167870036101084 Communication : 3.2378158844765346 Action : 3.1024368231046933 Health & Fitness : 3.0798736462093865 Photography : 2.944494584837545 News & Magazines : 2.7978339350180503 Social : 2.6624548736462095 Travel & Local : 2.3240072202166067 Shopping : 2.2450361010830324 Books & Reference : 2.1435018050541514 Simulation : 2.0419675090252705 Dating : 1.861462093862816 Arcade : 1.8501805054151623 Video Players & Editors : 1.7712093862815883 Casual : 1.7599277978339352 Maps & Navigation : 1.3989169675090252 Food & Drink : 1.2409747292418771 Puzzle : 1.128158844765343 Racing : 0.9927797833935018 Role Playing : 0.9363718411552346 Libraries & Demo : 0.9363718411552346 Auto & Vehicles : 0.9250902527075812 Strategy : 0.9138086642599278 House & Home : 0.8235559566787004 Weather : 0.8009927797833934 Events : 0.7107400722021661 Adventure : 0.6768953068592057 Comics : 0.6092057761732852 Beauty : 0.5979241877256317 Art & Design : 0.5979241877256317 Parenting : 0.4963898916967509 Card : 0.45126353790613716 Casino : 0.42870036101083037 Trivia : 0.41741877256317694 Educational;Education : 0.39485559566787 Board : 0.3835740072202166 Educational : 0.3722924187725632 Education;Education : 0.33844765342960287 Word : 0.2594765342960289 Casual;Pretend Play : 0.236913357400722 Music : 0.2030685920577617 Racing;Action & Adventure : 0.16922382671480143 Puzzle;Brain Games : 0.16922382671480143 Entertainment;Music & Video : 0.16922382671480143 Casual;Brain Games : 0.13537906137184114 Casual;Action & Adventure : 0.13537906137184114 Arcade;Action & Adventure : 0.12409747292418773 Action;Action & Adventure : 0.10153429602888085 Educational;Pretend Play : 0.09025270758122744 Simulation;Action & Adventure : 0.078971119133574 Parenting;Education : 0.078971119133574 Entertainment;Brain Games : 0.078971119133574 Board;Brain Games : 0.078971119133574 Parenting;Music & Video : 0.06768953068592057 Educational;Brain Games : 0.06768953068592057 Casual;Creativity : 0.06768953068592057 Art & Design;Creativity : 0.06768953068592057 Education;Pretend Play : 0.056407942238267145 Role Playing;Pretend Play : 0.04512635379061372 Education;Creativity : 0.04512635379061372 Role Playing;Action & Adventure : 0.033844765342960284 Puzzle;Action & Adventure : 0.033844765342960284 Entertainment;Creativity : 0.033844765342960284 Entertainment;Action & Adventure : 0.033844765342960284 Educational;Creativity : 0.033844765342960284 Educational;Action & Adventure : 0.033844765342960284 Education;Music & Video : 0.033844765342960284 Education;Brain Games : 0.033844765342960284 Education;Action & Adventure : 0.033844765342960284 Adventure;Action & Adventure : 0.033844765342960284 Video Players & Editors;Music & Video : 0.02256317689530686 Sports;Action & Adventure : 0.02256317689530686 Simulation;Pretend Play : 0.02256317689530686 Puzzle;Creativity : 0.02256317689530686 Music;Music & Video : 0.02256317689530686 Entertainment;Pretend Play : 0.02256317689530686 Casual;Education : 0.02256317689530686 Board;Action & Adventure : 0.02256317689530686 Video Players & Editors;Creativity : 0.01128158844765343 Trivia;Education : 0.01128158844765343 Travel & Local;Action & Adventure : 0.01128158844765343 Tools;Education : 0.01128158844765343 Strategy;Education : 0.01128158844765343 Strategy;Creativity : 0.01128158844765343 Strategy;Action & Adventure : 0.01128158844765343 Simulation;Education : 0.01128158844765343 Role Playing;Brain Games : 0.01128158844765343 Racing;Pretend Play : 0.01128158844765343 Puzzle;Education : 0.01128158844765343 Parenting;Brain Games : 0.01128158844765343 Music & Audio;Music & Video : 0.01128158844765343 Lifestyle;Pretend Play : 0.01128158844765343 Lifestyle;Education : 0.01128158844765343 Health & Fitness;Education : 0.01128158844765343 Health & Fitness;Action & Adventure : 0.01128158844765343 Entertainment;Education : 0.01128158844765343 Communication;Creativity : 0.01128158844765343 Comics;Creativity : 0.01128158844765343 Casual;Music & Video : 0.01128158844765343 Card;Action & Adventure : 0.01128158844765343 Books & Reference;Education : 0.01128158844765343 Art & Design;Pretend Play : 0.01128158844765343 Art & Design;Action & Adventure : 0.01128158844765343 Arcade;Pretend Play : 0.01128158844765343 Adventure;Education : 0.01128158844765343
We will then display the category frequency table
display_table(free_android, 1)
FAMILY : 18.907942238267147 GAME : 9.724729241877256 TOOLS : 8.461191335740072 BUSINESS : 4.591606498194946 LIFESTYLE : 3.9034296028880866 PRODUCTIVITY : 3.892148014440433 FINANCE : 3.7003610108303246 MEDICAL : 3.531137184115524 SPORTS : 3.395758122743682 PERSONALIZATION : 3.3167870036101084 COMMUNICATION : 3.2378158844765346 HEALTH_AND_FITNESS : 3.0798736462093865 PHOTOGRAPHY : 2.944494584837545 NEWS_AND_MAGAZINES : 2.7978339350180503 SOCIAL : 2.6624548736462095 TRAVEL_AND_LOCAL : 2.33528880866426 SHOPPING : 2.2450361010830324 BOOKS_AND_REFERENCE : 2.1435018050541514 DATING : 1.861462093862816 VIDEO_PLAYERS : 1.7937725631768955 MAPS_AND_NAVIGATION : 1.3989169675090252 FOOD_AND_DRINK : 1.2409747292418771 EDUCATION : 1.1620036101083033 ENTERTAINMENT : 0.9589350180505415 LIBRARIES_AND_DEMO : 0.9363718411552346 AUTO_AND_VEHICLES : 0.9250902527075812 HOUSE_AND_HOME : 0.8235559566787004 WEATHER : 0.8009927797833934 EVENTS : 0.7107400722021661 PARENTING : 0.6543321299638989 ART_AND_DESIGN : 0.6430505415162455 COMICS : 0.6204873646209386 BEAUTY : 0.5979241877256317
1-the most common genre is Games. The runner-up is entertainment.
2-The most common genre are Tools, Entertainment, Education, Business and Productivity.
prime_genre = freq_table(free_apple,-5)
for genre in prime_genre:
total = 0
len_genre = 0
for app in free_apple:
genre_app = app[-5]
if genre_app == genre:
user_rating = float(app[5])
total += user_rating
len_genre += 1
average_user_rating = total/len_genre
print(genre, ':' ,average_user_rating)
Social Networking : 71548.34905660378 Reference : 74942.11111111111 Entertainment : 14029.830708661417 Utilities : 18684.456790123455 Health & Fitness : 23298.015384615384 Sports : 23008.898550724636 Medical : 612.0 Lifestyle : 16485.764705882353 Games : 22788.6696905016 Catalogs : 4004.0 News : 21248.023255813954 Education : 7003.983050847458 Book : 39758.5 Productivity : 21028.410714285714 Music : 57326.530303030304 Weather : 52279.892857142855 Shopping : 26919.690476190477 Photo & Video : 28441.54375 Navigation : 86090.33333333333 Business : 7491.117647058823 Food & Drink : 33333.92307692308 Travel : 28243.8 Finance : 31467.944444444445
Based on the data , i will recommend to build a social networking or a navigation because there are the most download( they have the most users rating)
Category = freq_table(free_android, 1)
for user in Category:
total = 0
len_category = 0
for app in free_android:
category_app = app[1]
if category_app == user:
number_install = app[5]
number_install = number_install.replace('+', '')
number_install = number_install.replace(',','')
total += float(number_install)
len_category += 1
average_install = total / len_category
print(user, ':', average_install)
SOCIAL : 23253652.127118643 PARENTING : 542603.6206896552 BEAUTY : 513151.88679245283 SPORTS : 3638640.1428571427 SHOPPING : 7036877.311557789 BUSINESS : 1712290.1474201474 AUTO_AND_VEHICLES : 647317.8170731707 EVENTS : 253542.22222222222 TRAVEL_AND_LOCAL : 13984077.710144928 ART_AND_DESIGN : 1986335.0877192982 DATING : 854028.8303030303 FAMILY : 3695641.8198090694 ENTERTAINMENT : 11640705.88235294 HEALTH_AND_FITNESS : 4188821.9853479853 EDUCATION : 1833495.145631068 LIFESTYLE : 1437816.2687861272 MAPS_AND_NAVIGATION : 4056941.7741935486 BOOKS_AND_REFERENCE : 8767811.894736841 PHOTOGRAPHY : 17840110.40229885 NEWS_AND_MAGAZINES : 9549178.467741935 HOUSE_AND_HOME : 1331540.5616438356 MEDICAL : 120550.61980830671 COMICS : 817657.2727272727 FINANCE : 1387692.475609756 PERSONALIZATION : 5201482.6122448975 LIBRARIES_AND_DEMO : 638503.734939759 WEATHER : 5074486.197183099 GAME : 15588015.603248259 TOOLS : 10801391.298666667 VIDEO_PLAYERS : 24727872.452830188 COMMUNICATION : 38456119.167247385 PRODUCTIVITY : 16787331.344927534 FOOD_AND_DRINK : 1924897.7363636363
I will recommend to build a social networking app, travelling or entertainment