def length_check(data_list):
ideal_length=len(data_list[0])
anomaly_data=[]
for row in data_list[1:]:
if len(row)!=ideal_length:
anomaly_data.append(row)
data_list.remove(row)
#print(anomaly_data)
return data_list
def find_duplicate(data_list,pos_appname):
duplicate_data=[]
unique_data=[]
for row in data_list[1:]:
App_name=row[pos_appname]
if App_name in unique_data:
duplicate_data.append(App_name)
else:
unique_data.append(App_name)
return duplicate_data
def clear_list(data_list,duplicate_data,pos_appname,pos_review):
review_max={}
deleted=[]
unique=[]
sub_list=[]
for row in data_list:
if row[pos_appname] in duplicate_data:
n_app=row[pos_appname]
n_review=float(row[pos_review])
if n_app in review_max:
if review_max[n_app]<n_review:
review_max[n_app]=n_review
else:
review_max[n_app]=n_review
#print(review_max)
for AppName in review_max:
review_num=review_max[AppName]
for data in data_list:
if data[pos_appname]==AppName:
if float(data[pos_review])!=review_num:
deleted.append(data)
data_list.remove(data)
else:
sub_list=[data[pos_appname],data[pos_review]]
if sub_list in unique:
deleted.append(data)
data_list.remove(data)
else:
unique.append(sub_list)
#print(deleted)
return data_list
def remove_non_free(data_list,pos_not_free):
non_free_row=[]
for row in data_list[1:]:
if row[pos_not_free]=='0' or row[pos_not_free]=='0.0':
continue
else:
non_free_row.append(row)
data_list.remove(row)
print('total non free app : '+ str(len(non_free_row)))
print('total free app : '+ str(len(data_list)))
for app in non_free_row:
if app[pos_not_free]=='Free':
print(app)
return data_list
def remove_non_eng(string):
ascii_n=0
for c in string:
if ord(c)>127:
ascii_n+=1
if ascii_n>3:
return False
return True
def remove_sign(string):
if '+' in string:
string=string.replace('+','')
if ',' in string:
string=string.replace(',','')
return string
def genre_count(data_list,pos_genre):
genre_max={}
table_per=[]
App_Max=[]
total_len=len(data_list)-1 #for excluding header
for row in data_list[1:]:
#App_Name=row[pos_appname]
n_Genre=row[pos_genre]
if n_Genre in genre_max:
genre_max[n_Genre]+=1
else:
genre_max[n_Genre]=1
#print(genre_max)
for key in genre_max:
data_to_store=((genre_max[key]/total_len)*100,key)
table_per.append(data_to_store)
s_genre_percentage=sorted(table_per,reverse=True)
#print(sorted_data)
for genre in genre_max:
total=0
tot_installs=0
for data in data_list[1:]:
genre_name=data[pos_genre]
if genre_name==genre:
if '+' in data[5] or ',' in data[5]:
string=remove_sign(data[5])
else:
string=data[5]
tot_installs+=float(string)
avg_count=float(tot_installs/genre_max[genre])
tup_val=(avg_count,genre)
App_Max.append(tup_val)
s_genre_freq=sorted(App_Max,reverse=True)
return s_genre_percentage,s_genre_freq
def App_Install_Max(data_list,pos_appname,pos_genre,genre):
for row in data_list:
if row[pos_genre]==genre:
App_Name=row[pos_appname]
print(App_Name+' : '+ row[5])
#filename='googleplaystore.csv'
filename='AppleStore.csv'
opened_file = open(filename)
from csv import reader
read_file = reader(opened_file)
data = list(read_file)
print(data[0:1])
#for app in data:
# if app[pos_appname]=='Quick PDF Scanner + OCR FREE':
# print(app)
if filename=='googleplaystore.csv':
pos_appname=0
pos_review=3
pos_not_free=7
pos_genre=1
pos_Installs=5
else:
pos_appname=1
pos_review=5
pos_not_free=4
pos_genre=-5
pos_rating_count=5
print('length of data : '+str(len(data)))
correct_list=length_check(data)
print('length of data after removing wrong data: '+str(len(correct_list)))
duplicate_data=find_duplicate(correct_list,pos_appname)
print('Length of Duplicate data : '+ str(len(duplicate_data)))
#print(duplicate_data[0:5])
unique_data_list=clear_list(correct_list,duplicate_data,pos_appname,pos_review)
print('Length after removing data : '+ str(len(unique_data_list)))
#print(unique_data_list[0:5])
for data_row in unique_data_list[1:]:
AppName=data_row[pos_appname]
if remove_non_eng(AppName) is False:
unique_data_list.remove(data_row)
print('Length after non eng app: '+ str(len(unique_data_list)))
free_app_data=remove_non_free(unique_data_list,pos_not_free)
genre_total=genre_count(free_app_data,pos_genre)
genre_percentage=genre_total[0]
genre_freq=genre_total[1]
for row in genre_freq:
print(str(row[0]) +' : ' +row[1])
App_Install_Max(free_app_data,pos_appname,pos_genre,'Navigation')
#for row in genre_percentage:
# print(str(row[0]) +' : ' +row[1])
#for app in free_app_data[1:]:
# if app[pos_genre]=='FAMILY':
# print(app)
[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']] length of data : 7198 length of data after removing wrong data: 7198 Length of Duplicate data : 2 Length after removing data : 7196 Length after non eng app: 6182 total non free app : 2961 total free app : 3221 86090.33333333333 : Navigation 74942.11111111111 : Reference 71548.34905660378 : Social Networking 57326.530303030304 : Music 52279.892857142855 : Weather 39758.5 : Book 33333.92307692308 : Food & Drink 31467.944444444445 : Finance 28441.54375 : Photo & Video 28243.8 : Travel 26919.690476190477 : Shopping 23298.015384615384 : Health & Fitness 23008.898550724636 : Sports 22812.92467948718 : Games 21248.023255813954 : News 21028.410714285714 : Productivity 18684.456790123455 : Utilities 16485.764705882353 : Lifestyle 14029.830708661417 : Entertainment 7491.117647058823 : Business 7003.983050847458 : Education 4004.0 : Catalogs 612.0 : Medical Waze - GPS Navigation, Maps & Real-time Traffic : 345046 Google Maps - Navigation & Transit : 154911 Geocaching® : 12811 CoPilot GPS – Car Navigation & Offline Maps : 3582 ImmobilienScout24: Real Estate Search in Germany : 187 Railway Route Search : 5