-Below code is for open apple and google store dataset -Then convert them to list of lists
openApple_file=open('AppleStore.csv',encoding='UTF8')
opengoogle_file=open('googleplaystore.csv',encoding='UTF8')
from csv import reader
readapplefile=reader(openApple_file)
readgooglefile=reader(opengoogle_file)
Apple_Dataset=list(readapplefile)
Google_Dataset=list(readgooglefile)
def explore_data(dataset, start, end, rows_and_columns=False):
dataset_slice=dataset[start:end]
for row in dataset_slice:
print(row)
print('\n')
if rows_and_columns:
print('Number of Rows', len(dataset))
print('Number Of Columns', len(dataset[0]))
#explore_data(Apple_Dataset,0,2,True)
#explore_data(Google_Dataset,10473,10474,True)
#del Google_Dataset[10473]
#print(Google_Dataset[10473])
-below code to find duplicates and cleanse the datasets
app=[]
duplicate_apps=[]
for row in Google_Dataset:
tempvar=row[0]
if tempvar in app:
duplicate_apps.append(tempvar)
else:
app.append(tempvar)
print(len(duplicate_apps))
print(len(Google_Dataset)-len(duplicate_apps))
#print(duplicate_apps[0])
#for app in Google_Dataset:
#if app[0]=='Quick PDF Scanner + OCR FREE' or app[0]=='Instagram':
#print(app)
#print(Google_Dataset[0])
reviews_max={}
for row in Google_Dataset:
name=row[0]
n_reviews=row[3]
if name in reviews_max and reviews_max[name]<n_reviews:
reviews_max[name]=n_reviews
elif name not in reviews_max:
reviews_max[name]=n_reviews
#print(len(reviews_max))
Google_DatasetClean=[]
app_namelist=[]
for row in Google_Dataset:
appname=row[0]
no_ofreviews=row[3]
if appname in reviews_max and no_ofreviews==reviews_max[appname] and appname not in app_namelist:
Google_DatasetClean.append(row)
app_namelist.append(appname)
print(len(Google_DatasetClean))
-To find non english apps
english_appsgoogle=[]
englishappsapple=[]
def english_characters(inputstring):
count=0
for element in inputstring:
if ord(element)>127:
count+=1
if count>3:
return False
return True
#detect=english_characters('Instagram')
#print(detect)
#detect=english_characters('爱奇艺PPS -《欢乐颂2》电视剧热播')
#print(detect)
#detect=english_characters('Docs To Go™ Free Office Suite')
#print(detect)
#detect=english_characters('Instachat 😜')
#print(detect)
for row in Google_DatasetClean[1:]:
if english_characters(row[0]):
english_appsgoogle.append(row)
for row in Apple_Dataset[1:]:
if english_characters(row[1]):#track name
englishappsapple.append(row)
print(english_appsgoogle[0])
print(len(english_appsgoogle))
print(englishappsapple[0])
print(len(englishappsapple))
print(Google_Dataset[0])
print(Apple_Dataset[0])
-Below code for isolating free apps
#free apps
googlefreeapps=[]
applefreeapps=[]
for row in english_appsgoogle:
if row[6]=='Free':
googlefreeapps.append(row)
for row in englishappsapple:
if float(row[4])==0.0:
applefreeapps.append(row)
print (len(googlefreeapps))
print(len(applefreeapps))
-Create frequency table for cleaned datasets
def freq_table(dataset,index):
a_list=[]
freq_table={}
for row in dataset:
temp=row[index]
#alist.append(temp)
if temp in freq_table:
freq_table[temp]+=1
else:
freq_table[temp]=1
for key in freq_table:
freq_table[key]=(freq_table[key]*100)/len(dataset)
return freq_table
def display_table(dataset, index):
table = freq_table(dataset, index)
table_display = []
for key in table:
key_val_as_tuple = (table[key], key)
table_display.append(key_val_as_tuple)
table_sorted = sorted(table_display, reverse = True)
for entry in table_sorted:
print(entry[1], ':', entry[0])
#table=display_table(googlefreeapps,9)# genre
#table1=display_table(googlefreeapps,1) #catogory
#table2=display_table(applefreeapps,11)#prime_genre
table2=freq_table(applefreeapps, 11)
table3=freq_table(googlefreeapps, 1)
#print(table)
#print(table1)
#print(table2)
#print(table3)
-Below code is to find the number of installs for each catogory or genre to suggest the popular app
for genre in table2:
total=0
len_genre=0
for row in applefreeapps:
genreapp=row[11]
if genreapp==genre:
total+=float(row[5])
len_genre+=1
average_no_ratings=total/len_genre
print(genre,average_no_ratings)
for genre in table3:
total=0
len_genre=0
for row in googlefreeapps:
genreapp=row[1]
if genreapp==genre:
row[5]=row[5].replace('+','')
row[5]=row[5].replace(',','')
total+=float(row[5])
len_genre+=1
average_no_ratings=total/len_genre
print(genre,average_no_ratings)