In [2]:
from csv import reader
#Opening the Apple file

#Open the file
apple_file = open(r"C:\Users\Eriba\Downloads\AppleStore.csv", encoding="utf8")

#Read the file
read_a_file = reader(apple_file)

#Create a list of lists
apple = list(read_a_file)

#Select the header
apple_header = apple[0]

#Select the whole list without the first row
apple = apple[1:]


#Opening Google file
google_file = open(r"C:\Users\Eriba\Downloads\googleplaystore.csv", encoding="utf8")
read_g_file = reader(google_file)
google = list(read_g_file)
google_header = google[0]
google = google[1:]

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        
print("First 10 rows of Apple Store + the sum of all rows and columns")
print("\n")
explore_data(apple, 0, 11, rows_and_columns=True)

print("\nFirst 10 rows of Apple Store + the sum of all rows and columns")
print("\n")
explore_data(google, 0, 11, rows_and_columns=True)
First 10 rows of Apple Store + the sum of all rows and columns


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


['429047995', 'Pinterest', '74778624', 'USD', '0.0', '1061624', '1814', '4.5', '4.0', '6.26', '12+', 'Social Networking', '37', '5', '27', '1']


['282935706', 'Bible', '92774400', 'USD', '0.0', '985920', '5320', '4.5', '5.0', '7.5.1', '4+', 'Reference', '37', '5', '45', '1']


['553834731', 'Candy Crush Saga', '222846976', 'USD', '0.0', '961794', '2453', '4.5', '4.5', '1.101.0', '4+', 'Games', '43', '5', '24', '1']


['324684580', 'Spotify Music', '132510720', 'USD', '0.0', '878563', '8253', '4.5', '4.5', '8.4.3', '12+', 'Music', '37', '5', '18', '1']


['343200656', 'Angry Birds', '175966208', 'USD', '0.0', '824451', '107', '4.5', '3.0', '7.4.0', '4+', 'Games', '38', '0', '10', '1']


['512939461', 'Subway Surfers', '156038144', 'USD', '0.0', '706110', '97', '4.5', '4.0', '1.72.1', '9+', 'Games', '38', '5', '1', '1']


Number of rows: 7197
Number of columns: 16

First 10 rows of Apple Store + the sum of all rows and columns


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']


['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']


['Smoke Effect Photo Maker - Smoke Editor', 'ART_AND_DESIGN', '3.8', '178', '19M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'April 26, 2018', '1.1', '4.0.3 and up']


['Infinite Painter', 'ART_AND_DESIGN', '4.1', '36815', '29M', '1,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'June 14, 2018', '6.1.61.1', '4.2 and up']


['Garden Coloring Book', 'ART_AND_DESIGN', '4.4', '13791', '33M', '1,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'September 20, 2017', '2.9.2', '3.0 and up']


['Kids Paint Free - Drawing Fun', 'ART_AND_DESIGN', '4.7', '121', '3.1M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'July 3, 2018', '2.8', '4.0.3 and up']


['Text on Photo - Fonteee', 'ART_AND_DESIGN', '4.4', '13880', '28M', '1,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'October 27, 2017', '1.0.4', '4.1 and up']


Number of rows: 10841
Number of columns: 13
In [3]:
#Data cleaning
    #the wrong entry at 10472 doesn't have a "Category" column value,
    #so we are just going to delete this row as it is faulty
print(google_header) # the header row
print("\n")
print(google[10472]) # the incorrect row
print("\n")
print(google[0]) # the example correct row
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
In [4]:
def remove_data(dataset, dataset_header):
    for row in dataset:
        if len(row) != len(dataset_header):
            print (row)
            print (dataset.index(row))
            
remove_data(google, google_header)
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472
In [5]:
del google[10742]

#Checking if the row was deleted
print(google[10742]) 
['FP BW LCD View', 'FAMILY', '3.4', '16', '1.2M', '500+', 'Free', '0', 'Everyone', 'Entertainment', 'March 31, 2016', '1.0', '4.0.3 and up']
In [6]:
def remove_data(dataset, dataset_header):
    for row in dataset:
        if len(row) != len(dataset_header):
            print (row)
            print (dataset.index(row))
            
remove_data(google, google_header)

print(google[10742])
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472
['FP BW LCD View', 'FAMILY', '3.4', '16', '1.2M', '500+', 'Free', '0', 'Everyone', 'Entertainment', 'March 31, 2016', '1.0', '4.0.3 and up']
In [7]:
no_duplicate = []
duplicate = []

for app in google:
    name = app[0]
    if name in no_duplicate:
        duplicate.append(name)
    else:
        no_duplicate.append(name)
        
print(no_duplicate[:5])
print("\n")
print(duplicate[:5])
print("\n")
print("Total number of apps: ", len(google))
print("Number of unique apps: ", len(no_duplicate))
print("Number of duplicates: ", len(duplicate))
['Photo Editor & Candy Camera & Grid & ScrapBook', 'Coloring book moana', 'U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'Sketch - Draw & Paint', 'Pixel Draw - Number Art Coloring Book']


['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


Total number of apps:  10840
Number of unique apps:  9659
Number of duplicates:  1181
In [8]:
#Showing all the duplicates of a selected app
#name is a first column of the list
#so if I say to Python that name is in the column then I can sa that if
#certain name appears then please print it
for app in google:
    name = app[0]
    if name == "Quick PDF Scanner + OCR FREE":
        print(app)
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
In [9]:
reviews_max = {}

for app in google:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
    
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-9d00dde47af8> in <module>
      3 for app in google:
      4     name = app[0]
----> 5     n_reviews = float(app[3])
      6 
      7     if name in reviews_max and reviews_max[name] < n_reviews:

ValueError: could not convert string to float: '3.0M'
In [ ]: