About: This is the completion project for Introduction to Algorithims on the Data Engineer Track Goal: Compare methods for accessing data using in-line processsing and pre-cachching
import csv
header=[]
rows=[]
with open('laptops.csv') as f:
rows = list(csv.reader(f))
header=rows[0]
del rows[0]
print(header)
print(rows[0:5])
['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'] [['6571244', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 2.3GHz', '8GB', '128GB SSD', 'Intel Iris Plus Graphics 640', 'macOS', '1.37kg', '1339'], ['7287764', 'Apple', 'Macbook Air', 'Ultrabook', '13.3', '1440x900', 'Intel Core i5 1.8GHz', '8GB', '128GB Flash Storage', 'Intel HD Graphics 6000', 'macOS', '1.34kg', '898'], ['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', '575'], ['9722156', 'Apple', 'MacBook Pro', 'Ultrabook', '15.4', 'IPS Panel Retina Display 2880x1800', 'Intel Core i7 2.7GHz', '16GB', '512GB SSD', 'AMD Radeon Pro 455', 'macOS', '1.83kg', '2537'], ['8550527', 'Apple', 'MacBook Pro', 'Ultrabook', '13.3', 'IPS Panel Retina Display 2560x1600', 'Intel Core i5 3.1GHz', '8GB', '256GB SSD', 'Intel Iris Plus Graphics 650', 'macOS', '1.37kg', '1803']]
import csv
class Inventory:
##########################################################################################
# Constructor csv_filename should be the name/location to a laptops.csv formatted file
##########################################################################################
def __init__(self, csv_filename):
self.filename=csv_filename
self.__loadfile()
self.header=[]
self.rows=[]
self.__loadfile() #load the file
for row in self.rows:
row[-1] = float(row[-1])
self.rows_by_price = sorted(self.rows, key=lambda r: r[-1])
#indexes
self.id_to_row ={}
self.prices=set()
#pre-cache valuable lookups
self.__create_indexes()
############################################################
# Private method that loads the file passed into the constructor
############################################################
def __loadfile(self):
with open('laptops.csv') as f:
self.rows = list(csv.reader(f)) # read all the rows
self.header=self.rows[0] # grab row zero as the header
del self.rows[0] # remove the header row from the data
############################################################
# Private method that denormalizes the data into indexes
############################################################
def __create_indexes(self):
if len(self.id_to_row) == 0:
for i in range(len(self.rows)):
#build an id index into the data
self.id_to_row[int(self.rows[i][0])] = i
#build a price cache
self.prices.add(self.rows[i][12])
############################################################
# O(N)
# Returns a laptop based on its id or None if no matching id exists
############################################################
def get_laptop_from_id(self, laptop_id): # O(N)
for row in self.rows:
if int(row[0]) == laptop_id:
return row
return None
############################################################
# O(1)
# Returns a laptop based on its id or None if no matching id exists
############################################################
def get_laptop_from_id_fast(self, laptop_id): # O(1)
# send back the matching row or None if no match O(1)
return self.rows[self.id_to_row[laptop_id]] if laptop_id in self.id_to_row else None
############################################################
# O(N^2)
# Find if one or two laptops exist that add up to the
# specified amount. If a match is found then True is returned
# If no match is found then None is return
############################################################
def check_promotion_dollars(self, dollars):
# find if one item will match the dollars
for row in self.rows:
if float(row[12]) == dollars:
return True
# check if the sum of two items will match the dollars
for i in range(len(self.rows)): # O(N)
for j in range(len(self.rows)): # O(N*N)= O(N^2)
celli = float(self.rows[i][12])
cellj = float(self.rows[j][12])
if celli+cellj == dollars:
return True
return None
############################################################
# O(N)
# Find if one or two laptops exist that add up to the
# specified amount. If a match is found then True is returned
# If no match is found then None is return
############################################################
def check_promotion_dollars_fast(self, dollars):
# find if one item will match the dollars
if dollars in self.prices:
return True # O(1)
# check if the sum of two items will match the dollars
# this is the clever solution of backing into a single value
for p in self.prices: # O(N)
value1 = dollars - p
if value1 in self.prices:
return True
return None
############################################################
# O(log(N))
# Find the first laptop that is more expensive than the price passed in
# If no match is found then -1 is returned
############################################################
def find_first_laptop_more_expensive(self, target_price):
range_start = 0
range_end = len(self.rows_by_price) - 1
while range_start < range_end:
range_middle = (range_end + range_start) // 2
price = self.rows_by_price[range_middle][-1]
if price > target_price:
range_end = range_middle
elif price <= target_price:
range_start = range_middle + 1
price = self.rows_by_price[range_start][-1]
# if we are returning the max price and the price is greater than that, return -1
# This means the target_price is higher than any price in the data
if range_start == len(self.rows_by_price)-1 and price < target_price:
return -1
return range_start
c = Inventory('laptops.csv')
print('1')
print(c.header)
print(len(c.rows))
print(c.get_laptop_from_id_fast(3362737))
print(c.get_laptop_from_id_fast(3362736))
1 ['Id', 'Company', 'Product', 'TypeName', 'Inches', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'] 1303 ['3362737', 'HP', '250 G6', 'Notebook', '15.6', 'Full HD 1920x1080', 'Intel Core i5 7200U 2.5GHz', '8GB', '256GB SSD', 'Intel HD Graphics 620', 'No OS', '1.86kg', 575.0] None
# adding a index for promotional dollar calculation
import time
import random
c2 = Inventory('laptops.csv')
ids = [random.randint(1000000,9999999) for _ in range(10000)]
# I Commented this out because it is just tooooooo slow
total_time_no_dict = 0
# for i in ids:
# start = time.time()
# c.check_promotion_dollars(i) # O(N^2)
# end = time.time()
# total_time_no_dict += end-start
total_time_dict = 0
for i in ids:
start = time.time()
c.check_promotion_dollars_fast(i) # O(N)
end = time.time()
total_time_dict += end-start
print(total_time_no_dict, total_time_dict)
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-4-be69c655d148> in <module> 11 for i in ids: 12 start = time.time() ---> 13 c.check_promotion_dollars(i) # O(N^2) 14 end = time.time() 15 total_time_no_dict += end-start <ipython-input-2-cc4e5587cfa0> in check_promotion_dollars(self, dollars) 84 for j in range(len(self.rows)): # O(N*N)= O(N^2) 85 celli = float(self.rows[i][12]) ---> 86 cellj = float(self.rows[j][12]) 87 if celli+cellj == dollars: 88 return True KeyboardInterrupt:
# Promotional Dollar Test Values (1000 - True, 442 - False)
#Slow
v = c.check_promotion_dollars(1000)
assert v==True, f"Expected True, Received {str(v)}"
v = c.check_promotion_dollars(442)
assert v==None, f"Expected None, Received {str(v)}"
#Fast
v=c.check_promotion_dollars_fast(1000)
assert v==True, f"Expected True, Received {str(v)}"
v=c.check_promotion_dollars_fast(442)
assert v==None, f"Expected None, Received {str(v)}"
# Adding an index to lookup a laptop by id
prices = [random.randint(100,5000) for _ in range(100)]
c3 = Inventory('laptops.csv')
#Time comparisons
total_time_no_set = 0
for i in ids:
start = time.time()
c.get_laptop_from_id(i) # O(N)
end = time.time()
total_time_no_set += end-start
total_time_set = 0
for i in ids:
start = time.time()
c.get_laptop_from_id_fast(i) # O(1)
end = time.time()
total_time_set += end-start
print(total_time_no_set, total_time_set, f"Using a set is {int(total_time_no_set/total_time_set)} times faster")
# Binary Search for prices
c3 = Inventory('laptops.csv')
v=c3.find_first_laptop_more_expensive(150) # Boundary Case result should be 0
print(v, c3.rows_by_price[v][12]>150, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==0, f"index should be 0, received {str(v)}"
v=c3.find_first_laptop_more_expensive(174) # Normal Case result should be 1
print(v, c3.rows_by_price[v][12]>174, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==1, f"index should be 1, received {str(v)}"
v=c3.find_first_laptop_more_expensive(1000) # Normal Case result should be 683
print(v, c3.rows_by_price[v][12]>1000, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], c3.rows_by_price[v+1][12])
assert v==683, f"index should be 683, received {str(v)}"
v=c3.find_first_laptop_more_expensive(5500) # Max Case result should be 1302
print(v, c3.rows_by_price[v][12]>5500, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], 'NA')
assert v==1302, f"index should be 1302, received {str(v)}"
v=c3.find_first_laptop_more_expensive(10000) #Error case should be -1
print(v, c3.rows_by_price[v][12]>10000, c3.rows_by_price[v-1][12], c3.rows_by_price[v][12], 'NA')
assert v==-1,f"index should be -1, received {str(v)}"
# x =0
# for row in c3.rows_by_price:
# print(x, row[12])
# x+=1