#!/usr/bin/env python # coding: utf-8 # In[111]: # This dataset is pulled from the used cars from eBay Kleinanzeigen of the German Ebay Website # The data dictionary provided with data is as follows: # dateCrawled - When this ad was first crawled. All field-values are taken from this date. # name - Name of the car. # seller - Whether the seller is private or a dealer. # offerType - The type of listing # price - The price on the ad to sell the car. # abtest - Whether the listing is included in an A/B test. # vehicleType - The vehicle Type. # yearOfRegistration - The year in which the car was first registered. # gearbox - The transmission type. # powerPS - The power of the car in PS. # model - The car model name. # kilometre - How many kilometers the car has driven. # monthOfRegistration - The month in which the car was first registered. # fuelType - What type of fuel the car uses. # brand - The brand of the car. # notRepairedDamage - If the car has a damage which is not yet repaired. # dateCreated - The date on which the eBay listing was created. # nrOfPictures - The number of pictures in the ad. # postalCode - The postal code for the location of the vehicle. # lastSeenOnline - When the crawler saw this ad last online. #Aim of the project is to clean the data # In[112]: import numpy import pandas # In[113]: autos = pandas.read_csv('autos.csv', encoding='Latin-1') # In[114]: autos # In[115]: autos.info() autos.head(5) # In[116]: autos.columns # In[117]: autos_copy = autos.copy() columns = autos_copy.columns print(columns) # In[118]: autos_copy = autos.rename(columns = { 'dateCrawled':'date_crawled', 'name':'name', 'seller':'seller', 'offerType':'offer_type', 'price':'price', 'abtest':'abtest', 'vehicleType':'vehicle_type', 'yearOfRegistration':'registration_year', 'gearbox':'gearbox', 'powerPS':'power_ps', 'model':'model', 'odometer':'odometer_km', 'monthOfRegistration':'registration_month', 'fuelType':'fuel_type', 'brand':'brand', 'notRepairedDamage':'unrepaired_damage', 'dateCreated':'ad_created', 'nrOfPictures':'nr_of_pictures', 'postalCode':'postal_code', 'lastSeen':'last_seen'}) # In[119]: autos_copy.head() # In[120]: #Seller and offer_type are all the same value autos_copy.describe(include='all') # In[121]: #convert odometer to integer autos_copy["odometer_km"] = autos_copy["odometer_km"].str.replace(',|km','') autos_copy["odometer_km"] = autos_copy["odometer_km"].astype(float) # In[122]: #convert price to integer autos_copy["price"] = autos_copy["price"].str.replace('\$|,','') autos_copy["price"] = autos_copy["price"].astype(float) # In[123]: #Find unique prices for vechiles autos_copy["price"].unique().shape # In[124]: #Find unique kms per vehicle autos_copy["price"].describe() # In[125]: #Find unique price per vehicle autos_copy["price"].value_counts().sort_index(ascending=False).head(20) # In[126]: autos_copy["price"].value_counts().sort_index(ascending=False).tail(20) # In[127]: #There is too much variation between the pricing on the vechiles #Pricing above 350,000 appears to be random key entries or errorenous data #removal of vechiles with zero cost autos_copy[autos_copy["price"].between(1,350000)] # In[128]: #Find unique odometer readings for vechiles autos_copy["odometer_km"].unique().shape # In[129]: autos_copy["odometer_km"].describe() # In[130]: autos_copy["odometer_km"].value_counts().sort_index(ascending=False).head(20) # In[131]: #Review of the dataset does not appear to have any outliers, it is possible for vehicles #to have 150000, this would indicate vehicle are generally put up for sale when #they reach this odometer milestone autos_copy["odometer_km"].value_counts().sort_index(ascending=False).tail(20) # In[132]: autos_copy[['date_crawled','ad_created','last_seen']][0:5] # In[133]: #Majority of the data crawled is within the month of March, it would appear that the 5th/6th of the month of March and April #appear to be a period of low activity, but not enough data to make a conclusive statement. autos_copy['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[134]: #Dataset would indicate that most of the ads where created during the month of March autos_copy['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[135]: autos_copy['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[136]: autos_copy['registration_year'].describe() #There are 50000 vehicles listed, there appears to be erroneous data as the min is 1000 # and max is 9999, this data will require clean up. # In[137]: autos_copy['registration_year'].value_counts(normalize=True).sort_index() # In[138]: #Vechile Registration year before 1910 and after 2016 marked as erroneous data (autos_copy['registration_year'] < 1910).value_counts(normalize=True) # In[139]: autos_copy = autos_copy[(autos_copy['registration_year'] >= 1910) & (autos_copy['registration_year'] <= 2016)] # In[140]: autos_copy.head(5) # In[141]: #Significant majority vehicles were registered in the last 30 years. With most of these vehicles being 5 - 15 years #old. From 1995 - 2012. Indicating people tend to sell vehicles which are over 5 years old autos_copy['registration_year'].value_counts().sort_index() # In[142]: autos_copy['brand'].value_counts(normalize=True).sort_values(ascending=False) # In[143]: brand_counts = autos_copy['brand'].value_counts(normalize=True) top_brands = brand_counts[brand_counts > .05].index # In[144]: top_brands #Volkswagen is the market leader with over 21% market share # In[145]: autos_copy.groupby('brand')['price'].mean() # In[146]: autos_copy.groupby(['brand'])['price'].mean().sort_values(ascending=False) # In[147]: brand_mean_price ={} for brand in top_brands: top_brand = autos_copy[autos_copy['brand'] == brand] mean_price = top_brand['price'].mean() brand_mean_price[brand] = int(mean_price) # In[148]: print(mean_data) #The data would indicate that of the top brand ie. have greater than 5% market #Mercedes is the most expensive car on average #Opel is the least expensive car average # In[149]: bmp_series = pandas.Series(brand_mean_price) print(bmp_series) # In[150]: brand_mean_mileage ={} for brand in top_brands: top_brand = autos_copy[autos_copy['brand'] == brand] mean_mileage = top_brand['odometer_km'].mean() brand_mean_mileage[brand] = int(mean_mileage) # In[151]: print(brand_mean_mileage) # In[152]: mean_price = pandas.Series(brand_mean_price).sort_values(ascending=False) mean_mileage = pandas.Series(brand_mean_mileage).sort_values(ascending=False) # In[153]: print(mean_price) print(type(mean_price)) print(mean_mileage) print(type(mean_mileage)) # In[154]: pandas.DataFrame([mean_price]).transpose() # In[155]: pandas.concat([mean_price,mean_mileage],keys=['mean_price','mean_mileage'],axis=1) # The data does not indicate that the more expensive brand have lower odometer readings (mileage) # Mercedes has the highest price and the second highest mileage # Opel has the lowest mean price with second lowest average mileage # My assessment is the current comparison of automaker, mean price and mileage makes it difficult to infer any meaningful insights.