#!/usr/bin/env python # coding: utf-8 # # Exploring Ebay Car Sales Data # # In this project we will work with a modified data set from a classifieds section of the German Ebay website. The goal of this project is to clean and analyze the data. # In[1]: import pandas as pd import numpy as np # In[2]: autos = pd.read_csv('autos.csv', encoding='Latin-1') # In[3]: autos # In[4]: autos.info() # In[5]: autos.head() # Initial impressions of the data: # # 'dateCrawled' and 'lastSeenOnline' - might not be relevent to our ananlysis so might delete # 'price' - convert to float # 'model' - put this next to the 'brand' column for ease of reading # 'odometer' - convert to int # 'monthOfRegistration' - maybe convert to the full name of the month # 'dateCreated' - convert to date # # Some of the columns have null values that we will have to investigate. # Column titles we can clean, add spaces, and use lowercase. # First let's convert the column names from camelcase to snakecase and re-word some of them to make it more clear: # In[6]: autos.columns # In[7]: mapping_dictionary = {'dateCrawled':'date_crawled', 'offerType':'offer_type', 'abtest':'ab_test','vehicleType':'vehicle_type', 'yearOfRegistration':'registration_year', 'powerPS':'power_ps', 'monthOfRegistration':'registration_month', 'fuelType':'fuel_type', 'notRepairedDamage':'unrepaired_damage', 'dateCreated':'ad_created', 'nrOfPictures':'nr_of_pictures', 'postaCode':'postal_code', 'lastSeen':'last_seen'} # In[8]: autos.rename(mapper=mapping_dictionary, axis=1, inplace=True) # In[9]: autos.head() # In[10]: autos.describe(include='all') # 'seller' and 'offer_type' can be removed becasue amost all of the values are the same # 'price', 'odometer' needs to be converted to int # 'date_craweled', 'ad_created', and 'last_seen' can be converted to date format. # In[11]: autos['price'] = autos['price'].str.replace('$','').str.replace(',','').astype(int) # In[12]: autos['odometer'] = autos['odometer'].str.replace('km','').str.replace(',','').astype(int) # In[13]: autos.rename({'odometer':'odometer_km'}, axis=1, inplace=True) # Let's continue exploring the data, specifically looking for data that doesn't look right. We'll start by analyzing the odometer_km and price columns. # In[14]: autos['price'].describe() # In[15]: autos['price'].nlargest(20) # In[16]: price_bool = autos['price'] < 100 price_bool.value_counts() # We will remove the entries with prices 999,990 and above since those are either errors or extrememly expense that they may skew the results. There are over 1700 entries with a price less than 100 so we will exclude these as well. # In[17]: autos = autos.loc[autos['price'].between(100, 350000), :] # In[18]: autos['odometer_km'].min() # We don't need decimals for the 'odometer_km' column so we will convert it to int. 150,000 km is reasonable for some used cars so there is no need to remove any on this basis. The lowest is 5000 km which is also reasonable. # In[19]: autos['odometer_km'].describe() # In[20]: autos['price'].describe() # Observations about the remaining values for 'price' and 'odometer_km': # # The mean milage is over 125000 km so most of our entries are vehicles with high km. 75% of the vehicles have 125000 kilometers or more. # # The average price is 5930. # # ## Exploring the Columns Containing Dates # In[21]: autos[['date_crawled','ad_created','last_seen']][0:5] # In[22]: autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # All of the entries were crawled in either March or April of 2016 with most of them being craweled in March # In[23]: autos['ad_created'].str[:7].value_counts(normalize=True, dropna=False) # Over 83% of the ad were created in March of 2016. The total range for all ad creation is from June 2015 through April 2016. # In[24]: autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # The date range for when the ad was last seen is from March 5th, 2016 to April 7th, 2016 # ## Dealing With Incorrect Registration Year Data # In[25]: autos['registration_year'].describe() # Some things stand out from the 'registration_year' column: # # the minimum vallue is 1000 and the maximum value is 9999. The registration year can't be after the listing was seen so any entry after 2016 is inaccurate. The earliest possible registration date would be in the early 1900's when cars were invented. Let's take a look at some of the data to determine if we need to fix anything that may be unrealistic. # In[26]: autos[autos['registration_year'] < 1940] # The entires with registration years 1800, 1000, 1111, 1800, 1001 are inaccurate. The entry of 1910 for an Opel Corsa B is inaccurate because Opel Corsas were not manufactured until 1982. The entry of 1910 for a Renault Twingo is inaccurate because the Twingo was not manufactured until 1992. The rest of the entries before 1940 are vintage cars which appear to be accurate so we will leave our analysis there. # # We will remove entries with registration date before 1911 and after 2016 # In[27]: autos = autos[autos["registration_year"].between(1911, 2016)] autos['registration_year'].value_counts(normalize=True) # ## Exploring Price by Brand # # We will aggregate by price using brands that make up 5% or greater of total entries. Over 20% of all entries are Volkswagens and the top 4 brands make up over 50% of the data. # In[28]: autos['brand'].value_counts(normalize=False).head(10) # In[29]: brands = autos['brand'].value_counts(normalize=True) significant_brands = brands[brands > 0.05].index print(significant_brands) avg_price_by_brand = {} for b in significant_brands: selected_brand = autos[autos['brand'] == b] price_mean = selected_brand['price'].mean() avg_price_by_brand[b] = int(price_mean) print(avg_price_by_brand) # The most common car brand in our dataset, Volkswagen, has an average list price of 5,436. Audi is the most expensive at 9,380. The least expenive are Ford and Opel. # For the top 6 brands, let's use aggregation to understand the average mileage for those cars and if there's any visible link with mean price. # In[30]: avg_milage_by_brand = {} for b in significant_brands: selected_brand = autos[autos['brand'] == b] milage_mean = selected_brand['odometer_km'].mean() avg_milage_by_brand[b] = int(milage_mean) print(avg_milage_by_brand) # Convert both dictionaries to series objects: # In[31]: apb_series = pd.Series(avg_price_by_brand) print(apb_series) amb_series = pd.Series(avg_milage_by_brand) print(amb_series) # In[32]: price_milage_df = pd.DataFrame(apb_series, columns=['price_mean']) price_milage_df # In[33]: price_milage_df['milage_mean'] = amb_series print(price_milage_df) # The two least expensive brands have the lowest milage readings while the 3 most expensive have the most milage. # ## Identify data that uses German Words, translate them, and map the value to English # # # In[34]: autos.head(50) # The following columns have German words: # 'seller','offer_type', 'vehicle_type', 'gearbox', 'fuel_type', 'unrepaired_damage'. # # Let's use the value_counts() method to find out all the words that are contained within each column: # In[35]: autos['seller'].value_counts() # In[36]: autos['offer_type'].value_counts() # In[37]: autos['vehicle_type'].value_counts() # In[38]: autos['gearbox'].value_counts() # In[39]: autos['fuel_type'].value_counts() # In[40]: autos['unrepaired_damage'].value_counts() # Here is the common english translation for these words in the context of vehicles: # # 'seller' # privat = private, # gewerblich = trade # # 'offer_type' # Agnebot = asking price # # 'vehicle_type' # limousine = sedan, # kleinwagen = subcompact, # kombi = sport_wagen, # bus = van, # cabrio = convertible, # coupe = couple, # suv = suv, # andere = other # # 'gearbox' # manuell = manual, # automatik = automatic # # 'fuel_type' # benzin = gas, # diesel = diesel, # lpg = lpg (liquified petroleum gas), # cng = cng (compressed natural gas), # hybrid = hybrid, # elektro = electric, # andere = other # # 'unrepaired_damage' # nein = no, # ja = yes # Now that we have our translations, lets replace the German words with the English words. To do this we will use the series.map method: # In[41]: autos['seller'] = autos['seller'].map({'privat':'private', 'gewerblich':'trade'}) autos['offer_type'] = autos['offer_type'].map({'Angebot':'asking price'}) autos['vehicle_type'] = autos['vehicle_type'].map({'limousine':'sedan', 'bus':'bus', 'coupe':'coupe', 'suv':'suv','kleinwagen':'subcompact', 'kombi':'sport wagen', 'bus':'van', 'cabrio':'convertible', 'andere':'other'}) autos['gearbox'] = autos['gearbox'].map({'manuell':'manual', 'automatik':'automatic'}) autos['fuel_type'] = autos['fuel_type'].map({'benzin':'gas', 'diesel':'diesel', 'lpg':'lpg', 'cng':'cng', 'hybrid':'hybrid', 'elektro':'electric', 'andere':'other'}) autos['unrepaired_damage'] = autos['unrepaired_damage'].map({'nein':'no', 'ja':'yes'}) # In[42]: autos.head() # Convert the dates to be unifor numeric data so '2016-01-01' becomes integer 20160101 # In[43]: autos['date_crawled'] = autos['date_crawled'].str.replace('-', '').str[:9].astype(int) autos['ad_created'] = autos['ad_created'].str.replace('-', '').str[:9].astype(int) autos['last_seen'] = autos['last_seen'].str.replace('-', '').str[:9].astype(int) # In[44]: autos.head() # # Furthur Analysis # # ## What is the most common model for our most popular brands? # # To answer this we will filter by brand and select just the 'model' column. Then we will create a dictionary frequency table, iterate over our column, and add the counts to our dictionary. # In[45]: import operator vw = autos['brand'] == 'volkswagen' vw_models = autos[vw]['model'] model_freq_vw = {} for m in vw_models: if m not in model_freq_vw: model_freq_vw[m] = 1 else: model_freq_vw[m] += 1 sorted_vw = sorted(model_freq_vw.items(), key=operator.itemgetter(1), reverse=True) sorted_vw[0] # From here we could use this same code over and over for each brand but to speed up the process, we will make a custom function: # In[46]: import operator #This will let us sort our dictionary by value so that we can get the dictionary sorted by most common models def brand_model_freq(b): brand_filter = autos['brand'] == b brand_models = autos[brand_filter]['model'] model_freq = {} for m in brand_models: if m not in model_freq: model_freq[m] = 1 else: model_freq[m] += 1 sorted_brand = sorted(model_freq.items(), key=operator.itemgetter(1), reverse=True) return sorted_brand[0] # In[47]: brand_model_freq('volkswagen') # In[48]: brand_model_freq('bmw') # In[49]: brand_model_freq('opel') # In[50]: brand_model_freq('mercedes_benz') # In[51]: brand_model_freq('audi') # In[52]: brand_model_freq('ford') # ## How does milage affect price? # # To answer this we will create milage ranges that make sense and then assign our entries into those respective groups. # autos['odometer_km'].value_counts() # In[53]: group_less_than_20 = autos.loc[autos["odometer_km"]<20000,'price'] group_between_20_and_50 = autos.loc[autos["odometer_km"].between(20000,50000),'price'] group_between_50_and_80 = autos.loc[autos["odometer_km"].between(50000,80000),'price'] group_between_80_and_100 = autos.loc[autos["odometer_km"].between(80000,100000),'price'] group_between_100_and_125 = autos.loc[autos["odometer_km"].between(100000,125000),'price'] group_between_125_and_150 = autos.loc[autos["odometer_km"].between(125000,150000),'price'] group_150_and_over = autos.loc[autos["odometer_km"] == 150000,'price'] # In[54]: group_less_than_20.mean() # In[55]: group_between_20_and_50.mean() # In[56]: group_between_50_and_80.mean() # In[57]: group_between_80_and_100.mean() # In[58]: group_between_100_and_125.mean() # In[59]: group_between_125_and_150.mean() # In[60]: group_150_and_over.mean() # In general, we see the price decreasing as milage increases which is expected. # ## How Much Cheaper Are Cars with Damage Than Those Without? # # To answer this we will use boolean indexing to filter our dataframe by damage, then find the mean: # In[61]: with_damage = autos[autos['unrepaired_damage'] == 'yes'] with_damage['price'].mean() # In[62]: no_damage = autos[autos['unrepaired_damage'] == 'no'] no_damage['price'].mean() # On average, a vehicle with damage is one third the price of a vehicle with no damage.