#!/usr/bin/env python # coding: utf-8 # # Exploring Ebay Car Sales Data # In this project, we'll work on used cars data set from eBay Kleinanzeigen, a classifieds section of the German eBay website. # # The aim of this project is to clean the data and analyze the included used car listings. # # The data dictionary provided with data is as follows: # # * **dateCrawled** - When this ad was first crawled. All field-values are taken from this date. # * **name** - Name of the car. # * **seller** - Whether the seller is private or a dealer. # * **offerType** - The type of listing # * **price** - The price on the ad to sell the car. # * **abtest** - Whether the listing is included in an A/B test. # * **vehicleType** - The vehicle Type. # * **yearOfRegistration** - The year in which the car was first registered. # * **gearbox** - The transmission type. # * **powerPS** - The power of the car in PS. # * **model** - The car model name. # * **kilometer** - How many kilometers the car has driven. # * **monthOfRegistration** - The month in which the car was first registered. # * **fuelType** - What type of fuel the car uses. # * **brand** - The brand of the car. # * **notRepairedDamage** - If the car has a damage which is not yet repaired. # * **dateCreated** - The date on which the eBay listing was created. # * **nrOfPictures** - The number of pictures in the ad. # * **postalCode** - The postal code for the location of the vehicle. # * **lastSeenOnline** - When the crawler saw this ad last online. # ### Import Numpy and Pandas libraries. Read the data set file # In[431]: import numpy as np import pandas as pd autos = pd.read_csv('autos.csv', encoding="latin-1") # In[432]: autos # ### Print information about the dataframe # In[433]: autos.info() autos.head() # From the above, we can determine the following information: # * Data set consists of 50000 rows and 20 columns. # * 15 columns are of object datatype and 5 are integer datatype. So, most of the columns have string datatype. # * Columns of object data type have null for few of the fields. # * Column names use Camel case. # ### Clean the Column names # In[434]: column_names = autos.columns # attribute that provides column names print(column_names) # We will change the names from camel case to snake case. change the names to meaniningful names. # In[435]: col_names = {'yearOfRegistration':'registration_year', 'monthOfRegistration':'registration_month', 'notRepairedDamage':'unrepaired_damage', 'dateCreated':'ad_created'} autos.rename(col_names,axis=1,inplace=True) # replace the column names using dictionary autos.columns = autos.columns.str.lower() autos.head() # ### Initial Exploration and cleaning # In[436]: autos.describe(include='all') # include even empty values - NaN # 'seller' and 'offertype' columns have only 1 value. They can be dropped. # In[437]: autos.drop(['seller','offertype'], axis=1, inplace=True) #removing seller and offertype columns autos.columns # 'nrofpictures' column doesn't contain any value. Let's check the frequency of the values from this column. # In[438]: autos.loc[:,'nrofpictures'].value_counts(dropna=False) # by default value_counts() method doesn't include NaN # Let's remove 'nrofpictures' column as well as non of the fields contain data. # In[439]: autos.drop('nrofpictures',axis=1, inplace=True) # removing nrofpictures column autos.columns # In[440]: autos.loc[:, 'price'].value_counts(dropna=False) # In[441]: autos.loc[:, 'odometer'].value_counts(dropna=False) # 'price' and 'odometer' columns are numeric values stored as text. We'll remove the extra characters (',','km','$'), convert them to int datatype and rename the column 'odometer' to odometer_km. # In[442]: autos.loc[:,'price'] = autos.loc[:,'price'].str.replace("$","").str.replace(",","").astype(int) # using method chaining autos.loc[:,'price'].head() # In[443]: autos.loc[:,'odometer'] = autos.loc[:,'odometer'].str.replace("km","").str.replace(",","").astype(int) autos.rename({'odometer':'odometer_km'},axis=1,inplace=True) autos.loc[:,'odometer_km'].head() # ### Exploring the Odometer and Price Columns # In[444]: print(autos.loc[:,'price'].unique().shape[0]) print(autos.loc[:,'price'].describe()) print(autos.loc[:,'price'].value_counts(ascending=False).head(10)) # - We have 2357 unique prices for 50000 entries. # - Maximum price of the car is 100 million dollars. # - **1421** cars are priced at 0 dollars. **These should be removed**. # - All the prices seem to be rounded off to 100s. # - Notice that mean value is not same as 50 percentile. So, there are outliers in this column. # In[445]: print(autos.loc[:,'price'].value_counts(ascending=False).sort_index(ascending=False).head(20)) # The car prices seem to jump suddenly from 350,000 dollars to 999,990 dollars and beyond. So, the prices beyond 350,000 are source of outliers. Hence, **we need to remove rows in dataset with prices 0 and 350,000 above**. # # Let's also analyze the 'odometer_km' column futher. # In[446]: print(autos.loc[:,'odometer_km'].unique().shape) print(autos.loc[:,'odometer_km'].describe()) print(autos.loc[:,'odometer_km'].value_counts(ascending=False).head(10)) # - We have only 13 unique values (km driven) among 50000 cars. # - Maximum km driven is 150000 for 32424 cars which is almost 75% # In[447]: autos = autos[autos.loc[:, "price"].between(1,351000)] # removing rows that contain <= 0$ or > 350,000$ in 'price' column print(autos.loc[:,"price"].describe()) # ### Exploring the date columns # Let's analyze the date columns # 1. 'date_crawled','ad_created','last_seen' are object datatypes i.e. strings. # 2. We need to convert the data into a numerical representation so we can understand it quantitatively. # In[448]: autos[['datecrawled','ad_created','lastseen']][0:5] # Let's find out the distribution for each of these columns by extracting the dates # In[449]: autos.loc[:,'datecrawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[450]: autos.loc[:,'datecrawled'].str[:10].value_counts(normalize=True, dropna=False).sort_values() # The cars are crawled in March, April months of 2016. The crawling rate is almost the same each day. Suddenly, it dropped on the last 2 days of April # In[451]: autos.loc[:,'ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[452]: autos.loc[:,'ad_created'].str[:10].value_counts(normalize=True).sort_index().tail(40) # In[453]: autos.loc[:,'ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_values().tail(40) # Most of the Ads are created in March, April of 2016. This could be the reason for crawling rate to be high in these two months. We also have Ads created in 2015. But, there are no crawling entreis. Hence, it seems like users are not interested in old Ads. # In[454]: autos.loc[:,'lastseen'].str[:10].value_counts(normalize=True, dropna=False).sort_values() # In[455]: autos.loc[:,'lastseen'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # The 'lastseen' abruptly increased in last 3 days of April. But, at the same time, there were less number of crawling rate during these days. This indicates, there were no sales of the crash. # We also have 'registration_year' which indicates the date of registration of the car. Let's check the distribution of this column # In[456]: autos.loc[:, 'registration_year'].describe() # Minimum year is 1000 and Max year is 9999. These values doesn't seem to be realistic. # ### Dealing with Incorrect Registration Year Data # Car can't be first registered after the listing was seen, any vehicle with a registration year above 2016 is definitely inaccurate. Let's count the number of listings with cars that fall outside the 1900 - 2016 interval and see if it's safe to remove those rows entirely, or if we need more custom logic. # In[457]: bool_map = (autos.loc[:, 'registration_year'] < 1900) | (autos.loc[:, 'registration_year'] > 2016) print(autos.loc[bool_map, 'registration_year'].value_counts()) print(autos.loc[bool_map, 'registration_year'].describe()) print('Percentage :', (autos.loc[bool_map, 'registration_year'].shape[0]/autos.shape[0]) * 100) # Many cars have 2017, 2018 as registration dates. Others are hardly 1 or 2 entires. All these entries account upto 4% of total entries. As these entries seem to be unrealistic and less, we will remove them. # In[458]: autos = autos.loc[~bool_map] print(autos.loc[:, 'registration_year'].value_counts(normalize=True).sort_values(ascending=False).head(22)) print(autos.loc[:, 'registration_year'].value_counts(normalize=True).sort_index(ascending=False).head(22)) # Most of the registrations seem to have happened between 1995 and 2016 i.e. 20 yrs behind from 2016 # ### Exploring Price by Brand # In[459]: # Create a function to retreive top brand cars which are above 5% def top_brands_calc(dataf): brands = dataf.loc[:, 'brand'].value_counts(normalize=True) top_brands = brands[brands > 0.05].index print(top_brands) return top_brands top_brands = top_brands_calc(autos) # In[460]: # Let's create a function to calculate mean prices and save in dictionary mean_prices = {} def mean_prices_calc(top_brands, dataf): for brands in top_brands: #print(brand) brand_rows = dataf[dataf['brand'] == brands] mean_price = brand_rows['price'].mean() mean_prices[brands] = int(mean_price) print(mean_prices) return mean_prices mean_prices = mean_prices_calc(top_brands, autos) # Audi, BMW and Mercedes Benz are more expensive. Ford and Opel are less expensive. Volkswagen is in between. # ### Storing Aggregate Data in a DataFrame # In[461]: # Let's create a function that creates series object for the mean_prices whose values are sorted descending order def series_sorted(mean_prices): mean_price_series = pd.Series(mean_prices) mean_price_sorted = mean_price_series.sort_values(ascending=False) return mean_price_sorted mean_price_series_sorted = series_sorted(mean_prices) print(mean_price_series_sorted) # In[462]: # Let's create a function to calculate mean mileage and save in dictionary mean_mileage = {} def mean_mileage_calc(top_brands, dataf): for brands in top_brands: #print(brand) brand_rows = dataf[dataf['brand'] == brands] mileage = brand_rows['odometer_km'].mean() mean_mileage[brands] = int(mileage) print(mean_mileage) return(mean_mileage) mean_mileage = mean_mileage_calc(top_brands, autos) # In[463]: # create series object for the mean_mileage mean_mileage_series = pd.Series(mean_mileage) print(mean_mileage_series) # In[464]: # create a dataframe that includes mean_price and mean_mileage series objects df1 = pd.DataFrame(mean_price_series_sorted, columns=['mean_price']) df1.loc[:, 'mean_mileage'] = mean_mileage_series df1 # The Car prices seem to vary a lot. But, mileage of the cars varies slightly. We have another column unrepaired_damage which specifies whether the car is damaged and repaired. lets check the distribution. # In[465]: autos.loc[:, 'unrepaired_damage'].value_counts(dropna=False) # ~**18%** of the cars doesn't have the damage info (NaN). ~**10%** of the cars are damaged, rest are not damaged. Let's remove the cars that doesn't have the damage info and calculate the mean values of prices and mileage. # In[467]: # Let's remove all the rows from 'unrepaired_damage' column that contain NaN print(autos.shape) autos = autos[autos.loc[:, 'unrepaired_damage'].notnull()] autos.loc[:, 'unrepaired_damage'].value_counts(dropna=False) print(autos.shape) # In[468]: # Calculate the mean price and mileage table after removing the NaN values top_brands = top_brands_calc(autos) mean_prices = mean_prices_calc(top_brands,autos) mean_mileage = mean_mileage_calc(top_brands,autos) mean_price_series_sorted = series_sorted(mean_prices) mean_mileage_series = pd.Series(mean_mileage) df2 = pd.DataFrame(mean_price_series_sorted, columns=['mean_price']) df2.loc[:, 'mean_mileage'] = mean_mileage_series df2 # In[469]: print('Values before removing NaN rows') print('Mileage: ', 'min: ', df1['mean_mileage'].min(), 'max:', df1['mean_mileage'].max(), 'mean: ',df1['mean_mileage'].mean()) print('Price : '' min:', df1['mean_price'].min(), ' max: ', df1['mean_price'].max(), ' mean:', df1['mean_price'].mean()) print('Values after removing NaN rows') print('Mileage: ', 'min: ', df2['mean_mileage'].min(), 'max:', df2['mean_mileage'].max(), 'mean: ',df2['mean_mileage'].mean()) print('Price : '' min:', df2['mean_price'].min(), ' max: ', df2['mean_price'].max(), 'mean:', df2['mean_price'].mean()) # We can notice that the price of the cars seem to be increased and mileage is reduced. We also have **18%** of cars that are damaged. # In[470]: # List of cars that have damage autos.loc[autos.loc[:, 'unrepaired_damage'] == 'ja', 'brand'].value_counts(normalize=True).head(10) # Volkswagen topped the list with **18%** of damaged cars . Audi which is higly priced so far consistitues to **7%** of damaged cars. Opel which is low priced, has **13%** damaged cars. # In[471]: # List of cars that don't have damage autos.loc[autos.loc[:, 'unrepaired_damage'] == 'nein', 'brand'].value_counts(normalize=True).head(10) # Volkswagen again has **20%** undamaged cars. Audi consistitues **9%** and Opel **10%**. # # Let's consider Opel. It is low priced car. But, there are 3% more damaged cars than undamaged cars. Also, Audi which is high priced has 2% more undamaged cars than damaged cars. To conclude on which brand car to buy which is moderately priced, has less mileage and not damaged, let's remove all the damaged cars which are 10% of overall cars and determine the mean price and mileage values. # In[472]: # Let's consider the rows that contain 'nein' in column i.e non damaged cars autos_nein = autos.loc[autos.loc[:, 'unrepaired_damage'] == 'nein'] # In[473]: # Calculate the mean price and mileage table for the non damaged cars top_brands = top_brands_calc(autos_nein) mean_prices = mean_prices_calc(top_brands,autos_nein) mean_mileage = mean_mileage_calc(top_brands,autos_nein) mean_price_series_sorted = series_sorted(mean_prices) mean_mileage_series = pd.Series(mean_mileage) df3 = pd.DataFrame(mean_price_series_sorted, columns=['mean_price']) df3.loc[:, 'mean_mileage'] = mean_mileage_series df3 # In[474]: print('[1] Values before removing NaN rows') print('Mileage: ''max :', df1['mean_mileage'].max(), 'min: ', df1['mean_mileage'].min(), 'mean: ',df1['mean_mileage'].mean()) print('Price : ''max :', df1['mean_price'].max(), 'min: ', df1['mean_price'].min(), 'mean:', df1['mean_price'].mean()) print('[2] Values after removing NaN rows') print('Mileage: ''max :', df2['mean_mileage'].max(), 'min: ', df2['mean_mileage'].min(), 'mean: ',df2['mean_mileage'].mean()) print('Price : ''max :', df2['mean_price'].max(), 'min: ', df2['mean_price'].min(), 'mean:', df2['mean_price'].mean()) print("[3] Values of non damaged cars") print('Mileage: ''max :', df3['mean_mileage'].max(), 'min: ', df3['mean_mileage'].min(), 'mean: ',df3['mean_mileage'].mean()) print('Price : ''max :', df3['mean_price'].max(), 'min: ', df3['mean_price'].min(), 'mean:', df3['mean_price'].mean()) # Car prices have further increased and mileage has decreased after removing the damaged cars. # # Let's compare **Audi** which is higly priced in all these 3 cases with **Opel** and **Volkswagen**. # # 1) Comparing Audi with Opel (low priced car). Opels price is **66.47%** less than Audi. Mileage is **1.21%** increased than Audi. # # 2) Comparing Audi with Volkswagen (which is approximately closer to the mean price and mileage). VW price is **40.73%** less than Audi. Mileage increased only **0.17%** than Audi. # # In both cases, there is huge price difference than the mileage. It's better to buy Volkswagen than Opel due to below reasons. # # 1) Opel is old brand car than Audi and Volkswagen. # # 2) Volkswagen is 40.73% less priced than Audi and from mileage prespective, increase is 0.17% which is negligible. # # So, **Volkswagen** is the Winner :)