#!/usr/bin/env python # coding: utf-8 # # Guided Project: Exploring Ebay Car Sales Data # # In this guided project, we'll work with a dataset of used cars from eBay Kleinanzeigen, a [classifieds](https://en.wikipedia.org/wiki/Classified_advertising) section of the German eBay website. # The dataset was originally scraped and uploaded to [Kaggle](https://www.kaggle.com/orgesleka/used-cars-database/data). A few modifications from the original dataset that was uploaded to Kaggle have been made: # # - 50,000 data points from the full dataset have been sampled, to ensure your code runs quickly in our hosted environment # - The dataset has been dirtied a bit to more closely resemble what you would expect from a scraped dataset (the version uploaded to Kaggle was cleaned to be easier to work with). # # The aim of this project is to clean the data and analyze the included used car listings. # In[300]: import numpy as np import pandas as pd # In[301]: #Let's read in the dataset autos = pd.read_csv('autos.csv', encoding='Latin-1') autos.info() # In[302]: autos.head() # ## Data cleaning # # In the above few cells we read in the autos dataset into a dataframe: # # - We see that it consists of 50000 rows and 20 columns. # - Some columns have null values, but none have more than ~20% null values. # - The column names use [camelcase](https://en.wikipedia.org/wiki/Camel_case) instead of Python's preferred snakecase, which means we can't just replace spaces with underscores. # # Let's convert the column names from camelcase to snakecase and reword some of the column names based on the data dictionary to be more descriptive. # In[303]: #Let's get the names of the columns as they are: autos.columns # In[304]: #We edit the columns'names to snakecase: autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'ab_test', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'n_pictures', 'postal_code', 'last_seen'] # In[305]: autos.head() # In the above few cells we cleaned the names of the columns in the dataframe so that they are in snakecase which would be easier to work with. # # Now let's do some basic data exploration to determine what other cleaning tasks need to be done. Initially we will look for: # # - Text columns where all or almost all values are the same. These can often be dropped as they don't have useful information for analysis. # - Examples of numeric data stored as text which can be cleaned and converted. # In[306]: autos.describe(include='all') # In[307]: autos['seller'].value_counts() #We see that there is only one entry which isn't from a private person. # In[308]: autos['offer_type'].value_counts() #Only one entry differs. # In[309]: autos['price'].value_counts() #This column needs additional clean up. # In[310]: autos['odometer'].value_counts() #This column needs additional clean up. # In[311]: autos['n_pictures'].value_counts() #We see that all ads are without pictures. Maybe we can drop that column. # From the above it looks like the 'seller', 'offer_type' and 'n_pictures' columns can be dropped. It also seems like the 'odometer' and 'price' columns need additional clean up - they hold numeric values stored as text. Let's clean them up: # In[312]: #First we will clean the price column: autos['price'] = autos['price'].str.replace('$', '') autos['price'] = autos['price'].str.replace(',', '') autos['price'] = autos['price'].astype(int) autos['price'].head() # In[313]: #Next we will clean the odometer column: autos['odometer'] = autos['odometer'].str.replace('km', '') autos['odometer'] = autos['odometer'].str.replace(',', '') autos['odometer'] = autos['odometer'].astype(float) autos['odometer'].head() # In[314]: #Let's rename the odometer column, so that it's clear that it is in km: autos = autos.rename({'odometer':'odometer_km'}, axis=1) # Now that we've cleaned up the price and odometer columns, let's explore them further. # # ## Cleaning the 'price' columm # In[315]: #Let's see how many unique values there are: autos['price'].unique().shape # In[316]: #Let's view min/max/median/mean etc autos['price'].describe() # In[317]: autos['price'].value_counts().sort_index(ascending=True).head(40) #We see the 40 lowest prices and how many cars are priced at them. # In[318]: autos['price'].value_counts().sort_index(ascending=False).head(20) #We see the 20 highest prices and how many cars are priced at them. # Looking at the highest prices - it seems there's a huge jump from 350,000 to 999,990. In fact, all prices above 350,000 look as if they are incorrect. Let's drop those rows. # Additionally, we see that there are 1421 cars priced at 0 USD. Let's drop these as well. # In[319]: autos = autos[(autos['price'] >= 1) & (autos['price'] < 999990)] autos['price'].describe() # ## Cleaning the 'odometer_km' column # In[320]: #Let's see how many unique values autos['odometer_km'].shape # In[321]: #Let's view min/max/median/mean etc autos['odometer_km'].describe() # In[322]: autos['odometer_km'].value_counts(ascending=True).head(10) # In[323]: autos['odometer_km'].value_counts(ascending=False).head() # The odometer values seem rounded which might mean that the sellers had to choose from a list and not enter the actual numbers of the cars. # # ## Cleaning the date columns # # There are 5 columns that should represent date values. Some of these columns were created by the crawler, some came from the website itself. # Right now, the date_crawled, last_seen, and ad_created columns are all identified as string values by pandas. Because these three columns are represented as strings, we need to convert the data into a numerical representation so we can understand it quantitatively. The other two columns are represented as numeric values. # # Let's first understand how the values in the three string columns are formatted. These columns all represent full timestamp values, like so: # In[324]: autos[['date_crawled','ad_created','last_seen']][0:5] # In[325]: autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # We see that the ads were crawled between March 5th, 2016 and April 7th, 2016. It seems that the ads were crawled almost daily. # In[326]: autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # We see that the oldest ads are from June 11th, 2015 and the newest ones - from April 7th, 2016. # In[327]: autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # The crawler records when it last saw a listing. This allows us to determine when the car was sold. # # ## Cleaning the registration_year column # In[328]: autos['registration_year'].describe() # In[329]: autos['registration_year'].value_counts(normalize=True).sort_index() # There are values in the registration_year column that seem incorrect: # # - Given that cars were invented in 1885, any listing with registration_year prior to that seems to be incorrect. # - Given that the ads were created between June 11th, 2015 and April 7th, 2016, any car with registration year after 2016 should also be removed. # # Let's count the number of listings with cars that fall outside the 1900 - 2016 interval and see if it's safe to remove those rows entirely, or if we need more custom logic. # In[330]: autos['registration_year'].shape[0]-autos['registration_year'].between(1900,2016).sum() # We see that 1884 out of the 48565 listings fall outside of the 1900-2016 registration year interval. Given that this account for less than 4%, it is safe to remove them. # In[331]: autos=autos[autos['registration_year'].between(1900,2016)] # In[332]: autos['registration_year'].value_counts(normalize=True).head() # It looks like the majority of the cars were registered between 1999 and 2005. # # ## Exploring the 'brand' column # In[333]: (autos['brand'].value_counts(normalize=True)*100) # Unsurprisingly, German brands make up for about 60% of all listings. Volkswagen is the top brand with the number of cars listed from that brand being the same as the total of the next two brands - BMW and Opel. # There are a lot of brands that aren't very represented. For the sake of the analysis, we will focus on those that make up more than 5% of the listings. # In[334]: #Let's isolate the top brands brand_counts = (autos['brand'].value_counts(normalize=True)*100) top_brands = brand_counts[brand_counts > 5].index top_brands # In[335]: brand_mean_prices = {} for brand in top_brands: by_brand = autos[autos['brand']==brand] mean_price = by_brand['price'].mean() brand_mean_prices[brand] = int(mean_price) print(brand_mean_prices) # Based on the average prices we see that: # # - Opel and Ford have the lowest average prices. # - Audi, Mercedes and BMW have the highest average prices. # - Volkswagen are moderately priced. # # ## Mean mileage per brand # In[336]: km_per_brand = {} for brand in top_brands: by_brand = autos[autos['brand'] == brand] mean_km = by_brand['odometer_km'].mean() km_per_brand[brand] = int(mean_km) km_per_brand # Above we created a dictionary in which we see the average mileage per brand. Now, let's build a dataframe so that we can easily compare the average price and average mileage per brand and see if we can find any corelation. # In[337]: mean_prices = pd.Series(brand_mean_prices) mean_mileage = pd.Series(km_per_brand) # In[338]: brand_info = pd.DataFrame(mean_prices, columns=['mean_prices']) brand_info['mean_mileage'] = mean_mileage brand_info # It is interesting to see that even though the average mileage of BMW and Mercedes cars is higher, their prices are still higher than the prices of other brands. # ## Conclusion # # In this guided project we explored data collected from the German eBay. The data contained information on car sales listings. We cleaned up the data and looked for any corelation between the average price and mileage of the top brands. # # ## Next steps # Data cleaning next steps: # # - Identify categorical data that uses german words, translate them and map the values to their english counterparts # - Convert the dates to be uniform numeric data, so "2016-03-21" becomes the integer 20160321. # - See if there are particular keywords in the name column that you can extract as new columns # # Analysis next steps: # # - Find the most common brand/model combinations # - Split the odometer_km into groups, and use aggregation to see if average prices follows any patterns based on the milage. # - How much cheaper are cars with damage than their non-damaged counterparts? # # ## Data cleaning continued # # Let's identify columns which have data entered in German and translate it in English. We already know that the 'seller' and 'offer_type' columns have words in German so let's start with them. # In[339]: autos['seller'] = autos['seller'].str.replace('privat', 'private').str.replace('gewerblich', 'commercial') autos['seller'].value_counts() # In[340]: autos['offer_type'] = autos['offer_type'].str.replace('Angebot', 'offer') autos['offer_type'].value_counts() # In[341]: autos['vehicle_type'].value_counts() # In[342]: autos['vehicle_type'] = autos['vehicle_type'].str.replace( 'kleinwagen', 'small car').str.replace('kombi', 'combi').str.replace('cabrio', 'convertible').str.replace('andere', 'other') autos['vehicle_type'].value_counts() # In[343]: autos['gearbox'].value_counts() # In[344]: autos['gearbox'] = autos['gearbox'].str.replace('manuell', 'manual').str.replace('automatik', 'automatic') autos['gearbox'].value_counts() # In[345]: autos['fuel_type'].value_counts() # In[346]: autos['fuel_type'] = autos['fuel_type'].str.replace('benzin', 'petrol').str.replace('elektro', 'electric').str.replace('andere', 'other') autos['fuel_type'].value_counts() # In[347]: autos['unrepaired_damage'].value_counts() # In[348]: autos['unrepaired_damage'] = autos['unrepaired_damage'].str.replace('ja', 'yes').str.replace('nein', 'no') autos['unrepaired_damage'].value_counts() # Now, let's convert the dates to be uniform numeric data, so "2016-03-21" becomes the integer 20160321. The columns containing dates are: 'date_crawled', 'ad_created' and 'last_seen' # In[349]: autos['date_crawled'] = autos['date_crawled'].str[:10] autos['date_crawled'] = autos['date_crawled'].str.replace('-', '') autos['date_crawled'] = autos['date_crawled'].astype(float) autos['date_crawled'].value_counts().head() # In[350]: autos['ad_created'] = autos['ad_created'].str[:10] autos['ad_created'] = autos['ad_created'].str.replace('-', '') autos['ad_created'] = autos['ad_created'].astype(int) autos['ad_created'].value_counts().head() # In[351]: autos['last_seen'] = autos['last_seen'].str[:10] autos['last_seen'] = autos['last_seen'].str.replace('-', '') autos['last_seen'] = autos['last_seen'].astype(int) autos['last_seen'].value_counts().head() # ## Analysis continued # # Let's try to find the most common brand/model combination for the top brands: # In[357]: brand_model = {} for brand in top_brands: by_brand = autos[autos['brand'] == brand] top_model = by_brand['model'].describe()['top'] brand_model[brand] = top_model brand_model #Below we see the most common model per brand # Now let's see if the average prices follow any pattern based on mileage. # In[452]: ranges = autos['odometer_km'].value_counts(bins=5).index ranges # In[460]: ranges_tuple = ranges.to_tuples() km_price = {} for r in ranges_tuple: by_km = autos[autos['odometer_km'].between(r[0], r[1])] price = by_km['price'].mean() km_price[r] = int(price) km_price # In the above two cells we did the following: # # - we split the 'odometer_km' values in five bins (ranges) # - we calculated the mean price for each odometer bin # # We can conclude that the lower the milage of the car, the higher the price. # How much cheaper are cars with damage than their non-damaged counterparts? # In[429]: damage = autos['unrepaired_damage'].unique() damaged_price = {} for d in damage: by_d = autos[autos['unrepaired_damage'] == d] price = by_d['price'].mean() damaged_price[d] = price price_per_damaged # On average, cars with unrepaired damage are 30% cheaper than undamaged cars. # In[ ]: