#!/usr/bin/env python # coding: utf-8 # **Title: Analyzing Used Cars Listing on ebay Kleinanzeigen** # # **Introduction** # # The aim of this project is to clean the data and analyze the included used car listings from _eBay_ _Kleinanzeigen_, a [classifieds](https://en.wikipedia.org/wiki/Classified_advertising) section of the German eBay website. # # The dataset was originally scraped and uploaded to [Kaggle](https://www.kaggle.com/orgesleka/used-cars-database/data). Modifications have been made from the original dataset that was uploaded to Kaggle: # # - We sampled 50,000 data points from the full dataset, to ensure the code runs quickly in our hosted environment # # - We dirtied the dataset a bit to more closely resemble what it would be expect from a scraped dataset (the version uploaded to Kaggle was cleaned to be easier to work with) # # The data dictionary provided with data is as follows: # # - dateCrawled - When this ad was first crawled. All field-values are taken from this date. # - name - Name of the car. # - seller - Whether the seller is private or a dealer. # - offerType - The type of listing # - price - The price on the ad to sell the car. # - abtest - Whether the listing is included in an A/B test. # - vehicleType - The vehicle Type. # - yearOfRegistration - The year in which the car was first registered. # - gearbox - The transmission type. # - powerPS - The power of the car in PS. # - model - The car model name. # - kilometer - How many kilometers the car has driven. # - monthOfRegistration - The month in which the car was first registered. # - fuelType - What type of fuel the car uses. # - brand - The brand of the car. # - notRepairedDamage - If the car has a damage which is not yet repaired. # - dateCreated - The date on which the eBay listing was created. # - nrOfPictures - The number of pictures in the ad. # - postalCode - The postal code for the location of the vehicle. # - lastSeenOnline - When the crawler saw this ad last online. # Let's start by importing the libraries we need and reading the dataset into pandas. # In[1]: import pandas as pd import numpy as py autos = pd.read_csv(r'C:\Users\Admin\Desktop\New folder\autos.csv', encoding='Latin-1') autos.info() autos.head() # Referenced to the dataframe information, we can deduce that 5 of the columns have values of integers and the rest of the columns are in strings. There are a few columns with null values and some columns that contain dates stored as strings. It has also been observed that the column names use camelcase instead of Python's preferred snakecase, which means we can't just replace spaces with underscores. # # Let's convert the column names from camelcase to snakecase and reword some of the column names based on the data dictionary to be more descriptive. # In[2]: print(autos.columns) # We'll make a few changes here: # # - Change the columns from camelcase to snakecase. # - Change a few wordings to more accurately describe the columns. # In[3]: autos.columns=['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'num_photos', 'postal_code', 'last_seen'] autos.head() # We have essentially rename the column names from camelcase to snakecase. This is important for the dataset to be clean and easily understood by users. # # **Initial Data Exploration and Cleaning** # # We'll start by exploring the data to find obvious areas where we can clean the data. # # Now let's do some basic data exploration to determine what other cleaning tasks need to be done. Initially we will look for: - Text columns where all or almost all values are the same. These can often be dropped as they don't have useful information for analysis. - Examples of numeric data stored as text which can be cleaned and converted. # # The following methods are helpful for exploring the data: - DataFrame.describe() (with include='all' to get both categorical and numeric columns) - Series.value_counts() and Series.head() if any columns need a closer look. # In[4]: autos.describe(include='all') # Our initial observations: # # - There are a number of text columns where all (or nearly all) of the values are the same: # - seller # - offer_type # - The num_photos column looks odd, we'll need to investigate this further. # In[5]: autos['num_photos'].value_counts() # It looks like the num_photos column has 0 for every column. We'll drop this column, plus the other two we noted as mostly one value. # In[6]: autos.drop(['num_photos','seller','offer_type'],axis=1) # There are two columns, price and odometer, which are numeric values with extra characters being stored as text. We'll clean and convert these. # In[7]: autos['price'].unique() # In[8]: autos["price"]= autos["price"].str.replace("$","") autos["price"]= autos["price"].str.replace(",","") autos["price"]= autos["price"].astype(int) autos["price"].head() # In[9]: autos["odometer"].unique() # In[10]: autos["odometer"]= autos["odometer"].str.replace("km","") autos["odometer"]= autos["odometer"].str.replace(",","") autos["odometer"]= autos["odometer"].astype(int) # In[11]: autos.rename(columns={"odometer":"odometer_km"},inplace=True) autos["odometer_km"].head() # ** Exploring Odometer and Price ** # # Let's continue exploring the data, specifically looking for data that doesn't look right. We'll start by analyzing the odometer_km and price columns. Here's the steps we'll take: # # - Analyze the columns using minimum and maximum values and look for any values that look unrealistically high or low (outliers) that we might want to remove. # # - We'll use: # - Series.unique().shape to see how many unique values # - Series.describe() to view min/max/median/mean etc # - Series.value_counts(), with some variations: # - chained to .head() if there are lots of values. # - Because Series.value_counts() returns a series, we can use # Series.sort_index() with ascending= True or False to view the # highest and lowest values with their counts (can also chain to # head() here). # - When removing outliers, we can do df[(df["col"] > x ) & (df["col"] < # y )], but it's more readable to use df[df["col"].between(x,y)] # In[12]: print(autos["odometer_km"].unique().shape) print('\n') print(autos["odometer_km"].describe()) print('\n') print(autos["odometer_km"].value_counts()) # We can see that the values in this field are rounded, which might indicate that sellers had to choose from pre-set options for this field. Additionally, there are more high mileage than low mileage vehicles. # In[13]: print(autos["price"].unique().shape) print('\n') print(autos["price"].describe()) print('\n') print(autos["price"].value_counts()) # Again, the prices in this column seem rounded, however given there are 2357 unique values in the column, that may just be people's tendency to round prices on the site. # # There are 1,421 cars listed with $0 price - given that this is only 2% of the of the cars, we might consider removing these rows. The maximum price is one hundred million dollars, which seems a lot, let's look at the highest prices further. # In[14]: autos["price"].value_counts().sort_index(ascending=False).head(20) # In[15]: autos["price"].value_counts().sort_index(ascending=True).head(20) # There are a number of listings with prices below 30 dollars, including about 1,500 at zero dollars. There are also a small number of listings with very high values, including 14 at around or over one million. # # Given that eBay is an auction site, there could legitimately be items where the opening bid is one dollar . We will keep the one dollar items, but remove anything above $350,000, since it seems that prices increase steadily to that number and then jump up to less realistic numbers. # In[16]: autos=autos[autos["price"].between(1,351000)] autos["price"].describe() # **Exploring the date columns** # # Let's now move on to the date columns and understand the date range the data covers. There are a number of columns with date information: # # - date_crawled: added by the crawler # - registration_month: from the website # - registration_year: from the website # - ad_created: from the website # - last_seen: added by the crawler # # These are a combination of dates that were crawled, and dates with meta-information from the crawler. The non-registration dates are stored as strings. # # We'll explore each of these columns to learn more about the listings. # In[17]: autos[['date_crawled','ad_created','last_seen']][0:5] # In[18]: (autos["date_crawled"] .str[:10] .value_counts(normalize=True, dropna=False) .sort_index() ) # In[19]: (autos["date_crawled"] .str[:10] .value_counts(normalize=True, dropna=False) .sort_values() ) # Looks like the site was crawled daily over roughly a one month period in March and April 2016. The distribution of listings crawled on each day is roughly uniform. # In[20]: (autos["last_seen"] .str[:10] .value_counts(normalize=True, dropna=False) .sort_index() ) # The crawler recorded the date it last saw any listing, which allows us to determine on what day a listing was removed, presumably because the car was sold. # # The last three days contain a disproportionate amount of 'last seen' values. Given that these are 6-10x the values from the previous days, it's unlikely that there was a massive spike in sales, and more likely that these values are to do with the crawling period ending and don't indicate car sales. # In[21]: (autos["ad_created"] .str[:10] .value_counts(normalize=True, dropna=False) .sort_index() ) # There is a large variety of ad created dates. Most fall within 1-2 months of the listing date, but a few are quite old, with the oldest at around 9 months. # In[22]: autos["registration_year"].describe() # **Dealing with Incorrect Registration Year Data** # # One thing that stands out from the exploration we did in the last screen is that the registration_year column contains some odd values: # # - The minimum value is 1000, before cars were invented # - The maximum value is 9999, many years into the future # # Because a car can't be first registered after the listing was seen, any vehicle with a registration year above 2016 is definitely inaccurate. Determining the earliest valid year is more difficult. Realistically, it could be somewhere in the first few decades of the 1900s. # # Let's count the number of listings with cars that fall outside the 1900 - 2016 interval and see if it's safe to remove those rows entirely, or if we need more custom logic. # In[23]: autos["registration_year"].between(1900,2016).sum() / autos.shape[0] # Less than 4% of the data does not fall between the 1900 - 2016 period. So we can safely remove these rows and not affecting the overall data. # In[24]: autos = autos[autos["registration_year"].between(1900,2016)] autos["registration_year"].value_counts(normalize=True).head(10) # It appears that most of the vehicles were first registered in the past 20 years. # **Exploring Price by Brand** # In[25]: autos["brand"].value_counts(normalize=True) # German manufacturers represent four out of the top five brands, almost 50% of the overall listings. Volkswagen is by far the most popular brand, with approximately double the cars for sale of the next two brands combined. # # There are lots of brands that don't have a significant percentage of listings, so we will limit our analysis to brands representing more than 5% of total listings. # In[26]: brand_counts = autos["brand"].value_counts(normalize=True) common_brands = brand_counts[brand_counts > .05].index print(common_brands) # In[27]: brand_mean_prices = {} for brand in common_brands: brand_only = autos[autos["brand"] == brand] mean_price = brand_only["price"].mean() brand_mean_prices[brand] = int(mean_price) print(brand_mean_prices) # Of the top 5 brands, there is a distinct price gap: # # - Audi, BMW and Mercedes Benz are more expensive # - Ford and Opel are less expensive # - Volkswagen is in between - this may explain its popularity, it may be a 'best of 'both worlds' option. # **Exploring Mileage** # In[28]: brand_mean_mileage={} for brand in common_brands: brand_only = autos[autos["brand"] == brand] mean_mileage = brand_only["odometer_km"].mean() brand_mean_mileage[brand] = int(mean_mileage) print(brand_mean_mileage) # In[29]: bmp_series = pd.Series(brand_mean_prices) df = pd.DataFrame(bmp_series, columns=['mean_price']) bmm_series = pd.Series(brand_mean_mileage) df["mean_mileage"]=bmm_series print(df) # The range of car mileages does not vary as much as the prices do by brand, instead all falling within 10% for the top brands. There is a slight trend to the more expensive vehicles having higher mileage, with the less expensive vehicles having lower mileage.