#!/usr/bin/env python # coding: utf-8 # In this Guided Project we will work with a dataset of used cars from eBay Kleinanzeigen, a section of the German eBay website. # # The aim of this project is to clean the data and analyze the included used car listings. # # The data dictionary provided with data is as follows: # # - dateCrawled - When this ad was first crawled. All field-values are taken from this date. # - name - Name of the car. # - seller - Whether the seller is private or a dealer. # - offerType - The type of listing # - price - The price on the ad to sell the car. # - abtest - Whether the listing is included in an A/B test. # - vehicleType - The vehicle Type. # - yearOfRegistration - The year in which the car was first registered. # - gearbox - The transmission type. # - powerPS - The power of the car in PS. # - model - The car model name. # - kilometer - How many kilometers the car has driven. # - monthOfRegistration - The month in which the car was first registered. # - fuelType - What type of fuel the car uses. # - brand - The brand of the car. # - notRepairedDamage - If the car has a damage which is not yet repaired. # - dateCreated - The date on which the eBay listing was created. # - nrOfPictures - The number of pictures in the ad. # - postalCode - The postal code for the location of the vehicle. # - lastSeenOnline - When the crawler saw this ad last online. # # In[1]: import numpy as np import pandas as pd autos = pd.read_csv('autos.csv', encoding='Windows-1252') # In[2]: autos # We want to find out some details, thus we will look into the first 5 rows. # In[3]: #print(autos.info()) #print(autos.head()) # The dataset contains 20 columns, most of which are strings. # Some columns have null values, but none have more than ~20% null values. # The column names use camelcase instead of Python's preferred snakecase, which means we can't just replace spaces with underscores. # The Class of the dataset is = 'pandas.core.frame.DataFrame'> # It includeds 50000 entries ending with 2016-03-26 17:47:46 and beginning at 2016-03-14 00:42:12. # The data is seperated between numbers (int64) and strings (objects) # The dtypes of the indexes are: int64(5), object(14) # The Brands of the car are directly connected via (_) with the description of the car in the column of the name, but the brand is shown in a seperated column. # Also the price is not a number, more a string with a sign before it. # Now it is time to get all the column names from camelcase to snakecase. # In[4]: autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'ab_test', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'num_photos', 'postal_code','last_seen'] autos.head() # The new autos Dateframe constist of the same old data, just with the exception of new Index Names which are written in Snakecase and not Camelecase # # 3. Basic data exploration # 1) Initially we will look for: - Text columns where all or almost all values are the same. These can often be dropped as they don't have useful information for analysis. - Examples of numeric data stored as text which can be cleaned and converted. # - First we will check the Dataframe descriptive # # In[5]: autos.describe(include='all') # 1) Columns which basically have identical values: # -seller & offer_type # so we will delete those columns. # 2) The column num_photos looks a bit odd - so we will investigage it more. # 3) The odometer includes number values, which are stored as text. # # In[6]: autos["num_photos"].value_counts() # It looks like the num_photos column has 0 for every column. We'll drop this column, plus the other two we noted as mostly one value. # # In[7]: autos = autos.drop(["num_photos", "seller", "offer_type"], axis=1) # The odometer includes numeric values as text. Thus, we have to remove any non.numeric characters and convert the column to a numeric dtype and we have to change the column name into odometer_km. # In[8]: autos["odometer"] = (autos["odometer"].str.replace("km", "").str.replace(",", "").astype(int)) autos["price"] = (autos["price"].str.replace("$", "").str.replace(",", "").astype(int)) autos.rename(columns={'odometer': 'odometer_km'}, inplace=True) autos.rename(columns={'price': 'price_€'}, inplace=True) autos["odometer_km"].head() autos["price_€"].head() # # 4. Exploring the Odometer and Price Columns # We will analyze the columns using minimum and maximum values and look for any values that look unrealistically high or low (outliers) that we might want to remove. # In[9]: autos["odometer_km"].unique().shape # The Series Odometer_km has 13 unique values. # In[10]: autos["odometer_km"].describe() # The total entries are 50.000 # The smallest value in the odometer_km Series is 5000 and the highest 150000. # The data type is float. # In[11]: odo_unsorted = autos["odometer_km"].value_counts() odo_sorted = odo_unsorted.sort_index(ascending=True) print(odo_sorted) # We can see that the values in this field are rounded, which might indicate that sellers had to choose from pre-set options for this field. Additionally, there are more high mileage than low mileage vehicles. # But there are no strange values or data. # In[12]: print(autos["price_€"].unique().shape) print(autos["price_€"].describe()) autos["price_€"].value_counts().head(20) # The description shows us, that we have many values are too small to be true, as well as some values which are too high. Thus, we will dropthos values. # # Again, the prices in this column seem rounded, however given there are 2357 unique values in the column, that may just be people's tendency to round prices on the site. # # There are 1,421 cars listed with $0 price - given that this is only 2Percent of the of the cars, we might consider removing these rows. The maximum price is one hundred million dollars, which seems a lot, let's look at the highest prices further. # # In[13]: autos["price_€"].value_counts().sort_index(ascending=False).head(20) # In[14]: autos["price_€"].value_counts().sort_index(ascending=False).head(20) # There are a number of listings with prices below \$30, including about 1,500 at \$0. There are also a small number of listings with very high values, including 14 at around or over $1 million. # Given that eBay is an auction site, there could legitimately be items where the opening bid is \$1. We will keep the \$1 items, but remove anything above \$350,000, since it seems that prices increase steadily to that number and then jump up to less realistic numbers. # # In[15]: autos = autos[autos["price_€"].between(1,351000)] autos["price_€"].describe() # We removed around 1500cars from our list. Now the maximum price is 350.000€. We still include around 1500 entries with around 1 Euro, which most likely are open biddings. # # 5. Exploring the date columns # Let's now move on to the date columns and understand the date range the data covers. # # There are 5 columns that should represent date values. Some of these columns were created by the crawler, some came from the website itself # Right now, the date_crawled, last_seen, and ad_created columns are all identified as string values by pandas. Because these three columns are represented as strings, we need to convert the data into a numerical representation so we can understand it quantitatively. # Let's first understand how the values in the three string columns are formatted. # In[16]: autos[['date_crawled','ad_created','last_seen']][0:5] # We notice that the first 10 characters represent the day (e.g. 2016-03-12). To understand the date range, we can extract just the date values, use Series.value_counts() to generate a distribution, and then sort by the index # In[17]: autos["date_crawled"].describe() print(autos["date_crawled"].str[:10].head(10)) # In[18]: (autos["date_crawled"]. str[:10]. value_counts(normalize=True, dropna=False). sort_index(ascending=False)) # Looks like the site was crawled daily over roughly a one month period in March and April 2016. The distribution of listings crawled on each day is roughly uniform. # In[19]: (autos["ad_created"]. str[:10]. value_counts(normalize=True, dropna=False). sort_index(ascending=False)) # There is a large variety of ad created dates. Most fall within 1-2 months of the listing date, but a few are quite old, with the oldest at around 9 months. # In[20]: (autos["last_seen"].str[:10]. value_counts(normalize=True, dropna=False). sort_index(ascending=False)) # The crawler recorded the date it last saw any listing, which allows us to determine on what day a listing was removed, presumably because the car was sold. # # The last three days contain a disproportionate amount of 'last seen' values. Given that these are 6-10x the values from the previous days, it's unlikely that there was a massive spike in sales, and more likely that these values are to do with the crawling period ending and don't indicate car sales. # # # 6. Dealing with Incorrect Registration Year Data # In[21]: autos["registration_year"].describe() # The registration_year is a float type value. # It also includes some strange values. # 1) The minimum is 1000, which is not possible, since there no cars existed and the maximum value is 9999, which again is not possible, since it represents the future. # Thus, we will drop those values. # The lowest acceptable value is 1900 and the highest is 2019. # In[22]: autos["registration_year"].value_counts(normalize=True) autos["registration_year"].describe() # In[23]: (~autos["registration_year"].between(1900,2016)).sum() / autos.shape[0] #autos["registration_year"].describe() # The total number of vales decreased from 48565 to 48545. # Given the values out of the range are 4 percent, we will drop the values out of the range between 1910 and 2019. In total we droped 20 values that were out of that range. # In[24]: autos = autos[autos["registration_year"].between(1900,2016)] autos["registration_year"].value_counts(normalize=True).head(10) # # # It appears that most of the vehicles were first registered in the past 20 years. # # # 7. Exploring Price by Brand # In[25]: autos["brand"].value_counts(normalize=True) # That list shows us all brands which are included in the Datafram autos. # The 5 brands with the highest share are german manufacturers and represent around 60% of the total share. # There are lots of brands that don't have a significant percentage of listings, so we will limit our analysis to brands representing more than 5% of total listings. # In[26]: brand_counts = autos["brand"].value_counts(normalize=True) common_brands = brand_counts[brand_counts > .05].index print(common_brands) # In[31]: brand_mean_prices = {} for brand in common_brands: only_brand = autos[autos["brand"] == brand] mean_price = only_brand["price_€"].mean() brand_mean_prices[brand] = int(mean_price) brand_mean_prices # In[32]: brand_mean_power_ps = {} for brand in common_brands: brand_only = autos[autos["power_ps"] == brand] mean_ps = autos["power_ps"].mean() brand_power_ps_mean[brand] brand_power_ps_mean