#!/usr/bin/env python # coding: utf-8 # # Exploring E-bay Car Sales Data # - This Data is taken from [Kaggle Competitions](https://www.kaggle.com/orgesleka/used-cars-database/data) # # ### Data Dictionary # # > dateCrawled - When this ad was first crawled. All field-values are taken from this date.
# > name - Name of the car.
# > seller - Whether the seller is private or a dealer.
# > offerType - The type of listing
# > price - The price on the ad to sell the car.
# > abtest - Whether the listing is included in an A/B test.
# > vehicleType - The vehicle Type.
# > yearOfRegistration - The year in which which year the car was first registered.
# > gearbox - The transmission type.
# > powerPS - The power of the car in PS.
# > model - The car model name.
# > kilometer - How many kilometers the car has driven.
# > monthOfRegistration - The month in which which year the car was first registered.
# > fuelType - What type of fuel the car uses. # > brand- The brand of the car.
# > notRepairedDamage - If the car has a damage which is not yet repaired.
# > dateCreated - The date on which the eBay listing was created.
# > nrOfPictures - The number of pictures in the ad.
# > postalCode - The postal code for the location of the vehicle.
# > lastSeenOnline - When the crawler saw this ad last online.
# # ### Aim # We aim to clean the data and analyze the included used car listings usinfg `pandas` and `matplotlib` # # ### Introduction to Data # In[1]: import pandas as pd import numpy as np autos = pd.read_csv('autos.csv', encoding="Latin-1") autos.head() # In[2]: autos.info() # ### _Key Observations:_ # # - The Dataset includes of 20 Columns, most of which are Strings # # - Some Columns have null values, but none have more than 20% null values. # ### Cleaning Column Names # # - The column names use camelcase instead of Python's preferred snakecase, which means we can't just replace spaces with underscores. # In[3]: column_names = autos.columns column_names # In[4]: import re def clean_col(col): col.strip() col = col.replace("yearOfRegistration", "registration_year") col = col.replace("monthOfRegistration", "registration_month") col = col.replace("notRepairedDamage", "unrepaired_damage") col = col.replace("dateCreated", "ad_created") return re.sub('([a-z0-9])([A-Z])', r'\1_\2',col).lower() autos.columns = [clean_col(c) for c in autos.columns] print(autos.columns) autos.head() # ***Changed Names for all the columnms from camelCase to snake_case. For e.g. 'nrOfPictures' became 'nr_of_pictures'*** # ### Intitial Exploration and Cleaning # # Some other cleaning tasks could be: # - We will look for Text columns where all or almost all values are the same. These can often be dropped as they don't have useful information for analysis. # - Examples of numeric data stored as text which can be cleaned and converted. # In[5]: autos.describe(include='all') # ### _Key Observations:_ # - **nr_of_pictures**, **seller** and **offer_type** columns have mostly a single value. And they should be dropped. # - Columns **price** and **odometer** are shown as text (Object) type, whereas they should be numeric (int) type and should be converted. # In[6]: autos["price"] = autos["price"].replace({'\$':'',',':''}, regex=True).astype(int) autos["price"].head() # In[7]: autos["odometer"] = autos["odometer"].replace({'km':'',',':''}, regex=True).astype(int) autos.rename(columns={'odometer':'odometer_km'}, inplace=True) autos['odometer_km'].head() # In[8]: autos.drop(columns = ['nr_of_pictures','seller','offer_type'], inplace=True) autos.describe(include='all') # ### Exploring the Odometer and Price Columns # # - We continue exploring the data, specifically looking for data that doesn't look right. We'll start by analyzing the odometer_km and price columns. # - We will analyze the columns using minimum and maximum values. And, look for any values that look unrealistically high or low (outliers) that we might want to remove. # In[9]: autos['price'].unique().shape # In[10]: autos['price'].describe() # In[11]: print(autos['price'].value_counts().sort_index().head(10)) autos['price'].value_counts().sort_index(ascending=False).head(10) # *We can observe outliers (below 100 or above 1000000), in `price` column and we will remove them.* # In[12]: # Removing Outliers autos = autos[autos['price'].between(100,1000000)] print(autos['price'].unique().shape) autos['price'].value_counts().sort_index() # In[13]: print(autos['odometer_km'].unique().shape) autos['odometer_km'].describe() # In[14]: autos['odometer_km'].value_counts().sort_index() # ***We removed outliers from price column and found no outliers in odometer_km column.*** # ### Exploring the Date Columns # # There are 5 columns that should represent date values. Some of these columns were created by the crawler, some came from the website itself. # In[15]: autos[['date_crawled','ad_created','last_seen', 'registration_month', 'registration_year']][0:5] # In[16]: autos[['date_crawled','ad_created','last_seen', 'registration_month', 'registration_year']].dtypes # `date_crawled`, `last_seen`, and `ad_created` columns are all identified as string values by pandas. Because these three columns are represented as strings, we need to convert the data into a numerical representation so we can understand it quantitatively. # # `registration_month` and `registration_year` are represented as numeric values, so we can use methods like Series.describe() and draw graphs to understand the distribution without any extra data processing. # ### Cleaning `date_crawled`, `last_seen`, and `ad_created` # We will clean and analyze the date columns first. We can notice that the first 10 characters represent the day (e.g. 2016-03-12). To understand the date range, we can extract just the date value by doing some string formatting # In[17]: autos['date_crawled'] = autos['date_crawled'].str[:10] autos['date_crawled'] = autos['date_crawled'].str.replace('-','').astype(int) autos['last_seen'] = autos['last_seen'].str[:10] autos['last_seen'] = autos['last_seen'].str.replace('-','').astype(int) autos['ad_created'] = autos['ad_created'].str[:10] autos['ad_created'] = autos['ad_created'].str.replace('-','').astype(int) autos.head() # **Analyzing `date_crawled`** # In[18]: import matplotlib.pyplot as plt print(autos['date_crawled'].unique()) autos['date_crawled'].groupby(autos['date_crawled']).count().plot(kind='bar', figsize=(14, 3)) plt.show() # ***DATE CRAWLED - The above distribution look like a uniform ditribution.
Which means that the ads were crawled on a regular basis, with an average of 3% ads being crawled daily for 34 days.*** # **Analyzing `ad_created`** # In[19]: print(autos['ad_created'].unique()) autos['ad_created'].groupby(autos['ad_created']).count().plot(kind='bar', figsize=(16, 4)) plt.show() # ***AD CREATED - The above distribbution look like a left skewed ditribution.Which means that most of the ads were created recently.
It could be beacuse the cars that were posted earlier (like a month or two ago) would have got sold, hence their ad was pulled down.
Leaving only a very small number of ads (of unsold cars) posted from a long period of time.*** # **Analyzing `last_seen`** # In[20]: print(autos['last_seen'].unique()) autos['last_seen'].groupby(autos['last_seen']).count().plot(kind='bar', figsize=(14, 3)) plt.show() # ***LAST SEEN - Here the distribution is heavily left skewed, with almost 50% of the ads being seen in last 3 days of the dataset.
We can give a similar analogy to 'ad_created', older car ads are in less quantity in last seen, as they might be sold already.
And, as a consequence most of the last seens ads are of recent sellers, who are interested in selling their cars ASAP.*** # ### Cleaning, Analyzing `registration_month` and `registration_year` # # Removing outliers (if any) from both the columns. # In[21]: print(autos['registration_month'].unique()) autos['registration_month'].value_counts() # In[22]: # Observed 0 as an outlier autos = autos[autos['registration_month']!=0] print(autos['registration_month'].unique()) autos['registration_month'].groupby(autos['registration_month']).count().plot(kind='bar', figsize=(8, 3)) plt.show() # ***Observed outlier for 'Registration month' was 0. Removed it. # And, as expected the ditribution is uniform, since people can buy cars any month of the year.*** # In[23]: print(sorted(autos['registration_year'].unique())) autos['registration_year'].describe() # In[24]: # Here all the values for years before 1900 and after 2017 are invlaid or outliers autos = autos[autos['registration_year'].between(1900,2017)] print(sorted(autos['registration_year'].unique())) autos['registration_year'].groupby(autos['registration_year']).count().plot(kind='bar', figsize=(16, 3)) plt.show() # ***Observed outliers for 'Registration year' were values below 1900 and above 2017. Removed those outliers.
We can observe that the distribution is left skewed. # Most of the cars were registered between 1998-2008.
# It can be explained from the fact that most cars are used for max. 10-15 years and then sell/discard them.*** # In[25]: autos.head() # ### Exploring Price by Brand # # ***Selecting Top Brands*** # In[26]: brands = autos['brand'].value_counts(normalize=True) results = brands >= 0.05 ## Finding names of brands wit more then 5% share of ads top_brands_name = brands[results].index print(top_brands_name) # ***Mean Price*** # In[27]: brand_mean_price = {} for car in top_brands_name: mean_price = autos.loc[autos['brand']== car,'price'].mean() brand_mean_price[car] = round(mean_price, 2) brand_mean_price # ***Mean Mileage*** # In[28]: brand_mean_mileage = {} for car in top_brands_name: mean_mileage = autos.loc[autos['brand'] == car, 'odometer_km'].mean() brand_mean_mileage[car] = round(mean_mileage, 2) brand_mean_mileage # ***The average prices are as follows:*** # # |**Brand**|Average Price| # |-|-| # |**Volkswagen:** | 5943.71 | # |**BMW:** | 8637.41| # |**Opel:** | 3173.85| # |**Mercedes Benz:** | 8846.68| # |**Audi:** | 9718.9| # |**Ford:** | 4289.74| # # ***Here we may notice that the most expensive vehicles are from Audi, BMW and Mercedes.
While Ford, Opel are the most economical options, Volkswagen is in between.*** # # --- # # ***The Average Mileage is as follows*** # # |**Brand**|Average Price| # |-|-| # |**Volkswagen** | 128365.67| # |**BMW**|132575.41| # |**Opel** | 128805.64| # |**Mercedes Benz**| 131034.56| # |**Audi**|128830.76| # |**Ford**| 123839.23| # # ***Here, maximum mileage is given by BMW and least by Ford.*** # ### Storing Aggregate Data in a DataFrame # **First we will convert dictionary into a Series** # In[29]: bmp_series = pd.Series(brand_mean_price).sort_values(ascending=False) bmm_series = pd.Series(brand_mean_mileage).sort_values(ascending=False) print('bmp_series\n') print(bmp_series,'\n') print('bmm_series') bmm_series # **Create a dataframe from the first series object using the dataframe constructor.** # In[30]: top_brands_df = pd.DataFrame(bmp_series, columns=['mean_price']) top_brands_df # **Assign the other series as a new column in this dataframe.** # In[31]: top_brands_df['mean_mileage'] = bmm_series top_brands_df # ***Here we can observe that car mileage/brand does not vary much as compared to the car prices/brand.

While mean price difference for most expensive car brand (Audi) and most economical car brand (Opel) was a gigantic 71%.
However, same is not true for mean mileage, with total difference between top mileage giving brand(BMW) & least mileage giving brand(Ford) being mere 7%.***