#!/usr/bin/env python # coding: utf-8 # # Exploring eBay Car Sales Data # # - This project include, dataset of used cars from eBay Kleinanzeigen, a classified section of the German eBay website. # # - The dataset was originally scraped and uploaded to Kaggle by user orgesleka. # # - The original dataset isn't available on [Kaggle](https://www.kaggle.com/orgesleka) anymore, but you can find it [here](https://data.world/data-society/used-cars-data). # # ![](https://www.ebay-kleinanzeigen.de/static/img/common/logo/logo-ebayk-402x80.hsn0x4ev0qi.png) # ## Data Cleaning & Exploration # In[1]: import pandas as pd import numpy as np # creating dataframe autos from csv file autos = pd.read_csv("autos.csv", encoding= "Latin-1") # In[2]: autos # In[3]: autos.info() # In[4]: autos.head() # >- Dataframe has **50,000 rows** and **20 columns**. # > # > # >- All comumn names are in camelcase instead of snakecase # > # > # >- 5 out of 20 columns has missing values which inlude, **`vehicleType`, `gearbox`, `model`, `fuelType` & `notRepairedDamage`**. # > # > # >- Columns which could have differnt data type includes **`price`, `odometer`**. # ### Changing Columns to Snackcase # In[5]: autos.columns # In[6]: # Assign modified columns to the dataframe autos.columns= ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'year_of_registration', 'gearbox', 'powerPS', 'model', 'odometer', 'month_of_registration', 'fuel_type', 'brand', 'not_repaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen'] # ### Exploring All Columns # In[7]: autos.head() # In[8]: autos.describe(include = 'all') # >- **`seller`, `offer_type`, `nr_of_pictures`** have mostly one value and are candidates to be dropped. # > # > # >- **`vehicle_type`, `gearbox model`, `fuel_type`, `not_repaired_damage`** have missing values. # > # > # >- **`year_of_registration`, `powerPS`, `month_of_registration`** need more investigation. # > # > # >- Columns which could have numeric data type includes **`price`, `odometer`**. # In[9]: autos["year_of_registration"].value_counts() # In[10]: len(autos["powerPS"].value_counts()) # In[11]: autos["month_of_registration"].unique() # In[12]: autos["price"].dtype # In[13]: autos["nr_of_pictures"].value_counts() # ### Dropping Columns # In[14]: drop_columns = ["seller", "nr_of_pictures", "offer_type"] autos.drop(drop_columns, axis = 1, inplace = True) # ### Converting German field names from gearbox, fuel type & not_repaired_damage to English # In[15]: autos['gearbox'].unique() # In[16]: gear_box = {'manuell' : 'manual', 'automatik' : 'automatic', np.nan: 'unknown'} # In[17]: autos['gearbox'] = autos['gearbox'].map(gear_box) # In[18]: autos['fuel_type'].unique() # In[19]: fueltype = { 'lpg':'lpg', 'benzin':'petrol', 'diesel':'diesel', np.nan:'unknown', 'cng': 'cng', 'hybrid':'hybrid', 'elektro' : 'electro', 'andere': 'other'} # In[20]: autos['fuel_type'] = autos['fuel_type'].map(fueltype) # In[21]: autos['not_repaired_damage'].unique() # In[22]: not_rep_dam = {'nein': 'no', 'ja': 'yes', np.nan : 'unknown'} # In[23]: autos['not_repaired_damage'] = autos['not_repaired_damage'].map(not_rep_dam) # In[24]: autos.head() # In[ ]: # ### Cleaning price & odometer column # In[25]: # convet price column to numeric datatype autos["price"] = autos["price"].str.replace("$","").str.replace(",","") autos["price"]= autos["price"].astype(int) # In[26]: autos["price"].unique() # In[27]: # convert odometer column to numeric datatype autos["odometer"] = autos["odometer"].str.replace("km","").str.replace(",","") autos["odometer"] = autos["odometer"].astype(int) # In[28]: # Rename the above price & odometer column autos.rename({'price':'price($)','odometer' : 'odometer_km'}, axis = 1, inplace = True) # ### Exploring odometer_km & price column # In[29]: # EXploring odometer_km Column # In[30]: autos["odometer_km"].describe() # In[31]: autos["odometer_km"].value_counts().sort_index(ascending = False) # In[32]: # EXploring price($) Column # In[33]: autos["price($)"].describe() # In[34]: autos["price($)"].value_counts().sort_index(ascending= False).head(15) # In[35]: autos["price($)"].value_counts().sort_index(ascending= False).tail(250) # In[36]: # Removing outliners from the dataframe. autos = autos[autos["price($)"].between(1000,1000000)] autos # > * Price column has some unrealistic numbers. # > # > * Some prices are as high as 10 cr dollar where as some prices ae as low as 0 dollar. # > # > * So, we remove the outlines and keep minimum price as 1000 dollars and maximum price as 100 k dollars # > # > * After removing unwanted prices we have total 38629 rows # ### Exploring Date Column # In[37]: autos[['date_crawled','ad_created','last_seen']][0:5] # In[38]: (autos["date_crawled"].value_counts(normalize = True,dropna = False) .sort_index() ) # In[39]: (autos["ad_created"].value_counts(normalize = True,dropna = False) .sort_index()) # In[40]: autos["last_seen"].value_counts(normalize = True,dropna = False).sort_index() # In[41]: l = (autos["last_seen"].value_counts(normalize = True,dropna = False) .sort_index() .tail(12000) ) l # In[42]: l.sum() # > * The date crwaled is well distributed over the span of 2 months. # > # > * The ad created is didtributed for 10 months. # > # > * The last seen is distributed for a period of 1 month with approx 50% value in last 3 days. So, there must be surge in sell in last three days. # ### Cleaning & Exploring year_of_registration column # In[43]: autos["year_of_registration"].describe() # In[44]: (autos["year_of_registration"].value_counts(dropna = False) .sort_index(ascending = True)) # > * Since, **`year_of_registration`** column has bogous value. Some are as low as 1000 & some are as high as 9999. # > # > # > * We need to remove the unwanted value from the dataframe. # > # > # > * So, We remove the outliners and keep minimum year as 1900 and Max year as 2016 # In[45]: # Removing otlines from year_of_registration column. autos = autos[autos["year_of_registration"].between(1900,2016)] autos # In[46]: l =autos["year_of_registration"].value_counts(normalize = True).sort_index(ascending = False) l # In[47]: autos["year_of_registration"].describe() # In[48]: l.head(6).sum() # In[49]: l[6:16] # In[50]: l[6:10].sum() # In[51]: l[11:15].sum() # * 75% of total cars are registered between 2001 to 2016 # # # * Only 14.4% of cars are 6 years old from the date of registration. # # # * 21.83% of total cars are b/w 2006 to 2010. # # # * 26.88% of total cars are b/w 2001 to 2005. # > ***This shows that there more chances to sell a car if it is used b/w 10-15 yrs.*** # ### Exploring brand column based on no. of sales, mean_price & mean_mileage # In[52]: autos["brand"].unique() # In[53]: l = autos["brand"].value_counts().sort_values(ascending = False) l # > * We have chosen top 20 brand based on number of sale for mean_price calc. # In[54]: top_20_brand = l.index[:20] mean_price = {} for each in top_20_brand: selected_rows = autos[autos["brand"] == each] each_mean_price = selected_rows["price($)"].mean() mean_price[each] = each_mean_price mean_price # In[55]: import matplotlib.pyplot as plt # In[56]: plt.bar(list(mean_price.keys()),list(mean_price.values()),width = 0.5) plt.xlabel("Brand") plt.ylabel("Mean Price in $") plt.title("Mean Price of Top 20 Brand") plt.show() # > * Renault has least mean price which is 3590.94 dollars . # > # > * sonstige_autos has highest mean price which is 14454.55 dollars . # In[57]: top_6_price = {} top_6_mileage = {} for each in top_20_brand[:6]: selected_rows = autos[autos["brand"] == each] each_mean_price = selected_rows["price($)"].mean() top_6_price[each] = each_mean_price each_mean_mileage = selected_rows["odometer_km"].mean() top_6_mileage[each] = each_mean_mileage # In[58]: # Convert both dictionary to series objects using series constructior price_series = pd.Series(top_6_price) mileage_series = pd.Series(top_6_mileage) # In[59]: df = pd.DataFrame(price_series,columns = ['mean_price']) df['mean_mileage'] = mileage_series # In[60]: df # * All brands have nearly same mileage. # # * Most famous brand is volkswagen because of its affordable price. # ### Exploring Brand/Model Combinations # In[61]: autos_brand_model = autos[["brand","model"]] # In[62]: autos_brand_model["model"].value_counts().sort_values(ascending= False) # In[63]: (autos_brand_model["model"].value_counts() .sort_values(ascending= False) .tail(10)) # In[64]: autos_brand_model[autos_brand_model["model"] == "golf" ] # In[65]: lf = autos_brand_model[(autos_brand_model["model"] == "200" ) |(autos_brand_model["model"] == "samara") | (autos_brand_model["model"] == "145") |(autos_brand_model["model"] == "lanos") |(autos_brand_model["model"] == "b_max") | (autos_brand_model["model"] == "rangerover")] lf # > * Most famous car is **golf** model of **Volkawagen** . # > # >* Least famous car is give **lf DataFrame** icludes 5 cars each having one ad. # ### odometer _km VS average_price # In[66]: l = autos["odometer_km"].unique() l # In[67]: l.sort() # In[68]: l # In[69]: #create a list of booleans bool_mileage = [(autos["odometer_km"] < 25000) ,(autos["odometer_km"] >= 25000) & (autos["odometer_km"] < 50000) ,(autos["odometer_km"] >= 50000)&(autos["odometer_km"] < 75000) ,(autos["odometer_km"] >= 75000)&(autos["odometer_km"] < 100000) ,(autos["odometer_km"] >= 100000)&(autos["odometer_km"] < 125000) ,(autos["odometer_km"] >= 125000) ] #list of values to assign values = ['very_low','low', 'avg','above_avg', 'high', 'very_high'] #Creating new column to assign mileage category autos["mileage_type"] = np.select(bool_mileage,values) #Updated DataFrame autos.head() # In[70]: # Calculating mileage category distribution autos["mileage_type"].value_counts(dropna = False, normalize = True) # > * 71.29% of vehicles have very high mileage in the range of 125 k to 150k km. # In[71]: # Analysing relatiship b/w mileage & mean_price mil_cat = autos["mileage_type"].unique() # creating an empty dict mil_price = {} for each in mil_cat: selected_rows = autos[autos["mileage_type"] == each] mean_price = selected_rows["price($)"].mean() mil_price[each] = mean_price # displaying the dict mil_price # > * There is **positive** correlation betwen **Mileage** & **Price** of Car # ### Price vs Car Condition # In[ ]: # Calculating mean price for damage cars # In[76]: damaged_rows = autos[autos["not_repaired_damage"] == 'yes'] mean_price_1 = damaged_rows["price($)"].mean() mean_price_1 # In[ ]: # Calculating mean price for fit car # In[75]: not_damaged_rows = autos[autos["not_repaired_damage"] == 'no'] mean_price_2 = not_damaged_rows["price($)"].mean() mean_price_2 # In[77]: diff = mean_price_2 - mean_price_1 diff # > * So, damaged cars are **3771.92 dollars** cheaper than non-damage cars # In[ ]: