#!/usr/bin/env python
# coding: utf-8

# # Exploring eBay Car Sales Data
# 
# - This project include, dataset of used cars from eBay Kleinanzeigen, a classified section of the German eBay website.
# 
# - The dataset was originally scraped and uploaded to Kaggle by user orgesleka.
# 
# - The original dataset isn't available on [Kaggle](https://www.kaggle.com/orgesleka) anymore, but you can find it [here](https://data.world/data-society/used-cars-data).
# 
# ![](https://www.ebay-kleinanzeigen.de/static/img/common/logo/logo-ebayk-402x80.hsn0x4ev0qi.png)

# ## Data Cleaning & Exploration

# In[1]:


import pandas as pd
import numpy as np

# creating dataframe autos from csv file
autos = pd.read_csv("autos.csv", encoding= "Latin-1")


# In[2]:


autos


# In[3]:


autos.info()


# In[4]:


autos.head()


# >- Dataframe has **50,000 rows** and **20 columns**.
# >
# >
# >- All comumn names are in camelcase instead of snakecase
# >
# >
# >- 5 out of 20 columns has missing values which inlude, **`vehicleType`, `gearbox`, `model`, `fuelType` & `notRepairedDamage`**.
# >
# >
# >- Columns which could have differnt data type includes **`price`, `odometer`**.

# ### Changing Columns to Snackcase

# In[5]:


autos.columns


# In[6]:


# Assign modified columns to the dataframe
autos.columns= ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
       'vehicle_type', 'year_of_registration', 'gearbox', 'powerPS', 'model',
       'odometer', 'month_of_registration', 'fuel_type', 'brand',
       'not_repaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
       'last_seen']


# ### Exploring All Columns

# In[7]:


autos.head()


# In[8]:


autos.describe(include = 'all')


# >- **`seller`, `offer_type`, `nr_of_pictures`** have mostly one value and are candidates to be dropped.
# >
# >
# >- **`vehicle_type`, `gearbox model`, `fuel_type`, `not_repaired_damage`** have  missing values.
# >
# >
# >- **`year_of_registration`,  `powerPS`, `month_of_registration`** need more    investigation.
# >
# >
# >- Columns which could have numeric data type includes **`price`, `odometer`**.

# In[9]:


autos["year_of_registration"].value_counts()


# In[10]:


len(autos["powerPS"].value_counts())


# In[11]:


autos["month_of_registration"].unique()


# In[12]:


autos["price"].dtype


# In[13]:


autos["nr_of_pictures"].value_counts()


# ### Dropping Columns

# In[14]:


drop_columns = ["seller", "nr_of_pictures", "offer_type"]
autos.drop(drop_columns, axis = 1, inplace = True)


# ### Converting German field names from gearbox, fuel type & not_repaired_damage to English

# In[15]:


autos['gearbox'].unique()


# In[16]:


gear_box = {'manuell' : 'manual', 
            'automatik' : 'automatic',
            np.nan: 'unknown'}


# In[17]:


autos['gearbox'] = autos['gearbox'].map(gear_box)


# In[18]:


autos['fuel_type'].unique()


# In[19]:


fueltype = {  'lpg':'lpg',
               'benzin':'petrol',
               'diesel':'diesel',
                np.nan:'unknown',
               'cng': 'cng',
               'hybrid':'hybrid',
               'elektro' : 'electro',
               'andere': 'other'}


# In[20]:


autos['fuel_type'] = autos['fuel_type'].map(fueltype)


# In[21]:


autos['not_repaired_damage'].unique()


# In[22]:


not_rep_dam = {'nein': 'no',
               'ja': 'yes',
               np.nan : 'unknown'}


# In[23]:


autos['not_repaired_damage'] = autos['not_repaired_damage'].map(not_rep_dam)


# In[24]:


autos.head()


# In[ ]:


# ### Cleaning price & odometer column

# In[25]:


# convet price column to numeric datatype

autos["price"] = autos["price"].str.replace("$","").str.replace(",","")
autos["price"]= autos["price"].astype(int)


# In[26]:


autos["price"].unique()


# In[27]:


# convert odometer column to numeric datatype

autos["odometer"] = autos["odometer"].str.replace("km","").str.replace(",","")
autos["odometer"] = autos["odometer"].astype(int)


# In[28]:


# Rename the above price & odometer column

autos.rename({'price':'price($)','odometer' : 'odometer_km'},
             axis = 1,
             inplace = True)


# ### Exploring odometer_km & price column

# In[29]:


# EXploring odometer_km Column


# In[30]:


autos["odometer_km"].describe()


# In[31]:


autos["odometer_km"].value_counts().sort_index(ascending = False)


# In[32]:


# EXploring price($) Column


# In[33]:


autos["price($)"].describe()


# In[34]:


autos["price($)"].value_counts().sort_index(ascending= False).head(15)


# In[35]:


autos["price($)"].value_counts().sort_index(ascending= False).tail(250)


# In[36]:


# Removing outliners from the dataframe.
autos = autos[autos["price($)"].between(1000,1000000)]
autos


# > * Price column has some unrealistic numbers.
# >
# > * Some prices are as high as 10 cr dollar where as some prices ae as low as 0 dollar.
# >
# > * So, we remove the outlines and keep minimum price as 1000 dollars and maximum price as 100 k dollars
# >
# > * After removing unwanted prices we have total 38629 rows

# ### Exploring Date Column

# In[37]:


autos[['date_crawled','ad_created','last_seen']][0:5]


# In[38]:


(autos["date_crawled"].value_counts(normalize = True,dropna = False)
                     .sort_index()
                     )


# In[39]:


(autos["ad_created"].value_counts(normalize = True,dropna = False)
                   .sort_index())


# In[40]:


autos["last_seen"].value_counts(normalize = True,dropna = False).sort_index()


# In[41]:


l = (autos["last_seen"].value_counts(normalize = True,dropna = False)
                 .sort_index()
                 .tail(12000)
                 )
l


# In[42]:


l.sum()


# > * The date crwaled is well distributed over the span of 2 months.
# >
# > * The ad created is didtributed for 10 months.
# >
# > * The last seen is distributed for a period of 1 month with approx 50% value in last 3 days. So, there must be surge in sell in last three days.

# ### Cleaning & Exploring year_of_registration column

# In[43]:


autos["year_of_registration"].describe()


# In[44]:


(autos["year_of_registration"].value_counts(dropna = False)
                              .sort_index(ascending = True))


# > * Since, **`year_of_registration`** column has bogous value. Some are as low as 1000 & some are as high as 9999.
# >
# >
# > * We need to remove the unwanted value from the dataframe.
# >
# >
# > * So, We remove the outliners and keep minimum year as 1900 and Max year as 2016

# In[45]:


# Removing otlines from year_of_registration column.
autos = autos[autos["year_of_registration"].between(1900,2016)]
autos


# In[46]:


l =autos["year_of_registration"].value_counts(normalize = True).sort_index(ascending = False)
l


# In[47]:


autos["year_of_registration"].describe()


# In[48]:


l.head(6).sum()


# In[49]:


l[6:16]


# In[50]:


l[6:10].sum()


# In[51]:


l[11:15].sum()


# * 75% of total cars are registered between 2001 to 2016
# 
# 
# * Only 14.4% of cars are 6 years old from the date of registration.
#  
#  
# * 21.83% of total cars are b/w 2006 to 2010.
# 
# 
# * 26.88% of total cars are b/w 2001 to 2005.

# > ***This shows that there more chances to sell a car if it is used b/w 10-15 yrs.***

# ### Exploring brand column based  on no. of sales, mean_price & mean_mileage

# In[52]:


autos["brand"].unique()


# In[53]:


l = autos["brand"].value_counts().sort_values(ascending = False)
l


# >  * We have chosen top 20 brand based on number of sale for mean_price calc.

# In[54]:


top_20_brand = l.index[:20]

mean_price = {}

for each in top_20_brand:
    selected_rows = autos[autos["brand"] == each]
    each_mean_price = selected_rows["price($)"].mean()
    mean_price[each] = each_mean_price
    
mean_price


# In[55]:


import matplotlib.pyplot as plt


# In[56]:


plt.bar(list(mean_price.keys()),list(mean_price.values()),width = 0.5)
plt.xlabel("Brand")
plt.ylabel("Mean Price in $")
plt.title("Mean Price of Top 20 Brand")
plt.show()


# > * Renault has least mean price which is 3590.94 dollars .
# >
# > * sonstige_autos has highest mean price which is 14454.55 dollars .

# In[57]:


top_6_price = {}
top_6_mileage = {}

for each in top_20_brand[:6]:
    selected_rows = autos[autos["brand"] == each]
   
    each_mean_price = selected_rows["price($)"].mean()
    top_6_price[each] = each_mean_price
    
    each_mean_mileage = selected_rows["odometer_km"].mean()
    top_6_mileage[each] = each_mean_mileage
    

# In[58]:


# Convert both dictionary to series objects using series constructior
price_series = pd.Series(top_6_price)
mileage_series = pd.Series(top_6_mileage)


# In[59]:


df = pd.DataFrame(price_series,columns = ['mean_price'])
df['mean_mileage'] = mileage_series


# In[60]:


df


# * All brands have nearly same mileage.
# 
# * Most famous brand is volkswagen because of its affordable price. 

# ### Exploring Brand/Model Combinations

# In[61]:


autos_brand_model = autos[["brand","model"]]


# In[62]:


autos_brand_model["model"].value_counts().sort_values(ascending= False)


# In[63]:


(autos_brand_model["model"].value_counts()
                           .sort_values(ascending= False)
                           .tail(10))


# In[64]:


autos_brand_model[autos_brand_model["model"] == "golf" ]


# In[65]:


lf = autos_brand_model[(autos_brand_model["model"] == "200" )
                       |(autos_brand_model["model"] == "samara")
                       | (autos_brand_model["model"] == "145")
                       |(autos_brand_model["model"] == "lanos")
                       |(autos_brand_model["model"] == "b_max") 
                       | (autos_brand_model["model"] == "rangerover")]
lf


# > * Most famous car is **golf** model of **Volkawagen** .
# >
# >* Least famous car is give **lf DataFrame** icludes 5 cars each having one ad.

# ### odometer _km VS average_price

# In[66]:


l = autos["odometer_km"].unique()
l


# In[67]:


l.sort()


# In[68]:


l


# In[69]:


#create a list of booleans

bool_mileage = [(autos["odometer_km"] < 25000)
                ,(autos["odometer_km"] >= 25000) & (autos["odometer_km"] < 50000)
                ,(autos["odometer_km"] >= 50000)&(autos["odometer_km"] < 75000)
                ,(autos["odometer_km"] >= 75000)&(autos["odometer_km"] < 100000)
                ,(autos["odometer_km"] >= 100000)&(autos["odometer_km"] < 125000)
                ,(autos["odometer_km"] >= 125000)
               ]

#list of values to assign

values = ['very_low','low', 'avg','above_avg', 'high', 'very_high']

#Creating new column to assign mileage category

autos["mileage_type"] = np.select(bool_mileage,values)

#Updated DataFrame
autos.head()


# In[70]:


# Calculating mileage category distribution

autos["mileage_type"].value_counts(dropna = False, normalize = True)


# > * 71.29% of vehicles have very high mileage in the range of 125 k to 150k km.

# In[71]:


# Analysing relatiship b/w mileage & mean_price

mil_cat = autos["mileage_type"].unique()

# creating an empty dict

mil_price = {}

for each in mil_cat:
    selected_rows = autos[autos["mileage_type"] == each]
    mean_price = selected_rows["price($)"].mean()
    mil_price[each] = mean_price
    
# displaying the dict

mil_price


# > * There is **positive** correlation betwen **Mileage** & **Price** of Car

# ### Price vs Car Condition

# In[ ]:


# Calculating mean price for damage cars


# In[76]:


damaged_rows = autos[autos["not_repaired_damage"] == 'yes']

mean_price_1 = damaged_rows["price($)"].mean()

mean_price_1


# In[ ]:


# Calculating mean price for fit car


# In[75]:


not_damaged_rows = autos[autos["not_repaired_damage"] == 'no']

mean_price_2 = not_damaged_rows["price($)"].mean()

mean_price_2


# In[77]:


diff = mean_price_2  - mean_price_1

diff


# > * So, damaged cars are **3771.92 dollars** cheaper than non-damage cars

# In[ ]: