#!/usr/bin/env python
# coding: utf-8

# Here in this Project I will try to clean the 50000 sample used cars as it was crawled from Ebay-German Website.
# 
# We will do the following to our sample:
# 1- Clean the Data and analyze the included used car listing.
# 
# 2-And we will familiarize our self with the benefits of jupyter notebooks brings when working with Pandas library.
# 
# 

# In[1]:


import numpy as np
import pandas as pd

autos=pd.read_csv('autos.csv', encoding='Latin-1')


# In[2]:


autos


# In[3]:


autos.info()


# In[4]:


autos.head()


# We can see that in this dataset there are some columns which does not have info in their respective cells(NaN)
# We can see also that we are dealing with Two types of data:
# 
# 1- string (Shown as Object)
# 2- integer(Int64)
# 3- The Dataset contains 20 columns(most of the are Object=string)
# 

# Next we are going to change and reword the columns to better reflect the data contains in their cells.

# In[5]:


autos.columns


# In[6]:


co={'dateCrawled':'date_crawled', 'offerType':'offer_type', 'vehicleType':'vehicle_type', 'yearOfRegistration':'registration_year', 'powerPS':'power_ps',
       'monthOfRegistration':'registration_month', 'fuelType':'fuel_type',
       'notRepairedDamage':'unrepaired_damage', 'dateCreated':'ad_created', 'nrOfPictures':'no_of_pictures', 'postalCode':'postal_code',
       'lastSeen':'last_seen','name':'name','seller':'seller','price':'price','abtest':'abtest','gearbox':'gearbox','model':'model','odometer':'odometer','brand':'brand'}


# In[7]:


autos=autos.rename(columns=co)


# In[8]:


autos.columns


# In[9]:


autos.head()


# I have change the columns names for better understanding for my self and whoever interested working with data.
# 
# The original data columns name was written based on camelCase and I have changed that to snake_case.
# and in the end I have re assigned the new columns names back to the original dataframe

# NEXT
# I am going to explore my data to determine what kind of cleanin tasks left to be done.
# for starters, we will take a look at Text columns and chekc whether they are providing good and accountable data for our analysis.
# Then, we go ahead and try to clean our numeric data and convert them if we need to.
# The methods that we implement for our data exploration are:
# 
# 1- Describe() (for both categorical and numeric columns)
# 2- Value_counts and head()
# 

# In[10]:


autos.describe(include='all')


# based on the quick look:
# I have relized there are number of columns which are not doing any effect in our analysis and there are other columns which need more time to spend on.
# 
# the following columns are not play very importantly:
# 1- Offer_type
# 2- Seller
# 
# How ever the following columns need more attemtion and cleaning as well:
# 1- odometer
# 2- price
# 
# 

# In[11]:


autos['price'].value_counts().unique


# In[12]:


autos['price']=autos['price'].str.replace('$','').str.replace(',','').str.replace('.','').astype(int)


# In[13]:


autos['price']


# In[14]:


autos=autos.rename(columns={'odometer':'odometer_km'})


# In[15]:


autos['odometer_km']=autos['odometer_km'].str.replace(',','').str.replace('km','').astype(int)


# In[16]:


autos['odometer_km']=autos['odometer_km'].astype(int)


# In[17]:


autos['odometer_km'].unique().shape


# In[18]:


autos['odometer_km'].describe()


# In[19]:


autos['odometer_km'].value_counts().sort_index(ascending=False).head()


# In[20]:


autos['odometer_km'].value_counts().sort_index(ascending=True).head()


# In[21]:


autos['price']=autos['price'].astype(int)


# In[22]:


autos['price'].unique().shape


# In[23]:


autos['price'].describe()


# In[24]:


autos['price'].value_counts().sort_index(ascending=True).head(50)


# In[25]:


autos['price'].value_counts().sort_index(ascending=False).head(50)


# Ok, So far So good.
# We have realized that there are some inconsistancies in the prices which we need to get rid of those as they are not correctly reflected into our data.
# 
# I am going to remove anything above 350000 Dollars and anything below 100$
# 
# 

# autos=autos[autos['price'].between(100,350000)]
# 

# In[26]:


autos['price'].value_counts().sort_index().head(20)


# In[27]:


autos['price'].value_counts().sort_index(ascending=False)


# I have successfully removed the outliers anything above or below the considered values
# 

# In[28]:


autos.head(100)


# In[29]:


autos['abtest'].value_counts().sort_index().head()


# In[30]:


autos['abtest'].unique()


# In[31]:


autos['abtest'].describe()


# In[32]:


autos['date_crawled'].head()


# In[33]:


autos['last_seen'].head()


# In[34]:


autos['ad_created'].head()


# In[35]:


autos['registration_month'].head()


# In[36]:


autos['registration_year'].head()


# In[37]:


autos['registration_month'].describe()


# In[38]:


autos['registration_month'].value_counts().sort_index()


# In[39]:


crawled=autos['date_crawled'].str[:10]
crawled.value_counts(normalize=True,dropna=False).sort_index()


# Here I have used str method on pandas series 'date_crawled' to focus on the date and day of the crawled data.
# 
# Then, I have used value_counts with normalized argument to change the number of occurence of specific date to Percentage. and then I used sort_index function to sort them out based on ascending order.
# 
# I am going to do the same for 'ad_created' and 'last_seen' columns since they are both recognized as Oject(string) in Pandas.
# 

# In[40]:


autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index()


# In[41]:


autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index()


# In[42]:


autos['last_seen'].describe()


# In[43]:


autos['registration_year'].describe()


# As you can see from the result above there are Two particular number which do not make any sense.
# min value is =1000
# max value is=9999
# 
# I am going to limit the domain and consider values between 1900 up to 2016

# In[44]:


autos=autos[autos['registration_year'].between(1960,2017)]


# In[45]:


autos['registration_year'].value_counts().sort_index()


# In[46]:


autos['brand'].unique()


# In[47]:


brands=autos['brand'].value_counts()
brands


# In[48]:


brands.index[0:20]


# I have decided to create aggregate data of top 20 brands and price average and put it in the dictionary variable.
# 

# In[49]:


my_dict={}
for i in brands.index[0:6]:
    a=autos[autos['brand']==i]
    b=a['price'].mean()
    my_dict[i]=int(b)
print(my_dict)
    
    
# Audi, BMW and Mercedes Benz are more expensive
# Ford and Opel are less expensive
# Volkswagen is in between

# In[50]:


avg_kilometers={}
for i in brands.index[0:6]:
    a=autos[autos['brand']==i]
    b=a['odometer_km'].mean()
    
    avg_kilometers[i]=int(b)

print(avg_kilometers)
    

# Here we calculated the average milage based on top 6 brands

# In[51]:


autos.columns


# Next I am going to converts those dictionaries created to Pandas. Series

# In[52]:


avg_price_brand=pd.Series(my_dict)
avg_milage_brand=pd.Series(avg_kilometers)


# No I am going to converts these series into DataFram
# 

# In[53]:


Avg_Price_Mile=pd.DataFrame(avg_price_brand)


# In[54]:


Avg_Price_Mile['avg_milage_brand']=avg_milage_brand


# In[55]:


Avg_Price_Mile=Avg_Price_Mile.rename(columns={0:'avg_price_brand'})


# In[56]:


Avg_Price_Mile.sort_index(ascending=False)


# The table above shows the Average Price/Milage 
# The minimum average mileage belong to Fold and maximum mileage belong to BMW
# 
# The minimum average price belong to Opel and maximum average price accross all top 6 brands is belong to Mercedes_Benz

# In[ ]: