#!/usr/bin/env python # coding: utf-8 # Here in this Project I will try to clean the 50000 sample used cars as it was crawled from Ebay-German Website. # # We will do the following to our sample: # 1- Clean the Data and analyze the included used car listing. # # 2-And we will familiarize our self with the benefits of jupyter notebooks brings when working with Pandas library. # # # In[1]: import numpy as np import pandas as pd autos=pd.read_csv('autos.csv', encoding='Latin-1') # In[2]: autos # In[3]: autos.info() # In[4]: autos.head() # We can see that in this dataset there are some columns which does not have info in their respective cells(NaN) # We can see also that we are dealing with Two types of data: # # 1- string (Shown as Object) # 2- integer(Int64) # 3- The Dataset contains 20 columns(most of the are Object=string) # # Next we are going to change and reword the columns to better reflect the data contains in their cells. # In[5]: autos.columns # In[6]: co={'dateCrawled':'date_crawled', 'offerType':'offer_type', 'vehicleType':'vehicle_type', 'yearOfRegistration':'registration_year', 'powerPS':'power_ps', 'monthOfRegistration':'registration_month', 'fuelType':'fuel_type', 'notRepairedDamage':'unrepaired_damage', 'dateCreated':'ad_created', 'nrOfPictures':'no_of_pictures', 'postalCode':'postal_code', 'lastSeen':'last_seen','name':'name','seller':'seller','price':'price','abtest':'abtest','gearbox':'gearbox','model':'model','odometer':'odometer','brand':'brand'} # In[7]: autos=autos.rename(columns=co) # In[8]: autos.columns # In[9]: autos.head() # I have change the columns names for better understanding for my self and whoever interested working with data. # # The original data columns name was written based on camelCase and I have changed that to snake_case. # and in the end I have re assigned the new columns names back to the original dataframe # NEXT # I am going to explore my data to determine what kind of cleanin tasks left to be done. # for starters, we will take a look at Text columns and chekc whether they are providing good and accountable data for our analysis. # Then, we go ahead and try to clean our numeric data and convert them if we need to. # The methods that we implement for our data exploration are: # # 1- Describe() (for both categorical and numeric columns) # 2- Value_counts and head() # # In[10]: autos.describe(include='all') # based on the quick look: # I have relized there are number of columns which are not doing any effect in our analysis and there are other columns which need more time to spend on. # # the following columns are not play very importantly: # 1- Offer_type # 2- Seller # # How ever the following columns need more attemtion and cleaning as well: # 1- odometer # 2- price # # # In[11]: autos['price'].value_counts().unique # In[12]: autos['price']=autos['price'].str.replace('$','').str.replace(',','').str.replace('.','').astype(int) # In[13]: autos['price'] # In[14]: autos=autos.rename(columns={'odometer':'odometer_km'}) # In[15]: autos['odometer_km']=autos['odometer_km'].str.replace(',','').str.replace('km','').astype(int) # In[16]: autos['odometer_km']=autos['odometer_km'].astype(int) # In[17]: autos['odometer_km'].unique().shape # In[18]: autos['odometer_km'].describe() # In[19]: autos['odometer_km'].value_counts().sort_index(ascending=False).head() # In[20]: autos['odometer_km'].value_counts().sort_index(ascending=True).head() # In[21]: autos['price']=autos['price'].astype(int) # In[22]: autos['price'].unique().shape # In[23]: autos['price'].describe() # In[24]: autos['price'].value_counts().sort_index(ascending=True).head(50) # In[25]: autos['price'].value_counts().sort_index(ascending=False).head(50) # Ok, So far So good. # We have realized that there are some inconsistancies in the prices which we need to get rid of those as they are not correctly reflected into our data. # # I am going to remove anything above 350000 Dollars and anything below 100$ # # # autos=autos[autos['price'].between(100,350000)] # # In[26]: autos['price'].value_counts().sort_index().head(20) # In[27]: autos['price'].value_counts().sort_index(ascending=False) # I have successfully removed the outliers anything above or below the considered values # # In[28]: autos.head(100) # In[29]: autos['abtest'].value_counts().sort_index().head() # In[30]: autos['abtest'].unique() # In[31]: autos['abtest'].describe() # In[32]: autos['date_crawled'].head() # In[33]: autos['last_seen'].head() # In[34]: autos['ad_created'].head() # In[35]: autos['registration_month'].head() # In[36]: autos['registration_year'].head() # In[37]: autos['registration_month'].describe() # In[38]: autos['registration_month'].value_counts().sort_index() # In[39]: crawled=autos['date_crawled'].str[:10] crawled.value_counts(normalize=True,dropna=False).sort_index() # Here I have used str method on pandas series 'date_crawled' to focus on the date and day of the crawled data. # # Then, I have used value_counts with normalized argument to change the number of occurence of specific date to Percentage. and then I used sort_index function to sort them out based on ascending order. # # I am going to do the same for 'ad_created' and 'last_seen' columns since they are both recognized as Oject(string) in Pandas. # # In[40]: autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[41]: autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[42]: autos['last_seen'].describe() # In[43]: autos['registration_year'].describe() # As you can see from the result above there are Two particular number which do not make any sense. # min value is =1000 # max value is=9999 # # I am going to limit the domain and consider values between 1900 up to 2016 # In[44]: autos=autos[autos['registration_year'].between(1960,2017)] # In[45]: autos['registration_year'].value_counts().sort_index() # In[46]: autos['brand'].unique() # In[47]: brands=autos['brand'].value_counts() brands # In[48]: brands.index[0:20] # I have decided to create aggregate data of top 20 brands and price average and put it in the dictionary variable. # # In[49]: my_dict={} for i in brands.index[0:6]: a=autos[autos['brand']==i] b=a['price'].mean() my_dict[i]=int(b) print(my_dict) # Audi, BMW and Mercedes Benz are more expensive # Ford and Opel are less expensive # Volkswagen is in between # In[50]: avg_kilometers={} for i in brands.index[0:6]: a=autos[autos['brand']==i] b=a['odometer_km'].mean() avg_kilometers[i]=int(b) print(avg_kilometers) # Here we calculated the average milage based on top 6 brands # In[51]: autos.columns # Next I am going to converts those dictionaries created to Pandas. Series # In[52]: avg_price_brand=pd.Series(my_dict) avg_milage_brand=pd.Series(avg_kilometers) # No I am going to converts these series into DataFram # # In[53]: Avg_Price_Mile=pd.DataFrame(avg_price_brand) # In[54]: Avg_Price_Mile['avg_milage_brand']=avg_milage_brand # In[55]: Avg_Price_Mile=Avg_Price_Mile.rename(columns={0:'avg_price_brand'}) # In[56]: Avg_Price_Mile.sort_index(ascending=False) # The table above shows the Average Price/Milage # The minimum average mileage belong to Fold and maximum mileage belong to BMW # # The minimum average price belong to Opel and maximum average price accross all top 6 brands is belong to Mercedes_Benz # In[ ]: