#!/usr/bin/env python # coding: utf-8 # # Cleaning and Analyzing the Data # We are working with a data set used cars from eBay Kleinanzeigen, a classifieds section of the German eBay website. There are 50K records in dataset. The purpose of this project is to clean the data and analyze the included used car listing # In[1]: import pandas as pd import numpy as np autos = pd.read_csv("autos.csv", encoding='Latin-1') # In[2]: autos # In[3]: autos.info() # In[4]: autos.head(5) # The above data is has 20 columns, 15 of which are of object type (i.e. the data contained in those columns are not only of integer type). There are null values present in the data # In[5]: #Changing the Column name case from camelcase to snakecase #print ('before : ',autos.columns) autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen'] #print('after :',autos.columns) autos.head() # The name of the columns were in camelcase. However the preferred method is snakecase. Above we are changing the column names to be in confirmitiy with the camel case as well as renaming some columns to make them self explainatory. # Lets deep dive and explore the data . lets have a look if some column needs to be cleaned in order to make the analysis easy # In[6]: autos.describe(include='all') # From the above we can see that the odometer and price are of type string as it contains non-numeric character. We need to clean the data to use it for the analysis. Also the column "nr_of_pictures' has same value in almost all the rows. So we can drop that off. # In[7]: #investigating and cleaning the price column autos['price']=autos['price'].str.replace("$" ,"") autos['price']=autos['price'].str.replace("," ,"") #investigating and cleaning the odometer column autos['odometer']=autos['odometer'].str.replace("km" ,"") autos['odometer']=autos['odometer'].str.replace("," ,"") #renaming the columns autos.rename (columns={ 'odometer':'odometer_km'}, inplace=True) #dropping the column which we found to have same values in all row. # In[8]: autos = autos.drop(["nr_of_pictures", "seller", "offer_type",'nr_of_pictures' ], axis=1) # In[9]: #converting data type to int autos['odometer_km']= autos['odometer_km'].astype ('int') # In[10]: autos['price']=autos['price'].astype ('float') # Lets explore the Odomter and price Columns # In[11]: autos["odometer_km"].value_counts() # The majority of the cars have quite milage on them.Lets explore the price data further # In[12]: print ('Unique Car Prices:', autos["price"].unique().shape) print ('Stats about car prices\n',autos["price"].describe()) print ('Max Car Price', autos['price'].max()) print ('Min Car Price', autos['price'].min()) # A look at the car prices tells us that two outliers. The Max price 99999999 and the min price 0. Let explore more to see how many cars lie in those outliers. # In[13]: #Sorted Ascending autos['price'].value_counts().sort_index(ascending=True).head(20) # In[14]: #Sorted Descending autos['price'].value_counts().sort_index(ascending=False).head(20) # Now if you look at the top and bottom 20 price we can see that 1421 cars priced at 0. For the purpose of this analysis I am going to remove those cars from the dataset where the price is less then 3 or greater than 10000000 . # In[15]: autos=autos[autos["price"].between(3,10000000)] # In[16]: autos # In[17]: #Frequency for Date Crawled autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True) # From the column 'date_crawled' it seems that the data extracted/crawled activity was carried out for almost a month # In[18]: #Frequency for Ad Created autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True) # The frequency of ad created seem to be more in the year 2016. # In[19]: #Frequency for Ad Created autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True) # The analysis of the last_seen column tells that majority of the ads were still active /present on the website when it was last crawled. # In[20]: autos['registration_year'].describe() # Profiling the registration year we can see that most of the cars are registered before or on 2008. There seem to to be few data discrpency with 9999 and 1000 as max and min values respectively # Since the first car accessible to the masses was in 1908 and the since the car can't be registered until made, so above I analyzed the percentages of car in the data set whose registration year is between 1910 and 2016. # In[21]: autos['registration_year'].between(1910,2016).value_counts(normalize=True) # Only 3 % of the records lie outside our selected years. We can ignore those values. # Let us explore the brands of the cars whose ads are up for sale. # In[22]: brand_composition = autos['brand'].value_counts(normalize=True) print (brand_composition) # The list above shows the percentage of ads for each brand of car. Let us concentrate on cars whose ads comprises > 5% of total ads. # In[23]: #isolating the comprises > 5%car brand names selected_brands = brand_composition[brand_composition>0.05].index print (selected_brands) #Now creating a dictionary and aggregating the mean price of top 5 cars that we have choosen #in the previous step mean_car_price ={} for brands in selected_brands : print (brands) car_brand = autos[autos['brand']==brands] mean_car_p = car_brand['price'].mean() mean_car_price[brands]= int(mean_car_p) mean_car_price THe above are the mean prices of the cars we selected. There is quite a difference in luxury car mean price to the normal ones, indicating that the luxury cars are expesive # In[24]: #Exploring Mileage of the cars and calculating the mean mileage mean_car_mileage ={} for brands in selected_brands : print (brands) car_brand = autos[autos['brand']==brands] mean_car_m = car_brand['odometer_km'].mean() mean_car_mileage[brands]= int(mean_car_m) mean_car_mileage # In[25]: #converting the dictionariesi to Pandas series #this step is done so that we can have a single Dataframe #wher we can compare the price to mileage of the car mileage_series = pd.Series (mean_car_mileage) price_series = pd.Series (mean_car_price) # In[26]: #Finding out relationship between Mileage and Price df_price_mileage = pd.DataFrame (mileage_series, columns=['mean_mileage']) df_price_mileage df_price_mileage['mean_car_price']=price_series df_price_mileage.sort_values(by=['mean_mileage', 'mean_car_price'],ascending=False) # In general there seems to be a inverse relationship between the mileage and the car price. Higher the Milegae lower the car price # # Extra Cleaning Steps # In[27]: autos.columns # df.groupby(['key1', 'key2']).size() # autos.groupby(['brand','model']).size() # autos[['brand','model']].apply(pd.Series.value_counts) # converting date crawled into YYYMMDD format and changing it to int data type date_crawled_list = autos["date_crawled"].str.slice(0,10).str.split('-') for lista in date_crawled_list: autos['date_crawled_1']= int(lista[0]+lista[1]+lista[2]) #converting last_seen into YYYYMMDD format and chaning it into int data type last_seen_list = autos["last_seen"].str.slice(0,10).str.split('-') for lista in last_seen_list: autos['last_seen_1']= int(lista[0]+lista[1]+lista[2]) #converting ad_created into YYYYMMDD format and chaning it into int data type ad_created_list = autos['ad_created'].str.slice(0,10).str.split('-') for lista in ad_created_list: autos['ad_created_1']= int(lista[0]+lista[1]+lista[2]) # In[28]: #Droppin the columns last_seen and date_crawled autos = autos.drop(['last_seen','date_crawled','ad_created'], axis=1) autos.columns # In[29]: #Renaming the columns autos = autos.rename(columns={'last_seen_1':'last_seen','date_crawled_1':'date_crawled' ,'ad_created_1':'ad_created'}) autos