#!/usr/bin/env python # coding: utf-8 # # Exploring eBay Car Sales Data # # Dataset contains information on used cars from eBay Kleinanzeigen, a classifieds section of the German eBay website. The dataset was originally scraped and uploaded to Kaggle by user orgesleka. # ## Goal # # To clean the data and analyze trends in the included car listings. # Dataframe column description: # # - dateCrawled : when this ad was first crawled, all field-values are taken from this date # # - name : "name" of the car # # - seller : private or dealer # # - offerType # # - price : the price on the ad to sell the car # # - abtest # # - vehicleType # # - yearOfRegistration : at which year the car was first registered # # - gearbox # # - powerPS : power of the car in PS # # - model # # - kilometer : how many kilometers the car has driven # # - monthOfRegistration : at which month the car was first registered # # - fuelType # # - brand # # - notRepairedDamage : if the car has a damage which is not repaired yet # # - dateCreated : the date for which the ad at ebay was created # # - nrOfPictures : number of pictures in the ad # # - postalCode # # - lastSeenOnline : when the crawler saw this ad last online # In[2]: import pandas as pd import numpy as np # Read data autos = pd.read_csv('autos.csv', encoding= 'Latin-1') # In[3]: # Inspecting the dataframe autos.info() # ### Data exploration and cleaning # In[4]: autos.head() # There are 50,000 entries and 20 columns. Some coulmns like `vehicleType`, `gearbox`, `model` etc. contain null data. There is a need to further investigate these columns individually. Most of data types are integers and objects which are strings primarily and datatime info. # In[5]: autos.columns # Let's convert the column names from camelcase to snakecase and reword some of the column names based on the data dictionary to be more descriptive and inline with python convention. # In[6]: autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'num_of_pictures', 'postal_code', 'last_seen'] # Lets verify if column names are changed to snakecase # In[7]: autos.columns # In[8]: autos.describe(include = 'all') # including all parameter would list null values as well # There are columns like `seller` , `offer_type` have same value repeated consistently. 49,999 entries have same value. Therefore, we can eliminate them in our analysis at this point. Column `num_of_pictures` has all 0 values. # In[9]: autos['seller'].value_counts() # In[10]: autos['offer_type'].value_counts() # In[11]: autos['num_of_pictures'].value_counts() # Lets explore `price`, `odometer` columns to see if numeric data is entered as text. # In[12]: autos['price'].head() # note prices are listed in dollars, when we remove dollar sign # We can make sure to include dollar in column name to not to lose important data. # In[13]: autos['odometer'].head() # As we can see both columns have numeric data as object data type. We need to clean it and convert it to numeric type. # In[14]: autos['price'] = (autos['price'] .str.replace('$','') .str.replace(',','') .astype(int)) # lets rename `price` to `price_dollar` autos.rename({'price' : 'price_dollar'}, axis = 1, inplace = True) autos['price_dollar'].head() # In[15]: autos['odometer'] = (autos['odometer'] .str.replace(',','') .str.replace('km','') .astype(int)) # Lets rename `odometer` to 'odometer_km' autos.rename({'odometer': 'odometer_km'}, axis = 1, inplace = True) autos['odometer_km'].head() # #### Exploring odometer_km column # In[16]: autos['odometer_km'].unique().shape # There are 13 unique values in the `odometer_km` column # In[17]: autos['odometer_km'].describe() # In[18]: autos['odometer_km'].value_counts() # 150,000 km is the most common odometer reading for the cars. Upto 75 % of odometer readings are at 150,000. Data here does not seem to be unrealstically high or low as used cars would tend to have high odometer readings. # Lets check the `price` column # In[19]: autos['price_dollar'].unique().shape # There are 2,357 unique entries in this column. # In[20]: autos['price_dollar'].describe() # Max value of the car is 100,000,000 which is unrealistic and minimum value is 0. Upto 75% of car prices are around 7,200 dollars and mean lies at 9840 dollars # In[21]: price_filter = autos["price_dollar"] > 1000000 weird_values = autos.loc[price_filter, ['name', 'price_dollar']] print(weird_values) # We can see that some of the prices for vehicles which are not even in luxury category are in million dollars. To address this data discrepancy lets filter the results such that we get entries which have prices between 500 to 200,000 dollars. # In[22]: autos =autos[autos["price_dollar"].between(500,200000)] autos["price_dollar"].describe() # Now we can see that mean value was skewed due to very unrelastically high and low values. It has been normalized to some extent using appropriate filter values. Mean value of car price is 6,271 dollars and maximum value is 198,000 dollars. # Lets remove the unwanted column which have redudant entries which don't add any value to our data analysis. # In[23]: autos =autos.drop(["seller","offer_type","num_of_pictures"], axis =1) # In[24]: autos.info() # In[25]: autos.iloc[1] # just to get an idea about how one entry looks like # #### Analyzing date associated columns # # Lets analyze date associated columns # In[26]: dates_first_five = autos[['date_crawled', 'ad_created', 'last_seen']][0:5] # In[27]: print(dates_first_five) # In[28]: dates_first_five.info() # We can see that dataype for the above dataframe is of object(= string) type, so we can use string methods to further analyze. # In[29]: dates =dates_first_five['date_crawled'].str[:10] # In[30]: print(dates) # Lets consider the `dates_crawled` column # In[31]: date_crawled = (autos['date_crawled']. str[:10]. str.replace('-',''). astype(int)) ad_created = (autos['ad_created']. str[:10]. str.replace('-',''). astype(int)) last_seen = (autos['last_seen']. str[:10]. str.replace('-',''). astype(int)) # Now, we are going to sort dates in ascending order and count relative frequencies (percentages). # In[32]: date_crawled_relative_freq= date_crawled.value_counts(normalize =True, dropna=False).sort_index() ad_created_relative_freq= ad_created.value_counts(normalize =True, dropna=False).sort_index() last_seen_relative_freq= last_seen.value_counts(normalize =True, dropna=False).sort_index() # In[33]: print(date_crawled_relative_freq) # We can see that data was crawled from March 5,2016 to April 7,2016 with almost similarly consistent relative frequencies. # In[34]: print(ad_created_relative_freq) # Ebay Advertisements were created from 11 June 2015 until April 7,2016. Ads were created slightly earlier than they were crawled from online # In[35]: print(last_seen_relative_freq) # We can see `last seen` entries are from March 5,2016 from April 7,2016. We can see significant increase relative frequencies in last 3 days April 5,6,7 2016. If we add them then it forms close 0.48 (= 48 % of all entries) # #### Car Registration Data # Lets analyze data `registration_year` the year in which cars were first registered. # In[36]: reg_year = autos['registration_year'] reg_year.describe() # Crude analysis shows that mean is 2005 with min at 1000 (I dont think we even had horse carts then, I might be wrong lol) and max at 9999 (thats too much distant in future) # In[37]: autos_reg_year_cleaned = autos[reg_year.between(1916,2017)] reg_year_cleaned = autos_reg_year_cleaned['registration_year'] # In[38]: reg_year_cleaned.value_counts(normalize =True).head(25) # We can clearly see that majority of the value close to 95% of values lie between 1992-2005. We can use sum function to confirm it # In[39]: reg_year_cleaned.value_counts(normalize =True).head(25).sum() # ### Data Aggregation # # Lets first explore the `brand` column # In[40]: autos['brand'].value_counts(normalize =True) # Lets look at top 20 results # In[41]: top_20 =autos['brand'].value_counts(normalize =True).head(20) # In[42]: top_20.index # gives us list to iterate over # In[43]: # Creating empty dictionary to store values key - brand name , value- mean price brand_prices ={} for brand in top_20.index: filter_brand =autos['brand'] == brand # creating boolean mask mean_price =autos[filter_brand]['price_dollar'].mean() brand_prices[brand] = mean_price # assigning result back to our dictionary for k,v in brand_prices.items(): print("{}\'s average price is {:,} dollars".format(k,v)+ '\n') # Above we have obtained average prices for top 20 brands in our dataframe. # # Lets see the trend of top 20 brands with respect to odometer readings aka mileage. # In[44]: brand_mileage ={} for brand in top_20.index: filter_brand =autos['brand'] == brand mean_mileage =autos[filter_brand]['odometer_km'].mean() brand_mileage[brand] = mean_mileage for k,v in brand_mileage.items(): print("{}\'s average mileage is {:,.2f} km".format(k,v)+ '\n') # In[45]: # We can use pd.Series method to create Series from our dictionaries brand_mean_prices_series = pd.Series(brand_prices) brand_mean_mileage =pd.Series(brand_mileage) # In[46]: df = pd.DataFrame(brand_mean_prices_series,columns=['mean_price_dollars']) df['mean_mileage_km'] = brand_mean_mileage #aggregating mileage values to our dataframe print(df) # We can see correlation in the data here where most of the brands higher the mean price higher is the mean mileage eg: `bmw`, `audi`, `mercedes_benz` with few exceptions like `mini` where mean price is high but mean mileage is quite low. # Lets create a Series of cars with unrepaired_damaged and change nien to no and ja to yes # In[47]: autos['unrepaired_damage'] =(autos['unrepaired_damage']. str.replace('nein','no'). str.replace('ja','yes')) # In[48]: damaged_cars = autos['unrepaired_damage'] == 'yes' damaged =autos[damaged_cars] price_unrepaired ={} for brand in top_20.index: damaged_cars= damaged['brand'] == brand # if the value in brand column equals to brand in the iteration list then choose it mean_price =damaged[damaged_cars]['price_dollar'].mean() # using boolean mask created above to filter results and take mean of it price_unrepaired[brand] = mean_price unrepaired_mean_price =pd.Series(price_unrepaired) df['damaged_mean_price'] = unrepaired_mean_price # adding column damaged_mean_price to our aggregated dataframe print(df) # We can notice that price for unrepaired cars reduces significantly by approximately 50%. # #### Analyzing Volkswagen -the top brand # In[50]: volk = autos['brand'] == 'volkswagen' # boolean mask to narrow results only associated with Volkswagen brand volks_model=autos[volk] name = volks_model['name'] # to get name of specific model name.value_counts(normalize = True).head(20) # top 20 results of relative frequencies of name of the model in Volkswagen brand # Volkswagen is the brand with most entries and data seems to be dominated by different versions of Volkswagen Golf and Polo # # Conclusion # # - Data shows positive correlation between pricing and milage, where more expensive cars like mercedes,audi, bmw tend to have higher milage odometer readings on them. # # # - Most Common Brand is Volkswagen with models like Golf and Polo dominating the entries. # # # - Prices of unrepaired cars is significantly lower compared to average price of the car by almost 50% # # # - Upto 75% of the car listings have milage 150,000 which is understandable considering these are used cars. # # # - Upto 48% of the entries were scrapped online between period of April 5-7,2016. # # # - Upto 95 % of the cars were registered between 1992-2005. #