#!/usr/bin/env python # coding: utf-8 # # Ebay Car Sales Data. Germany # # **The aim of this project is to clean the data and analyze the included used car listings.** # # Our dataset represents used cars from *eBay Kleinanzeigen*, a classifields section of the German eBay website. The dataset was originally scrapped by user orgesleka and modified by DQ for learning purpose. # # The data dictionary provided with data is as follows: # # * **dateCrawled** - When this ad was first crawled. All field-values are taken from this date. # # # * **name** - Name of the car. # # # * **seller** - Whether the seller is private or a dealer. # # # * **offerType** - The type of listing # # # * **price** - The price on the ad to sell the car. # # # * **abtest** - Whether the listing is included in an A/B test. # # # * **vehicleType** - The vehicle Type. # # # * **yearOfRegistration** - The year in which the car was first registered. # # # * **gearbox** - The transmission type. # # # * **powerPS** - The power of the car in PS. # # # * **model** - The car model name. # # # * **kilometer** - How many kilometers the car has driven. # # # * **monthOfRegistration** - The month in which the car was first registered. # # # * **fuelType** - What type of fuel the car uses. # # # * **brand** - The brand of the car. # # # * **notRepairedDamage** - If the car has a damage which is not yet repaired. # # # * **dateCreated** - The date on which the eBay listing was created. # # # * **nrOfPictures** - The number of pictures in the ad. # # # * **postalCode** - The postal code for the location of the vehicle. # # # * **lastSeenOnline** - When the crawler saw this ad last online. # # So, let's move step by step: # # # #### 1). Import libraries and reading "autos.csv" file # In[1]: # import libraries: import numpy as np import pandas as pd # reading csv file: autos = pd.read_csv("autos.csv", encoding = "Latin-1") # getting general idea about the dataset: autos.info() autos.head(7) # **Notes:** # # Our dataset have 50000 entries and 20 columns, five of columns are integer type and 15 are of object type. Not all columns are filled with information , as in "vehicleType 44905 non-null object". Some object columns can be transformed to integer type , for example "price" or "odometr". # #### 2). Formatting column names: # * from **yearOfRegistration** to **registration_year** # # # * from **monthOfRegistration** to **registration_month** # # # * from **notRepairedDamage** to **unrepaired_damage** # # # * from **dateCreated** to **ad_created** # # # * from **camelcase** to **snakecase** # In[2]: # existing column names: autos.columns # In[3]: # column labels copy: autos_copy = autos.columns.copy() # function to format labels: def change(label): label = label.replace ("yearOfRegistration", "registration_year") label = label.replace("monthOfRegistration", "registration_month") label = label.replace("notRepairedDamage", "unrepaired_damage") label = label.replace("dateCreated", "ad_created") label = label.replace("dateCrawled", "date_crawled") label = label.replace("offerType", "offer_type") label = label.replace("vehicleType", "vehicle_type") label = label.replace("powerPS", "power_ps") label = label.replace("fuelType", "fuel_type") label = label.replace("postalCode", "postal_code") label = label.replace("lastSeen", "last_seen") label = label.replace("nrOfPictures", "nr_of_pictures") return label # append function to empty list: new_lab =[] for name in autos_copy: n = change(name) new_lab.append(n) # append list to the copy arrey: autos_copy = new_lab # append our copy arrey to existing names arrey: autos.columns = autos_copy # check current view of column labels: autos.head() # **Notes:** # # Now we've changed the format of column labels from camelcase to snakecase and reworded some names for better visual reading # #### 3). Other cleaning tasks: # * to check if there any same values # * to figure out any text that can be cleaned and converted to numeric # In[4]: # descriptive statistics: autos.describe(include = "all") # **Notes:** # - column "number of pictures" can be possibly dropped # - intersting columns: "seller", "price", "gearbox", "registration year", "odometer", "brand" # - coulmns "price" and "odometer" can be converted to numeric # In[5]: #remove non-numeric characters from columns "price" and "odometer" autos["price"] = autos["price"].str.replace("$","").str.replace(",","") autos["odometer"] = autos["odometer"].str.replace("km", "").str.replace(",","") #convert string to integer: autos["price"] = autos["price"].astype(int) autos["odometer"] = autos["odometer"].astype(int) #check data types: print(autos[["price","odometer"]].dtypes) #rename column name: autos.rename({"odometer": "odometer_km"}, axis = 1, inplace = True) # #### 4). "Price" and "Odometer_km" columns analysis. # In[6]: # unique values: print("Price unique values: ",autos["price"].unique().shape) print("Odometer_km unique values: ",autos["odometer_km"].unique().shape) # In[7]: # columns description: print("Price description:","\n",autos["price"].describe()) print("\n") print("Odometer_km description:","\n", autos["odometer_km"].describe()) # **Notes:** # # as we see above, statistics for price column described in E-notion format. So we will look for min,max,mean with the help of methods # In[8]: # min, max, mean for price column: print("Price min: ",autos["price"].min()) print("Price max: ",autos["price"].max()) print("Price mean: ", autos["price"].mean()) # In[9]: #value counts for price column: autos["price"].value_counts().sort_index(ascending= False).head(15) # In[10]: autos["price"].value_counts().sort_index(ascending= True).head(10) # In[11]: #value counts for odometer_km: autos["odometer_km"].value_counts() # In[12]: # removing entries with no value in price column: autos = autos[autos["price"].between(10,350000)] autos["price"].describe() # **Notes:** # # Column "Odometer_km" have been left as it is, but in column "Price" entries from 0 to 10 and from 350 000 to 99999999 removed, because it is not common for car to have no cost or too high price. # #### 5). Date range analysis. # # There are 5 columns with date information in the dataset: 'data_crewled' ,'last_seen', 'ad_created' are strings and 'registration_month', 'registration_year' are integers. # # For numeric data we will use series.describe() method to understand the distribution, but firstly we need to convert strings into numeric. # In[13]: # view on our string dates: autos[['date_crawled','ad_created','last_seen']][0:5] # In[14]: # value counts for date_crawled: autos['date_crawled'].str[:10].value_counts(normalize = True, dropna = False).sort_index() # In[15]: #value counts for ad_created: autos['ad_created'].str[:10].value_counts(normalize = True, dropna = False).sort_index(ascending = False) # In[16]: #value counts for last_seen: autos['last_seen'].str[:10].value_counts(normalize = True, dropna = False).sort_index(ascending = False) # **Notes:** # # - date_crewled in the range from 2016-03-05 to 2016-04-07 # - ad_created in the range from 2015-06-11 to 2016-04-07 # - last_seen in the range from 2016-03-05 to 2016-04-07 # # This informmation represents that active period is for about a month. Ad_created includes information of later periods but with low percentage, because some sellers are still in process of distribution their cars for half a year. # In[17]: # registration_year description: autos['registration_year'].describe() # **Notes:** # # Very strange data for *min = 1000* and *max =9999*, that are unrealistic. # # The year 1886 is regarded as the birth year of the modern car when German inventor Karl Benz patented his Benz Patent-Motorwagen. Cars became widely available in the early 20th century. One of the first cars accessible to the masses was the 1908 Model T, an American car manufactured by the Ford Motor Company (Wikipedia) # # So the *min year* - 1886 # # *Max year* - 2016 (the year of data gathered) # In[18]: # selecting range between 1886 and 2016: autos = autos[autos["registration_year"].between(1886,2016)] # new description of registration_year column: autos["registration_year"].describe() # **Notes:** # # Now we see , that the oldest car is of 1910 year. # In[19]: # value counts for registration_year: autos['registration_year'].value_counts(normalize = True) # ** Notes:** # From statistics above we can came to counclusion that the most ads on the site with cars in range 10-15 years old (2000 - 2005 +- 1 year) # #### 5). Brand analysis by price # # In[20]: #unique brands: autos['brand'].unique() # In[21]: # value counts: autos['brand'].value_counts(ascending = False) # ** Notes:** # Our next step will be analysis of prices of japan brands in Germany. # Let's group the most 5 popular brands : "toyota", "nissan", "mazda", "mitsubishi", "honda" and see the mean prices for it. # In[22]: # select 5 japan brands: japan_group = autos[(autos['brand'] == "toyota") | (autos['brand'] == "mazda")| (autos['brand'] == 'nissan')|(autos['brand'] == 'mitsubishi')|(autos['brand']== 'honda')] # check our new arrey japan_group.head() # In[23]: # list of unique names: japan_u = japan_group['brand'].unique() print(japan_u) # In[24]: # empty dictionary for final data: japan_b = {} # In[25]: #iterating our list of brands and calculating mean of the price: for c in japan_u: rows = japan_group[japan_group['brand'] == c ] mean = rows["price"].mean() japan_b[c] = mean # In[26]: #iterating dictionary: for k , v in japan_b.items(): print( k,": ", v.round()) # **Notes:** # As a result, the most expensive is "Toyota" and "Nissan". "Mazda" and "Honda" in the middle. # In[ ]: # storing a new dictionary with mileage values: japan_m ={} for c in japan_u: rows = japan_group[japan_group['brand'] == c ] mileage = rows["odometer_km"].mean() japan_m[c] = mileage # In[ ]: # converting dictionaries into series: price_s = pd.Series(japan_b).astype(int) mileage_s = pd.Series(japan_m).astype(int) # In[ ]: # converting series into dataframe: jb = pd.DataFrame(price_s, columns = ['mean_price']) jb # In[ ]: # adding a new column with mileage values: jb['mean_mileage'] = mileage_s jb # ** Conclusion:** # # Avarage prices for japan cars in Gernamy are in the range between $4000 -$5000. The price depends from brand. If the buyer want Toyota, he should be ready to pay about $5000 , if the budget less, he can look for such brand as Mitsubishi. # # Average mileage is in the range 115000 -125000 kilometers and do not have much impact on the price.