#!/usr/bin/env python # coding: utf-8 # ## introduction # In this guided project, we'll work with a dataset of used cars from eBay Kleinanzeigen,a section of the German eBay website. # The aim of this project is to clean the data and analyze the included used car listings. # In[2]: import pandas as pd import numpy as np # In[3]: autos = pd.read_csv("autos.csv", encoding= 'Latin-1') # In[4]: autos.columns # In[5]: autos.info() # In[6]: autos.head() # **Observations** # From the column names we can notice the name is written in camelcase naming convention, i'll convert it to snakecase naming convention. # some interger column are mixed with text values like price and odometer. # **cleaning column names** # In[8]: autos.columns = autos.columns.map({'dateCrawled':'date_crawled', 'name':'name', 'seller':'seller', 'offerType':'offer_type', 'price':'price', 'abtest':'abtest', 'vehicleType':'vehicle_type', 'yearOfRegistration':'year_of_registration', 'gearbox':'gearbox', 'powerPS': 'power_ps', 'model':'model', 'vehicleType':'vehicle_type', 'yearOfRegistration':'year_of_registration', 'gearbox':'gearbox', 'powerPS':'power_ps', 'model':'model', 'odometer':'odometer', 'monthOfRegistration':'month_of_registration', 'fuelType':'fuel_type', 'brand':'brand', 'notRepairedDamage':'not_repaired_damage', 'dateCreated':'date_created', 'nrOfPictures':'nr_of_pictures', 'postalCode':'postal_code', 'lastSeen':'last_seen'}) # In[9]: autos.columns # In[10]: autos.head() # **i changed the column name convention to snakecase convention, One study has found that readers can recognize snake case values more quickly than camel case** # **Initial Exploration and Cleaning** # In[11]: autos.describe(include ='all') # In[12]: autos["price"] = autos["price"].str.replace("$","").str.replace(","," ").str.replace(" ", "") autos["odometer"] = autos["odometer"].str.replace("km"," ").str.replace(",","") # In[13]: autos.rename({"date_created":"ad_created","month_of_registration":"registration_month","year_of_registration":"registration_year","not_repaired_damage":"unrepaired_damage",},axis = 1, inplace = True) # In[14]: autos["price"].head() # In[15]: autos["odometer"].head() # In[16]: autos.rename({"odometer":"odometer_km"}, axis = 1, inplace = True) #To rename the column to odometer_km # In[17]: autos["price"] = autos["price"].astype(int) autos["odometer_km"] = autos["odometer_km"].astype(int) #To convert both columns to interger values from strimg # **Exploring the Odometer and price columns** # In[18]: autos["odometer_km"].unique().shape # In[19]: autos["odometer_km"].describe() # In[20]: autos["odometer_km"].value_counts().sort_index(ascending = False) # **from this column, we see that all the km are rounded up and majority of the care sold on the ebay site has high mileage. there's also no reason for outliners since all the set km are taken by buyers.** # In[21]: autos["price"].unique().shape # In[23]: autos["price"].describe() # In[24]: autos["price"].value_counts().sort_index(ascending = True).head(10) # In[25]: autos["price"].value_counts().sort_index(ascending = True).tail(10) # In[26]: autos = autos[autos["price"].between(0,350000)] # **from the price column, we discovered bidders placed bids as high as 99999999 which doesnt sound realistic from a bidding site, we decided to consider all bids lower than 1 and above 350000 as outliners** # **Exploring the date column** # In[27]: autos["date_crawled"].str[:10].value_counts(normalize = True, dropna = False).sort_index() # In[28]: autos["ad_created"].str[:10].value_counts(normalize = True, dropna = False).sort_index() # In[29]: autos["last_seen"].str[:10].value_counts(normalize = True, dropna = False).sort_index() # i observed that a highest level of activity happened on 2016-04-07. # In[30]: autos["registration_year"].describe() # it's observed that the min and max registration years are 1000 and 9999 which seems absurd # In[31]: autos["registration_year"].value_counts().sort_index().head(15) # **i decided to explore the frequencies of the registration_year column in ascending order, i then decided to remove all enteries before 1900 because that year had 2 frequencies** # In[32]: autos = autos[autos["registration_year"].between(1934, 2016)] # In[33]: autos["registration_year"].value_counts(normalize = True).head(10) # In[34]: autos["registration_year"].value_counts(normalize = True).head(10).sum() # # i observed that 57.34% of the vehicles were registered in the past 20years and had the top 10 most registered cars # **Exploring cars by Brand** # In[35]: autos["brand"].unique() # **percentage of each brand** # In[36]: autos["brand"].value_counts(normalize = True) # **Top twenty most common brand** # In[37]: top_brand = autos["brand"].value_counts().iloc[0:19].index top_brand # **Top twenty most common brand by average price** # In[38]: top_brand = autos["brand"].value_counts().iloc[0:19].index top_brand_by_mean_price = {} for each_brand in top_brand: top_brand_row = autos[autos["brand"] == each_brand]["price"].mean() top_brand_by_mean_price[each_brand] = top_brand_row top_brand_by_mean_price # **Brands that individually makeup above 5% of the total amount of brands in the brand column** # In[39]: brands_above_5 = autos["brand"].value_counts(normalize = True) brands_above_5 = brands_above_5[brands_above_5 > 0.05].index brands_above_5_mean = {} for each_brand in brands_above_5: brands_above_5_row = autos[autos["brand"]== each_brand]["price"].mean().round() brands_above_5_mean[each_brand] = brands_above_5_row brands_above_5_mean # **The maximum price for each brand** # In[40]: max_price_per_brand = {} for each_brand in autos["brand"].unique(): each_brand_row = autos[autos["brand"] == each_brand].sort_values("price",ascending = False).iloc[0]["price"] max_price_per_brand[each_brand] = each_brand_row max_price_per_brand # **below we will convert the brands_above_5_mean dictionary to a series, then subsequently create a new dataframe named df then assign the series as "mean_price"** # In[41]: bmp_series = pd.Series(brands_above_5_mean) print(bmp_series) # In[42]: df= pd.DataFrame(bmp_series, columns = ["mean_price"]) df # **lets find the average mileage for the the above brands that individually have over 5% of the total percentage of cars in the dataset.** # In[43]: top_brand_mileage = {} unique_brand = autos["brand"].value_counts(normalize = True) unique_brand = unique_brand[unique_brand > 0.05].index for each_brand in unique_brand: unique_brand_row = autos[autos["brand"] == each_brand]["odometer_km"].mean() top_brand_mileage[each_brand] = unique_brand_row top_brand_mileage # In[44]: top_brand_mileage_series = pd.Series(top_brand_mileage).round() top_brand_mileage_series # **Then we add it to the new dataframe df** # In[45]: df["mileage"] = top_brand_mileage_series # In[46]: df # ## Next step # **Identify categorical data that uses german words, translate them and map the values to their english counterparts** # In[48]: autos[["vehicle_type","gearbox","fuel_type","unrepaired_damage"]] # In[49]: autos["vehicle_type"].unique() # **convert german words in vehicle_type column to english** # In[50]: autos["vehicle_type"] = autos["vehicle_type"].map({"bus":"bus","limousine":"limousine","nan":"nan","coupe":"coupe","suv":"suv","kleinwagen":"mini car","kombi":"estate car","cabrio":"convertible","andere":"other"}) # In[51]: autos["vehicle_type"].unique() # **convert german words in gearbox column to english** # In[52]: autos["gearbox"].unique() # In[53]: autos["gearbox"] = autos["gearbox"].map({"manuell":"manual","automatik":"automatic","nan":"nan"}) # In[54]: autos["gearbox"].unique() # **convert german words in fuel_type column to english** # In[55]: autos["fuel_type"].unique() # In[56]: autos["fuel_type"] = autos["fuel_type"].map({"lpg":"lpg","benzin":"petrol","diesel":"diesel","nan":"nan","cng":"cng","hybrid":"hybrid","elektro":"electric","andere":"other"}) # In[57]: autos["fuel_type"].unique() # **convert german words in unrepaired_damage column to english** # In[58]: autos["unrepaired_damage"].unique() # In[59]: autos["unrepaired_damage"] = autos["unrepaired_damage"].map({"nein":"none","nan":"nan","ja":"yes"}) # In[60]: autos["unrepaired_damage"].unique() # In[61]: autos["ad_created"] = autos["ad_created"].str.replace("-","").str.split().str[0] # **Convert the dates to be uniform numeric data, so "2016-03-21" becomes the integer 20160321.** # In[62]: autos["ad_created"] = autos["ad_created"].astype(int) autos["ad_created"] # ## Analysis next step # **1. Find the most common brand/model combinations** # In[63]: most_brand_model = {} most_brand = autos["brand"].value_counts(ascending = False).index for each_brand in most_brand: most_brand_row = autos[autos["brand"] == each_brand]["model"].iloc[0] most_brand_model[each_brand] = most_brand_row most_brand_model # **from the analysis, we can see that volkswagen is the most popular brand and golf is the most popular model** # **2. Split the odometer_km into groups, and use aggregation to see if average prices follows any patterns based on the mileage.** # In[108]: unique_km_per_mean_price = {} unique_km = autos["odometer_km"].value_counts(ascending = False).sort_index(ascending = False).index for km in unique_km: unique_km_row = autos[autos["odometer_km"] == km]["price"].mean().round() unique_km_per_mean_price[km] = unique_km_row unique_km_per_mean_price # In[103]: km_5000 = autos[autos["odometer_km"] == 5000]["price"] km_5000 # **i analyzed the average price of each unique km and i obsered they were inveresely correlated i.e the higher the Km the lower the average price of such vehicle. except for the list km which is 5000. i probeb further and i discovered most of the values were 0 hence it resulted in a lower than expected value** # **3. How much cheaper are cars with damage than their non-damaged counterparts** # In[112]: autos["unrepaired_damage"].unique() # In[124]: difference = (autos[autos["unrepaired_damage"]== "none"]["price"].mean()) - (autos[autos["unrepaired_damage"] == "yes"]["price"].mean()) # In[128]: difference.round(2) # on the average damaged cars are $4951.75 cheaper than their counterparts