#!/usr/bin/env python # coding: utf-8 # ### Intro # # Okay, here we go with the project. I hope I can get it to work. We'll be importing a dataset from the German eBay kleinen ads for cars, do some cleanup and then a bit of analysis. # In[86]: import numpy as np import pandas as pd autos = pd.read_csv("autos.csv", encoding = 'Latin-1' ) # In[87]: autos # In[88]: autos.info() autos.head(3) # ### Some initial observations # # We have 20 columns # CamelCase instead of snake_case # # date: maybe convert to day and get rid of the time? # price: remove dollar sign, convert to integer, rename column # vehicleType: missing some data # gearbox: missing some data # model: missing some data # odometer: remove km, convert to integer, rename column # fuelType: missing some data # notRepairedDamage: missing data # dateCreated: maybe convert to day and get rid of the time? # lastSeen: maybe convert to day and get rid of the time? # # # In[89]: autos.columns # In[90]: #first, let's convert from camelcase to snakecase autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price_usd', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'number_pictures', 'postal_code', 'last_seen'] autos.head() # In[91]: autos.describe(include='all') # ### Data cleanup # # # It looks like we can drop number of pictures, seller, and offertype, since they provide almost no information. I think abtest probably refers to whether the advertisement is itself tested, which isn't relevant here. Is the crawl date, ad creation date, and last seen date relevant? # # Maybe, if a car hasn't sold in a while there might be a reason. We'll look at cars that have been listed more recently. # In[92]: autos = autos.drop(['seller', 'offer_type','number_pictures','abtest'], axis=1) # Some more clean up: Odometer needs to be a number, change column name # Price needs to be a number -- we've already changed the column name. # In[93]: #Cleaning up the price and odometer columns autos["price_usd"] = autos["price_usd"].str.replace("$","") autos["price_usd"] = autos["price_usd"].str.replace(",","").astype(int) autos["odometer"] = autos["odometer"].str.replace("km","") autos["odometer"] = autos["odometer"].str.replace(",","").astype(int) autos.rename(columns = {'odometer':'odometer_km'}, inplace=True) # ### First exploration of data # # In[94]: #Lets take a look to find weird data in odometers autos['odometer_km'].shape # In[95]: autos["odometer_km"].describe() # Well, that doesn't tell us much. It looks like the data tops out at 150k mileage, so who knows how many miles are actually on a car indicated with 150 (it's actually "above 150") # # Let's see how many cars have 150k or more miles. # In[96]: autos["odometer_km"].value_counts().sort_index(ascending=False).head(10) # Wow, that's a lot. Probably we'll filter out those high mileage cars later on. # # Let's look at he price now # In[97]: autos['price_usd'].shape # In[98]: autos["price_usd"].describe() # In[99]: autos["price_usd"].value_counts().sort_index(ascending=False).head(50) # Wow. There's a few cars that are crazy expensive. I'm guessing a few of those might be a typo, let's take a look... # In[100]: autos[autos["price_usd"] > 150000].head(50) # ### Remove bad data and cars that aren't priced right # # # Looks like there's some typos in there. Let's get rid of all cars with a price greater than 200000 - there's only one legit entry (A ferrari), and we can't afford it anyway.. # In[101]: autos = autos[autos.price_usd < 200000] # There's also a bunch of autos that are way too cheap. We aren't in the market for a really bad car - let's get rid of those too. # In[102]: autos = autos[autos.price_usd > 2500] # Now let's take a quick look at the data we have left. First make sure we got rid of the high prices. # In[103]: autos[autos["price_usd"] > 150000].sort_index(ascending=False).head(50) # Now we'll look to make sure we got rid of the low values # In[104]: autos[autos["price_usd"] < 5000].sort_index(ascending=True).head(50) # Now let's take a look at where we are... # In[105]: autos.info() # ### Exploring the date info # # # Let's see if there's any interesting info in the dates. First, let's grab a sample of the data. # In[106]: autos[['date_crawled','ad_created','last_seen']][0:5] # Hm. Well, the timestamp down to the second isn't very meaningful. Let's strip out the time and see what we are left with. # In[107]: autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True) # Looks like a pretty even distribution, with about 3% of the total for each day. there are some exceptions, seems like sundays don't have as many crawl dates. Is this meaningful? Probably not. Moving on. # In[108]: autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True) # Looks like the most recent dates have the most created. We'll be sure and filter out cars advertised more than a month or so... # In[109]: autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True) # Looks like people check in reasonably frequently. Let's take a closer look at how old these cars are... # In[110]: autos['registration_year'].describe() # ### More data cleanup based on registration date # # # We have some outliers in reg year -- let's update the data to remove too old or in-the-future registrations. We're only looking for cars less than six years old or so, so we'll start with that (remembering we only have data up to 2016) # In[111]: autos = autos[autos.registration_year >= 2010] # Okay, now let's see where we are...' # In[112]: autos['registration_year'].describe() # Need to get rid of the high values... # In[113]: autos = autos[autos.registration_year <= 2018] # Another look ... # In[114]: autos['registration_year'].describe() # Most cares are late 90s to 2010 or so # ### Get rid of any cars with unrepaired damage # # # Okay, now we can take a closer look at some cars we might want to buy. First thing, let's get rid of any cars with damage...' # In[115]: autos['unrepaired_damage'].describe() # In[116]: autos['unrepaired_damage'].unique() # Let's get rid of any cars with unrepaired damage' # In[117]: autos = autos[autos.unrepaired_damage != 'ja'] # Let's take a look at what we're left with # In[118]: autos.describe() # Okay, we're down to about 7000 possibilities. Let's look at the distribution of brands # In[119]: autos['brand'].value_counts(normalize=True) # ### Aggregate the long tail of brands # # # We want the biggest selection of cars, but we don't want to spent too much time sorting through the long tail. Instead, we'll take any brand with less than 2% of the overall share and aggregate them as "other" # In[120]: brand_quantity = autos['brand'].value_counts(normalize=True) brands_to_aggregate = brand_quantity[brand_quantity < .02].index print(brands_to_aggregate) # In[121]: for brand in brands_to_aggregate: autos.loc[autos['brand'] == brand, 'brand'] = 'other' # Now take a look at what we have # In[122]: autos['brand'].value_counts(normalize=True) # ### Explore remaining data # # # Now that we have aggregated our brands, let's take a look at price and mileage. First, we'll get an array of the brands so that we can calculate mean prices in a loop. # In[126]: brand_explore = autos['brand'].unique() brand_explore # Now we calculate mean price # In[131]: mean_price = {} for brand in brand_explore: branddf = autos[autos['brand'] == brand] mean = branddf['price_usd'].mean() mean_price[brand] = int(mean) mean_price # Let's do the same thing for mileage # # In[130]: mean_odometer = {} for brand in brand_explore: branddf = autos[autos['brand'] == brand] odometer = branddf['odometer_km'].mean() mean_odometer[brand] = int(odometer) mean_odometer # ### Create final table # # Here we have the remaining data set # # In[134]: mean_odom = pd.Series(mean_odometer).sort_values(ascending=False) mean_prices = pd.Series(mean_price).sort_values(ascending=False) final_table = pd.DataFrame(mean_prices, columns=['mean_prices']) final_table["mean_odometer"] = mean_odom final_table # ### Takeaways # # # Cheapest and lowest mileage seem to be fiats # # If you have a bit more money, look at hyundais # # If you have even more money, look at minis and "other" # # Audis, BMWs and Mercedes are all most expensive and have among the highest mileage. Be cautious. # In[ ]: