#!/usr/bin/env python
# coding: utf-8

# ### Intro
# 
# Okay, here we go with the project. I hope I can get it to work. We'll be importing a dataset from the German eBay kleinen ads for cars, do some cleanup and then a bit of analysis.

# In[86]:


import numpy as np
import pandas as pd

autos = pd.read_csv("autos.csv", encoding = 'Latin-1' )


# In[87]:


autos


# In[88]:


autos.info()

autos.head(3)


# ### Some initial observations
# 
# We have 20 columns
# CamelCase instead of snake_case
# 
# date: maybe convert to day and get rid of the time?
# price: remove dollar sign, convert to integer, rename column
# vehicleType: missing some data
# gearbox: missing some data
# model: missing some data
# odometer: remove km, convert to integer, rename column
# fuelType: missing some data
# notRepairedDamage: missing data
# dateCreated: maybe convert to day and get rid of the time?
# lastSeen: maybe convert to day and get rid of the time?
# 
# 

# In[89]:


autos.columns


# In[90]:


#first, let's convert from camelcase to snakecase

autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price_usd', 'abtest',
       'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model',
       'odometer', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'number_pictures', 'postal_code',
       'last_seen']
autos.head()


# In[91]:


autos.describe(include='all')


# ### Data cleanup
# 
# 
# It looks like we can drop number of pictures, seller, and offertype, since they provide almost no information. I think abtest probably refers to whether the advertisement is itself tested, which isn't relevant here. Is the crawl date, ad creation date, and last seen date relevant?
# 
# Maybe, if a car hasn't sold in a while there might be a reason. We'll look at cars that have been listed more recently.

# In[92]:


autos = autos.drop(['seller', 'offer_type','number_pictures','abtest'], axis=1)


# Some more clean up: Odometer needs to be a number, change column name
# Price needs to be a number -- we've already changed the column name.

# In[93]:


#Cleaning up the price and odometer columns

autos["price_usd"] = autos["price_usd"].str.replace("$","")
autos["price_usd"] = autos["price_usd"].str.replace(",","").astype(int)
autos["odometer"] = autos["odometer"].str.replace("km","")
autos["odometer"] = autos["odometer"].str.replace(",","").astype(int)

autos.rename(columns = {'odometer':'odometer_km'}, inplace=True)


# ### First exploration of data
# 

# In[94]:


#Lets take a look to find weird data in odometers

autos['odometer_km'].shape


# In[95]:


autos["odometer_km"].describe()


# Well, that doesn't tell us much. It looks like the data tops out at 150k mileage, so who knows how many miles are actually on a car indicated with 150 (it's actually "above 150")
# 
# Let's see how many cars have 150k or more miles.

# In[96]:


autos["odometer_km"].value_counts().sort_index(ascending=False).head(10)


# Wow, that's a lot. Probably we'll filter out those high mileage cars later on.
# 
# Let's look at he price now

# In[97]:


autos['price_usd'].shape


# In[98]:


autos["price_usd"].describe()


# In[99]:


autos["price_usd"].value_counts().sort_index(ascending=False).head(50)


# Wow. There's a few cars that are crazy expensive. I'm guessing a few of those might be a typo, let's take a look...

# In[100]:


autos[autos["price_usd"] > 150000].head(50)


# ### Remove bad data and cars that aren't priced right
# 
# 
# Looks like there's some typos in there. Let's get rid of all cars with a price greater than 200000 - there's only one legit entry (A ferrari), and we can't afford it anyway.. 

# In[101]:


autos = autos[autos.price_usd < 200000]


# There's also a bunch of autos that are way too cheap. We aren't in the market for a really bad car - let's get rid of those too.

# In[102]:


autos = autos[autos.price_usd > 2500]


# Now let's take a quick look at the data we have left. First make sure we got rid of the high prices.

# In[103]:


autos[autos["price_usd"] > 150000].sort_index(ascending=False).head(50)


# Now we'll look to make sure we got rid of the low values

# In[104]:


autos[autos["price_usd"] < 5000].sort_index(ascending=True).head(50)


# Now let's take a look at where we are...

# In[105]:


autos.info()


# ### Exploring the date info
# 
# 
# Let's see if there's any interesting info in the dates. First, let's grab a sample of the data.

# In[106]:


autos[['date_crawled','ad_created','last_seen']][0:5]


# Hm. Well, the timestamp down to the second isn't very meaningful. Let's strip out the time and see what we are left with.

# In[107]:


autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True)


# Looks like a pretty even distribution, with about 3% of the total for each day. there are some exceptions, seems like sundays don't have as many crawl dates. Is this meaningful? Probably not. Moving on.

# In[108]:


autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True)


# Looks like the most recent dates have the most created. We'll be sure and filter out cars advertised more than a month or so...

# In[109]:


autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index(ascending=True)


# Looks like people check in reasonably frequently. Let's take a closer look at how old these cars are...

# In[110]:


autos['registration_year'].describe()


# ### More data cleanup based on registration date
# 
# 
# We have some outliers in reg year -- let's update the data to remove too old or in-the-future registrations. We're only looking for cars less than six years old or so, so we'll start with that (remembering we only have data up to 2016)

# In[111]:


autos = autos[autos.registration_year >= 2010]


# Okay, now let's see where we are...'

# In[112]:


autos['registration_year'].describe()


# Need to get rid of the high values...

# In[113]:


autos = autos[autos.registration_year <= 2018]


# Another look ...

# In[114]:


autos['registration_year'].describe()


# Most cares are late 90s to 2010 or so

# ### Get rid of any cars with unrepaired damage
# 
# 
# Okay, now we can take a closer look at some cars we might want to buy. First thing, let's get rid of any cars with damage...'

# In[115]:


autos['unrepaired_damage'].describe()


# In[116]:


autos['unrepaired_damage'].unique()


# Let's get rid of any cars with unrepaired damage'

# In[117]:


autos = autos[autos.unrepaired_damage != 'ja']


# Let's take a look at what we're left with

# In[118]:


autos.describe()


# Okay, we're down to about 7000 possibilities. Let's look at the distribution of brands

# In[119]:


autos['brand'].value_counts(normalize=True)


# ### Aggregate the long tail of brands
# 
# 
# We want the biggest selection of cars, but we don't want to spent too much time sorting through the long tail. Instead, we'll take any brand with less than 2% of the overall share and aggregate them as "other"

# In[120]:


brand_quantity = autos['brand'].value_counts(normalize=True) 
brands_to_aggregate = brand_quantity[brand_quantity < .02].index

print(brands_to_aggregate)


# In[121]:


for brand in brands_to_aggregate:
    autos.loc[autos['brand'] == brand, 'brand'] = 'other'


# Now take a look at what we have

# In[122]:


autos['brand'].value_counts(normalize=True)


# ### Explore remaining data
# 
# 
# Now that we have aggregated our brands, let's take a look at price and mileage. First, we'll get an array of the brands so that we can calculate mean prices in a loop.

# In[126]:


brand_explore = autos['brand'].unique()

brand_explore


# Now we calculate mean price

# In[131]:


mean_price = {}

for brand in brand_explore:
    branddf = autos[autos['brand'] == brand]
    mean = branddf['price_usd'].mean()
    mean_price[brand] = int(mean)
    
    
mean_price


# Let's do the same thing for mileage
# 

# In[130]:


mean_odometer = {}

for brand in brand_explore:
    branddf = autos[autos['brand'] == brand]
    odometer = branddf['odometer_km'].mean()
    mean_odometer[brand] = int(odometer)
    
    
mean_odometer


# ### Create final table
# 
# Here we have the remaining data set
# 

# In[134]:


mean_odom = pd.Series(mean_odometer).sort_values(ascending=False)
mean_prices = pd.Series(mean_price).sort_values(ascending=False)

final_table = pd.DataFrame(mean_prices, columns=['mean_prices'])
final_table["mean_odometer"] = mean_odom

final_table


# ### Takeaways
# 
# 
# Cheapest and lowest mileage seem to be fiats
# 
# If you have a bit more money, look at hyundais 
# 
# If you have even more money, look at minis and "other"
# 
# Audis, BMWs and Mercedes are all most expensive and have among the highest mileage. Be cautious.

# In[ ]: