#!/usr/bin/env python
# coding: utf-8

# In this Guided Project we will work with a dataset of used cars from eBay Kleinanzeigen, a section of the German eBay website.
# 
# The aim of this project is to clean the data and analyze the included used car listings. 
# 
# The data dictionary provided with data is as follows:
# 
# - dateCrawled - When this ad was first crawled. All field-values are taken from this date.
# - name - Name of the car.
# - seller - Whether the seller is private or a dealer.
# - offerType - The type of listing
# - price - The price on the ad to sell the car.
# - abtest - Whether the listing is included in an A/B test.
# - vehicleType - The vehicle Type.
# - yearOfRegistration - The year in which the car was first registered.
# - gearbox - The transmission type.
# - powerPS - The power of the car in PS.
# - model - The car model name.
# - kilometer - How many kilometers the car has driven.
# - monthOfRegistration - The month in which the car was first registered.
# - fuelType - What type of fuel the car uses.
# - brand - The brand of the car.
# - notRepairedDamage - If the car has a damage which is not yet repaired.
# - dateCreated - The date on which the eBay listing was created.
# - nrOfPictures - The number of pictures in the ad.
# - postalCode - The postal code for the location of the vehicle.
# - lastSeenOnline - When the crawler saw this ad last online.
# 

# In[1]:


import numpy as np
import pandas as pd 
autos = pd.read_csv('autos.csv', encoding='Windows-1252') 


# In[2]:


autos


# We want to find out some details, thus we will look into the first 5 rows.

# In[3]:


#print(autos.info())
#print(autos.head())


# The dataset contains 20 columns, most of which are strings.
# Some columns have null values, but none have more than ~20% null values.
# The column names use camelcase instead of Python's preferred snakecase, which means we can't just replace spaces with underscores.
# The Class of the dataset is = 'pandas.core.frame.DataFrame'>
# It includeds 50000 entries ending with 2016-03-26 17:47:46 and beginning at 2016-03-14 00:42:12.
# The data is seperated between numbers (int64) and strings (objects) 
# The dtypes of the indexes are: int64(5), object(14)
# The Brands of the car are directly connected via (_) with the description of the car in the column of the name, but the brand is shown in a seperated column.
# Also the price is not a number, more a string with a sign before it.

# Now it is time to get all the column names from camelcase to snakecase.

# In[4]:


autos.columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'ab_test',
       'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model',
       'odometer', 'registration_month', 'fuel_type', 'brand',
       'unrepaired_damage', 'ad_created', 'num_photos', 'postal_code','last_seen']
autos.head()


# The new autos Dateframe constist of the same old data, just with the exception of new Index Names which are written in Snakecase and not Camelecase 

# # 3. Basic data exploration

# 1) Initially we will look for: - Text columns where all or almost all values are the same. These can often be dropped as they don't have useful information for analysis. - Examples of numeric data stored as text which can be cleaned and converted.
#  - First we will check the Dataframe descriptive
#  

# In[5]:


autos.describe(include='all')


# 1) Columns which basically have identical values:
#     -seller & offer_type 
#     so we will delete those columns.
# 2)  The column num_photos looks a bit odd - so we will investigage it more.
# 3) The odometer includes number values, which are stored as text.
#     

# In[6]:


autos["num_photos"].value_counts()


# It looks like the num_photos column has 0 for every column. We'll drop this column, plus the other two we noted as mostly one value.
# 

# In[7]:


autos = autos.drop(["num_photos", "seller", "offer_type"], axis=1)


# The odometer includes numeric values as text. Thus, we have to remove any non.numeric characters and convert the column to a numeric dtype and we have to change the column name into odometer_km. 

# In[8]:


autos["odometer"] = (autos["odometer"].str.replace("km", "").str.replace(",", "").astype(int))
autos["price"] = (autos["price"].str.replace("$", "").str.replace(",", "").astype(int))
autos.rename(columns={'odometer': 'odometer_km'}, inplace=True)  
autos.rename(columns={'price': 'price_€'}, inplace=True)   
autos["odometer_km"].head()
autos["price_€"].head()


# # 4. Exploring the Odometer and Price Columns

# We will analyze the columns using minimum and maximum values and look for any values that look unrealistically high or low (outliers) that we might want to remove.

# In[9]:


autos["odometer_km"].unique().shape


# The Series Odometer_km has 13 unique values.

# In[10]:


autos["odometer_km"].describe()


# The total entries are 50.000
# The smallest value in the odometer_km Series is 5000 and the highest 150000.
# The data type is float.

# In[11]:


odo_unsorted = autos["odometer_km"].value_counts()
odo_sorted = odo_unsorted.sort_index(ascending=True)
print(odo_sorted)


# We can see that the values in this field are rounded, which might indicate that sellers had to choose from pre-set options for this field. Additionally, there are more high mileage than low mileage vehicles.
# But there are no strange values or data.

# In[12]:


print(autos["price_€"].unique().shape)
print(autos["price_€"].describe())
autos["price_€"].value_counts().head(20)


# The description shows us, that we have many values are too small to be true, as well as some values which are too high. Thus, we will dropthos values.
# 
# Again, the prices in this column seem rounded, however given there are 2357 unique values in the column, that may just be people's tendency to round prices on the site.
# 
# There are 1,421 cars listed with $0 price - given that this is only 2Percent of the of the cars, we might consider removing these rows. The maximum price is one hundred million dollars, which seems a lot, let's look at the highest prices further.
# 

# In[13]:


autos["price_€"].value_counts().sort_index(ascending=False).head(20)


# In[14]:


autos["price_€"].value_counts().sort_index(ascending=False).head(20)


# There are a number of listings with prices below \$30, including about 1,500 at \$0. There are also a small number of listings with very high values, including 14 at around or over $1 million.
# Given that eBay is an auction site, there could legitimately be items where the opening bid is \$1. We will keep the \$1 items, but remove anything above \$350,000, since it seems that prices increase steadily to that number and then jump up to less realistic numbers.
# 

# In[15]:


autos = autos[autos["price_€"].between(1,351000)]
autos["price_€"].describe()


# We removed around 1500cars from our list. Now the maximum price is 350.000€. We still include around 1500 entries with around 1 Euro, which most likely are open biddings.

# # 5. Exploring the date columns

# Let's now move on to the date columns and understand the date range the data covers.
# 
# There are 5 columns that should represent date values. Some of these columns were created by the crawler, some came from the website itself

# Right now, the date_crawled, last_seen, and ad_created columns are all identified as string values by pandas. Because these three columns are represented as strings, we need to convert the data into a numerical representation so we can understand it quantitatively.

# Let's first understand how the values in the three string columns are formatted.

# In[16]:


autos[['date_crawled','ad_created','last_seen']][0:5]


# We notice that the first 10 characters represent the day (e.g. 2016-03-12). To understand the date range, we can extract just the date values, use Series.value_counts() to generate a distribution, and then sort by the index

# In[17]:


autos["date_crawled"].describe()
print(autos["date_crawled"].str[:10].head(10))


# In[18]:


(autos["date_crawled"].
                        str[:10].
                        value_counts(normalize=True, dropna=False).
                        sort_index(ascending=False))


# Looks like the site was crawled daily over roughly a one month period in March and April 2016. The distribution of listings crawled on each day is roughly uniform.

# In[19]:


(autos["ad_created"].
                        str[:10].
                        value_counts(normalize=True, dropna=False).
                        sort_index(ascending=False))


# There is a large variety of ad created dates. Most fall within 1-2 months of the listing date, but a few are quite old, with the oldest at around 9 months.

# In[20]:


(autos["last_seen"].str[:10].
                        value_counts(normalize=True, dropna=False).
                        sort_index(ascending=False))


# The crawler recorded the date it last saw any listing, which allows us to determine on what day a listing was removed, presumably because the car was sold.
# 
# The last three days contain a disproportionate amount of 'last seen' values. Given that these are 6-10x the values from the previous days, it's unlikely that there was a massive spike in sales, and more likely that these values are to do with the crawling period ending and don't indicate car sales.
# 

# # 6. Dealing with Incorrect Registration Year Data

# In[21]:


autos["registration_year"].describe()


# The registration_year is a float type value. 
# It also includes some strange values. 
# 1) The minimum is 1000, which is not possible, since there no cars existed and the maximum value is 9999, which again is not possible, since it represents the future.
# Thus, we will drop those values.

# The lowest acceptable value is 1900 and the highest is 2019.

# In[22]:


autos["registration_year"].value_counts(normalize=True)
autos["registration_year"].describe()


# In[23]:


(~autos["registration_year"].between(1900,2016)).sum() / autos.shape[0]
#autos["registration_year"].describe()


# The total number of vales decreased from 48565 to 48545.
# Given the values out of the range are 4 percent, we will drop the values out of the range between 1910 and 2019. In total we droped 20 values that were out of that range.

# In[24]:


autos = autos[autos["registration_year"].between(1900,2016)]
autos["registration_year"].value_counts(normalize=True).head(10)


# 
# 
# It appears that most of the vehicles were first registered in the past 20 years.
# 

# # 7. Exploring Price by Brand

# In[25]:


autos["brand"].value_counts(normalize=True)


# That list shows us all brands which are included in the Datafram autos. 
# The 5 brands with the highest share are german manufacturers and represent around 60% of the total share.
# There are lots of brands that don't have a significant percentage of listings, so we will limit our analysis to brands representing more than 5% of total listings.

# In[26]:


brand_counts = autos["brand"].value_counts(normalize=True)
common_brands = brand_counts[brand_counts > .05].index
print(common_brands)


# In[31]:


brand_mean_prices = {}

for brand in common_brands:
    only_brand = autos[autos["brand"] == brand]
    mean_price = only_brand["price_€"].mean()
    brand_mean_prices[brand] = int(mean_price)


brand_mean_prices


# In[32]:


brand_mean_power_ps = {}

for brand in common_brands:
    brand_only = autos[autos["power_ps"] == brand]
    mean_ps = autos["power_ps"].mean()
    brand_power_ps_mean[brand]
    
brand_power_ps_mean