#!/usr/bin/env python
# coding: utf-8

# In[111]:


# This dataset is pulled from the used cars from eBay Kleinanzeigen of the German Ebay Website
# The data dictionary provided with data is as follows:

# dateCrawled - When this ad was first crawled. All field-values are taken from this date.
# name - Name of the car.
# seller - Whether the seller is private or a dealer.
# offerType - The type of listing
# price - The price on the ad to sell the car.
# abtest - Whether the listing is included in an A/B test.
# vehicleType - The vehicle Type.
# yearOfRegistration - The year in which the car was first registered.
# gearbox - The transmission type.
# powerPS - The power of the car in PS.
# model - The car model name.
# kilometre - How many kilometers the car has driven.
# monthOfRegistration - The month in which the car was first registered.
# fuelType - What type of fuel the car uses.
# brand - The brand of the car.
# notRepairedDamage - If the car has a damage which is not yet repaired.
# dateCreated - The date on which the eBay listing was created.
# nrOfPictures - The number of pictures in the ad.
# postalCode - The postal code for the location of the vehicle.
# lastSeenOnline - When the crawler saw this ad last online.

#Aim of the project is to clean the data


# In[112]:


import numpy
import pandas


# In[113]:


autos = pandas.read_csv('autos.csv', encoding='Latin-1')


# In[114]:


autos


# In[115]:


autos.info()
autos.head(5)


# In[116]:


autos.columns


# In[117]:


autos_copy = autos.copy()
columns = autos_copy.columns
print(columns)


# In[118]:


autos_copy = autos.rename(columns = {
'dateCrawled':'date_crawled',
'name':'name',
'seller':'seller',
'offerType':'offer_type',
'price':'price',
'abtest':'abtest',
'vehicleType':'vehicle_type',
'yearOfRegistration':'registration_year',
'gearbox':'gearbox',
'powerPS':'power_ps',
'model':'model',
'odometer':'odometer_km',
'monthOfRegistration':'registration_month',
'fuelType':'fuel_type',
'brand':'brand',
'notRepairedDamage':'unrepaired_damage',
'dateCreated':'ad_created',
'nrOfPictures':'nr_of_pictures',
'postalCode':'postal_code',
'lastSeen':'last_seen'})


# In[119]:


autos_copy.head()


# In[120]:


#Seller and offer_type are all the same value
autos_copy.describe(include='all')


# In[121]:


#convert odometer to integer
autos_copy["odometer_km"] = autos_copy["odometer_km"].str.replace(',|km','')
autos_copy["odometer_km"] = autos_copy["odometer_km"].astype(float)


# In[122]:


#convert price to integer
autos_copy["price"] = autos_copy["price"].str.replace('\$|,','')
autos_copy["price"] = autos_copy["price"].astype(float)


# In[123]:


#Find unique prices for vechiles
autos_copy["price"].unique().shape


# In[124]:


#Find unique kms per vehicle
autos_copy["price"].describe()


# In[125]:


#Find unique price per vehicle
autos_copy["price"].value_counts().sort_index(ascending=False).head(20)


# In[126]:


autos_copy["price"].value_counts().sort_index(ascending=False).tail(20)


# In[127]:


#There is too much variation between the pricing on the vechiles
#Pricing above 350,000 appears to be random key entries or errorenous data
#removal of vechiles with zero cost
autos_copy[autos_copy["price"].between(1,350000)]


# In[128]:


#Find unique odometer readings for vechiles
autos_copy["odometer_km"].unique().shape


# In[129]:


autos_copy["odometer_km"].describe()


# In[130]:


autos_copy["odometer_km"].value_counts().sort_index(ascending=False).head(20)


# In[131]:


#Review of the dataset does not appear to have any outliers, it is possible for vehicles
#to have 150000, this would indicate vehicle are generally put up for sale when
#they reach this odometer milestone
autos_copy["odometer_km"].value_counts().sort_index(ascending=False).tail(20)


# In[132]:


autos_copy[['date_crawled','ad_created','last_seen']][0:5]


# In[133]:


#Majority of the data crawled is within the month of March, it would appear that the 5th/6th of the month of March and April
#appear to be a period of low activity, but not enough data to make a conclusive statement.
autos_copy['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index()


# In[134]:


#Dataset would indicate that most of the ads where created during the month of March

autos_copy['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index()


# In[135]:


autos_copy['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index()


# In[136]:


autos_copy['registration_year'].describe()

#There are 50000 vehicles listed, there appears to be erroneous data as the min is 1000
# and max is 9999, this data will require clean up.


# In[137]:


autos_copy['registration_year'].value_counts(normalize=True).sort_index()


# In[138]:


#Vechile Registration year before 1910 and after 2016 marked as erroneous data
(autos_copy['registration_year'] < 1910).value_counts(normalize=True)


# In[139]:


autos_copy = autos_copy[(autos_copy['registration_year'] >= 1910) & (autos_copy['registration_year'] <= 2016)]


# In[140]:


autos_copy.head(5)


# In[141]:


#Significant majority vehicles were registered in the last 30 years. With most of these vehicles being 5 - 15 years
#old. From 1995 - 2012. Indicating people tend to sell vehicles which are over 5 years old
autos_copy['registration_year'].value_counts().sort_index()


# In[142]:


autos_copy['brand'].value_counts(normalize=True).sort_values(ascending=False)


# In[143]:


brand_counts = autos_copy['brand'].value_counts(normalize=True)
top_brands = brand_counts[brand_counts > .05].index


# In[144]:


top_brands
#Volkswagen is the market leader with over 21% market share


# In[145]:


autos_copy.groupby('brand')['price'].mean()


# In[146]:


autos_copy.groupby(['brand'])['price'].mean().sort_values(ascending=False)


# In[147]:


brand_mean_price ={}
for brand in top_brands:
    top_brand = autos_copy[autos_copy['brand'] == brand]
    mean_price = top_brand['price'].mean()
    brand_mean_price[brand] = int(mean_price)


# In[148]:


print(mean_data)
#The data would indicate that of the top brand ie. have greater than 5% market
#Mercedes is the most expensive car on average
#Opel is the least expensive car average


# In[149]:


bmp_series = pandas.Series(brand_mean_price)
print(bmp_series)


# In[150]:


brand_mean_mileage ={}
for brand in top_brands:
    top_brand = autos_copy[autos_copy['brand'] == brand]
    mean_mileage = top_brand['odometer_km'].mean()
    brand_mean_mileage[brand] = int(mean_mileage)


# In[151]:


print(brand_mean_mileage)


# In[152]:


mean_price = pandas.Series(brand_mean_price).sort_values(ascending=False)
mean_mileage = pandas.Series(brand_mean_mileage).sort_values(ascending=False)


# In[153]:


print(mean_price)
print(type(mean_price))
print(mean_mileage)
print(type(mean_mileage))


# In[154]:


pandas.DataFrame([mean_price]).transpose()


# In[155]:


pandas.concat([mean_price,mean_mileage],keys=['mean_price','mean_mileage'],axis=1)


# The data does not indicate that the more expensive brand have lower odometer readings (mileage)
# Mercedes has the highest price and the second highest mileage
# Opel has the lowest mean price with second lowest average mileage
# My assessment is the current comparison of automaker, mean price and mileage makes it difficult to infer any meaningful insights.