# Mount Google.
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
#drive.flush_and_unmount()

# Find out where the data is.
!ls "gdrive/MyDrive/Colab Notebooks/gp3_autos/"


import pandas as pd
import numpy as np

autos = pd.read_csv('gdrive/MyDrive/Colab Notebooks/gp3_autos/autos.csv', encoding="Latin-1")


autos.info()

 autos.head()

autos["name"].value_counts()

autos["offerType"].value_counts()

autos.dtypes

autos.describe(include='all')

autos["seller"].value_counts()

autos["offerType"].value_counts()

autos["nrOfPictures"].value_counts()

autos["vehicleType"].iloc[0:22]

autos['gearbox'].value_counts()

autos['brand'].value_counts()

# Copy used to ensure we have a copy of the old columns for reference if needed.
columns_old = autos.columns.copy()
print(columns_old)


columns_new = ['date_crawled', # to snake_case
'name', # no change
'seller', # no change
'offer_type', # to snake_case
'price', # no change
'ab_test', # to snake_case
'vehicle_type', # to snake_case
'registration_year', # clarity
'gearbox', # no change
'power_ps', # to snake_case
'model', # no change
'odometer_km', # more accurate
'registration_month', #clarity
'fuel_type', # to snake_case
'brand', # no change
'unrepaired_damage', # clarity
'ad_created', # to snake_case
'num_pictures', # clarity and to snake_case
'postcode', # common usage
'last_seen'] # to snake_case

# Copy back (rather than assignment) to preserve versions
autos.columns = columns_new.copy()

autos.columns

autos = autos.drop(columns=["num_pictures", "seller", "offer_type"], axis=1)

autos.columns

autos["registration_year"].describe()

autos["registration_year"].value_counts().head()

# initial_year_distribution = autos["registration_year"].value_counts().sort_index()
# initial_year_distribution.plot()

range_bool = autos["registration_year"].between(1980,2020)
final_distribution = autos[range_bool]
final_distribution["registration_year"].value_counts().sort_index().plot()

autos["registration_year"] = final_distribution["registration_year"]

autos["price"].describe()

autos["price"].value_counts().sort_index().plot()

price_bool = autos["price"].between(1,45000)
final_price_distribution = autos[price_bool]
final_price_distribution["price"].value_counts().sort_index().plot()

autos["price"] = final_price_distribution["price"]

autos["price"].describe()

print(autos["date_crawled"].dtypes)
print(autos["last_seen"].dtypes)
print(autos["ad_created"].dtypes)
print(autos["registration_month"].dtypes)
print(autos["registration_year"].dtypes)

autos[['date_crawled','ad_created','last_seen']][0:5]

print(autos['date_crawled'].str[:10].value_counts(dropna=False).sort_index())

autos['date_crawled'] = pd.to_datetime(autos['date_crawled'])
print(autos["date_crawled"].dtype)

print(autos['ad_created'].str[:10].value_counts(dropna=False).sort_index())

bool_2015 = autos["ad_created"].str[:4] == "2015"
slice_2015 = autos[bool_2015]
slice_2015.head()

autos['ad_created'] = pd.to_datetime(autos['ad_created'])
print(autos["ad_created"].dtype)

print(autos['last_seen'].str[:10].value_counts(dropna=False).sort_index())

autos['last_seen'] = pd.to_datetime(autos['last_seen'])
print(autos["last_seen"].dtype)

autos["registration_month"].describe()

autos = autos.drop(columns=["registration_month"], axis=1)
autos.columns

autos["brand"].value_counts(dropna=False, normalize=True).head(16)

mean_price_agg = {}

top_16_index = autos["brand"].value_counts(dropna=False, normalize=True).head(16)

for brand in top_16_index.index:
  brand_slice = autos[autos["brand"] == brand]
  agg_mean = brand_slice["price"].mean()
  mean_price_agg[brand] = round(agg_mean)

print(mean_price_agg)

mean_price_series = pd.Series(mean_price_agg)
mean_price_df = pd.DataFrame(mean_price_series, columns=["mean_price"])
mean_price_df = mean_price_df.sort_values(by=["mean_price"], ascending=False)
mean_price_df


mean_price_df.sort_values(by=["mean_price"], ascending=True).plot.barh()

autos["odometer_km"].describe()

autos["odometer_km"].value_counts()

mean_odo_agg = {}

top_16_index = autos["brand"].value_counts(dropna=False, normalize=True).head(16)

for brand in top_16_index.index:
  brand_slice = autos[autos["brand"] == brand]
  mean_odo = brand_slice["odometer_km"].mean()
  mean_odo_agg[brand] = round(mean_odo/10)

print(mean_odo_agg)


mean_odo_series = pd.Series(mean_odo_agg)
mean_odo_df = pd.DataFrame(mean_odo_series, columns=["mean_odometer"])
mean_odo_df.sort_index()

mean_price_df.sort_index()


join_them = [mean_odo_df, mean_price_df]
mean_odo_price_df = pd.concat(join_them, axis=1)
mean_odo_price_df.sort_values(by="mean_price", ascending=False)

import matplotlib.pyplot as plt

#df2 = df.groupby(['Name', 'Abuse/NFF'])['Name'].count().unstack('Abuse/NFF').fillna(0)
mean_odo_price_df.sort_values(by="mean_price", ascending=False).head(6).plot(kind='bar', stacked=False)