# Mount Google. from google.colab import drive drive.mount("/content/gdrive", force_remount=True) #drive.flush_and_unmount() # Find out where the data is. !ls "gdrive/MyDrive/Colab Notebooks/gp3_autos/" import pandas as pd import numpy as np autos = pd.read_csv('gdrive/MyDrive/Colab Notebooks/gp3_autos/autos.csv', encoding="Latin-1") autos.info() autos.head() autos["name"].value_counts() autos["offerType"].value_counts() autos.dtypes autos.describe(include='all') autos["seller"].value_counts() autos["offerType"].value_counts() autos["nrOfPictures"].value_counts() autos["vehicleType"].iloc[0:22] autos['gearbox'].value_counts() autos['brand'].value_counts() # Copy used to ensure we have a copy of the old columns for reference if needed. columns_old = autos.columns.copy() print(columns_old) columns_new = ['date_crawled', # to snake_case 'name', # no change 'seller', # no change 'offer_type', # to snake_case 'price', # no change 'ab_test', # to snake_case 'vehicle_type', # to snake_case 'registration_year', # clarity 'gearbox', # no change 'power_ps', # to snake_case 'model', # no change 'odometer_km', # more accurate 'registration_month', #clarity 'fuel_type', # to snake_case 'brand', # no change 'unrepaired_damage', # clarity 'ad_created', # to snake_case 'num_pictures', # clarity and to snake_case 'postcode', # common usage 'last_seen'] # to snake_case # Copy back (rather than assignment) to preserve versions autos.columns = columns_new.copy() autos.columns autos = autos.drop(columns=["num_pictures", "seller", "offer_type"], axis=1) autos.columns autos["registration_year"].describe() autos["registration_year"].value_counts().head() # initial_year_distribution = autos["registration_year"].value_counts().sort_index() # initial_year_distribution.plot() range_bool = autos["registration_year"].between(1980,2020) final_distribution = autos[range_bool] final_distribution["registration_year"].value_counts().sort_index().plot() autos["registration_year"] = final_distribution["registration_year"] autos["price"].describe() autos["price"].value_counts().sort_index().plot() price_bool = autos["price"].between(1,45000) final_price_distribution = autos[price_bool] final_price_distribution["price"].value_counts().sort_index().plot() autos["price"] = final_price_distribution["price"] autos["price"].describe() print(autos["date_crawled"].dtypes) print(autos["last_seen"].dtypes) print(autos["ad_created"].dtypes) print(autos["registration_month"].dtypes) print(autos["registration_year"].dtypes) autos[['date_crawled','ad_created','last_seen']][0:5] print(autos['date_crawled'].str[:10].value_counts(dropna=False).sort_index()) autos['date_crawled'] = pd.to_datetime(autos['date_crawled']) print(autos["date_crawled"].dtype) print(autos['ad_created'].str[:10].value_counts(dropna=False).sort_index()) bool_2015 = autos["ad_created"].str[:4] == "2015" slice_2015 = autos[bool_2015] slice_2015.head() autos['ad_created'] = pd.to_datetime(autos['ad_created']) print(autos["ad_created"].dtype) print(autos['last_seen'].str[:10].value_counts(dropna=False).sort_index()) autos['last_seen'] = pd.to_datetime(autos['last_seen']) print(autos["last_seen"].dtype) autos["registration_month"].describe() autos = autos.drop(columns=["registration_month"], axis=1) autos.columns autos["brand"].value_counts(dropna=False, normalize=True).head(16) mean_price_agg = {} top_16_index = autos["brand"].value_counts(dropna=False, normalize=True).head(16) for brand in top_16_index.index: brand_slice = autos[autos["brand"] == brand] agg_mean = brand_slice["price"].mean() mean_price_agg[brand] = round(agg_mean) print(mean_price_agg) mean_price_series = pd.Series(mean_price_agg) mean_price_df = pd.DataFrame(mean_price_series, columns=["mean_price"]) mean_price_df = mean_price_df.sort_values(by=["mean_price"], ascending=False) mean_price_df mean_price_df.sort_values(by=["mean_price"], ascending=True).plot.barh() autos["odometer_km"].describe() autos["odometer_km"].value_counts() mean_odo_agg = {} top_16_index = autos["brand"].value_counts(dropna=False, normalize=True).head(16) for brand in top_16_index.index: brand_slice = autos[autos["brand"] == brand] mean_odo = brand_slice["odometer_km"].mean() mean_odo_agg[brand] = round(mean_odo/10) print(mean_odo_agg) mean_odo_series = pd.Series(mean_odo_agg) mean_odo_df = pd.DataFrame(mean_odo_series, columns=["mean_odometer"]) mean_odo_df.sort_index() mean_price_df.sort_index() join_them = [mean_odo_df, mean_price_df] mean_odo_price_df = pd.concat(join_them, axis=1) mean_odo_price_df.sort_values(by="mean_price", ascending=False) import matplotlib.pyplot as plt #df2 = df.groupby(['Name', 'Abuse/NFF'])['Name'].count().unstack('Abuse/NFF').fillna(0) mean_odo_price_df.sort_values(by="mean_price", ascending=False).head(6).plot(kind='bar', stacked=False)