#Import relevant libraries
import numpy as np
import pandas as pd
autos = pd.read_csv('autos.csv', encoding= 'Latin-1')
autos
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 2016-03-27 14:38:19 | Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon | privat | Angebot | $24,900 | control | limousine | 2011 | automatik | 239 | q5 | 100,000km | 1 | diesel | audi | nein | 2016-03-27 00:00:00 | 0 | 82131 | 2016-04-01 13:47:40 |
49996 | 2016-03-28 10:50:25 | Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... | privat | Angebot | $1,980 | control | cabrio | 1996 | manuell | 75 | astra | 150,000km | 5 | benzin | opel | nein | 2016-03-28 00:00:00 | 0 | 44807 | 2016-04-02 14:18:02 |
49997 | 2016-04-02 14:44:48 | Fiat_500_C_1.2_Dualogic_Lounge | privat | Angebot | $13,200 | test | cabrio | 2014 | automatik | 69 | 500 | 5,000km | 11 | benzin | fiat | nein | 2016-04-02 00:00:00 | 0 | 73430 | 2016-04-04 11:47:27 |
49998 | 2016-03-08 19:25:42 | Audi_A3_2.0_TDI_Sportback_Ambition | privat | Angebot | $22,900 | control | kombi | 2013 | manuell | 150 | a3 | 40,000km | 11 | diesel | audi | nein | 2016-03-08 00:00:00 | 0 | 35683 | 2016-04-05 16:45:07 |
49999 | 2016-03-14 00:42:12 | Opel_Vectra_1.6_16V | privat | Angebot | $1,250 | control | limousine | 1996 | manuell | 101 | vectra | 150,000km | 1 | benzin | opel | nein | 2016-03-13 00:00:00 | 0 | 45897 | 2016-04-06 21:18:48 |
50000 rows × 20 columns
autos.head()
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
autos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dateCrawled 50000 non-null object 1 name 50000 non-null object 2 seller 50000 non-null object 3 offerType 50000 non-null object 4 price 50000 non-null object 5 abtest 50000 non-null object 6 vehicleType 44905 non-null object 7 yearOfRegistration 50000 non-null int64 8 gearbox 47320 non-null object 9 powerPS 50000 non-null int64 10 model 47242 non-null object 11 odometer 50000 non-null object 12 monthOfRegistration 50000 non-null int64 13 fuelType 45518 non-null object 14 brand 50000 non-null object 15 notRepairedDamage 40171 non-null object 16 dateCreated 50000 non-null object 17 nrOfPictures 50000 non-null int64 18 postalCode 50000 non-null int64 19 lastSeen 50000 non-null object dtypes: int64(5), object(15) memory usage: 7.6+ MB
The columns information shows we have 5 interger types columns and 15 object types columns. Some numeric data are being stored as object, example is the price of the car. Also, the date columns are stored as objects as well. Finally, some columns have missing/null values in them
columns= autos.columns
columns
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'odometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen'], dtype='object')
new_columns = ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model',
'odometer', 'registration_month', 'fuel_type', 'brand',
'unrepaired_amage', 'ad_created', 'nr_of_pictures', 'postal_code',
'last_seen']
autos.columns = new_columns
autos.head()
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_amage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
In the cells above, I have changed the names of a few columns to something new. At the same time, I changed the rest of the columns from CamelCase to snake_case
autos.describe(include='all')
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer | registration_month | fuel_type | brand | unrepaired_amage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000 | 50000 | 50000 | 50000 | 50000 | 50000 | 44905 | 50000.000000 | 47320 | 50000.000000 | 47242 | 50000 | 50000.000000 | 45518 | 50000 | 40171 | 50000 | 50000.0 | 50000.000000 | 50000 |
unique | 48213 | 38754 | 2 | 2 | 2357 | 2 | 8 | NaN | 2 | NaN | 245 | 13 | NaN | 7 | 40 | 2 | 76 | NaN | NaN | 39481 |
top | 2016-03-10 15:36:24 | Ford_Fiesta | privat | Angebot | $0 | test | limousine | NaN | manuell | NaN | golf | 150,000km | NaN | benzin | volkswagen | nein | 2016-04-03 00:00:00 | NaN | NaN | 2016-04-07 06:17:27 |
freq | 3 | 78 | 49999 | 49999 | 1421 | 25756 | 12859 | NaN | 36993 | NaN | 4024 | 32424 | NaN | 30107 | 10687 | 35232 | 1946 | NaN | NaN | 8 |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2005.073280 | NaN | 116.355920 | NaN | NaN | 5.723360 | NaN | NaN | NaN | NaN | 0.0 | 50813.627300 | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 105.712813 | NaN | 209.216627 | NaN | NaN | 3.711984 | NaN | NaN | NaN | NaN | 0.0 | 25779.747957 | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | 1067.000000 | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1999.000000 | NaN | 70.000000 | NaN | NaN | 3.000000 | NaN | NaN | NaN | NaN | 0.0 | 30451.000000 | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2003.000000 | NaN | 105.000000 | NaN | NaN | 6.000000 | NaN | NaN | NaN | NaN | 0.0 | 49577.000000 | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2008.000000 | NaN | 150.000000 | NaN | NaN | 9.000000 | NaN | NaN | NaN | NaN | 0.0 | 71540.000000 | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9999.000000 | NaN | 17700.000000 | NaN | NaN | 12.000000 | NaN | NaN | NaN | NaN | 0.0 | 99998.000000 | NaN |
autos.price.value_counts()
$0 1421 $500 781 $1,500 734 $2,500 643 $1,000 639 ... $1,169 1 $8,180 1 $193 1 $135,000 1 $29,970 1 Name: price, Length: 2357, dtype: int64
autos.odometer.value_counts()
150,000km 32424 125,000km 5170 100,000km 2169 90,000km 1757 80,000km 1436 70,000km 1230 60,000km 1164 50,000km 1027 5,000km 967 40,000km 819 30,000km 789 20,000km 784 10,000km 264 Name: odometer, dtype: int64
#Lets clean up the price and odometer columns
autos.price = autos.price.str.replace('$','')
autos.price = autos.price.str.replace(',','')
#convert to int
autos.price = autos.price.astype(int)
autos.odometer = autos.odometer.str.replace('km','')
autos.odometer = autos.odometer.str.replace(',','')
#convert to int
autos.odometer = autos.odometer.astype(int)
#rename the column
autos.rename({'odometer':'odometer_km'}, axis=1, inplace=True)
autos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date_crawled 50000 non-null object 1 name 50000 non-null object 2 seller 50000 non-null object 3 offer_type 50000 non-null object 4 price 50000 non-null int64 5 abtest 50000 non-null object 6 vehicle_type 44905 non-null object 7 registration_year 50000 non-null int64 8 gearbox 47320 non-null object 9 power_ps 50000 non-null int64 10 model 47242 non-null object 11 odometer_km 50000 non-null int64 12 registration_month 50000 non-null int64 13 fuel_type 45518 non-null object 14 brand 50000 non-null object 15 unrepaired_amage 40171 non-null object 16 ad_created 50000 non-null object 17 nr_of_pictures 50000 non-null int64 18 postal_code 50000 non-null int64 19 last_seen 50000 non-null object dtypes: int64(7), object(13) memory usage: 7.6+ MB
autos[['odometer_km','price']].describe()
odometer_km | price | |
---|---|---|
count | 50000.000000 | 5.000000e+04 |
mean | 125732.700000 | 9.840044e+03 |
std | 40042.211706 | 4.811044e+05 |
min | 5000.000000 | 0.000000e+00 |
25% | 125000.000000 | 1.100000e+03 |
50% | 150000.000000 | 2.950000e+03 |
75% | 150000.000000 | 7.200000e+03 |
max | 150000.000000 | 1.000000e+08 |
autos.price.value_counts().sort_index(ascending=False).head()
#or dot tail()
99999999 1 27322222 1 12345678 3 11111111 2 10000000 1 Name: price, dtype: int64
autos.odometer_km.value_counts().sort_index(ascending=True)
5000 967 10000 264 20000 784 30000 789 40000 819 50000 1027 60000 1164 70000 1230 80000 1436 90000 1757 100000 2169 125000 5170 150000 32424 Name: odometer_km, dtype: int64
# Remove outliers from the data
autos = autos[(autos['price'] >= 1000) & (autos['price'] <= 7200)]
autos = autos[autos['odometer_km'].between(50000,150000)].reset_index(drop=True)
autos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25338 entries, 0 to 25337 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date_crawled 25338 non-null object 1 name 25338 non-null object 2 seller 25338 non-null object 3 offer_type 25338 non-null object 4 price 25338 non-null int64 5 abtest 25338 non-null object 6 vehicle_type 23183 non-null object 7 registration_year 25338 non-null int64 8 gearbox 24243 non-null object 9 power_ps 25338 non-null int64 10 model 24261 non-null object 11 odometer_km 25338 non-null int64 12 registration_month 25338 non-null int64 13 fuel_type 23488 non-null object 14 brand 25338 non-null object 15 unrepaired_amage 20819 non-null object 16 ad_created 25338 non-null object 17 nr_of_pictures 25338 non-null int64 18 postal_code 25338 non-null int64 19 last_seen 25338 non-null object dtypes: int64(7), object(13) memory usage: 3.9+ MB
autos.shape
(25338, 20)
autos.head()
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer_km | registration_month | fuel_type | brand | unrepaired_amage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | 5000 | control | bus | 2004 | manuell | 158 | andere | 150000 | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | 4350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70000 | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
2 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | 1350 | test | kombi | 2003 | manuell | 0 | focus | 150000 | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
3 | 2016-03-16 18:55:19 | Golf_IV_1.9_TDI_90PS | privat | Angebot | 1990 | control | limousine | 1998 | manuell | 90 | golf | 150000 | 12 | diesel | volkswagen | nein | 2016-03-16 00:00:00 | 0 | 53474 | 2016-04-07 03:17:32 |
4 | 2016-03-31 19:48:22 | Smart_smart_fortwo_coupe_softouch_pure_MHD_Pan... | privat | Angebot | 5299 | control | kleinwagen | 2010 | automatik | 71 | fortwo | 50000 | 9 | benzin | smart | nein | 2016-03-31 00:00:00 | 0 | 34590 | 2016-04-06 14:17:52 |
autos[['date_crawled','ad_created','last_seen']].head()
date_crawled | ad_created | last_seen | |
---|---|---|---|
0 | 2016-03-26 17:47:46 | 2016-03-26 00:00:00 | 2016-04-06 06:45:54 |
1 | 2016-03-12 16:58:10 | 2016-03-12 00:00:00 | 2016-03-15 03:16:28 |
2 | 2016-04-01 14:38:50 | 2016-04-01 00:00:00 | 2016-04-01 14:38:50 |
3 | 2016-03-16 18:55:19 | 2016-03-16 00:00:00 | 2016-04-07 03:17:32 |
4 | 2016-03-31 19:48:22 | 2016-03-31 00:00:00 | 2016-04-06 14:17:52 |
# extract just the date part of the columns above
autos.date_crawled = autos.date_crawled.str[:10]
autos.ad_created = autos.ad_created.str[:10]
autos.last_seen = autos.last_seen.str[:10]
#Show distribution
autos.date_crawled.value_counts(normalize=True, dropna=False).sort_index()
2016-03-05 0.025416 2016-03-06 0.013971 2016-03-07 0.035362 2016-03-08 0.034217 2016-03-09 0.033191 2016-03-10 0.034020 2016-03-11 0.033310 2016-03-12 0.037769 2016-03-13 0.014879 2016-03-14 0.037177 2016-03-15 0.034020 2016-03-16 0.029284 2016-03-17 0.030942 2016-03-18 0.012590 2016-03-19 0.034296 2016-03-20 0.037888 2016-03-21 0.037493 2016-03-22 0.033349 2016-03-23 0.031810 2016-03-24 0.028258 2016-03-25 0.029994 2016-03-26 0.033546 2016-03-27 0.031494 2016-03-28 0.035244 2016-03-29 0.033586 2016-03-30 0.033349 2016-03-31 0.031415 2016-04-01 0.032797 2016-04-02 0.035086 2016-04-03 0.037967 2016-04-04 0.037454 2016-04-05 0.013419 2016-04-06 0.003947 2016-04-07 0.001460 Name: date_crawled, dtype: float64
autos.ad_created.value_counts(normalize=True, dropna=False).sort_index()
2015-08-10 0.000039 2015-11-10 0.000039 2015-12-30 0.000039 2016-01-07 0.000039 2016-01-13 0.000039 2016-01-27 0.000118 2016-01-29 0.000039 2016-02-01 0.000039 2016-02-02 0.000079 2016-02-05 0.000039 2016-02-11 0.000039 2016-02-12 0.000079 2016-02-16 0.000039 2016-02-18 0.000079 2016-02-19 0.000039 2016-02-20 0.000039 2016-02-21 0.000039 2016-02-23 0.000039 2016-02-25 0.000039 2016-02-27 0.000158 2016-02-28 0.000197 2016-02-29 0.000158 2016-03-01 0.000118 2016-03-02 0.000079 2016-03-03 0.000908 2016-03-04 0.001618 2016-03-05 0.023088 2016-03-06 0.015313 2016-03-07 0.034178 2016-03-08 0.033823 2016-03-09 0.033389 2016-03-10 0.033744 2016-03-11 0.033389 2016-03-12 0.037769 2016-03-13 0.016260 2016-03-14 0.035638 2016-03-15 0.034020 2016-03-16 0.029916 2016-03-17 0.030429 2016-03-18 0.013142 2016-03-19 0.033231 2016-03-20 0.038046 2016-03-21 0.037809 2016-03-22 0.033152 2016-03-23 0.031692 2016-03-24 0.027942 2016-03-25 0.030350 2016-03-26 0.033823 2016-03-27 0.031060 2016-03-28 0.035165 2016-03-29 0.033902 2016-03-30 0.033112 2016-03-31 0.031336 2016-04-01 0.032836 2016-04-02 0.034573 2016-04-03 0.038322 2016-04-04 0.038164 2016-04-05 0.011919 2016-04-06 0.003947 2016-04-07 0.001342 Name: ad_created, dtype: float64
autos.last_seen.value_counts(normalize=True, dropna=False).sort_index()
2016-03-05 0.001263 2016-03-06 0.004736 2016-03-07 0.005762 2016-03-08 0.007972 2016-03-09 0.010419 2016-03-10 0.010932 2016-03-11 0.014208 2016-03-12 0.025772 2016-03-13 0.010301 2016-03-14 0.013024 2016-03-15 0.016655 2016-03-16 0.018312 2016-03-17 0.030152 2016-03-18 0.007933 2016-03-19 0.016576 2016-03-20 0.021785 2016-03-21 0.021233 2016-03-22 0.022891 2016-03-23 0.019102 2016-03-24 0.021272 2016-03-25 0.019339 2016-03-26 0.017957 2016-03-27 0.015392 2016-03-28 0.021115 2016-03-29 0.021983 2016-03-30 0.025377 2016-03-31 0.024982 2016-04-01 0.024785 2016-04-02 0.026679 2016-04-03 0.025377 2016-04-04 0.025416 2016-04-05 0.118597 2016-04-06 0.210159 2016-04-07 0.122543 Name: last_seen, dtype: float64
autos.registration_year.describe()
count 25338.000000 mean 2002.890481 std 6.260992 min 1934.000000 25% 2000.000000 50% 2003.000000 75% 2006.000000 max 2019.000000 Name: registration_year, dtype: float64
autos = autos[autos['registration_year'].between(1910,2016)]
autos.registration_year.value_counts(normalize=True).sort_index()
1934 0.000041 1952 0.000041 1953 0.000041 1956 0.000041 1958 0.000083 ... 2012 0.004585 2013 0.000991 2014 0.000248 2015 0.000041 2016 0.020610 Name: registration_year, Length: 61, dtype: float64
autos.columns
Index(['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer_km', 'registration_month', 'fuel_type', 'brand', 'unrepaired_amage', 'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen'], dtype='object')
autos.brand.unique()
array(['peugeot', 'smart', 'ford', 'volkswagen', 'audi', 'renault', 'mazda', 'mini', 'mercedes_benz', 'bmw', 'toyota', 'opel', 'jeep', 'dacia', 'saab', 'volvo', 'seat', 'nissan', 'skoda', 'subaru', 'sonstige_autos', 'fiat', 'mitsubishi', 'hyundai', 'chevrolet', 'chrysler', 'honda', 'suzuki', 'kia', 'trabant', 'citroen', 'land_rover', 'alfa_romeo', 'rover', 'daihatsu', 'daewoo', 'jaguar', 'lancia', 'lada', 'porsche'], dtype=object)
#What are the percentages of each brand in our data set
brands = {}
for item in autos.brand:
count = 0
if item not in brands:
brands[item] = 1
else:
brands[item] += 1
import operator
brands_sorted = sorted(brands.items(), key=operator.itemgetter(1), reverse=True)
brands_sorted[:7]
[('volkswagen', 5160), ('opel', 2746), ('bmw', 2450), ('mercedes_benz', 2382), ('audi', 1769), ('ford', 1670), ('renault', 1188)]
branded = {}
for item in brands_sorted:
key = item[0]
val = item[1]
branded[key]=val
branded
{'volkswagen': 5160, 'opel': 2746, 'bmw': 2450, 'mercedes_benz': 2382, 'audi': 1769, 'ford': 1670, 'renault': 1188, 'peugeot': 895, 'fiat': 627, 'smart': 517, 'seat': 469, 'skoda': 436, 'toyota': 410, 'mazda': 404, 'citroen': 396, 'nissan': 320, 'hyundai': 260, 'volvo': 235, 'mitsubishi': 202, 'honda': 200, 'kia': 191, 'alfa_romeo': 179, 'suzuki': 159, 'chevrolet': 144, 'mini': 133, 'sonstige_autos': 131, 'chrysler': 91, 'dacia': 70, 'daihatsu': 52, 'subaru': 48, 'jeep': 45, 'saab': 42, 'land_rover': 34, 'daewoo': 33, 'jaguar': 33, 'rover': 25, 'lada': 20, 'trabant': 19, 'lancia': 16, 'porsche': 11}
branded.keys()
dict_keys(['volkswagen', 'opel', 'bmw', 'mercedes_benz', 'audi', 'ford', 'renault', 'peugeot', 'fiat', 'smart', 'seat', 'skoda', 'toyota', 'mazda', 'citroen', 'nissan', 'hyundai', 'volvo', 'mitsubishi', 'honda', 'kia', 'alfa_romeo', 'suzuki', 'chevrolet', 'mini', 'sonstige_autos', 'chrysler', 'dacia', 'daihatsu', 'subaru', 'jeep', 'saab', 'land_rover', 'daewoo', 'jaguar', 'rover', 'lada', 'trabant', 'lancia', 'porsche'])
brands_me = ['volkswagen','bmw','opel','mercedes_benz','audi',
'ford','renault', 'peugeot', 'fiat', 'seat']
brand_price = {}
for item in brands_me:
cars = autos[autos['brand']==item].price
price = cars.mean()
brand_price[item]=price
brand_price
{'volkswagen': 3178.826937984496, 'bmw': 3550.2775510204083, 'opel': 2895.9694100509832, 'mercedes_benz': 3507.788832913518, 'audi': 3486.473713962691, 'ford': 2927.234730538922, 'renault': 2525.401515151515, 'peugeot': 2993.1787709497207, 'fiat': 2902.749601275917, 'seat': 3337.590618336887}
autos.head(2)
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | registration_year | gearbox | power_ps | model | odometer_km | registration_month | fuel_type | brand | unrepaired_amage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | 5000 | control | bus | 2004 | manuell | 158 | andere | 150000 | 3 | lpg | peugeot | nein | 2016-03-26 | 0 | 79588 | 2016-04-06 |
1 | 2016-03-12 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | 4350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70000 | 6 | benzin | smart | nein | 2016-03-12 | 0 | 33729 | 2016-03-15 |
# Mean mileage
top_brands = ['volkswagen','bmw','opel','mercedes_benz','audi',
'ford']
mean_mile = {}
for brand in top_brands:
df = autos[autos['brand'] == brand].odometer_km
mile = df.mean()
mean_mile[brand] = mile
mean_mile
{'volkswagen': 141311.0465116279, 'bmw': 147020.4081632653, 'opel': 135451.56591405682, 'mercedes_benz': 144357.68261964736, 'audi': 147422.2724703222, 'ford': 133023.95209580837}
mean_price = {}
for brand in top_brands:
df = autos.loc[autos['brand'] == brand].price
price = df.mean()
mean_price[brand] = price
mean_price
{'volkswagen': 3178.826937984496, 'bmw': 3550.2775510204083, 'opel': 2895.9694100509832, 'mercedes_benz': 3507.788832913518, 'audi': 3486.473713962691, 'ford': 2927.234730538922}
# COnstruct a Series
mean_p6 = pd.Series(mean_price)
mean_m6 = pd.Series(mean_mile)
mean_p6
volkswagen 3178.826938 bmw 3550.277551 opel 2895.969410 mercedes_benz 3507.788833 audi 3486.473714 ford 2927.234731 dtype: float64
top_6 = pd.DataFrame(mean_p6, columns=['mean_price'])
top_6
mean_price | |
---|---|
volkswagen | 3178.826938 |
bmw | 3550.277551 |
opel | 2895.969410 |
mercedes_benz | 3507.788833 |
audi | 3486.473714 |
ford | 2927.234731 |
top_6['mean_mileage'] = mean_m6
top_6
mean_price | mean_mileage | |
---|---|---|
volkswagen | 3178.826938 | 141311.046512 |
bmw | 3550.277551 | 147020.408163 |
opel | 2895.969410 | 135451.565914 |
mercedes_benz | 3507.788833 | 144357.682620 |
audi | 3486.473714 | 147422.272470 |
ford | 2927.234731 | 133023.952096 |
print(top_6)
mean_price mean_mileage volkswagen 3178.826938 141311.046512 bmw 3550.277551 147020.408163 opel 2895.969410 135451.565914 mercedes_benz 3507.788833 144357.682620 audi 3486.473714 147422.272470 ford 2927.234731 133023.952096