import numpy as np
import pandas as pd
import datetime as dt
%matplotlib inline
autos = pd.read_csv('autos.csv', encoding = 'Latin-1')
autos.shape
(50000, 20)
autos.head()
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
autos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dateCrawled 50000 non-null object 1 name 50000 non-null object 2 seller 50000 non-null object 3 offerType 50000 non-null object 4 price 50000 non-null object 5 abtest 50000 non-null object 6 vehicleType 44905 non-null object 7 yearOfRegistration 50000 non-null int64 8 gearbox 47320 non-null object 9 powerPS 50000 non-null int64 10 model 47242 non-null object 11 odometer 50000 non-null object 12 monthOfRegistration 50000 non-null int64 13 fuelType 45518 non-null object 14 brand 50000 non-null object 15 notRepairedDamage 40171 non-null object 16 dateCreated 50000 non-null object 17 nrOfPictures 50000 non-null int64 18 postalCode 50000 non-null int64 19 lastSeen 50000 non-null object dtypes: int64(5), object(15) memory usage: 7.6+ MB
autos.describe(include= 'all')
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000 | 50000 | 50000 | 50000 | 50000 | 50000 | 44905 | 50000.000000 | 47320 | 50000.000000 | 47242 | 50000 | 50000.000000 | 45518 | 50000 | 40171 | 50000 | 50000.0 | 50000.000000 | 50000 |
unique | 48213 | 38754 | 2 | 2 | 2357 | 2 | 8 | NaN | 2 | NaN | 245 | 13 | NaN | 7 | 40 | 2 | 76 | NaN | NaN | 39481 |
top | 2016-03-12 16:06:22 | Ford_Fiesta | privat | Angebot | $0 | test | limousine | NaN | manuell | NaN | golf | 150,000km | NaN | benzin | volkswagen | nein | 2016-04-03 00:00:00 | NaN | NaN | 2016-04-07 06:17:27 |
freq | 3 | 78 | 49999 | 49999 | 1421 | 25756 | 12859 | NaN | 36993 | NaN | 4024 | 32424 | NaN | 30107 | 10687 | 35232 | 1946 | NaN | NaN | 8 |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2005.073280 | NaN | 116.355920 | NaN | NaN | 5.723360 | NaN | NaN | NaN | NaN | 0.0 | 50813.627300 | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 105.712813 | NaN | 209.216627 | NaN | NaN | 3.711984 | NaN | NaN | NaN | NaN | 0.0 | 25779.747957 | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | 1067.000000 | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1999.000000 | NaN | 70.000000 | NaN | NaN | 3.000000 | NaN | NaN | NaN | NaN | 0.0 | 30451.000000 | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2003.000000 | NaN | 105.000000 | NaN | NaN | 6.000000 | NaN | NaN | NaN | NaN | 0.0 | 49577.000000 | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2008.000000 | NaN | 150.000000 | NaN | NaN | 9.000000 | NaN | NaN | NaN | NaN | 0.0 | 71540.000000 | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9999.000000 | NaN | 17700.000000 | NaN | NaN | 12.000000 | NaN | NaN | NaN | NaN | 0.0 | 99998.000000 | NaN |
autos = autos.drop(columns=['seller','offerType'])
autos
dateCrawled | name | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 2016-03-27 14:38:19 | Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon | $24,900 | control | limousine | 2011 | automatik | 239 | q5 | 100,000km | 1 | diesel | audi | nein | 2016-03-27 00:00:00 | 0 | 82131 | 2016-04-01 13:47:40 |
49996 | 2016-03-28 10:50:25 | Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... | $1,980 | control | cabrio | 1996 | manuell | 75 | astra | 150,000km | 5 | benzin | opel | nein | 2016-03-28 00:00:00 | 0 | 44807 | 2016-04-02 14:18:02 |
49997 | 2016-04-02 14:44:48 | Fiat_500_C_1.2_Dualogic_Lounge | $13,200 | test | cabrio | 2014 | automatik | 69 | 500 | 5,000km | 11 | benzin | fiat | nein | 2016-04-02 00:00:00 | 0 | 73430 | 2016-04-04 11:47:27 |
49998 | 2016-03-08 19:25:42 | Audi_A3_2.0_TDI_Sportback_Ambition | $22,900 | control | kombi | 2013 | manuell | 150 | a3 | 40,000km | 11 | diesel | audi | nein | 2016-03-08 00:00:00 | 0 | 35683 | 2016-04-05 16:45:07 |
49999 | 2016-03-14 00:42:12 | Opel_Vectra_1.6_16V | $1,250 | control | limousine | 1996 | manuell | 101 | vectra | 150,000km | 1 | benzin | opel | nein | 2016-03-13 00:00:00 | 0 | 45897 | 2016-04-06 21:18:48 |
50000 rows × 18 columns
# columns before snakecase
autos.columns
Index(['dateCrawled', 'name', 'price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'odometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen'], dtype='object')
autos.rename(columns = {'yearOfRegistration':'registration_year', 'abtest':'test',
'monthOfRegistration': 'registration_month',
'notRepairedDamage': 'unrepaired_damage',
'dateCreated':'ad_created' }, inplace = True)
autos.rename(columns = {'dateCrawled':'date_crawled' ,
'offerType': 'offer_type', 'vehicleType':'vehicle_type' ,
'powerPS': 'power_ps','odometer':'odometer_km','fuelType':'fuel_type', 'nrOfPictures':'nr_of_pictures' ,
'postalCode': 'postal_code', 'lastSeen': 'last_seen'}, inplace = True)
# columns after snakecase:
autos.columns
Index(['date_crawled', 'name', 'price', 'test', 'vehicle_type', 'registration_year', 'gearbox', 'power_ps', 'model', 'odometer_km', 'registration_month', 'fuel_type', 'brand', 'unrepaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code', 'last_seen'], dtype='object')
#price after:
autos['price'] = autos['price'].str.replace('$', '')
autos['price'] = autos['price'].str.replace(',', '')
autos['price'] = autos['price'].astype(int)
autos['price'].unique()
array([ 5000, 8500, 8990, ..., 385, 22200, 16995])
#odometer before:
autos['odometer_km'].unique()
array(['150,000km', '70,000km', '50,000km', '80,000km', '10,000km', '30,000km', '125,000km', '90,000km', '20,000km', '60,000km', '5,000km', '100,000km', '40,000km'], dtype=object)
#odometer after
autos['odometer_km'] = autos['odometer_km'].str.replace('km', '')
autos['odometer_km'] = autos['odometer_km'].str.replace(',', '')
autos['odometer_km'] = autos['odometer_km'].astype(int)
autos['odometer_km'].unique()
array([150000, 70000, 50000, 80000, 10000, 30000, 125000, 90000, 20000, 60000, 5000, 100000, 40000])
#before:
autos[['date_crawled','last_seen','ad_created']]
date_crawled | last_seen | ad_created | |
---|---|---|---|
0 | 2016-03-26 17:47:46 | 2016-04-06 06:45:54 | 2016-03-26 00:00:00 |
1 | 2016-04-04 13:38:56 | 2016-04-06 14:45:08 | 2016-04-04 00:00:00 |
2 | 2016-03-26 18:57:24 | 2016-04-06 20:15:37 | 2016-03-26 00:00:00 |
3 | 2016-03-12 16:58:10 | 2016-03-15 03:16:28 | 2016-03-12 00:00:00 |
4 | 2016-04-01 14:38:50 | 2016-04-01 14:38:50 | 2016-04-01 00:00:00 |
... | ... | ... | ... |
49995 | 2016-03-27 14:38:19 | 2016-04-01 13:47:40 | 2016-03-27 00:00:00 |
49996 | 2016-03-28 10:50:25 | 2016-04-02 14:18:02 | 2016-03-28 00:00:00 |
49997 | 2016-04-02 14:44:48 | 2016-04-04 11:47:27 | 2016-04-02 00:00:00 |
49998 | 2016-03-08 19:25:42 | 2016-04-05 16:45:07 | 2016-03-08 00:00:00 |
49999 | 2016-03-14 00:42:12 | 2016-04-06 21:18:48 | 2016-03-13 00:00:00 |
50000 rows × 3 columns
#after:
autos[['date_crawled','last_seen','ad_created']] = autos[['date_crawled','last_seen','ad_created']].apply(lambda x : x.str[:10])
autos[['date_crawled','last_seen','ad_created']]
date_crawled | last_seen | ad_created | |
---|---|---|---|
0 | 2016-03-26 | 2016-04-06 | 2016-03-26 |
1 | 2016-04-04 | 2016-04-06 | 2016-04-04 |
2 | 2016-03-26 | 2016-04-06 | 2016-03-26 |
3 | 2016-03-12 | 2016-03-15 | 2016-03-12 |
4 | 2016-04-01 | 2016-04-01 | 2016-04-01 |
... | ... | ... | ... |
49995 | 2016-03-27 | 2016-04-01 | 2016-03-27 |
49996 | 2016-03-28 | 2016-04-02 | 2016-03-28 |
49997 | 2016-04-02 | 2016-04-04 | 2016-04-02 |
49998 | 2016-03-08 | 2016-04-05 | 2016-03-08 |
49999 | 2016-03-14 | 2016-04-06 | 2016-03-13 |
50000 rows × 3 columns
print(autos['vehicle_type'].unique())
autos['vehicle_type'] = autos['vehicle_type'].map({'kleinwagen': 'small_car', 'kombi': 'station_wagon', 'cabrio': 'convertable', 'andere': 'other'})
print(autos['vehicle_type'].unique())
['bus' 'limousine' 'kleinwagen' 'kombi' nan 'coupe' 'suv' 'cabrio' 'andere'] [nan 'small_car' 'station_wagon' 'convertable' 'other']
print(autos['gearbox'].unique())
autos['gearbox'] = autos['gearbox'].map({'manuell': 'manual', 'automatik':'automatic'})
print(autos['gearbox'].unique())
['manuell' 'automatik' nan] ['manual' 'automatic' nan]
print(autos['fuel_type'].unique())
autos['fuel_type'] = autos['fuel_type'].map({'benzin': 'petrol', 'elektro': 'electric', 'andere':'other'})
print(autos['fuel_type'].unique())
['lpg' 'benzin' 'diesel' nan 'cng' 'hybrid' 'elektro' 'andere'] [nan 'petrol' 'electric' 'other']
print(autos['unrepaired_damage'].unique())
autos['unrepaired_damage'] = autos['unrepaired_damage'].map({'nein':'no', 'ja': 'yes'})
print(autos['unrepaired_damage'].unique())
['nein' nan 'ja'] ['no' nan 'yes']
autos.odometer_km.unique()
array([150000, 70000, 50000, 80000, 10000, 30000, 125000, 90000, 20000, 60000, 5000, 100000, 40000])
autos['odometer_km'].describe()
count 50000.000000 mean 125732.700000 std 40042.211706 min 5000.000000 25% 125000.000000 50% 150000.000000 75% 150000.000000 max 150000.000000 Name: odometer_km, dtype: float64
autos['odometer_km'].value_counts(normalize = True)*100
150000 64.848 125000 10.340 100000 4.338 90000 3.514 80000 2.872 70000 2.460 60000 2.328 50000 2.054 5000 1.934 40000 1.638 30000 1.578 20000 1.568 10000 0.528 Name: odometer_km, dtype: float64
filt = (autos['price'] > 100) & (autos['price'] < 50000)
autos = autos.loc[filt]
autos['price'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x2b0fd546988>
autos['price'].describe()
count 47902.000000 mean 5625.508392 std 6720.551790 min 110.000000 25% 1250.000000 50% 3000.000000 75% 7450.000000 max 49999.000000 Name: price, dtype: float64
autos['price'].median()
3000.0
autos['registration_year'].describe()
count 47902.000000 mean 2004.748152 std 88.185993 min 1000.000000 25% 1999.000000 50% 2004.000000 75% 2008.000000 max 9999.000000 Name: registration_year, dtype: float64
(~autos['registration_year'].between(1920, 2016)).sum()/autos.shape[0]
0.03903803599014655
#It looks like only 3% of the values are outside our range, therefore we can delete them.
filt = autos['registration_year'].between(1920, 2016)
autos = autos.loc[filt]
autos['registration_year'].value_counts(normalize=True).head(10)
2000 0.066845 2005 0.063173 1999 0.062370 2004 0.058568 2003 0.058481 2006 0.057938 2001 0.057047 2002 0.053745 1998 0.050378 2007 0.049205 Name: registration_year, dtype: float64
brand_grp = autos.groupby(['brand','model','registration_year','odometer_km'])
brand_grp['model'].value_counts().loc['mini'].loc['cooper'].head(50)
registration_year odometer_km model 1997 125000 cooper 1 1998 90000 cooper 1 2001 125000 cooper 1 150000 cooper 4 2002 5000 cooper 1 90000 cooper 2 100000 cooper 1 125000 cooper 3 150000 cooper 9 2003 100000 cooper 1 125000 cooper 4 150000 cooper 5 2004 90000 cooper 1 100000 cooper 1 125000 cooper 5 150000 cooper 8 2005 50000 cooper 1 60000 cooper 2 80000 cooper 1 90000 cooper 2 100000 cooper 2 125000 cooper 7 150000 cooper 9 2006 50000 cooper 2 60000 cooper 1 80000 cooper 2 90000 cooper 4 100000 cooper 4 125000 cooper 1 150000 cooper 6 2007 60000 cooper 1 80000 cooper 1 90000 cooper 3 100000 cooper 3 125000 cooper 6 150000 cooper 4 2008 40000 cooper 1 70000 cooper 4 80000 cooper 5 90000 cooper 4 100000 cooper 3 125000 cooper 3 150000 cooper 2 2009 30000 cooper 2 40000 cooper 2 50000 cooper 10 60000 cooper 2 70000 cooper 4 80000 cooper 3 90000 cooper 3 Name: model, dtype: int64
brand_grp['price'].agg(['mean', 'max', 'min']).loc['mini'].loc[['cooper','registration_year','odometer_km']].head(50)
mean | max | min | |||
---|---|---|---|---|---|
model | registration_year | odometer_km | |||
cooper | 1997 | 125000 | 5900.000000 | 5900 | 5900 |
1998 | 90000 | 15500.000000 | 15500 | 15500 | |
2001 | 125000 | 4250.000000 | 4250 | 4250 | |
150000 | 4237.500000 | 5750 | 3350 | ||
2002 | 5000 | 3200.000000 | 3200 | 3200 | |
90000 | 3800.000000 | 3800 | 3800 | ||
100000 | 5200.000000 | 5200 | 5200 | ||
125000 | 3950.000000 | 4950 | 2900 | ||
150000 | 3876.555556 | 6000 | 2500 | ||
2003 | 100000 | 6279.000000 | 6279 | 6279 | |
125000 | 5750.000000 | 6200 | 5400 | ||
150000 | 5025.800000 | 6299 | 3800 | ||
2004 | 90000 | 9000.000000 | 9000 | 9000 | |
100000 | 6950.000000 | 6950 | 6950 | ||
125000 | 5504.400000 | 6500 | 4800 | ||
150000 | 5456.250000 | 7200 | 4250 | ||
2005 | 50000 | 7900.000000 | 7900 | 7900 | |
60000 | 8125.000000 | 9250 | 7000 | ||
80000 | 8888.000000 | 8888 | 8888 | ||
90000 | 8144.500000 | 8799 | 7490 | ||
100000 | 8138.500000 | 8500 | 7777 | ||
125000 | 7897.142857 | 10800 | 5490 | ||
150000 | 5380.777778 | 8888 | 3290 | ||
2006 | 50000 | 8175.000000 | 9450 | 6900 | |
60000 | 10500.000000 | 10500 | 10500 | ||
80000 | 8899.500000 | 10800 | 6999 | ||
90000 | 8837.500000 | 11500 | 6350 | ||
100000 | 7375.000000 | 8400 | 6500 | ||
125000 | 8500.000000 | 8500 | 8500 | ||
150000 | 5506.500000 | 6800 | 2990 | ||
2007 | 60000 | 9900.000000 | 9900 | 9900 | |
80000 | 8500.000000 | 8500 | 8500 | ||
90000 | 8485.000000 | 9500 | 7400 | ||
100000 | 8000.000000 | 10400 | 5500 | ||
125000 | 7333.000000 | 8650 | 5700 | ||
150000 | 7324.500000 | 8499 | 5250 | ||
2008 | 40000 | 12888.000000 | 12888 | 12888 | |
70000 | 9475.000000 | 11000 | 8200 | ||
80000 | 12039.800000 | 15500 | 10400 | ||
90000 | 11199.500000 | 12900 | 9999 | ||
100000 | 10746.333333 | 12250 | 9990 | ||
125000 | 10793.333333 | 11900 | 9980 | ||
150000 | 8900.000000 | 9900 | 7900 | ||
2009 | 30000 | 11994.500000 | 14999 | 8990 | |
40000 | 13720.000000 | 15950 | 11490 | ||
50000 | 10865.700000 | 16800 | 4200 | ||
60000 | 14445.000000 | 17900 | 10990 | ||
70000 | 11925.000000 | 14100 | 10200 | ||
80000 | 12413.333333 | 14250 | 10490 | ||
90000 | 10566.666667 | 11400 | 9800 |
#I selected the 6 most populart brands by count
brand_counts = autos['brand'].value_counts().head(6).index
brand_counts
Index(['volkswagen', 'bmw', 'opel', 'mercedes_benz', 'audi', 'ford'], dtype='object')
# Then created a dictionary to hold the average price for each
brand_mean_prices = {}
for brand in brand_counts:
filt = autos['brand'] == brand
auto_brand = autos[filt]
mean_price = auto_brand['price'].mean()
brand_mean_prices[brand] = int(mean_price)
print(brand_mean_prices)
{'volkswagen': 5440, 'bmw': 8145, 'opel': 3020, 'mercedes_benz': 8226, 'audi': 9076, 'ford': 3673}
# Then sorted the dictionary from highest to lowest average price.
brand_mean_prices_sorted = sorted(brand_mean_prices.items(), key = lambda x: x[1], reverse = True)
print(brand_mean_prices_sorted)
[('audi', 9076), ('mercedes_benz', 8226), ('bmw', 8145), ('volkswagen', 5440), ('ford', 3673), ('opel', 3020)]
# I did the same for the mileage
brand_mean_mileage = {}
for brand in brand_counts:
filt = autos['brand'] == brand
auto_brand = autos[filt]
mean_mileage = auto_brand['odometer_km'].mean()
brand_mean_mileage[brand] = int(mean_mileage)
print (brand_mean_mileage)
{'volkswagen': 128807, 'bmw': 132974, 'opel': 129375, 'mercedes_benz': 131581, 'audi': 129796, 'ford': 124382}
# Finally I combined the data into a new dataframe and pretty printed it
top_6 = pd.DataFrame(data = [brand_mean_prices, brand_mean_mileage]).T
print(top_6)
0 1 volkswagen 5440 128807 bmw 8145 132974 opel 3020 129375 mercedes_benz 8226 131581 audi 9076 129796 ford 3673 124382
top_6 = top_6.reset_index()
top_6.columns = ['brand', 'mean_price', 'mileage']
top_6.columns = [x.upper() for x in top_6.columns]
top_6.set_index('BRAND').sort_values('MEAN_PRICE', ascending = False)
MEAN_PRICE | MILEAGE | |
---|---|---|
BRAND | ||
audi | 9076 | 129796 |
mercedes_benz | 8226 | 131581 |
bmw | 8145 | 132974 |
volkswagen | 5440 | 128807 |
ford | 3673 | 124382 |
opel | 3020 | 129375 |
# I split the string prior to '_' to extract the brand in the 'name' column
autos['name'] = autos['name'].str.split('_', expand = True)
autos['name']
0 Peugeot 1 BMW 2 Volkswagen 3 Smart 4 Ford ... 49995 Audi 49996 Opel 49997 Fiat 49998 Audi 49999 Opel Name: name, Length: 46032, dtype: object
# That left me with a bunch of columns containing 'NAN', so I dropped them
autos.dropna(axis = 'columns', how = 'all', inplace = True)
autos
date_crawled | name | price | test | vehicle_type | registration_year | gearbox | power_ps | model | odometer_km | registration_month | fuel_type | brand | unrepaired_damage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 | Peugeot | 5000 | control | NaN | 2004 | manual | 158 | andere | 150000 | 3 | NaN | peugeot | no | 2016-03-26 | 0 | 79588 | 2016-04-06 |
1 | 2016-04-04 | BMW | 8500 | control | NaN | 1997 | automatic | 286 | 7er | 150000 | 6 | petrol | bmw | no | 2016-04-04 | 0 | 71034 | 2016-04-06 |
2 | 2016-03-26 | Volkswagen | 8990 | test | NaN | 2009 | manual | 102 | golf | 70000 | 7 | petrol | volkswagen | no | 2016-03-26 | 0 | 35394 | 2016-04-06 |
3 | 2016-03-12 | Smart | 4350 | control | small_car | 2007 | automatic | 71 | fortwo | 70000 | 6 | petrol | smart | no | 2016-03-12 | 0 | 33729 | 2016-03-15 |
4 | 2016-04-01 | Ford | 1350 | test | station_wagon | 2003 | manual | 0 | focus | 150000 | 7 | petrol | ford | no | 2016-04-01 | 0 | 39218 | 2016-04-01 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 2016-03-27 | Audi | 24900 | control | NaN | 2011 | automatic | 239 | q5 | 100000 | 1 | NaN | audi | no | 2016-03-27 | 0 | 82131 | 2016-04-01 |
49996 | 2016-03-28 | Opel | 1980 | control | convertable | 1996 | manual | 75 | astra | 150000 | 5 | petrol | opel | no | 2016-03-28 | 0 | 44807 | 2016-04-02 |
49997 | 2016-04-02 | Fiat | 13200 | test | convertable | 2014 | automatic | 69 | 500 | 5000 | 11 | petrol | fiat | no | 2016-04-02 | 0 | 73430 | 2016-04-04 |
49998 | 2016-03-08 | Audi | 22900 | control | station_wagon | 2013 | manual | 150 | a3 | 40000 | 11 | NaN | audi | no | 2016-03-08 | 0 | 35683 | 2016-04-05 |
49999 | 2016-03-14 | Opel | 1250 | control | NaN | 1996 | manual | 101 | vectra | 150000 | 1 | petrol | opel | no | 2016-03-13 | 0 | 45897 | 2016-04-06 |
46032 rows × 18 columns
# create a list of models sorted from most popular to least popular
autos['model'].value_counts()
models = autos['model'].value_counts().index
models
Index(['golf', 'andere', '3er', 'polo', 'corsa', 'passat', 'astra', 'a4', 'c_klasse', '5er', ... 'materia', '145', 'v60', 'charade', 'samara', 'kappa', '200', 'rangerover', 'b_max', 'i3'], dtype='object', length=244)
# loop through the list of models to create a dictionary with a brand/model pair
top_brand_model = {}
for model in models:
filt = autos['model'] == model
select_brand = autos.loc[filt, 'brand'].iloc[0]
if select_brand in top_brand_model:
continue
else:
top_brand_model[select_brand] = model
print(top_brand_model)
{'volkswagen': 'golf', 'peugeot': 'andere', 'bmw': '3er', 'opel': 'corsa', 'audi': 'a4', 'mercedes_benz': 'c_klasse', 'ford': 'focus', 'renault': 'twingo', 'smart': 'fortwo', 'fiat': 'punto', 'seat': 'ibiza', 'skoda': 'octavia', 'mini': 'cooper', 'nissan': 'micra', 'hyundai': 'i_reihe', 'honda': 'civic', 'mazda': '6_reihe', 'toyota': 'yaris', 'mitsubishi': 'colt', 'citroen': 'berlingo', 'volvo': 'v70', 'suzuki': 'grand', 'alfa_romeo': '156', 'porsche': '911', 'daihatsu': 'cuore', 'chevrolet': 'matiz', 'kia': 'sportage', 'trabant': '601', 'dacia': 'sandero', 'chrysler': 'voyager', 'land_rover': 'freelander', 'subaru': 'legacy', 'jeep': 'wrangler', 'lancia': 'ypsilon', 'jaguar': 'x_type', 'daewoo': 'kalos', 'saab': '900', 'lada': 'niva', 'rover': 'rangerover'}
# create a list of odometer bins
autos.odometer_km.value_counts()
odo_bins = autos.odometer_km.value_counts().index
odo_bins
Int64Index([150000, 125000, 100000, 90000, 80000, 70000, 60000, 50000, 40000, 30000, 20000, 5000, 10000], dtype='int64')
# create a dictionary with the average price per bin
mean_price_milage = {}
for odo in odo_bins:
select_row = autos.loc[autos['odometer_km'] == odo]
select_col= select_row['price']
mean_price = select_col.mean()
mean_price_milage[odo] = mean_price
mean_price_milage
{150000: 3762.544193700047, 125000: 6092.827865448505, 100000: 7801.989655172414, 90000: 8241.74684305472, 80000: 9592.133869787856, 70000: 10736.449829931973, 60000: 11418.336642599277, 50000: 13065.026557711952, 40000: 14158.692802056556, 30000: 14608.035422343324, 20000: 15280.181946403385, 5000: 6090.694362017804, 10000: 17575.01818181818}
# The second to last '5000', because it doesn't follow the trend of decreaseing price with increasing miles.
# It's an outlier so I will delete it and rerun
autos.odometer_km.value_counts()
filt = ~(autos['odometer_km'] == 5000)
odo_bins = autos[filt].odometer_km.value_counts().index
odo_bins
Int64Index([150000, 125000, 100000, 90000, 80000, 70000, 60000, 50000, 40000, 30000, 20000, 10000], dtype='int64')
mean_price_milage = {}
for odo in odo_bins:
select_row = autos.loc[autos['odometer_km'] == odo]
select_col= select_row['price']
mean_price = select_col.mean()
mean_price_milage[odo] = mean_price
# There is an obvious trend of decreasing price with increasing miles
mean_price_milage
{150000: 3762.544193700047, 125000: 6092.827865448505, 100000: 7801.989655172414, 90000: 8241.74684305472, 80000: 9592.133869787856, 70000: 10736.449829931973, 60000: 11418.336642599277, 50000: 13065.026557711952, 40000: 14158.692802056556, 30000: 14608.035422343324, 20000: 15280.181946403385, 10000: 17575.01818181818}
autos['unrepaired_damage'].value_counts()
no 33572 yes 4434 Name: unrepaired_damage, dtype: int64
### first I find average price of damaged model and create a dictionary
filt = autos['unrepaired_damage'] == 'yes'
damaged = autos.loc[filt, ['model','price','unrepaired_damage']]
damaged
model | price | unrepaired_damage | |
---|---|---|---|
28 | cooper | 5250 | yes |
51 | s_klasse | 6000 | yes |
81 | micra | 2000 | yes |
86 | colt | 399 | yes |
97 | andere | 800 | yes |
... | ... | ... | ... |
49962 | andere | 2200 | yes |
49966 | c1 | 1490 | yes |
49972 | vito | 3300 | yes |
49978 | e_klasse | 900 | yes |
49989 | polo | 150 | yes |
4434 rows × 3 columns
dam_mods = damaged['model'].unique()
dam_mods
array(['cooper', 's_klasse', 'micra', 'colt', 'andere', '5_reihe', 'ka', 'tigra', 'galaxy', 'a3', 'fiesta', 'polo', 'touareg', 'megane', 'ducato', 'golf', '80', 'a6', 'corsa', 'passat', nan, 'vectra', 'focus', '3er', '3_reihe', 'transporter', 'a_klasse', 'sprinter', 'ibiza', 'x_reihe', 'a4', 'carnival', 'twingo', '5er', 'nubira', 'kangoo', 'scenic', 'civic', 'clk', 'jazz', 'laguna', 'toledo', 'mondeo', 'zafira', 'octavia', 'astra', 'fox', '2_reihe', '601', 'aveo', '100', '7er', 'e_klasse', 'fabia', 'almera', 'espace', 'clio', 'a5', 'c4', 'aygo', 'c2', '1_reihe', 'v_klasse', 'm_klasse', 'slk', 'seicento', 'yaris', 'c_klasse', 'tt', 'caddy', 'combo', 'punto', 'grand', 'lupo', 'touran', 'fortwo', 'meriva', 'fusion', 'forfour', '850', 'carisma', 'leon', 'c5', 'a2', 'sl', 'justy', 'ptcruiser', 'swift', 'signum', 'primera', 'v70', 'bora', 'lancer', '6_reihe', 'stilo', 'insignia', 'wrangler', '1er', 'omega', 'corolla', '147', 'auris', '156', 'superb', 'impreza', 'm_reihe', 'transit', '4_reihe', 'b_klasse', 'cl', 'escort', 'beetle', 'berlingo', 'rio', 'getz', '159', 'one', 'picanto', 'bravo', 'sirion', 'agila', 'sharan', 'santa', 'galant', 'vito', 'arosa', 'voyager', 'navara', 'a8', 'v40', 'kadett', 'c3', 'sandero', 'calibra', 'kalos', 'ceed', 'roomster', 'c_max', 'discovery', 'legacy', 'matiz', 'ypsilon', 'niva', 'q7', 'duster', 'vivaro', 'kuga', 'mx_reihe', '900', 'citigo', 'alhambra', 'cayenne', 'panda', 'avensis', 'mustang', 'viano', 'r19', 'up', 'altea', 'cuore', 'accord', 'range_rover', 'forester', 'modus', 'i_reihe', 'jimny', 'sorento', 'x_trail', 'tucson', 'cordoba', 'defender', 'kaefer', 'sportage', 'qashqai', 'doblo', '500', 'rx_reihe', 'freelander', 'g_klasse', 'logan', 'outlander', 'samara', '911', 'rav', 'cherokee', 's_type', 'antara', 'note', 'lanos', 'cr_reihe', 'verso', 'x_type', 'pajero', '9000', 's60', 'i3', 'eos', 'terios', 'move', 'c_reihe', 'xc_reihe', 'v50', 'lybra', 'c1'], dtype=object)
ave_price_damaged = {}
for mods in dam_mods:
filt = damaged['model'] == mods
select_row = damaged[filt]
select_col = select_row['price'].mean()
ave_price_damaged[mods] = select_col
print(ave_price_damaged)
{'cooper': 4271.111111111111, 's_klasse': 5552.727272727273, 'micra': 884.8064516129032, 'colt': 1173.625, 'andere': 2455.441798941799, '5_reihe': 4366.666666666667, 'ka': 533.3488372093024, 'tigra': 746.6, 'galaxy': 2839.904761904762, 'a3': 2769.4477611940297, 'fiesta': 1099.9166666666667, 'polo': 1070.0843373493976, 'touareg': 20933.333333333332, 'megane': 1282.5555555555557, 'ducato': 3335.285714285714, 'golf': 1879.6258278145694, '80': 641.421052631579, 'a6': 3201.7157894736843, 'corsa': 1262.1858974358975, 'passat': 2183.734513274336, nan: nan, 'vectra': 922.6410256410256, 'focus': 1652.056179775281, '3er': 2336.3286384976527, '3_reihe': 1289.774193548387, 'transporter': 4864.793103448276, 'a_klasse': 1610.0727272727272, 'sprinter': 5260.0, 'ibiza': 1502.5151515151515, 'x_reihe': 11625.333333333334, 'a4': 2971.8, 'carnival': 2410.0, 'twingo': 873.140625, '5er': 4006.030303030303, 'nubira': 574.5, 'kangoo': 1435.25, 'scenic': 906.32, 'civic': 826.4117647058823, 'clk': 2981.5384615384614, 'jazz': 3012.5, 'laguna': 1433.342105263158, 'toledo': 1089.8, 'mondeo': 1071.2073170731708, 'zafira': 2169.943396226415, 'octavia': 4606.5, 'astra': 1369.9044117647059, 'fox': 1818.75, '2_reihe': 1508.753846153846, '601': 649.0, 'aveo': 2957.5, '100': 1166.5, '7er': 5070.846153846154, 'e_klasse': 4427.341772151899, 'fabia': 2299.0, 'almera': 949.8571428571429, 'espace': 723.0625, 'clio': 1088.125, 'a5': 21649.8, 'c4': 4171.0, 'aygo': 3112.5, 'c2': 1547.5714285714287, '1_reihe': 572.952380952381, 'v_klasse': 3512.25, 'm_klasse': 6667.5, 'slk': 3641.6666666666665, 'seicento': 481.9, 'yaris': 3650.0, 'c_klasse': 3739.635135135135, 'tt': 7521.111111111111, 'caddy': 4449.411764705882, 'combo': 2082.1111111111113, 'punto': 901.1224489795918, 'grand': 3583.1111111111113, 'lupo': 781.8536585365854, 'touran': 5496.1875, 'fortwo': 1507.1025641025642, 'meriva': 3848.090909090909, 'fusion': 1923.3333333333333, 'forfour': 1074.75, '850': 580.0, 'carisma': 705.8333333333334, 'leon': 3861.4615384615386, 'c5': 3072.1111111111113, 'a2': 2333.3333333333335, 'sl': 6500.0, 'justy': 322.5, 'ptcruiser': 1699.6666666666667, 'swift': 1181.142857142857, 'signum': 2642.1428571428573, 'primera': 1058.4117647058824, 'v70': 1494.142857142857, 'bora': 1762.4166666666667, 'lancer': 650.0, '6_reihe': 1349.9444444444443, 'stilo': 899.7142857142857, 'insignia': 4766.666666666667, 'wrangler': 3450.0, '1er': 6220.645161290323, 'omega': 758.71875, 'corolla': 1623.3333333333333, '147': 1286.8461538461538, 'auris': 4749.5, '156': 1190.388888888889, 'superb': 10639.6, 'impreza': 4991.666666666667, 'm_reihe': 8900.0, 'transit': 1871.75, '4_reihe': 1190.8181818181818, 'b_klasse': 4798.75, 'cl': 14425.0, 'escort': 427.5, 'beetle': 3094.0, 'berlingo': 919.8, 'rio': 2583.1666666666665, 'getz': 948.625, '159': 7800.0, 'one': 5566.666666666667, 'picanto': 2670.0, 'bravo': 330.0, 'sirion': 490.0, 'agila': 829.875, 'sharan': 5235.214285714285, 'santa': 2285.4285714285716, 'galant': 1250.0, 'vito': 3658.0588235294117, 'arosa': 621.3571428571429, 'voyager': 1477.5555555555557, 'navara': 4000.0, 'a8': 3780.0, 'v40': 756.4615384615385, 'kadett': 1678.3333333333333, 'c3': 3500.0, 'sandero': 4219.25, 'calibra': 1200.0, 'kalos': 850.0, 'ceed': 4250.0, 'roomster': 2799.5, 'c_max': 2333.1666666666665, 'discovery': 1725.0, 'legacy': 1800.0, 'matiz': 524.5, 'ypsilon': 195.0, 'niva': 2500.0, 'q7': 11950.0, 'duster': 8500.0, 'vivaro': 4879.666666666667, 'kuga': 6390.0, 'mx_reihe': 2499.75, '900': 383.3333333333333, 'citigo': 3990.0, 'alhambra': 6233.333333333333, 'cayenne': 17666.666666666668, 'panda': 1374.8333333333333, 'avensis': 2512.5, 'mustang': 9280.0, 'viano': 24333.333333333332, 'r19': 662.25, 'up': 6449.333333333333, 'altea': 3133.3333333333335, 'cuore': 841.6666666666666, 'accord': 2017.8, 'range_rover': 1800.0, 'forester': 1700.0, 'modus': 1950.0, 'i_reihe': 5379.083333333333, 'jimny': 5555.333333333333, 'sorento': 3549.8, 'x_trail': 3400.0, 'tucson': 3690.0, 'cordoba': 500.0, 'defender': 12245.0, 'kaefer': 2960.0, 'sportage': 1200.0, 'qashqai': 11783.333333333334, 'doblo': 1587.5, '500': 5147.5, 'rx_reihe': 2945.0, 'freelander': 4016.6666666666665, 'g_klasse': 6375.0, 'logan': 1200.0, 'outlander': 2850.0, 'samara': 300.0, '911': 20833.333333333332, 'rav': 6829.666666666667, 'cherokee': 800.0, 's_type': 1600.0, 'antara': 7500.0, 'note': 3875.0, 'lanos': 200.0, 'cr_reihe': 2190.0, 'verso': 7347.5, 'x_type': 1883.0, 'pajero': 2724.75, '9000': 200.0, 's60': 3500.0, 'i3': 250.0, 'eos': 7500.0, 'terios': 1500.0, 'move': 400.0, 'c_reihe': 2999.0, 'xc_reihe': 11450.0, 'v50': 3495.0, 'lybra': 799.0, 'c1': 1490.0}
# Next I find the ave price of the undamaged model and create a dictionary
filt = autos['unrepaired_damage'] == 'no'
no_damaged = autos.loc[filt, ['model','price','unrepaired_damage']]
print(no_damaged)
no_dam_mods = no_damaged['model'].unique()
model price unrepaired_damage 0 andere 5000 no 1 7er 8500 no 2 golf 8990 no 3 fortwo 4350 no 4 focus 1350 no ... ... ... ... 49995 q5 24900 no 49996 astra 1980 no 49997 500 13200 no 49998 a3 22900 no 49999 vectra 1250 no [33572 rows x 3 columns]
ave_price_no_damaged = {}
for mods in no_dam_mods:
filt = no_damaged['model'] == mods
select_row = no_damaged[filt]
select_col = select_row['price'].mean()
ave_price_no_damaged[mods] = select_col
print(ave_price_no_damaged)
{'andere': 7431.067929188966, '7er': 10414.252747252747, 'golf': 6110.624437781109, 'fortwo': 4011.6495327102803, 'focus': 4094.413237924866, 'arosa': 1156.622950819672, 'megane': 4170.25327510917, 'a3': 9263.882352941177, nan: nan, 'vectra': 2234.4084084084084, 'scirocco': 12298.285714285714, '911': 34504.875, 'cooper': 12062.406374501992, '5er': 8882.866197183099, 'polo': 3232.245353159851, 'e_klasse': 9173.015514809591, 'c_klasse': 7645.5904550499445, 'mondeo': 3827.102893890675, 'altea': 6376.472222222223, 'a1': 14793.827160493827, 'twingo': 1783.2671755725191, 'a_klasse': 4558.894607843137, '3_reihe': 3614.7846607669617, 'sandero': 5666.333333333333, 'passat': 5826.300821355236, 'primera': 1637.125, 'corsa': 2295.3608949416343, 'a6': 9605.913477537437, 'astra': 3968.1730569948186, '3er': 6961.522898842476, '1er': 12016.36, '6_reihe': 4490.843478260869, 'a4': 8153.984749455338, 'micra': 2576.4938271604938, 'transporter': 11698.121513944223, 'vito': 8369.5125, 'cordoba': 921.1724137931035, '100': 2031.725, '2_reihe': 3505.54, 'octavia': 8339.256, 'm_klasse': 14493.991803278688, 'lupo': 1533.2641509433963, 'superb': 9682.0, 'laguna': 2659.4545454545455, '1_reihe': 1877.377358490566, 'seicento': 1035.4761904761904, 'touran': 8626.362573099415, 'colt': 2834.027027027027, 'clio': 2824.449848024316, 'avensis': 5556.372881355932, 'vivaro': 9291.813953488372, 'x_reihe': 18391.49019607843, 'tigra': 3210.0476190476193, 'boxster': 20915.842105263157, 'signum': 4880.046511627907, 'sharan': 6725.233128834356, 'zafira': 3995.915254237288, 'ka': 1814.39, 'fiesta': 3484.7425742574255, 'rav': 8930.78947368421, 'a5': 22454.473214285714, 'beetle': 7549.046728971963, 'touareg': 16169.650602409638, 'c_reihe': 8339.076923076924, 'phaeton': 13704.222222222223, 'i_reihe': 8412.027397260274, 'sl': 22135.73417721519, 'insignia': 12460.013513513513, 'up': 8371.279069767443, 'civic': 4081.711711711712, 'mx_reihe': 6407.872340425532, 'omega': 1275.2526315789473, 'sorento': 9139.153846153846, '80': 1750.8877551020407, 'z_reihe': 13276.19266055046, 'berlingo': 4103.726027397261, 'clk': 6954.946808510638, 's_max': 11803.816326530612, 'kalos': 1508.25, 'cx_reihe': 22253.714285714286, 'grand': 10052.108108108108, 'swift': 4353.882352941177, 'tiguan': 18021.21818181818, 'sprinter': 6940.366666666667, 'mii': 7316.25, 'viano': 19744.24074074074, 'almera': 2068.7619047619046, 'picanto': 3586.128205128205, 'ibiza': 4275.98431372549, 'galaxy': 5188.3, 'espace': 3501.098039215686, 'scenic': 2910.846153846154, 'one': 7768.292682926829, 'bora': 2441.7638888888887, 'fox': 2891.621212121212, 's_klasse': 14200.975308641975, 'leon': 8894.02564102564, 'transit': 5141.902777777777, 'punto': 1923.464, 'qashqai': 13140.353846153846, 'mustang': 20694.763157894737, 'm_reihe': 25917.594594594593, 'jazz': 4851.5161290322585, 'meriva': 4869.690265486725, 'corolla': 2626.2615384615383, 'duster': 9350.04, 'yaris': 4435.9655172413795, 'fabia': 4584.642857142857, 'voyager': 1957.1538461538462, 'freelander': 7012.68, 'lancer': 6807.9375, 'forester': 3152.222222222222, 's_type': 6848.9, 'bravo': 2898.1724137931033, '500': 8406.696078431372, '156': 1689.5166666666667, 'stilo': 1813.8, 'panda': 3653.7258064516127, 'a8': 11099.818181818182, 'ceed': 9047.777777777777, '5_reihe': 8067.04347826087, 'caddy': 7949.25, 'aygo': 3669.3555555555554, 'citigo': 7746.105263157895, 'exeo': 13860.0, 'c1': 4190.818181818182, 'kangoo': 3194.983870967742, 'v40': 2227.8688524590166, 'doblo': 4083.181818181818, 'toledo': 2084.703703703704, 'c_max': 7189.090909090909, 'impreza': 9572.6, 'justy': 3003.1666666666665, 'tt': 13743.041666666666, '145': 2750.0, 'c3': 4060.810344827586, 'roomster': 6767.193548387097, 'auris': 10198.407407407407, 'eos': 11690.161290322581, 'slk': 12250.666666666666, 'terios': 4590.0, 'agila': 1703.0243902439024, 'tucson': 6134.571428571428, 'fusion': 3935.653846153846, 'c4': 6810.765957446809, 'glk': 26290.206896551725, 'v50': 5545.375, 'combo': 3355.025641025641, 'matiz': 1843.2954545454545, 'yeti': 15351.4, 'v70': 5009.258064516129, '900': 2542.7, 'outlander': 11293.470588235294, 'c2': 3494.8947368421054, 'jimny': 6685.285714285715, 'forfour': 2874.4117647058824, 'a2': 3808.848484848485, '601': 3068.157894736842, 'kadett': 4185.0, 'b_klasse': 11266.61320754717, '850': 2280.3888888888887, 'r19': 759.8, 'cc': 15144.35294117647, 'note': 5588.368421052632, 'logan': 3766.5555555555557, 'xc_reihe': 13378.673913043478, 'verso': 7539.121951219512, 'jetta': 5339.961538461538, 'c5': 5300.964285714285, 'getz': 2154.121212121212, 'navara': 14246.75, 'cr_reihe': 6799.166666666667, 'x_type': 7070.0, 'clubman': 12428.192307692309, 's60': 7379.142857142857, '4_reihe': 4855.509433962264, 'cayenne': 21391.085714285713, '147': 2722.8545454545456, 'ducato': 5219.642857142857, 'carisma': 1219.4411764705883, 'q3': 28346.962962962964, 'spider': 9182.925925925925, 'cherokee': 5966.636363636364, 'wrangler': 17559.75, 'q7': 20889.91891891892, 'escort': 1240.7407407407406, 'roadster': 4754.25, 'captiva': 12409.681818181818, 'cl': 11754.48275862069, 'cuore': 1393.6808510638298, '90': 1699.6, 'kuga': 17622.052631578947, 'x_trail': 7894.387096774193, 'q5': 27314.254545454547, 'ypsilon': 1450.5, 'carnival': 2178.9, 'ptcruiser': 2571.4, 'pajero': 9772.416666666666, 'delta': 11250.0, 'rx_reihe': 8144.277777777777, 'sportage': 12591.702127659575, 'calibra': 2998.8333333333335, 'sirion': 2024.75, 'kaefer': 8916.645161290322, 'gl': 27660.0, 'amarok': 26069.4, 'santa': 7835.272727272727, '300c': 11998.9, 'antara': 9926.833333333334, 'accord': 5425.5, 'crossfire': 14439.8, '159': 6659.653846153846, 'lybra': 659.8, 'nubira': 1362.25, 'niva': 3449.75, 'v_klasse': 7244.7, 'legacy': 3412.5, 'aveo': 4993.266666666666, 'defender': 28724.714285714286, 'range_rover': 15230.0, 'juke': 11573.454545454546, 'spark': 5429.761904761905, 'lanos': 664.1428571428571, 'v60': 21000.0, 'rio': 4923.074074074074, 'modus': 3367.9583333333335, 'galant': 1831.125, 'alhambra': 6607.848484848485, 'discovery': 15154.363636363636, '6er': 18014.0, 'range_rover_sport': 26712.375, 'range_rover_evoque': 28256.8, 'move': 683.1666666666666, 'croma': 3099.5, 'g_klasse': 17954.38888888889, 'lodgy': 11939.8, 'rangerover': 23500.0, 'kappa': 1725.0, '9000': 2912.25, 'charade': 466.6666666666667, '200': 2250.0, 'b_max': 10600.0, 'samara': 1499.0}
###combine the two dictionaries into a separate df
combined = pd.DataFrame.from_dict([ave_price_no_damaged, ave_price_damaged]).T
combined
0 | 1 | |
---|---|---|
andere | 7431.067929 | 2455.441799 |
7er | 10414.252747 | 5070.846154 |
golf | 6110.624438 | 1879.625828 |
fortwo | 4011.649533 | 1507.102564 |
focus | 4094.413238 | 1652.056180 |
... | ... | ... |
charade | 466.666667 | NaN |
200 | 2250.000000 | NaN |
b_max | 10600.000000 | NaN |
samara | 1499.000000 | 300.000000 |
i3 | NaN | 250.000000 |
244 rows × 2 columns
combined = combined.rename(columns = {1: 'ave_price_damaged', 0: 'ave_price_no_damaged'})
combined.dropna(axis = 'index', how = 'any', inplace = True)
#Then I pretty print it
combined['ave_price_difference (damaged vs. undamaged)'] = combined['ave_price_no_damaged'] - combined['ave_price_damaged']
combined = combined.round()
combined
ave_price_no_damaged | ave_price_damaged | ave_price_difference (damaged vs. undamaged) | |
---|---|---|---|
andere | 7431.0 | 2455.0 | 4976.0 |
7er | 10414.0 | 5071.0 | 5343.0 |
golf | 6111.0 | 1880.0 | 4231.0 |
fortwo | 4012.0 | 1507.0 | 2505.0 |
focus | 4094.0 | 1652.0 | 2442.0 |
... | ... | ... | ... |
discovery | 15154.0 | 1725.0 | 13429.0 |
move | 683.0 | 400.0 | 283.0 |
g_klasse | 17954.0 | 6375.0 | 11579.0 |
9000 | 2912.0 | 200.0 | 2712.0 |
samara | 1499.0 | 300.0 | 1199.0 |
202 rows × 3 columns