import pandas as pd
import numpy as np
# creating dataframe autos from csv file
autos = pd.read_csv("autos.csv", encoding= "Latin-1")
autos
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 2016-03-27 14:38:19 | Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon | privat | Angebot | $24,900 | control | limousine | 2011 | automatik | 239 | q5 | 100,000km | 1 | diesel | audi | nein | 2016-03-27 00:00:00 | 0 | 82131 | 2016-04-01 13:47:40 |
49996 | 2016-03-28 10:50:25 | Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... | privat | Angebot | $1,980 | control | cabrio | 1996 | manuell | 75 | astra | 150,000km | 5 | benzin | opel | nein | 2016-03-28 00:00:00 | 0 | 44807 | 2016-04-02 14:18:02 |
49997 | 2016-04-02 14:44:48 | Fiat_500_C_1.2_Dualogic_Lounge | privat | Angebot | $13,200 | test | cabrio | 2014 | automatik | 69 | 500 | 5,000km | 11 | benzin | fiat | nein | 2016-04-02 00:00:00 | 0 | 73430 | 2016-04-04 11:47:27 |
49998 | 2016-03-08 19:25:42 | Audi_A3_2.0_TDI_Sportback_Ambition | privat | Angebot | $22,900 | control | kombi | 2013 | manuell | 150 | a3 | 40,000km | 11 | diesel | audi | nein | 2016-03-08 00:00:00 | 0 | 35683 | 2016-04-05 16:45:07 |
49999 | 2016-03-14 00:42:12 | Opel_Vectra_1.6_16V | privat | Angebot | $1,250 | control | limousine | 1996 | manuell | 101 | vectra | 150,000km | 1 | benzin | opel | nein | 2016-03-13 00:00:00 | 0 | 45897 | 2016-04-06 21:18:48 |
50000 rows × 20 columns
autos.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50000 entries, 0 to 49999 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 dateCrawled 50000 non-null object 1 name 50000 non-null object 2 seller 50000 non-null object 3 offerType 50000 non-null object 4 price 50000 non-null object 5 abtest 50000 non-null object 6 vehicleType 44905 non-null object 7 yearOfRegistration 50000 non-null int64 8 gearbox 47320 non-null object 9 powerPS 50000 non-null int64 10 model 47242 non-null object 11 odometer 50000 non-null object 12 monthOfRegistration 50000 non-null int64 13 fuelType 45518 non-null object 14 brand 50000 non-null object 15 notRepairedDamage 40171 non-null object 16 dateCreated 50000 non-null object 17 nrOfPictures 50000 non-null int64 18 postalCode 50000 non-null int64 19 lastSeen 50000 non-null object dtypes: int64(5), object(15) memory usage: 7.6+ MB
autos.head()
dateCrawled | name | seller | offerType | price | abtest | vehicleType | yearOfRegistration | gearbox | powerPS | model | odometer | monthOfRegistration | fuelType | brand | notRepairedDamage | dateCreated | nrOfPictures | postalCode | lastSeen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
Dataframe has 50,000 rows and 20 columns.
All comumn names are in camelcase instead of snakecase
5 out of 20 columns has missing values which inlude,
vehicleType
,gearbox
,model
,fuelType
¬RepairedDamage
.Columns which could have differnt data type includes
price
,odometer
.
autos.columns
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest', 'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model', 'odometer', 'monthOfRegistration', 'fuelType', 'brand', 'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode', 'lastSeen'], dtype='object')
# Assign modified columns to the dataframe
autos.columns= ['date_crawled', 'name', 'seller', 'offer_type', 'price', 'abtest',
'vehicle_type', 'year_of_registration', 'gearbox', 'powerPS', 'model',
'odometer', 'month_of_registration', 'fuel_type', 'brand',
'not_repaired_damage', 'ad_created', 'nr_of_pictures', 'postal_code',
'last_seen']
autos.head()
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | year_of_registration | gearbox | powerPS | model | odometer | month_of_registration | fuel_type | brand | not_repaired_damage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | privat | Angebot | $5,000 | control | bus | 2004 | manuell | 158 | andere | 150,000km | 3 | lpg | peugeot | nein | 2016-03-26 00:00:00 | 0 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | privat | Angebot | $8,500 | control | limousine | 1997 | automatik | 286 | 7er | 150,000km | 6 | benzin | bmw | nein | 2016-04-04 00:00:00 | 0 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | privat | Angebot | $8,990 | test | limousine | 2009 | manuell | 102 | golf | 70,000km | 7 | benzin | volkswagen | nein | 2016-03-26 00:00:00 | 0 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | privat | Angebot | $4,350 | control | kleinwagen | 2007 | automatik | 71 | fortwo | 70,000km | 6 | benzin | smart | nein | 2016-03-12 00:00:00 | 0 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | privat | Angebot | $1,350 | test | kombi | 2003 | manuell | 0 | focus | 150,000km | 7 | benzin | ford | nein | 2016-04-01 00:00:00 | 0 | 39218 | 2016-04-01 14:38:50 |
autos.describe(include = 'all')
date_crawled | name | seller | offer_type | price | abtest | vehicle_type | year_of_registration | gearbox | powerPS | model | odometer | month_of_registration | fuel_type | brand | not_repaired_damage | ad_created | nr_of_pictures | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 50000 | 50000 | 50000 | 50000 | 50000 | 50000 | 44905 | 50000.000000 | 47320 | 50000.000000 | 47242 | 50000 | 50000.000000 | 45518 | 50000 | 40171 | 50000 | 50000.0 | 50000.000000 | 50000 |
unique | 48213 | 38754 | 2 | 2 | 2357 | 2 | 8 | NaN | 2 | NaN | 245 | 13 | NaN | 7 | 40 | 2 | 76 | NaN | NaN | 39481 |
top | 2016-03-21 20:37:19 | Ford_Fiesta | privat | Angebot | $0 | test | limousine | NaN | manuell | NaN | golf | 150,000km | NaN | benzin | volkswagen | nein | 2016-04-03 00:00:00 | NaN | NaN | 2016-04-07 06:17:27 |
freq | 3 | 78 | 49999 | 49999 | 1421 | 25756 | 12859 | NaN | 36993 | NaN | 4024 | 32424 | NaN | 30107 | 10687 | 35232 | 1946 | NaN | NaN | 8 |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2005.073280 | NaN | 116.355920 | NaN | NaN | 5.723360 | NaN | NaN | NaN | NaN | 0.0 | 50813.627300 | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 105.712813 | NaN | 209.216627 | NaN | NaN | 3.711984 | NaN | NaN | NaN | NaN | 0.0 | 25779.747957 | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1000.000000 | NaN | 0.000000 | NaN | NaN | 0.000000 | NaN | NaN | NaN | NaN | 0.0 | 1067.000000 | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1999.000000 | NaN | 70.000000 | NaN | NaN | 3.000000 | NaN | NaN | NaN | NaN | 0.0 | 30451.000000 | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2003.000000 | NaN | 105.000000 | NaN | NaN | 6.000000 | NaN | NaN | NaN | NaN | 0.0 | 49577.000000 | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2008.000000 | NaN | 150.000000 | NaN | NaN | 9.000000 | NaN | NaN | NaN | NaN | 0.0 | 71540.000000 | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9999.000000 | NaN | 17700.000000 | NaN | NaN | 12.000000 | NaN | NaN | NaN | NaN | 0.0 | 99998.000000 | NaN |
seller
,offer_type
,nr_of_pictures
have mostly one value and are candidates to be dropped.
vehicle_type
,gearbox model
,fuel_type
,not_repaired_damage
have missing values.
year_of_registration
,powerPS
,month_of_registration
need more investigation.Columns which could have numeric data type includes
price
,odometer
.
autos["year_of_registration"].value_counts()
2000 3354 2005 3015 1999 3000 2004 2737 2003 2727 ... 1931 1 1929 1 1001 1 9996 1 1952 1 Name: year_of_registration, Length: 97, dtype: int64
len(autos["powerPS"].value_counts())
448
autos["month_of_registration"].unique()
array([ 3, 6, 7, 4, 8, 12, 10, 0, 9, 11, 5, 2, 1])
autos["price"].dtype
dtype('O')
autos["nr_of_pictures"].value_counts()
0 50000 Name: nr_of_pictures, dtype: int64
drop_columns = ["seller", "nr_of_pictures", "offer_type"]
autos.drop(drop_columns, axis = 1, inplace = True)
autos['gearbox'].unique()
array(['manuell', 'automatik', nan], dtype=object)
gear_box = {'manuell' : 'manual',
'automatik' : 'automatic',
np.nan: 'unknown'}
autos['gearbox'] = autos['gearbox'].map(gear_box)
autos['fuel_type'].unique()
array(['lpg', 'benzin', 'diesel', nan, 'cng', 'hybrid', 'elektro', 'andere'], dtype=object)
fueltype = { 'lpg':'lpg',
'benzin':'petrol',
'diesel':'diesel',
np.nan:'unknown',
'cng': 'cng',
'hybrid':'hybrid',
'elektro' : 'electro',
'andere': 'other'}
autos['fuel_type'] = autos['fuel_type'].map(fueltype)
autos['not_repaired_damage'].unique()
array(['nein', nan, 'ja'], dtype=object)
not_rep_dam = {'nein': 'no',
'ja': 'yes',
np.nan : 'unknown'}
autos['not_repaired_damage'] = autos['not_repaired_damage'].map(not_rep_dam)
autos.head()
date_crawled | name | price | abtest | vehicle_type | year_of_registration | gearbox | powerPS | model | odometer | month_of_registration | fuel_type | brand | not_repaired_damage | ad_created | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | $5,000 | control | bus | 2004 | manual | 158 | andere | 150,000km | 3 | lpg | peugeot | no | 2016-03-26 00:00:00 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | $8,500 | control | limousine | 1997 | automatic | 286 | 7er | 150,000km | 6 | petrol | bmw | no | 2016-04-04 00:00:00 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | $8,990 | test | limousine | 2009 | manual | 102 | golf | 70,000km | 7 | petrol | volkswagen | no | 2016-03-26 00:00:00 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | $4,350 | control | kleinwagen | 2007 | automatic | 71 | fortwo | 70,000km | 6 | petrol | smart | no | 2016-03-12 00:00:00 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | $1,350 | test | kombi | 2003 | manual | 0 | focus | 150,000km | 7 | petrol | ford | no | 2016-04-01 00:00:00 | 39218 | 2016-04-01 14:38:50 |
# convet price column to numeric datatype
autos["price"] = autos["price"].str.replace("$","").str.replace(",","")
autos["price"]= autos["price"].astype(int)
autos["price"].unique()
array([ 5000, 8500, 8990, ..., 385, 22200, 16995])
# convert odometer column to numeric datatype
autos["odometer"] = autos["odometer"].str.replace("km","").str.replace(",","")
autos["odometer"] = autos["odometer"].astype(int)
# Rename the above price & odometer column
autos.rename({'price':'price($)','odometer' : 'odometer_km'},
axis = 1,
inplace = True)
# EXploring odometer_km Column
autos["odometer_km"].describe()
count 50000.000000 mean 125732.700000 std 40042.211706 min 5000.000000 25% 125000.000000 50% 150000.000000 75% 150000.000000 max 150000.000000 Name: odometer_km, dtype: float64
autos["odometer_km"].value_counts().sort_index(ascending = False)
150000 32424 125000 5170 100000 2169 90000 1757 80000 1436 70000 1230 60000 1164 50000 1027 40000 819 30000 789 20000 784 10000 264 5000 967 Name: odometer_km, dtype: int64
# EXploring price($) Column
autos["price($)"].describe()
count 5.000000e+04 mean 9.840044e+03 std 4.811044e+05 min 0.000000e+00 25% 1.100000e+03 50% 2.950000e+03 75% 7.200000e+03 max 1.000000e+08 Name: price($), dtype: float64
autos["price($)"].value_counts().sort_index(ascending= False).head(15)
99999999 1 27322222 1 12345678 3 11111111 2 10000000 1 3890000 1 1300000 1 1234566 1 999999 2 999990 1 350000 1 345000 1 299000 1 295000 1 265000 1 Name: price($), dtype: int64
autos["price($)"].value_counts().sort_index(ascending= False).tail(250)
975 2 970 7 965 2 960 1 958 1 ... 5 2 3 1 2 3 1 156 0 1421 Name: price($), Length: 250, dtype: int64
# Removing outliners from the dataframe.
autos = autos[autos["price($)"].between(1000,1000000)]
autos
date_crawled | name | price($) | abtest | vehicle_type | year_of_registration | gearbox | powerPS | model | odometer_km | month_of_registration | fuel_type | brand | not_repaired_damage | ad_created | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | 5000 | control | bus | 2004 | manual | 158 | andere | 150000 | 3 | lpg | peugeot | no | 2016-03-26 00:00:00 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | 8500 | control | limousine | 1997 | automatic | 286 | 7er | 150000 | 6 | petrol | bmw | no | 2016-04-04 00:00:00 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | 8990 | test | limousine | 2009 | manual | 102 | golf | 70000 | 7 | petrol | volkswagen | no | 2016-03-26 00:00:00 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | 4350 | control | kleinwagen | 2007 | automatic | 71 | fortwo | 70000 | 6 | petrol | smart | no | 2016-03-12 00:00:00 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | 1350 | test | kombi | 2003 | manual | 0 | focus | 150000 | 7 | petrol | ford | no | 2016-04-01 00:00:00 | 39218 | 2016-04-01 14:38:50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 2016-03-27 14:38:19 | Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon | 24900 | control | limousine | 2011 | automatic | 239 | q5 | 100000 | 1 | diesel | audi | no | 2016-03-27 00:00:00 | 82131 | 2016-04-01 13:47:40 |
49996 | 2016-03-28 10:50:25 | Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... | 1980 | control | cabrio | 1996 | manual | 75 | astra | 150000 | 5 | petrol | opel | no | 2016-03-28 00:00:00 | 44807 | 2016-04-02 14:18:02 |
49997 | 2016-04-02 14:44:48 | Fiat_500_C_1.2_Dualogic_Lounge | 13200 | test | cabrio | 2014 | automatic | 69 | 500 | 5000 | 11 | petrol | fiat | no | 2016-04-02 00:00:00 | 73430 | 2016-04-04 11:47:27 |
49998 | 2016-03-08 19:25:42 | Audi_A3_2.0_TDI_Sportback_Ambition | 22900 | control | kombi | 2013 | manual | 150 | a3 | 40000 | 11 | diesel | audi | no | 2016-03-08 00:00:00 | 35683 | 2016-04-05 16:45:07 |
49999 | 2016-03-14 00:42:12 | Opel_Vectra_1.6_16V | 1250 | control | limousine | 1996 | manual | 101 | vectra | 150000 | 1 | petrol | opel | no | 2016-03-13 00:00:00 | 45897 | 2016-04-06 21:18:48 |
38629 rows × 17 columns
Price column has some unrealistic numbers.
Some prices are as high as 10 cr dollar where as some prices ae as low as 0 dollar.
So, we remove the outlines and keep minimum price as 1000 dollars and maximum price as 100 k dollars
After removing unwanted prices we have total 38629 rows
autos[['date_crawled','ad_created','last_seen']][0:5]
date_crawled | ad_created | last_seen | |
---|---|---|---|
0 | 2016-03-26 17:47:46 | 2016-03-26 00:00:00 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | 2016-04-04 00:00:00 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | 2016-03-26 00:00:00 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | 2016-03-12 00:00:00 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | 2016-04-01 00:00:00 | 2016-04-01 14:38:50 |
(autos["date_crawled"].value_counts(normalize = True,dropna = False)
.sort_index()
)
2016-03-05 14:06:30 0.000026 2016-03-05 14:06:40 0.000026 2016-03-05 14:07:21 0.000026 2016-03-05 14:07:26 0.000026 2016-03-05 14:07:40 0.000026 ... 2016-04-07 14:30:09 0.000026 2016-04-07 14:30:26 0.000026 2016-04-07 14:36:44 0.000026 2016-04-07 14:36:55 0.000026 2016-04-07 14:36:56 0.000026 Name: date_crawled, Length: 37561, dtype: float64
(autos["ad_created"].value_counts(normalize = True,dropna = False)
.sort_index())
2015-06-11 00:00:00 0.000026 2015-08-10 00:00:00 0.000026 2015-09-09 00:00:00 0.000026 2015-11-10 00:00:00 0.000026 2015-12-30 00:00:00 0.000026 ... 2016-04-03 00:00:00 0.039452 2016-04-04 00:00:00 0.037278 2016-04-05 00:00:00 0.011986 2016-04-06 00:00:00 0.003365 2016-04-07 00:00:00 0.001320 Name: ad_created, Length: 74, dtype: float64
autos["last_seen"].value_counts(normalize = True,dropna = False).sort_index()
2016-03-05 14:46:02 0.000026 2016-03-05 14:49:34 0.000026 2016-03-05 15:16:11 0.000026 2016-03-05 15:16:47 0.000026 2016-03-05 15:28:10 0.000026 ... 2016-04-07 14:58:44 0.000078 2016-04-07 14:58:45 0.000026 2016-04-07 14:58:46 0.000026 2016-04-07 14:58:48 0.000078 2016-04-07 14:58:50 0.000078 Name: last_seen, Length: 31223, dtype: float64
l = (autos["last_seen"].value_counts(normalize = True,dropna = False)
.sort_index()
.tail(12000)
)
l
2016-04-05 11:50:29 0.000026 2016-04-05 11:50:30 0.000026 2016-04-05 11:50:36 0.000026 2016-04-05 11:50:39 0.000026 2016-04-05 11:50:40 0.000026 ... 2016-04-07 14:58:44 0.000078 2016-04-07 14:58:45 0.000026 2016-04-07 14:58:46 0.000026 2016-04-07 14:58:48 0.000078 2016-04-07 14:58:50 0.000078 Name: last_seen, Length: 12000, dtype: float64
l.sum()
0.48932149421419147
The date crwaled is well distributed over the span of 2 months.
The ad created is didtributed for 10 months.
The last seen is distributed for a period of 1 month with approx 50% value in last 3 days. So, there must be surge in sell in last three days.
autos["year_of_registration"].describe()
count 38629.000000 mean 2005.678713 std 86.681928 min 1000.000000 25% 2001.000000 50% 2005.000000 75% 2009.000000 max 9999.000000 Name: year_of_registration, dtype: float64
(autos["year_of_registration"].value_counts(dropna = False)
.sort_index(ascending = True))
1000 1 1001 1 1927 1 1929 1 1931 1 .. 5911 1 6200 1 8888 1 9000 1 9999 2 Name: year_of_registration, Length: 91, dtype: int64
Since,
year_of_registration
column has bogous value. Some are as low as 1000 & some are as high as 9999.We need to remove the unwanted value from the dataframe.
So, We remove the outliners and keep minimum year as 1900 and Max year as 2016
# Removing otlines from year_of_registration column.
autos = autos[autos["year_of_registration"].between(1900,2016)]
autos
date_crawled | name | price($) | abtest | vehicle_type | year_of_registration | gearbox | powerPS | model | odometer_km | month_of_registration | fuel_type | brand | not_repaired_damage | ad_created | postal_code | last_seen | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | 5000 | control | bus | 2004 | manual | 158 | andere | 150000 | 3 | lpg | peugeot | no | 2016-03-26 00:00:00 | 79588 | 2016-04-06 06:45:54 |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | 8500 | control | limousine | 1997 | automatic | 286 | 7er | 150000 | 6 | petrol | bmw | no | 2016-04-04 00:00:00 | 71034 | 2016-04-06 14:45:08 |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | 8990 | test | limousine | 2009 | manual | 102 | golf | 70000 | 7 | petrol | volkswagen | no | 2016-03-26 00:00:00 | 35394 | 2016-04-06 20:15:37 |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | 4350 | control | kleinwagen | 2007 | automatic | 71 | fortwo | 70000 | 6 | petrol | smart | no | 2016-03-12 00:00:00 | 33729 | 2016-03-15 03:16:28 |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | 1350 | test | kombi | 2003 | manual | 0 | focus | 150000 | 7 | petrol | ford | no | 2016-04-01 00:00:00 | 39218 | 2016-04-01 14:38:50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 2016-03-27 14:38:19 | Audi_Q5_3.0_TDI_qu._S_tr.__Navi__Panorama__Xenon | 24900 | control | limousine | 2011 | automatic | 239 | q5 | 100000 | 1 | diesel | audi | no | 2016-03-27 00:00:00 | 82131 | 2016-04-01 13:47:40 |
49996 | 2016-03-28 10:50:25 | Opel_Astra_F_Cabrio_Bertone_Edition___TÜV_neu+... | 1980 | control | cabrio | 1996 | manual | 75 | astra | 150000 | 5 | petrol | opel | no | 2016-03-28 00:00:00 | 44807 | 2016-04-02 14:18:02 |
49997 | 2016-04-02 14:44:48 | Fiat_500_C_1.2_Dualogic_Lounge | 13200 | test | cabrio | 2014 | automatic | 69 | 500 | 5000 | 11 | petrol | fiat | no | 2016-04-02 00:00:00 | 73430 | 2016-04-04 11:47:27 |
49998 | 2016-03-08 19:25:42 | Audi_A3_2.0_TDI_Sportback_Ambition | 22900 | control | kombi | 2013 | manual | 150 | a3 | 40000 | 11 | diesel | audi | no | 2016-03-08 00:00:00 | 35683 | 2016-04-05 16:45:07 |
49999 | 2016-03-14 00:42:12 | Opel_Vectra_1.6_16V | 1250 | control | limousine | 1996 | manual | 101 | vectra | 150000 | 1 | petrol | opel | no | 2016-03-13 00:00:00 | 45897 | 2016-04-06 21:18:48 |
37209 rows × 17 columns
l =autos["year_of_registration"].value_counts(normalize = True).sort_index(ascending = False)
l
2016 0.017496 2015 0.009702 2014 0.017442 2013 0.021366 2012 0.035099 ... 1937 0.000108 1934 0.000054 1931 0.000027 1929 0.000027 1927 0.000027 Name: year_of_registration, Length: 77, dtype: float64
autos["year_of_registration"].describe()
count 37209.000000 mean 2003.968018 std 7.020982 min 1927.000000 25% 2001.000000 50% 2005.000000 75% 2008.000000 max 2016.000000 Name: year_of_registration, dtype: float64
l.head(6).sum()
0.14456179956462145
l[6:16]
2010 0.042570 2009 0.055820 2008 0.059233 2007 0.060711 2006 0.071246 2005 0.074847 2004 0.070091 2003 0.066570 2002 0.057379 2001 0.055497 Name: year_of_registration, dtype: float64
l[6:10].sum()
0.21833427396597596
l[11:15].sum()
0.26888655970329756
75% of total cars are registered between 2001 to 2016
Only 14.4% of cars are 6 years old from the date of registration.
21.83% of total cars are b/w 2006 to 2010.
26.88% of total cars are b/w 2001 to 2005.
*This shows that there more chances to sell a car if it is used b/w 10-15 yrs.*
autos["brand"].unique()
array(['peugeot', 'bmw', 'volkswagen', 'smart', 'ford', 'chrysler', 'audi', 'renault', 'sonstige_autos', 'mazda', 'porsche', 'mini', 'mercedes_benz', 'seat', 'toyota', 'opel', 'dacia', 'jeep', 'saab', 'volvo', 'nissan', 'jaguar', 'skoda', 'subaru', 'fiat', 'mitsubishi', 'chevrolet', 'hyundai', 'honda', 'kia', 'citroen', 'suzuki', 'trabant', 'land_rover', 'alfa_romeo', 'rover', 'daihatsu', 'daewoo', 'lancia', 'lada'], dtype=object)
l = autos["brand"].value_counts().sort_values(ascending = False)
l
volkswagen 7845 bmw 4664 mercedes_benz 4152 audi 3631 opel 3314 ford 2185 renault 1387 peugeot 1038 fiat 784 skoda 709 seat 643 smart 618 toyota 544 mazda 530 citroen 517 nissan 507 mini 405 hyundai 400 sonstige_autos 389 volvo 334 kia 286 porsche 278 honda 273 mitsubishi 256 chevrolet 246 alfa_romeo 232 suzuki 213 dacia 122 chrysler 118 jeep 103 land_rover 98 jaguar 69 subaru 64 daihatsu 63 saab 51 daewoo 34 trabant 32 rover 27 lancia 25 lada 23 Name: brand, dtype: int64
- We have chosen top 20 brand based on number of sale for mean_price calc.
top_20_brand = l.index[:20]
mean_price = {}
for each in top_20_brand:
selected_rows = autos[autos["brand"] == each]
each_mean_price = selected_rows["price($)"].mean()
mean_price[each] = each_mean_price
mean_price
{'volkswagen': 6898.376545570427, 'bmw': 9119.20218696398, 'mercedes_benz': 9302.614402697494, 'audi': 10322.269347287249, 'opel': 4219.954737477368, 'ford': 5786.703432494279, 'renault': 3590.942321557318, 'peugeot': 3955.169556840077, 'fiat': 4008.174744897959, 'skoda': 6836.696755994359, 'seat': 5638.640746500778, 'smart': 3780.4692556634304, 'toyota': 5573.57169117647, 'mazda': 5309.526415094339, 'citroen': 4614.970986460348, 'nissan': 6428.428007889546, 'mini': 10715.237037037037, 'hyundai': 6181.8725, 'sonstige_autos': 14454.552699228792, 'volvo': 6151.583832335329}
import matplotlib.pyplot as plt
plt.bar(list(mean_price.keys()),list(mean_price.values()),width = 0.5)
plt.xlabel("Brand")
plt.ylabel("Mean Price in $")
plt.title("Mean Price of Top 20 Brand")
plt.show()
<Figure size 640x480 with 1 Axes>
/dataquest/system/env/python3/lib/python3.8/site-packages/plotly/matplotlylib/mpltools.py:368: MatplotlibDeprecationWarning: The is_frame_like function was deprecated in Matplotlib 3.1 and will be removed in 3.3.
Renault has least mean price which is 3590.94 dollars .
sonstige_autos has highest mean price which is 14454.55 dollars .
top_6_price = {}
top_6_mileage = {}
for each in top_20_brand[:6]:
selected_rows = autos[autos["brand"] == each]
each_mean_price = selected_rows["price($)"].mean()
top_6_price[each] = each_mean_price
each_mean_mileage = selected_rows["odometer_km"].mean()
top_6_mileage[each] = each_mean_mileage
# Convert both dictionary to series objects using series constructior
price_series = pd.Series(top_6_price)
mileage_series = pd.Series(top_6_mileage)
df = pd.DataFrame(price_series,columns = ['mean_price'])
df['mean_mileage'] = mileage_series
df
mean_price | mean_mileage | |
---|---|---|
volkswagen | 6898.376546 | 125771.829191 |
bmw | 9119.202187 | 132001.500858 |
mercedes_benz | 9302.614403 | 130062.620424 |
audi | 10322.269347 | 127491.049298 |
opel | 4219.954737 | 123952.926976 |
ford | 5786.703432 | 119622.425629 |
All brands have nearly same mileage.
Most famous brand is volkswagen because of its affordable price.
autos_brand_model = autos[["brand","model"]]
autos_brand_model["model"].value_counts().sort_values(ascending= False)
golf 2918 andere 2668 3er 2285 passat 1152 a4 1092 ... 200 1 lanos 1 samara 1 rangerover 1 b_max 1 Name: model, Length: 241, dtype: int64
(autos_brand_model["model"].value_counts()
.sort_values(ascending= False)
.tail(10))
v60 3 delta 3 kappa 2 r19 2 145 1 200 1 lanos 1 samara 1 rangerover 1 b_max 1 Name: model, dtype: int64
autos_brand_model[autos_brand_model["model"] == "golf" ]
brand | model | |
---|---|---|
2 | volkswagen | golf |
7 | volkswagen | golf |
38 | volkswagen | golf |
40 | volkswagen | golf |
58 | volkswagen | golf |
... | ... | ... |
49875 | volkswagen | golf |
49881 | volkswagen | golf |
49955 | volkswagen | golf |
49958 | volkswagen | golf |
49961 | volkswagen | golf |
2918 rows × 2 columns
lf = autos_brand_model[(autos_brand_model["model"] == "200" )
|(autos_brand_model["model"] == "samara")
| (autos_brand_model["model"] == "145")
|(autos_brand_model["model"] == "lanos")
|(autos_brand_model["model"] == "b_max")
| (autos_brand_model["model"] == "rangerover")]
lf
brand | model | |
---|---|---|
1083 | alfa_romeo | 145 |
13376 | rover | rangerover |
44551 | audi | 200 |
47852 | ford | b_max |
48066 | lada | samara |
48575 | daewoo | lanos |
Most famous car is golf model of Volkawagen .
Least famous car is give lf DataFrame icludes 5 cars each having one ad.
l = autos["odometer_km"].unique()
l
array([150000, 70000, 50000, 80000, 10000, 30000, 90000, 125000, 20000, 60000, 5000, 40000, 100000])
l.sort()
l
array([ 5000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 125000, 150000])
#create a list of booleans
bool_mileage = [(autos["odometer_km"] < 25000)
,(autos["odometer_km"] >= 25000) & (autos["odometer_km"] < 50000)
,(autos["odometer_km"] >= 50000)&(autos["odometer_km"] < 75000)
,(autos["odometer_km"] >= 75000)&(autos["odometer_km"] < 100000)
,(autos["odometer_km"] >= 100000)&(autos["odometer_km"] < 125000)
,(autos["odometer_km"] >= 125000)
]
#list of values to assign
values = ['very_low','low', 'avg','above_avg', 'high', 'very_high']
#Creating new column to assign mileage category
autos["mileage_type"] = np.select(bool_mileage,values)
#Updated DataFrame
autos.head()
<ipython-input-69-e6487ebf3e30>:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
date_crawled | name | price($) | abtest | vehicle_type | year_of_registration | gearbox | powerPS | model | odometer_km | month_of_registration | fuel_type | brand | not_repaired_damage | ad_created | postal_code | last_seen | mileage_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016-03-26 17:47:46 | Peugeot_807_160_NAVTECH_ON_BOARD | 5000 | control | bus | 2004 | manual | 158 | andere | 150000 | 3 | lpg | peugeot | no | 2016-03-26 00:00:00 | 79588 | 2016-04-06 06:45:54 | very_high |
1 | 2016-04-04 13:38:56 | BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik | 8500 | control | limousine | 1997 | automatic | 286 | 7er | 150000 | 6 | petrol | bmw | no | 2016-04-04 00:00:00 | 71034 | 2016-04-06 14:45:08 | very_high |
2 | 2016-03-26 18:57:24 | Volkswagen_Golf_1.6_United | 8990 | test | limousine | 2009 | manual | 102 | golf | 70000 | 7 | petrol | volkswagen | no | 2016-03-26 00:00:00 | 35394 | 2016-04-06 20:15:37 | avg |
3 | 2016-03-12 16:58:10 | Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... | 4350 | control | kleinwagen | 2007 | automatic | 71 | fortwo | 70000 | 6 | petrol | smart | no | 2016-03-12 00:00:00 | 33729 | 2016-03-15 03:16:28 | avg |
4 | 2016-04-01 14:38:50 | Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... | 1350 | test | kombi | 2003 | manual | 0 | focus | 150000 | 7 | petrol | ford | no | 2016-04-01 00:00:00 | 39218 | 2016-04-01 14:38:50 | very_high |
# Calculating mileage category distribution
autos["mileage_type"].value_counts(dropna = False, normalize = True)
very_high 0.712973 avg 0.085141 above_avg 0.075627 high 0.048832 low 0.040528 very_low 0.036900 Name: mileage_type, dtype: float64
- 71.29% of vehicles have very high mileage in the range of 125 k to 150k km.
# Analysing relatiship b/w mileage & mean_price
mil_cat = autos["mileage_type"].unique()
# creating an empty dict
mil_price = {}
for each in mil_cat:
selected_rows = autos[autos["mileage_type"] == each]
mean_price = selected_rows["price($)"].mean()
mil_price[each] = mean_price
# displaying the dict
mil_price
{'very_high': 5338.356063176147, 'avg': 12810.441287878788, 'above_avg': 9733.069651741294, 'very_low': 18542.76183539694, 'low': 16545.97480106101, 'high': 9134.291689598238}
- There is positive correlation betwen Mileage & Price of Car
# Calculating mean price for damage cars
damaged_rows = autos[autos["not_repaired_damage"] == 'yes']
mean_price_1 = damaged_rows["price($)"].mean()
mean_price_1
4328.579922447221
# Calculating mean price for fit car
not_damaged_rows = autos[autos["not_repaired_damage"] == 'no']
mean_price_2 = not_damaged_rows["price($)"].mean()
mean_price_2
8100.509005217977
diff = mean_price_2 - mean_price_1
diff
3771.9290827707555
- So, damaged cars are 3771.92 dollars cheaper than non-damage cars