#!/usr/bin/env python # coding: utf-8 # ## Used car dataset from eBay Kleinanzeigen # The purpose of this project is importing, cleaning and exploratory analysis of used car listings from the German ebay website. Results are visualized to help easily understand and compare the data. Observations from the analysis are summarized at the end. # In[280]: # Import necessary packages import numpy as np import pandas as pd import matplotlib # Read the input CSV data file into a dataframe input_file = 'autos.csv' autos = pd.read_csv(input_file, encoding='Latin-1' ) # In[281]: # Setup the NB to display plots inline get_ipython().run_line_magic('matplotlib', 'inline') # Some options to look at the dataframe and get familiar with the dataset # - Using a helper function that reads in the CSV file and prints out specific number of rows in a table format # - Running the variable 'autos' from above. This utilizes the benefits of jupyter notebook for pandas and will render the first few and last few values in a table format. # - Using df.head() or .tail() to print the first and last few rows in a table format # In[282]: def print_some_rows(csv_file, num_rows, enc='Latin-1'): return pd.read_csv(csv_file, nrows=num_rows, encoding=enc) # In[283]: print_some_rows(input_file, 8) # In[284]: #autos # In[285]: autos.head() # In[286]: # Details about the dataframe - number of rows and columns, names of columns and types of data they contain autos.info() autos.dtypes # ## Observations on the input dataset # There are 50,000 rows with 20 columns of datapoints which are mostly strings. # # There are 5 numeric columns. # # Some columns have missing values. # # More observations on patterns and the need to clean: # # - Some column labels are not as readable or too long. The following edits will make them better # # Replace # 1. `yearOfRegistration` with `registration_year` # 2. `monthOfRegistration` with `registration_month` # 3. `notRepairedDamage` with `unrepaired_damage` # 4. `dateCreated` with `ad_created` # # - Some column labels are in camelCase. It will be more user friendly to comply with Python's `lower_case_with_underscores` snake format # # Define functions to make the two changes to them. # In[287]: # Get column names using .columns attribute print (autos.columns) print (list(autos.columns)) # While this is useful for looping over in cleaning the column names, df.head() is also useful for a quick view of the names in a table format autos.head(0) # ## Make column labels readable # In[288]: # Function that edits existing column names to be readable def edit_cols(col): col = col.replace('yearOfRegistration', 'registration_year') col = col.replace('monthOfRegistration', 'registration_month') col = col.replace('notRepairedDamage', 'unrepaired_damage') col = col.replace('dateCreated', 'ad_created') return col # In[289]: # Implement the above function on the columns ed_cols = [] for col in list(autos.columns): ed_cols.append(edit_cols(col)) # Assign modified column names list back to the .columns attribute autos.columns = ed_cols print (autos.columns) autos.head(0) # In[290]: # Function that converts column names to python snake case def to_snake(col): snake = col[0].lower() return (snake + ''.join( '_'+l.lower() if l.isupper() else l for l in col[1:]) ) # In[291]: # Implement the above function on the columns snaked_cols = [] for c in autos.columns: snaked_cols.append(to_snake(c)) # Assign modified column names list back to the .columns attribute autos.columns = snaked_cols # In[292]: # Column names after the modifications autos.head(0) # ## Data exploration and cleaning # In[293]: # Summary stats of all columns - numeric and non-numeric autos.describe(include='all') # In[294]: # Summary stats of just numeric columns to better explore how good data quality is autos.describe() # In[295]: # Summary stats of non-numeric columns autos.describe(include=['O']) # ## Observations on numeric columns: # - Earliest and latest `registration_year` values 1000 and 9999 need further digging into # - `registration_month` is 0 for over 5K data points. It needs to be looked into. # - `power_in_ps` for some listings is 0. That means car is a dud or for scrap only # - `nr_of_pictures` has 0 for all 50K values, which is not useful for analysis and can be dropped # # ## Observations on non-numeric columns: # - Some of the columns have very few unique values and need translation from German to figure usefulness for analysis # - `price` has non-digit characters ($ and punctuation - commas). Removing those characters can help converting it to numeric data type # - `price` has values of 0 which is strange and needs further study # - `odometer` has non-digit characters (km and punctuation - commas), which can be removed and column data type converted to numeric as well # # Get a closer look at the counts on numeric columns # # In[296]: autos['registration_month'].value_counts() # In[297]: autos['power_p_s'].value_counts().head(10) # In[298]: autos['nr_of_pictures'].value_counts() # In[299]: # Convert string columns to numeric data type autos['price'] = autos['price'].str.replace('$','') autos['price'] = autos['price'].str.replace(',','') autos['odometer'] =autos['odometer'].str.replace('km','') autos['odometer'] =autos['odometer'].str.replace(',','') autos[ ['price', 'odometer'] ] = autos[ ['price', 'odometer'] ].astype(int) # In[300]: # Rename odometer column to odometer_km, since, km is a useful unit, for information autos.rename({"odometer": "odometer_km"}, axis=1, inplace=True) # In[301]: # dataframe.dtypes will help verify the changed data types and column names autos.dtypes # In[302]: autos.isnull().sum() # As observed earlier, some columns have null values, however, not more than about 20% # ## Analysis of numeric columns `price` and `odometer_km` # In[303]: print (autos['price'].unique().shape) print (autos['odometer_km'].unique().shape) # In[304]: autos['odometer_km'].unique() # In[352]: autos.odometer_km.value_counts() # In[358]: autos.odometer_km.plot.box(vert=False,grid=True) # In[359]: autos.odometer_km.plot.hist() # In[305]: autos['odometer_km'].describe() # Listings range from low-mileage cars (5000 km) to high-mileage ones (150000 km) # # Most cars' mileage is on the higher side - over 120000KM. Visualizing it above indicates that the maximum, median value and 3rd quartile are all coinciding at 150K. It also shows that there is a wide range in the values of mileage (i.e more spread out data) in the under 100K area and about 50% of the listings are close together in the higher range. # When we split the `odometer_km` into bins later on in the analysis, it will be clearer which bins most mileages fall into. # # In[306]: autos['price'].describe() # As found earlier, `price` column has some outliers - minimum of 0 and maximum of ~100M dollars. Let's study the price list further # In[307]: pd.DataFrame(autos['price'].value_counts(dropna=False).head(10)) # The value_counts index above (which is actually the price) is in ascending order, so, by reversing the order of the index, let's see what the price distribution looks like on the higher end. # In[308]: pd.DataFrame(autos['price'].value_counts().sort_index(ascending=False).head(20)) # - There are no cars in range 350000 < `price` < 999990 # # - A used car has nearly 100M price tag - find what that is (i.e row with the MAX value of `price` ) # In[309]: pd.DataFrame(autos.loc[autos['price'].idxmax()]) # It's an intriguing listing name (when translated to english) - *Exchange for something of the same value*. # # It's also unrealistic a limo with 150K miles would be worth a 100M dollars. # # Let's see how many and of what type used cars are in millions of dollars # In[310]: millions = autos['price'] > 999999 car_is = autos[millions] pd.DataFrame(car_is[ ['vehicle_type', 'model','odometer_km', 'brand', 'price' ] ]) # Even millions price range does not seem to contain any of the expensive brands like Bugatti or Lamborghini, which could sell for that much in used condition. # # There are many that are missing vehicle type or model. A buyer would need that information before bidding. # # Therefore, it's reasonable to drop these and explore data points upto 1M. # In[311]: # Look at the distribution when the price is between 500 and 1M. upto_1mil = autos['price'].between(500,999999) autos_1mil = autos[upto_1mil] autos_1mil['price'].describe() # Checking distribution again using an upper price limit of 350K, not many data points are lost compared to 1M, because there are no listings between 350K and 1M. # # Also, $6K is more realistic for average price of a used car. # In[312]: # Clean the data so that prices are between 500 and 350K. Look at the distribution upto_350k = autos['price'].between(500,350000) autos_350k = autos[upto_350k] print (autos_350k['price'].describe()) autos = autos_350k # In[313]: # Lines left after cleaning autos.shape # ## Explore columns containing dates # These columns are string values right now # - `date_crawled` # - `ad_created` # - `last_seen` # # They can be better understood and useful for analysis when converted to numeric type # In[314]: autos[['date_crawled','ad_created','last_seen']].head() # - Select first 10 characters which is sufficient for the date in yyyy-mm-dd format # - Get relative frequencies (or percentages) instead of counts of the unique values, including missing(null) values, sorted in ascending order of dates # # These columns have non-numeric values so, Series.describe() gets us different stats like so: # In[315]: autos['date_crawled'].str[:10].describe() # In[316]: # Look at the distribution of values in percentages autos['date_crawled'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # In[317]: autos['ad_created'].str[:10].describe() # Ads are created starting 2015-06-11. Most listings occured on 2016-04-03 # In[318]: # Look at the distribution of values in percentages autos['ad_created'].str[:10].value_counts(normalize=True, dropna=False).sort_index().head(20) # `date_crawled` and `last_seen` are the same dates # In[319]: autos['last_seen'].str[:10].describe() # In[320]: # Look at the distribution of values in percentages autos['last_seen'].str[:10].value_counts(normalize=True, dropna=False).sort_index() # ## Analysis of numeric columns `registration_year` and `registration_month` # In[321]: autos['registration_year'].describe() # Registration years of 1000 or 9999 do not make sense and are irrelevant to analysis. # # The ad was first crawled starting in 2016 so, the latest valid registration year could not be after 2016. # # For lower limit on the registration year, I would utilize publicly available knowledge about when first cars came out. # # Cars became widely available in the early 20th century, even though there were some first ones in late 1800s. # - 1886 was the birth year of the modern car when German inventor Karl Benz patented his Benz Patent-Motorwagen and # - 1896 was when the first successful American gasoline automobile ( designed by Bicycle mechanics J. Frank and Charles Duryea of Springfield, Massachusetts ) was first sold. # # Hence, selecting 1920 for lower limit. # In[322]: year_bool = autos['registration_year'].between(1000, 1920) # In[323]: autos.loc[year_bool, ['registration_year', 'brand', 'price']] # Based on 1920 for lower cutoff, we won't lose much of valid data # In[324]: include_reg_year = autos[autos['registration_year'].between(1920,2016)] # In[325]: include_reg_year.shape # In[326]: include_reg_year['registration_year'].describe() # Looking at the percentages (instead of counts) of the unique registration year values, distribution prior to 1987 needs some looking into whether we can drop them. # In[327]: include_reg_year['registration_year'].value_counts(normalize=True, dropna=False).sort_index() # In[328]: include_reg_year2 = include_reg_year[include_reg_year['registration_year'].between(1969,2016)] # In[329]: include_reg_year2.shape # In[330]: include_reg_year2['registration_year'].describe() # In[331]: include_reg_year2['registration_year'].value_counts(normalize=True, dropna=False).sort_index(ascending=False) # In[332]: include_reg_year2['registration_year'].value_counts(normalize=True, dropna=False).sort_index(ascending=False).head(23).sum() # Distribution of the data for years of registration 1994-2016, with 94% of listings in that range, indicates it's reasonable to keep that data for analysis. # In[333]: include_1994_2016 = include_reg_year2[include_reg_year2['registration_year'].between(1994,2016)] include_1994_2016.shape # In[334]: include_1994_2016['registration_year'].describe() # In[335]: include_1994_2016['registration_year'].value_counts(normalize=True, dropna=False).sort_index() # In[336]: autos = include_1994_2016 # In[337]: # Taking stock of cleaned data print (autos.shape) autos.head() # ## Exploring And analyzing `brand` column # Understand the mean prices across brands # In[338]: # List and number of all the unique brands brand_list = autos["brand"].unique() print (len(brand_list)) print (brand_list) # In[339]: counts = pd.DataFrame(autos['brand'].value_counts()) counts # I am selecting brands that have atleast 100 listings, including **Porsche** in the mix makes it interesting to analyze mean prices # In[344]: #counts[counts.brand > 100] # Note: counts.brand is same as counts['brand'] brands_gt_100 = counts[counts.brand > 100].index print (brands_gt_100) # In[345]: # Aggregate data by `brand` column # Assign brands and their mean prices as key-value pairs to a dictionary brand_price = {} for b in brands_gt_100: # Select only rows that correspond to a specific brand brand_rows = autos[autos["brand"] == b] # Calculate the mean price for those rows mean_price = brand_rows["price"].mean() brand_price[b] = mean_price print (len(brand_price)) print (brand_price) # In[346]: # Construct a dataframe from this dictionary brand_price_df = pd.DataFrame(list(brand_price.items()),columns = ['brand','mean_price']) print (type(brand_price_df)) # Sort the mean_price column from highest to lowest, to find its corresponding brand brand_price_df = brand_price_df.sort_values('mean_price', ascending=False) brand_price_df # In[347]: brand_price_df.describe() # In[348]: # Top 6 brands by listings counts.head(6) # In[349]: # Brands by mean price brand_price_df.head(6) # In[350]: counts.head(6).plot.barh(title='Top 6 popular brands', legend=False) # In[351]: brand_price_df.head(6).plot.barh(x='brand',y='mean_price',title='Top 6 brands by mean price', legend=False) # - Top brands **Audi, Mercedes Benz** and **BMW**, all German, make the list of used cars with a high mean price # - **Ford** and **Opel** are top brands but much cheaper and don't make the list of top brands by price # - **Porsche**, also a German car, is the priciest used car at an average of 49661. It's understandable because it is world's #1 luxury brand clearly reflected in its price tag # - **Sonstige autos** is a distant second priciest at 14265, followed by # - **Mini**, possibly, because it's owned by BMW # - **Renault** is the least expensive used car at 2762. # - The average used car price is 7194. # - **Volkswagen** is the top most brand in terms of listings, however, it's not among the priciest. Its price falls below the average used car. # # We may infer that there are so many **Volkswagen** listings because they can sell easily due to their lower average price. # # However, it would be interesting to see what the average mileage on it is. That tells us if it's not a well liked car and people want to dump quickly or it's so popular that despite high mileage, people expect it to be an easy sale. # ## Analysis of mileage of top 6 brands # With mean mileage and mean price for each of the top brands, understand if there's any visible link between them # In[250]: # Aggregate data by `odometer_km' column # Assign brands and their mean mileages as key-value pairs to a dictionary brand_miles = {} for b in list(counts.head(6).index): # Select only rows that correspond to a specific brand brand_rows = autos[autos["brand"] == b] # Calculate the mean mileage for those rows mean_mileage = round(brand_rows["odometer_km"].mean(),2) brand_miles[b] = mean_mileage print (len(brand_miles)) print (brand_miles) # In[251]: # Convert `brand_miles` dictionary to a series object; don't sort values brand_miles_series = pd.Series(brand_miles) print (brand_miles_series) # In[252]: # Create a dataframe from the series brand_miles_series brand_miles_df = pd.DataFrame(brand_miles_series,columns = ['avg_miles']) brand_miles_df # In[253]: # Similarly, calculate the average price for the top 6 brands, using aggregation brand_avg_pr = {} for b in list(counts.head(6).index): # Select only rows that correspond to a specific brand brand_rows = autos[autos["brand"] == b] # Calculate the mean price for those rows avg_pr = round(brand_rows["price"].mean(),2) # Assign the mean price to the dictionary brand_avg_pr, using the brand name as the key brand_avg_pr[b] = avg_pr print (len(brand_avg_pr)) print (brand_avg_pr) # In[254]: # Convert `brand_avg_pr` dictionary to a Series object brand_avgpr_series = pd.Series(brand_avg_pr) print (type(brand_avgpr_series)) brand_avgpr_series # In[255]: # Add the Series object as a new column named `mean_price` to the dataframe `brand_miles_df` brand_miles_df['mean_price'] = brand_avgpr_series brand_miles_df # In[256]: # Sort mileage values for seeing any connection to mean price values brand_miles_df.sort_values('avg_miles', ascending=False) # In[257]: brand_miles_df.plot.scatter(x='avg_miles',y='mean_price',title='Impact of mileage on avg.price of car') # For the high-end brands **Audi, BMW and Mercedes**(in the top right of the scatter plot), as mileage increases, the average price clearly decreases. Mileage is a indeed factor when it comes to price, even among high-end brands. # # **Ford** and **Opel** (in the bottom left) are much less expensive used cars unlike German brands. However, as mileage goes up, their price goes down consistent with the trend seen with the expensive brands. # # **Volkswagen** with average mileage similar to **Opel** and higher than **Ford** still costs more than both. That could be because it's just a popular brand. It's just not in the league of expensive brands, nor comparable to the inexpensive ones. # ## Drop worthy columns # # There are text columns where all or almost all values are the same or unique values are 1 or very few # - column `nr_of_pictures` as identified earlier contains all values = `0` # - columns `seller` and `offer_type` seem to have very few unique values # Let's see # In[207]: print (autos[['seller','offer_type',]].dtypes) autos.shape # In[208]: num_uniq_seller = len(autos['seller'].unique()) num_uniq_offer = len(autos['offer_type'].unique()) print (num_uniq_seller) print (num_uniq_offer) # In[209]: print (autos['seller'].value_counts()) print (autos['offer_type'].value_counts()) # In[210]: #Drop them 3 columns autos = autos.drop(['seller','offer_type','nr_of_pictures'],axis=1) # In[211]: autos.shape # ## Further data cleaning # - ** German to English translation of Categorical data** # These 3 columns have categorical data that needs translation from German # `gearbox` # `unrepaired_damage` # `fuel_type` # In[212]: autos['gearbox'].value_counts() # In[213]: autos['unrepaired_damage'].value_counts() # In[214]: autos['fuel_type'].value_counts() # In[215]: autos['gearbox'] = autos['gearbox'].str.replace('manuell','manual') autos['gearbox'] = autos['gearbox'].str.replace('automatik','automatic') autos['unrepaired_damage'] = autos['unrepaired_damage'].str.replace('nein','no') autos['unrepaired_damage'] = autos['unrepaired_damage'].str.replace('ja','yes') autos['fuel_type'] = autos['fuel_type'].str.replace('benzin', 'gasoline') autos['fuel_type'] = autos['fuel_type'].str.replace('elektro', 'electric') autos['fuel_type'] = autos['fuel_type'].str.replace('andere', 'other') # In[216]: print(autos['gearbox'].value_counts()) print ('\n') print(autos['unrepaired_damage'].value_counts()) print ('\n') print(autos['fuel_type'].value_counts()) # - **Investigation of key words from `name` column worthy of extracting as new columns** # In[217]: autos[ ['name', 'brand','model']].tail(50) # - In some names, the first two words separated by `_` indicate the `brand` and `model` of the car respectively. However, those columns already exist so, it's not worth extracting them as new columns # - Some of the names are obscure to be of value to extract into new columns (for example, `Omas_Liebling` or `W.Lupo_1.0`) # - Some names have information reg. fuel type or gearbox which is redundant even if extracted, as those columns already exist. # - In some cases, name is a single word or random number. There's just too much text under name column and not even consistent across all rows, to extract into new columns # In[218]: single_names =[] for row in list(autos['name']): if '_' not in row: single_names.append(row) pd.DataFrame(single_names) # There are 101 names that don't convey anything meaningful or they need translation to English. Overall, there are no consistent key words in the `name` column that we can extract as is into separate columns that will provide value to analysis. In fact, it's a good candidate to drop from the dataframe. # In[219]: autos.drop(['name'], axis=1, inplace=True) # In[220]: autos.shape # - **Convertion of all dates to uniform numeric data** # In[221]: autos.shape # In[222]: autos[['date_crawled', 'ad_created', 'last_seen']].dtypes # In[223]: autos[['date_crawled', 'ad_created', 'last_seen']].head() # In[224]: autos['date_crawled'] = autos['date_crawled'].str[:10] autos['ad_created'] = autos['ad_created'].str[:10] autos['last_seen'] = autos['last_seen'].str[:10] # In[225]: autos['date_crawled'] = autos['date_crawled'].str.replace('-','') autos['ad_created'] = autos['ad_created'].str.replace('-','') autos['last_seen'] = autos['last_seen'].str.replace('-','') # In[226]: # Convert to data type integer autos[['date_crawled', 'ad_created', 'last_seen']] = autos[['date_crawled', 'ad_created', 'last_seen']].astype(int) # In[227]: autos[['date_crawled', 'ad_created', 'last_seen']].dtypes # In[228]: autos[['date_crawled', 'ad_created', 'last_seen']].head() # ## Some more questions for further Analysis # # ** What are some of the most common brand/model combinations?** # In[229]: autos[['brand','model']].head() # In[230]: # Concatenate `brand`, `model` strings with a `_` autos['brand_model'] = autos['brand']+'_'+autos['model'] # In[231]: brand_model_df = pd.DataFrame(autos['brand_model'].value_counts()) # In[232]: # The brand/model combinations that make the tope 10 most common listings brand_model_df.head(10) # The most common cars by far are **Volkswagen** models. # **How much cheaper are cars with damage than their non-damaged counterparts?** # In[233]: autos.head(0) # In[234]: autos.unrepaired_damage.value_counts() # In[235]: autos['unrepaired_damage'].isnull().sum() # In[236]: cars_with_damage = autos[autos['unrepaired_damage'] == 'yes'] cars_no_damage = autos[autos['unrepaired_damage'] == 'no'] print (cars_with_damage.shape) print (cars_no_damage.shape) # In[237]: # Calculate the average price for just this set of rows damage_mean = cars_with_damage['price'].mean() no_damage_mean = cars_no_damage['price'].mean() print (damage_mean) print (no_damage_mean) # **Do average prices follow any patterns based on the mileage?** # # In[238]: mileages = pd.DataFrame(autos['odometer_km'].unique()) print (mileages.max() - mileages.min()) mileages # Split the odometer_km into bins like this: # (Note that left bin edge is exclusive and right bin edge is inclusive) # # [1000, 31000, 61000, 91000, 121000, 151000] # In[239]: # Create a new column `odometer_km_bins` that sets the x argument to the `odometer_km` column and sets the bins argument to the list of bin edge values # `odometer_km` values will get assigned to the proper bin autos['odometer_km_bins'] = pd.cut(x=autos['odometer_km'], bins=[1000, 31000, 61000, 91000, 121000, 151000]) # In[240]: # Print out some rows with the new column and checkout the bins autos[['price', 'odometer_km', 'odometer_km_bins']].head() # In[241]: autos['odometer_km_bins'].value_counts() # In[242]: # Unique mileage bins to loop over, using index labels autos['odometer_km_bins'].value_counts().index # In[243]: # Produce a dictionary of average price for each mileage bin avg_price_by_odo_bin = {} for b in autos['odometer_km_bins'].value_counts().index: # select rows that correspond to each mileage bin b_rows = autos[autos['odometer_km_bins'] == b] # Calculate the average price for just those rows avg_price = b_rows["price"].mean() # Assign the avg price value to the dictionary, using the odometer_km_bins as the key avg_price_by_odo_bin[b] = avg_price print (avg_price_by_odo_bin) # In[244]: # Create a dataframe from the dictionary odo_price_df = pd.DataFrame(list(avg_price_by_odo_bin.items()),columns = ['odometer_km_groups','avg_price']) # In[245]: # Sort column in descending order odo_price_df.sort_values('odometer_km_groups', ascending=False) # In[ ]: autos.odometer_km.plot.hist() # ## Summarizing the Analyzed results from Used Car Listings # # - These are the top 6 brands by listings: # # Volkswagen # BMW # Opel # Mercedes-Benz # Audi # Ford # # # - Some listings are priced at millions of dollars for coupes and Limousines. That aspect gives the site an impression that not every listing is realistic and buyers could be in for a surprise or a long bidding process # # # - Top brands **Audi, Mercedes Benz** and **BMW**, all German, on average, are also among the highest priced cars # # - **Ford** and **Opel** are top brands but much cheaper and don't make the list of top brands by price # # - **Porsche**, also a German car, is the priciest used car at an average of 49661. It's understandable because it is world's #1 luxury brand and has over 200 listings on this site # # - **Sonstige autos** is a distant second priciest at 14265, followed by # - **Mini**, possibly, because it's owned by BMW # - **Renault** is the least expensive used car at 2762 # - The average used car price is 7194 # # # - **Volkswagen** is by far the most popular brand and 3 of its models **golf, polo and passat** are the most commonly listed. However, it's not among the priciest. Its price falls below that of the average used car # # # - Almost 75% of the listings are high-mileage cars, having mileages in 121000 - 151000 KM range # # # - It's also worth noting that damaged cars do not dominate the listings. 8% of the listings are with (unrepaired) damages while 76% are with no (unrepaired) damages # # # - On an average, cars with unrepaired damages cost over 4500 less than their non-damaged counterparts. Knowing average repair costs for a brand would help a buyer decide if it's worth paying higher price for a car without damages or if the price difference # covers the damages for that brand so he or she can get it for cheaper # # # - Visualizing the data for `Impact of mileage on avg.price of car`(Scatter plot)corraborated what splitting mileage into separate bins showed - that the average prices increased steadily for lower mileage bins: # # For the high-end brands Audi, BMW and Mercedes, as mileage increased, the average price clearly decreased. Mileage is a indeed factor when it comes to price, even among high-end brands # # Ford and Opel are much less expensive used cars unlike German brands. However, as mileage went up, their price went down, consistent with the trend seen with the expensive brands. # # Volkswagen with average mileage similar to Opel and higher than Ford still costs more than both. That could be because it's just a popular brand, even though it's not in the league of expensive brands or comparable to the inexpensive ones # # We may also infer that there are so many Volkswagen listings because they can sell easily due to their lower average price # In[ ]: