Initial:

In [3]:
#!pip -q install --upgrade --ignore-installed numpy pandas scipy sklearn seaborn
#!pip install SWMat
In [2]:
import seaborn as sns
sns.__version__
Out[2]:
'0.9.0'

Import:

In [3]:
import sys

sys.path.append('../SWMat/')
from SWMat import SWMat
In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
In [5]:
import pandas as pd
import numpy as np
In [6]:
import warnings
warnings.filterwarnings("ignore")

Dataset:

In [7]:
from sklearn.datasets import california_housing

data = california_housing.fetch_california_housing()
In [8]:
data.keys()
Out[8]:
dict_keys(['data', 'target', 'feature_names', 'DESCR'])
In [9]:
print(data['DESCR'])
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bureau publishes sample data (a block group typically has a population
of 600 to 3,000 people).

It can be downloaded/loaded using the
:func:`sklearn.datasets.fetch_california_housing` function.

.. topic:: References

    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,
      Statistics and Probability Letters, 33 (1997) 291-297

In [10]:
X = data['data']
y = data['target']
columns = data['feature_names']
In [11]:
train_df = pd.DataFrame(X, index=np.arange(len(X)), columns=columns)
train_df['target'] = y
train_df.head()
Out[11]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude target
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25 3.422

2) Distribution Plot

a) Histograms \ KDE:

Method for joining midpoint of histogram is taken from here.

In [12]:
N, X, _ = plt.hist(train_df['target'], bins=5, ec='w') # 'ec' (edgecolors) for outline
X = 0.5*(X[1:]+ X[:-1])
_ = plt.plot(X, N, '-*', color='orange')
In [15]:
from matplotlib.pyplot import figure
figure(figsize=(8, 6))

train_df['target'].plot.kde(label="Density Plot", color='b') # Or you can use gaussian_kde from scipy.stats as given here: https://realpython.com/python-histograms/
_ = plt.hist(train_df['target'], bins=10, color='lightblue', label='target', density=True, ec='black')
plt.legend()
plt.title("Target Histogram")
plt.xlabel("Target Bins")
plt.ylabel("Probability");
In [16]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

_ = sns.distplot(train_df['target'], rug=True, hist_kws={'ec':'lightblue', 'color':'blue', 'label':'hist'}, 
                 kde_kws={'color':'b', 'label':'density'}, rug_kws={'color':'orange', 'height':0.02, 'label': 'rug plot'})
plt.legend();

Making data talk:

In [17]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

temp = train_df['target'].plot.kde(label="Density Plot", color='b') # Or you can use gaussian_kde from scipy.stats as given here: https://realpython.com/python-histograms/
_ = plt.hist(train_df['target'], bins=10, color='lightblue', label='target', density=True, ec='white')
plt.legend()
plt.title("Making Data Talk", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)
plt.xlabel("Target Bins")
plt.ylabel("Probability")
plt.text(2.5, 0.3, "We have a Bi-modal Distribution\nfor Target variable, with\nmost Blocks having Target\nvalue around 1.6 and 5.", fontsize=14,
            bbox={'facecolor': 'orange', 'edgecolor': 'orange', 'alpha': 0.5, 'pad': 4})
plt.scatter([1.6, 5], [0.435, 0.134], s=1500, c='orange', alpha=0.5)

for p in _[2]:
    p.set_zorder(0)
In [58]:
swm = SWMat(plt)
swm.hist(train_df['target'], bins=10, highlight=[2, 9])
swm.title("Carefully looking at the dependent variable revealed some problems that might occur!")
swm.text("Target is a bi-modal dependent feature.\nIt can be <prop fontsize='18' color='blue'> hard to predict.<\prop>",btw_text_dist=.5);

And likewise you can check distribution of multiple variables:

In [ ]:
from matplotlib.pyplot import figure
figure(figsize=(15, 10))

plt.hist(train_df['MedInc'], bins=50, density=True, alpha=0.5, label="MedInc")
plt.hist(train_df['HouseAge'], bins=50, density=True, alpha=0.5, label="HouseAge")
plt.hist(train_df['AveRooms'], bins=90, density=True, alpha=0.5, label="AveRooms")
plt.axis([0, 53, 0, 0.305])
plt.legend(frameon=False, loc='upper center', ncol=3, fontsize=14);
In [61]:
fig, axs = plt.subplots(1, 4, figsize=(50, 10))

N, X, _ = axs[0].hist(train_df['target'], bins=5, ec='w')
X = 0.5*(X[1:]+ X[:-1])
axs[0].plot(X, N, '-*', color='orange')
axs[0].set_title("Normal", fontdict={'fontsize': 19}, pad=15)

sns.distplot(train_df['target'], rug=True, hist_kws={'ec':'lightblue', 'color':'blue', 'label':'hist'}, 
                 kde_kws={'color':'b', 'label':'density'}, ax=axs[1])
axs[1].legend()
axs[1].set_title("Seaborn", fontdict={'fontsize': 19}, pad=15)

train_df['target'].plot.kde(label="Density Plot", color='b', ax=axs[2]) # Or you can use gaussian_kde from scipy.stats as given here: https://realpython.com/python-histograms/
ht = axs[2].hist(train_df['target'], bins=10, color='lightblue', label='target', density=True, ec='white')
axs[2].legend()
axs[2].set_title("Matplotlib Power", fontdict={'fontsize': 19}, pad=15)
axs[2].set_xlabel("Target Bins")
axs[2].set_ylabel("Probability")
axs[2].text(2.5, 0.3, "We have a Bi-modal Distribution\nfor Target variable, with\nmost Blocks having Target\nvalue around 1.6 and 5.", fontsize=14,
            bbox={'facecolor': 'orange', 'edgecolor': 'orange', 'alpha': 0.5, 'pad': 4})
axs[2].scatter([1.6, 5], [0.435, 0.134], s=1500, c='orange', alpha=0.5)
for p in ht[2]:
    p.set_zorder(0)
    
swm = SWMat(plt, ax=axs[3])
swm.hist(train_df['target'], bins=10, highlight=[2, 9])
swm.title("Carefully looking at the dependent variable revealed some problems that might occur!")
swm.text("Target is a bi-modal dependent feature.\nIt can be <prop fontsize='18' color='blue'> hard to predict.<\prop>",btw_text_dist=.5, btw_line_dist=.7);
In [20]:
from matplotlib.pyplot import figure
figure(figsize=(14, 10))

N, bins, patches = plt.hist(train_df['target'], bins=50, density=True, label="Target")
# For Density plot:
train_df['target'].plot.kde(label="Density Plot", color="b") # As we increase number of bins, our plot will look more and more like density plot

# For more on histogram with density plots look here: https://towardsdatascience.com/histograms-and-density-plots-in-python-f6bda88f5ac0
# For Grid (for better mapping of heights)
plt.grid(axis='y', alpha=0.75)
# Add lines for mean, median and mode:
plt.vlines(x=train_df['target'].mean(), ymin=0, ymax=0.7, colors='green', linestyle='dashdot', label='mean')
plt.vlines(x=train_df['target'].median(), ymin=0, ymax=0.7, colors='red', linestyle='dashed', label='median')
plt.vlines(x=train_df['target'].mode(), ymin=0, ymax=0.7, colors='blue', linestyle='dotted', label='mode')
# Add text for lines above: (https://predictablynoisy.com/matplotlib/tutorials/text/text_intro.html)
plt.text(x=train_df['target'].mean()+0.03, y=0.6, s='mean', color='green', bbox={'alpha': 0.1, 'pad': 2})
plt.text(train_df['target'].median()-0.68, 0.6, 'median', color='red', bbox={'alpha': 0.1, 'pad': 2})
plt.text(train_df['target'].mode()-0.5, 0.6, 'mode', color='blue', bbox={'alpha': 0.1, 'pad': 2})

################################## For colored bins #######################################
# You can ignore this, but using this way you can map your data to a color palette.
from matplotlib import colors
norm = colors.Normalize(N.min(), N.max()) # For mapping whole range values to a colorbar.

for freq, thispatch in zip(N, patches):
    color = plt.cm.inferno_r(norm(freq), alpha=0.85) # Pick a color from a palette (here, inferno_r) based on a value between [0, 1]
    thispatch.set_facecolor(color) # set color of current patch
    thispatch.set_alpha(0.5)
# From here: https://matplotlib.org/gallery/statistics/hist.html
##################################     End (1)     ########################################

plt.legend()
plt.xlabel("Target  ->", fontdict={'fontsize': 12,'fontweight': 5})
plt.ylabel("Density  ->", fontdict={'fontsize': 12,'fontweight': 5})
plt.title("Target Distribution", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)

## Adding colorbar: (you can ignore this too)
# Form here: https://stackoverflow.com/questions/43805821/matplotlib-add-colorbar-to-non-mappable-object
import matplotlib as mpl
cmap = plt.get_cmap('inferno_r', 20)
norm = mpl.colors.Normalize(vmin=0,vmax=0.6)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) # colorbar needs a Mappable. Contour, Scatter and hist2d gives them by default. There you can simply call plt.colorbar()
sm.set_array([])
cb = plt.colorbar(sm, ticks=np.linspace(0,0.6,20))
cb.set_label("Normal density  ->");

3) Relational Plots

a) Line Plot (+ Scatter):

Line Plot:

There are only two variables whose relationship is nearly linear. And they are "AveRooms" and "AveBedrms". And it is obvious.

In [131]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

plt.plot('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), label="Average Bedrooms")

plt.legend()
plt.title("Average Rooms vs Average Bedrooms")
plt.xlabel("Avg Rooms  ->")
plt.ylabel("Avg BedRooms  ->");
In [114]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

plt.scatter('AveRooms', 'AveBedrms', data=train_df)
plt.plot('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'))

plt.xlabel("Avg Rooms  ->")
plt.ylabel("Avg BedRooms  ->");
In [138]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

sorted_df = train_df.sort_values('AveRooms')

plt.scatter('AveRooms', 'AveBedrms', data=train_df, c='lightgreen')
plt.scatter('AveRooms', 'AveBedrms', data=train_df[(train_df['AveRooms']>20)], c='y', alpha=0.7)
plt.scatter('AveRooms', 'AveBedrms', data=train_df[(train_df['AveRooms']>50)], c='r', alpha=0.7)
plt.plot('AveRooms', 'AveBedrms', data=sorted_df, c='lightgreen')
plt.plot('AveRooms', 'AveBedrms', data=sorted_df[(sorted_df['AveRooms']>20)], c='yellow', alpha=0.7)
plt.plot('AveRooms', 'AveBedrms', data=sorted_df[(sorted_df['AveRooms']>50)], c='red', alpha=0.7)


plt.title("Average Rooms vs Average Bedrooms")
plt.xlabel("Avg Rooms  ->")
plt.ylabel("Avg BedRooms  ->");
In [144]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

sorted_df = train_df.sort_values('AveRooms')

plt.plot('AveRooms', 'AveBedrms', data=sorted_df, c='lightgreen')
plt.plot('AveRooms', 'AveBedrms', data=sorted_df[(sorted_df['AveRooms']>20)], c='yellow', alpha=0.7)
plt.plot('AveRooms', 'AveBedrms', data=sorted_df[(sorted_df['AveRooms']>50)], c='red', alpha=0.7)

# Adding text:
plt.text(40, 2.5, "Most Blocks have average less than\n5 bed rooms and 20 rooms.", fontsize=14,
        bbox={'facecolor': 'lightgreen', 'edgecolor': 'lightgreen', 'pad': 4})

plt.title("Average Rooms vs Average Bedrooms")
plt.xlabel("Avg Rooms  ->")
plt.ylabel("Avg BedRooms  ->");

Making Data Talk:

In [12]:
sorted_df = train_df.sort_values('AveRooms', na_position='first').reset_index(drop=True) # reset is necessary, otherwise original Series will be passed.
In [16]:
swm = SWMat(plt)

swm.line_plot(sorted_df['AveRooms'], sorted_df['AveBedrms'], line_labels=["Average Bedrooms"], highlight=0, 
              label_points_after=60, xlabel="Average Rooms", highlight_label_region_only=True, point_label_dist=0.9)
swm.title("There are some possible outliers in 'AveRooms' and 'AveBedrms'!", ttype="title+")
swm.text("This may affect our results. We should\ncarefully look into these and <prop color='blue'>find a\n possible resolution.<\prop>", 
         position="out-mid-right", fontsize=20, btw_line_dist=2.5, btw_text_dist=2);
In [17]:
fig, axs = plt.subplots(1, 4, figsize=(40, 8))
fig.suptitle("Line Plots", fontsize=28)

axs[0].plot('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), label="Average Bedrooms")
axs[0].legend()
axs[0].set_title("Normal", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)
axs[0].set_xlabel("Average Rooms  ->")
axs[0].set_ylabel("Average BedRooms  ->")

sns.lineplot(x='AveRooms', y='AveBedrms', data=train_df, ax=axs[1])
axs[1].set_title("Seaborn", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)

a = axs[2].plot('AveRooms', 'AveBedrms', data=sorted_df, c='lightgreen', label = "Normal")
b = axs[2].plot('AveRooms', 'AveBedrms', data=sorted_df[(sorted_df['AveRooms']>20)], c='yellow', alpha=0.7, label="High")
c = axs[2].plot('AveRooms', 'AveBedrms', data=sorted_df[(sorted_df['AveRooms']>50)], c='red', alpha=0.7, label="Very High")
axs[2].set_title("Matplotlib Power", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)
########################### Adding manual legends ########################################
axs[2].legend(handles=[a[0], b[0], c[0]], ncol=1, frameon=False, fontsize='large')
##########################################################################################
axs[2].text(40, 2.5, "Most Blocks have on average less than\n5 bed rooms and 20 rooms.", fontsize=14,
        bbox={'facecolor': 'lightgreen', 'edgecolor': 'lightgreen', 'pad': 4})
axs[2].set_xlabel("Average Rooms  ->")
axs[2].set_ylabel("Average BedRooms  ->")

swm = SWMat(plt, ax=axs[3])

swm.line_plot(sorted_df['AveRooms'], sorted_df['AveBedrms'], line_labels=["Average Bedrooms"], highlight=0, label_points_after=60,
            xlabel="Average Rooms", highlight_label_region_only=True, point_label_dist=0.9, hide_y=True)
swm.title("There are some possible outliers in 'AveRooms' and 'AveBedrms'!", ttype="title+")
swm.text("This may affect our results. We should\ncarefully look into these and, <prop color='blue'>find a\n possible resolution.<\prop>", 
         position="out-mid-right", fontsize=20, btw_line_dist=2.5, btw_text_dist=2);

But this doesn't look that good. What can we do?

Scatter Plot (1):

In [12]:
# For fitting a linear line:
from numpy.polynomial.polynomial import polyfit

const, slope = polyfit(train_df['AveRooms'], train_df['AveBedrms'], deg=1)
Y = train_df['AveRooms']*slope + const

For more info look here.

In [ ]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

plt.scatter('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), edgecolors='w', linewidths=0.1)

plt.legend()
plt.title("Scatter Plot of Average Rooms and Average Bedrooms")
plt.xlabel("Average Bedrooms  ->")
plt.ylabel("Average Rooms  ->");
In [ ]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

plt.scatter('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), edgecolors='w', linewidth=0.2)
plt.plot(train_df['AveRooms'], Y, linewidth=1, color='red', linestyle='-', alpha=0.8)

plt.xlabel("Avg Rooms  ->")
plt.ylabel("Avg BedRooms  ->")

# Adding annotations:
plt.annotate("Possible outliers", xy=(144, 31), xytext=(160, 34),
             arrowprops={'arrowstyle':'-[,widthB=4.0', 'color': 'black'},
             bbox={'pad':4, 'edgecolor':'orange', 'facecolor':'orange', 'alpha':0.4})

plt.annotate("Regression Line", xy=(80, 12), xytext=(120, 3),
             arrowprops={'arrowstyle':'->', 'color': 'black', "connectionstyle":"arc3,rad=-0.2"},
             bbox={'pad':4, 'edgecolor':'orange', 'facecolor':'orange', 'alpha':0.4})
plt.show()

We can also add confidence interval region for our regression line. We can get confidence interval for our regression like using sklearn's GaussianProcess method. (look here)

In [ ]:
sample = train_df.sample(frac=0.5) # Gaussian Process taking too much memory...
In [23]:
from sklearn.gaussian_process import GaussianProcessRegressor

gp = GaussianProcessRegressor()
In [25]:
%%time

print("Fitting...")
gp.fit(sample['AveRooms'].values.reshape(-1, 1), sample['AveBedrms'].values)
print("Fitting Complete.")

print("Predicting...")
x = np.linspace(0, 145, 146)
preds, std = gp.predict(x.reshape(-1, 1), return_std=True)
print("Predicted.")

# For 95% confidence interval:
delta = 1.96*std
Fitting...
Fitting Complete.
Predicting...
Predicted.
Wall time: 8min 44s
In [26]:
from matplotlib.pyplot import figure
figure(figsize=(10, 7))

plt.scatter('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'))#, edgecolors='w')
plt.plot(train_df['AveRooms'], Y, linewidth=1, color='red', linestyle='-', alpha=0.8)

plt.xlabel("Avg Rooms  ->")
plt.ylabel("Avg BedRooms  ->")

# Adding annotations:
plt.annotate("Possible outliers", xy=(144, 31), xytext=(160, 34),
             arrowprops={'arrowstyle':'-[,widthB=4.0', 'color': 'black'},
             bbox={'pad':4, 'edgecolor':'orange', 'facecolor':'orange', 'alpha':0.4})

plt.annotate("Regression Line", xy=(80, 12), xytext=(120, 3),
             arrowprops={'arrowstyle':'->', 'color': 'black', "connectionstyle":"arc3,rad=-0.2"},
             bbox={'pad':4, 'edgecolor':'orange', 'facecolor':'orange', 'alpha':0.4})

# For confidence interval:
plt.fill_between(x, preds-delta, preds+delta, color='gray', alpha=0.4)
plt.ylim(0, 35);
In [18]:
swm = SWMat(plt)
plt.scatter('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), edgecolors='w', linewidths=0.3)
swm.line_plot(train_df['AveRooms'], Y, highlight=0, alpha=0.7, line_labels=["Regression Line"])
swm.title("'AveBedrms' and 'AveRooms' are highly correlated!", ttype="title+")
swm.text("Taking both of them in regressioin process\nmight not be necessary. We can either\n<prop color='blue'>take one of them</prop> or <prop color='blue'>take average.</prop>",
         position='out-mid-right', btw_line_dist=5)
swm.axis(labels=["Average Rooms", "Average Bedrooms"])
In [19]:
fig, axs = plt.subplots(1, 4, figsize=(40, 8))
fig.suptitle("Scatter Plots", fontsize=28)

axs[0].scatter('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), edgecolors='w', linewidths=0.3)
axs[0].set_title("Normal", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)
axs[0].set_xlabel("Average Bedrooms  ->")
axs[0].set_ylabel("Average Rooms  ->")

sns.regplot(x='AveRooms', y='AveBedrms', data=train_df, label="Average Bedrooms", ax=axs[1])
axs[1].set_title("Seaborn (regplot)", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)

##################### For colors in scatter plot #################################
diff = (train_df['AveBedrms'] - Y).abs()
colors = []
for i in range(len(diff)):
  if diff[i] <= 0.17: colors.append('g')
  elif diff[i] > 0.17 and diff[i] <= 2: colors.append('y')
  else: colors.append('r')
train_df['colors'] = colors
sorted_df = train_df.sort_values('AveRooms')
##################################################################################
sct = axs[2].scatter('AveRooms', 'AveBedrms', data=sorted_df, c=sorted_df['colors'])
del train_df['colors']
axs[2].plot(train_df['AveRooms'], Y, linewidth=1, color='red', linestyle='-', alpha=0.8)
axs[2].set_title("Matplotlib Power", fontdict={'fontsize': 19, 'fontweight':0.5 }, pad=15)
axs[2].set_xlabel("Avg Rooms  ->")
axs[2].set_ylabel("Avg BedRooms  ->")
###################### Setting legend manually ###################################
one = axs[2].scatter([], [], c='g', label='Low Residual')
two = axs[2].scatter([], [], c='y', label='Med. Residual')
three = axs[2].scatter([], [], c='r', label='High Residual')
axs[2].legend(handles=[one, two, three],)
##################################################################################
# Adding annotations:
axs[2].annotate("Possible outliers", xy=(144, 31), xytext=(150, 34),
             arrowprops={'arrowstyle':'-[,widthB=4.0', 'color': 'black'},
             bbox={'pad':4, 'edgecolor':'orange', 'facecolor':'orange', 'alpha':0.4})
axs[2].annotate("Regression Line", xy=(80, 12), xytext=(120, 3),
             arrowprops={'arrowstyle':'->', 'color': 'black', "connectionstyle":"arc3,rad=-0.2"},
             bbox={'pad':4, 'edgecolor':'orange', 'facecolor':'orange', 'alpha':0.4})
axs[2].text(26, 25, "We have a nearly linear\nrelationship between\nAverage rooms and Average bed\nRooms.", fontsize=16,
           bbox={'facecolor': 'orange', 'edgecolor': 'orange', 'pad': 4, 'alpha': 0.4});


swm = SWMat(plt, ax=axs[3])
axs[3].scatter('AveRooms', 'AveBedrms', data=train_df.sort_values('AveRooms'), edgecolors='w', linewidths=0.3)
swm.line_plot(train_df['AveRooms'], Y, highlight=0, alpha=0.7, line_labels=["Regression Line"])
swm.title("'AveBedrms' and 'AveRooms' are highly correlated!", ttype="title+")
swm.text("Taking both of them in regressioin process\nmight not be necessary. We can either\n<prop color='blue'>take one of them</prop> or <prop color='blue'>take average.</prop>",
         position='out-mid-right', btw_line_dist=5)
swm.axis(labels=["Average Rooms", "Average Bedrooms"])