Notebook

Web scraping for flats¶

Get some lists from leboncoin.fr

In [23]:

import hashlib
import urllib
import unidecode
import numpy as np
from string import atof,atoi
from pprint import pprint
from bs4 import BeautifulSoup as BS
from datetime import datetime, timedelta

In [24]:

# ---------------------------------------
#             get the 1st 100
base_url='https://www.leboncoin.fr/locations/offres/ile_de_france/paris/'

# ----------------------------------------
#            a shortcut
urlget = urllib.urlopen

Downlad page number¶

The default search criteria are for flats with size > 30 sqm, in Paris. The function returns the Beautifulsoup-ped HTML of a given page. Each page contains a list of flats.

In [25]:

def get_page_from_url(pgnr,base_url):
    CL_paris_apa_html="{0:s}?o={1:d}&sqs=2&ret=1&ret=2&ret=5".format(base_url,pgnr)
    print CL_paris_apa_html
    try:
        xpage = urlget(CL_paris_apa_html)
        CL_html = xpage.read()
        encoding = xpage.info()['content-type'].split('=')[-1]
    except IOError: # in case of network error
        print 'IOError'
        CL_html = ''
    try:
        return BS(CL_html.decode(encoding,'replace'),'lxml')
    except UnboundLocalError:
        return BS(CL_html.decode('latin-1','replace'),'lxml')
human_page=get_page_from_url(2,base_url)

https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=2&sqs=2&ret=1&ret=2&ret=5

List flats urls from page¶

From each page given by the previous function, we record the list of flats, their publication date, and pid.

In [26]:

def get_listings_from_html(human_page,base_url):
    today = datetime.now()
    yesterday = today - timedelta(days=1)
    yesterday = yesterday.strftime('%d/%m/%y')
    today = today.strftime('%d/%m/%y')

    apartments = human_page.findAll('a',attrs={'class':'list_item'})
    
    results=[]
    for item in apartments:
        try:
            title=unidecode.unidecode(''.join(item.select('h2.item_title')[0].contents).strip())
        except IndexError:
            continue
        insert_time = item.select('aside.item_absolute > p.item_supp')
        if not len(insert_time):
            continue
        insert_time = insert_time[0].contents[-1].strip().replace(u"Aujourd'hui",today)
        insert_time = insert_time.replace(u"Hier",yesterday)
        url = 'http:'+item.attrs['href']
        pid = url.split('/')[-1].split('.')[0]
        results.append({'insert_time':insert_time,'pid':pid,'url':url,'title':title})
        md5_input = title
        results[-1].update({'md5sum':hashlib.md5(md5_input).hexdigest()})
    
    return results
apas=get_listings_from_html(human_page,base_url)
# apas[1]

In [27]:

# for jix in xrange(len(flats)):
#     keys=['insert_time','pid','url']
#     fl_ = {j:flats[jix] for j in keys}
#     flats[jix].update({'md5sum':hashlib.md5(str(fl)).hexdigest()})
    
# print flats[-1]

From flat url to flat record¶

We have the urls of all flats listed in CL.. well, we have the functions to get the urls, the actual work will be done at the end. Finally, we can get the informations out of each flat individual page. This is really boring, but needed. The results is a list of features for each flat. These features will need further refinement, that I will do in future notebooks

In [28]:

def select2text(din):
    basic_string = [' '.join(list(j.stripped_strings)) for j in din]
#     basic_string =  [unidecode.unidecode(j) for j in basic_string]
    return basic_string

def make_col_name(nm):
    nm = unidecode.unidecode(nm)
    nm = nm.title()
    for sub in " ,./;'[]\<>?:{}|=+!@$%^&*()_-#":
        nm = nm.replace(sub,'')
    if nm[:2]=='Pi' and nm[-2:]=='es':
        nm='Pieces'
    if 'Meubl' in nm:
        nm = 'Meuble'
    return nm.strip()
    
# functions to process the raw features record
from collections import defaultdict as DDict
def default():
    return lambda out:out

def loyer(out):
    # process loyer
    lm = out['LoyerMensuel']
    lm = make_col_name(lm)
    eur = lm.split('Eur')[0].replace(' ','')
    out['LoyerMensuel']=atof(eur)
    # add a filed 
    out['ChargesComprises'] = False
    if 'ChargesComprises' in lm:
        out['ChargesComprises'] = True
    return out

def surface(out):
    # process surface
    sr = out['Surface']
    sr = atof(sr.strip().split('m')[0])
    out['Surface'] = sr
    return out
    
def meublenonmeuble(out):
    #process MeubleNonMeuble
    mnm = out['Meuble']
    if 'Non' in mnm:
        out['Meuble'] = False
    else:
        out['Meuble'] = True
    return out

def ville(out):
    #process Ville
    vl = out['Ville']
    zipcode = atoi(vl.split()[-1])
    arrondissement = zipcode - 75000
    out['Arrondissement'] = arrondissement
    return out
    
unrawify_dict=DDict(default)
unrawify_dict['Ville']=ville
unrawify_dict['Meuble']=meublenonmeuble
unrawify_dict['Surface']=surface
unrawify_dict['LoyerMensuel']=loyer

def unrawify_apas(indict):
    outdict=indict.copy()
    for j in indict.keys():
        outdict.update(unrawify_dict[j](indict))
    return outdict

def apafeatures(apainfo):
    url = apainfo['url']
    apa=BS(urlget(url).read())
    col=select2text(apa.select('section.properties .property'))
    col = map(make_col_name,col)
    
    val=select2text(apa.select('section.properties .value'))
    
    lines = min(len(col),len(val))
    out = {}
    for i in xrange(lines):
        out[col[i]]=val[i]
    
    out = unrawify_apas(out)
    if len(val)!=len(col):
        out['Problematic']=True
    else:
        out['Problematic']=False
        
    
    out.update(apainfo)
    return out

A memoizing page getter¶

As the docstring says: sort of memoizing for apafeatures, specialized for the apas tuple.

This class saves its cache as pickle

In [29]:

import cPickle as pkl
import os
import time

class get_features_cache(object):
    """
        sort of memoizing for apafeatures, specialized for the flat record
    """
    def __init__(self,fname,maxdt=-1):
        self.fname=fname
        try:
            self.db=pkl.load(open(fname,'rb'))
        except IOError:
            self.db=[]
        self.clids=[j['md5sum'] for j in self.db]
        self.dirty = 0
        self.upd_time = time.strftime('%H:%M %d/%m/%y')
        
    def __call__(self,apainfo):
        url = apainfo['url']
        clid = apainfo['md5sum']
        ins_time = apainfo['insert_time']
        # check if app was already retrieved
        if clid in self.clids:
            #TODO check if retrieved version is too old/invalid
            return {'data':self.db[self.clids.index(clid)],'from_cache':True}
        else:
            #retrieve data, store, update self.clids, and finally return
            out = apafeatures(apainfo)
            
            self.clids.append(clid)
            self.db.append(out)
            self.dirty = 1
            self.upd_time = time.strftime('%H:%M %d/%m/%y')
            return {'data':out,'from_cache':False}
        
    def __len__(self):
        return len(self.db)
    
    def __del__(self):
        self._save()
        
    def _save(self):
        if self.dirty:
            print "saving apas db"
            print "db rows: {0:d}".format(len(self.db))
            pkl.dump(self.db,open(self.fname,'wb'))
            self.dirty = 0
            
    def upd_date(self):
        return self.upd_time

Ask pages at random times¶

This is class that behaves as a function (see the __call__) that waits a random time before retrieving the requested URL. The waiting time is sampled from a Poisson distribution.

In [30]:

# ----------------------------------------------------------
#            Poissonian waiting time in the urlget function
class urlgetter(object):
    def __init__(self,waiting_time):
        self.mean=waiting_time
        
    def __call__(self,url):
        import time
        #waiting time [s]
        wt = self.poisson()
        time.sleep(wt)
        return urllib.urlopen(url)
    
    def poisson(self):
        from math import log
        from random import random
        return -log(1.0 - random()) / self.mean

Put all togheter¶

The culprit of all these efforts, the loop that rules them all, where the work is truly done.

In [31]:

import random
pages=range(1,7)

apagetter = get_features_cache(fname='data/LBClocations.pkl',)
print 'The {2:s} the db {0:s} contains {1:d} locations'.format(apagetter.fname,len(apagetter),apagetter.upd_date())
urlget = urlgetter(1/.5)
failed=[]
flats = []
class Found(Exception): pass

try:
    for page in pages:
        human_page=get_page_from_url(page,base_url)
        apas=get_listings_from_html(human_page,base_url)
        print 'page', page
        if not len(apas): break
        for count,apa in enumerate(apas):
            try:
                last=apagetter(apa)
            except Exception,msg:
                failed.append(apa)
                print 'failed #{1:d} {0:s}'.format(apa,count)
                print msg
except Found:
    pass
apagetter._save()
print 'last downloaded record:'
print 'title:',last['data']['title']
print 'insert time:',last['data']['insert_time']

The 09:44 26/06/17 the db data/LBClocations.pkl contains 2227 locations
https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=1&sqs=2&ret=1&ret=2&ret=5
page 1
https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=2&sqs=2&ret=1&ret=2&ret=5
page 2
https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=3&sqs=2&ret=1&ret=2&ret=5
page 3
https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=4&sqs=2&ret=1&ret=2&ret=5
page 4
https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=5&sqs=2&ret=1&ret=2&ret=5
page 5
https://www.leboncoin.fr/locations/offres/ile_de_france/paris/?o=6&sqs=2&ret=1&ret=2&ret=5
page 6
last downloaded record:
title: T2 buttes haumont
insert time: 24 juin, 14:36

In [32]:

flats = apagetter.db
len(flats)

Out[32]:

to the pandas¶

In [33]:

%pylab inline
import pandas as pd
import seaborn as sns
from datetime import datetime,date
df = pd.DataFrame(flats)
df.columns

Populating the interactive namespace from numpy and matplotlib

Out[33]:

Index([u'Arrondissement', u'ChargesComprises', u'ClasseEnergie',
       u'ClasseInergie', u'ClasseShchnergie', u'ClasseYnergie', u'Description',
       u'Ges', u'LoyerMensuel', u'Meuble', u'Pieces', u'Problematic',
       u'Reference', u'Surface', u'TypeDeBien', u'Ville', u'insert_time',
       u'md5sum', u'pid', u'title', u'url'],
      dtype='object')

In [35]:

# https://stackoverflow.com/questions/26294333/parse-french-date-in-python
import dateparser
def correct_instime_format(j):
    if not isinstance(j,date):
        j = unidecode.unidecode(j)
        return dateparser.parse(j)
    return j

df['insert_time']=df['insert_time'].apply(correct_instime_format)
#######################################################################
def safe_conv_pos_int(x):
    try:
        return int(x)
    except ValueError:
        return -1
df['Pieces']=df['Pieces'].apply(safe_conv_pos_int)
df_all=df.copy() #backup
#######################################################################
# we care only of 2 pieces
df=df[df['Pieces']==2]
#######################################################################
# remove if cheaper than 500 (it's crap) and bigger than 70
df=df[(df.LoyerMensuel>500) & (df.Surface<70)]

print "*"*100
print "{0:d} flats recorded with 2 pieces, of size < 70sqm, and price >500eu".format(df.shape[0])
print "*"*100

****************************************************************************************************
935 flats recorded with 2 pieces, of size < 70sqm, and price >500eu
****************************************************************************************************

In [36]:

df['aux'] = pd.to_datetime(df['insert_time'])
df['weekday']=df['aux'].apply(lambda x:x.weekday())
df['ins_hour']=df['aux'].apply(lambda x:x.hour)
fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='weekday',data=df,ax=ax)
ax.xaxis.set_ticklabels(['Mon','Tue','Wed','Thu','Fri','Sat','Sun'])
sns.countplot(x='ins_hour',data=df,ax=bx,order=np.roll(range(0,24),-6))

Out[36]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5b0a6f1bd0>

In [37]:

print 'Max surface:',df.Surface.max()
print 'Min surface:',df.Surface.min()
print 'Min price:',df.LoyerMensuel.min()
sns.plt.hist(df.Surface.tolist(),bins=np.arange(0-2.5,int(df.Surface.max())+2.5,5))
sns.plt.xlabel('Surface');

Max surface: 69.0
Min surface: 25.0
Min price: 543.0

In [38]:

print ', '.join(df.columns)

Arrondissement, ChargesComprises, ClasseEnergie, ClasseInergie, ClasseShchnergie, ClasseYnergie, Description, Ges, LoyerMensuel, Meuble, Pieces, Problematic, Reference, Surface, TypeDeBien, Ville, insert_time, md5sum, pid, title, url, aux, weekday, ins_hour

work out some new columns¶

Binning price by 250€
Binnig surface by 5 m²

In [39]:

i = [10,15,20,25,30,35,40]
o = sns.np.digitize(i,bins=sns.np.arange(0,130,5))
print 'Surface binning legend'
for p,q in zip(i,o):
    print p,'sqm -> #',q

Surface binning legend
10 sqm -> # 3
15 sqm -> # 4
20 sqm -> # 5
25 sqm -> # 6
30 sqm -> # 7
35 sqm -> # 8
40 sqm -> # 9

In [40]:

#######################################################################
df['price_bin']=sns.np.digitize(df.LoyerMensuel,bins=sns.np.arange(0,21*250,250))
df['sqm_bin'] = sns.np.digitize(df.Surface,bins=sns.np.arange(0,130,5))
df['size_sqm']= 5*(df['sqm_bin']-1)
df['price_sqm'] = df.LoyerMensuel/df.Surface
df = df[(df['price_sqm'] < 200)]
#######################################################################
mapges2int={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8,'I':9,'N':10,'V':11,'X':12}
def safe_ges(x):
    try:
        return x.strip()[0]
    except AttributeError:
        return 'X'
    
df['Ges_lit']=df['Ges'].apply(safe_ges)
df['Ges_int']=df['Ges_lit'].apply(lambda x:mapges2int[x])
########################################################################
df['ClasseEnergie_lit']=df['ClasseEnergie'].apply(safe_ges)
df['ClasseEnergie_int']=df['ClasseEnergie_lit'].apply(lambda x:mapges2int[x])
########################################################################
dfCC=df[df['ChargesComprises']==1]
dfNC=df[df['ChargesComprises']==0]
df=dfCC
########################################################################
def sistema_arrondissement(x):
    if x>100:
        return int(x-100)
    else:
        return int(x)
df['Arrondissement']=df['Arrondissement'].apply(sistema_arrondissement)
########################################################################
# sorting by average price per arrondissement 
def mean_outliers(x):
    _m = x.mean()
    _v = x.std()
    return x[(x>_m-_v) & (x<_m+_v)].mean()

sorted_district_by_aveprice = []
__tmp = []
for c in df.groupby('Arrondissement'):
#     __tmp.append((c[0],c[1].price_sqm.mean()))
    __tmp.append((c[0],mean_outliers(c[1].price_sqm)))

__tmp = np.asarray(__tmp)
__sidx = np.argsort(__tmp[:,1])
sorted_district_by_aveprice = __tmp[__sidx][:,0]
sorted_district_by_aveprice_labels = ['%d'%j for j in sorted_district_by_aveprice]
sorted_ges_labels = sorted(list(df['Ges_lit'].unique()))
#######################################################################
df['Meuble_int']=df['Meuble'].apply(safe_conv_pos_int)
#######################################################################

Save the data for later¶

In [41]:

df.to_pickle("data/lbc_pandas.pkl")

Visualize on-sight insights¶

In [42]:

fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='ClasseEnergie_lit',data=df,order=sorted_ges_labels,hue='Meuble',ax=ax)
sns.countplot(x='Arrondissement',data=df,ax=bx)
fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
sns.countplot(x='size_sqm',data=df,hue='Meuble',ax=ax)

Out[42]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5ad5ea79d0>

Remember that the surface nad price bins are:

Surface bin: 5 m²
Price bin : 250 Eu

In [43]:

sqm_bin_palette=sns.cubehelix_palette(9, start=1, rot=.1)
# sns.palplot(sqm_bin_palette)

In [44]:

sns.set_palette('muted')
fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
sns.swarmplot(x='size_sqm',y='price_sqm',data=dfCC,ax=ax,palette='muted',hue='Meuble')
sns.boxplot(x='size_sqm',y='price_sqm',data=dfCC,hue='Meuble',ax=bx);
#####################################################################
fig,ax=sns.plt.subplots(1,1,figsize=(15,5))
sns.swarmplot(y=u'price_sqm',x=u'Arrondissement',data=df[df['Meuble']==True],order=sorted_district_by_aveprice,\
              ax=ax,size=7,palette=sqm_bin_palette,hue='size_sqm')
sns.swarmplot(y=u'price_sqm',x=u'Arrondissement',data=df[df['Meuble']==False],order=sorted_district_by_aveprice,ax=ax,\
              size=2.5,color='#ff0000')

_=ax.xaxis.set_ticklabels(sorted_district_by_aveprice_labels)
#####################################################################
fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
sns.swarmplot(y=u'size_sqm',x=u'ClasseEnergie_lit',data=dfCC,order=sorted_ges_labels,ax=bx)
sns.swarmplot(y=u'price_sqm',x=u'ClasseEnergie_lit',data=dfCC[dfCC['Meuble']==False],order=sorted_ges_labels,ax=ax,\
              size=2.5,color='#ff0000')
sns.swarmplot(y=u'price_sqm',x=u'ClasseEnergie_lit',data=dfCC[dfCC['Meuble']==True],order=sorted_ges_labels,ax=ax,\
             palette=sqm_bin_palette,hue='sqm_bin');
#####################################################################
line="#"*90
print line
print '* color code by sqm_bin too should give the trend of the 1st plot'
print '* # of records',df.shape[0]
print '                  /20',df.shape[0]/20.
print '                  /15',df.shape[0]/15.
print line

##########################################################################################
* color code by sqm_bin too should give the trend of the 1st plot
* # of records 907
                  /20 45.35
                  /15 60.4666666667
##########################################################################################

In [50]:

fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
range = sns.plt.np.arange
ax.hist2d(df['Arrondissement'],df['LoyerMensuel'],cmap='Blues',bins=[range(1,21)-.5,range(500,2500,100)-50]);
bx.hist2d(df['Arrondissement'],df['size_sqm'],cmap='Blues',bins=[range(1,21)-.5,sns.plt.np.arange(5*5,15*5,5)-2.5]);
ax.set_xlabel('Arrondissement')
ax.set_ylabel('LoyerMensuel')
bx.set_xlabel('Arrondissement')
bx.set_ylabel('size_sqm')

Out[50]:

<matplotlib.text.Text at 0x7f5ad4b52690>

Pdf¶

In [53]:

careof = ['Surface','price_sqm','Arrondissement','Meuble_int','weekday','ins_hour']
ylbl   = ['LoyerMensuel',]
z = df[careof+ylbl].dropna()
print z.shape

(907, 7)

In [54]:

import qgrid
# qgrid.nbinstall(overwrite=True)

Flats above median¶

What's the probability a flat has a price given it's features?

There is still to few data to compute $\int\, dprice\_sqm price\_smq\, P(price\_sqm|Arr,Meuble,sqm)$, so I use the simpler conditional probabilities $P(Arr,Meuble)$ or $P(Arr,sqm)$ that need less classes.

A similar analysis can be tried with a naive Bayes or a logit model.. TODO

In [55]:

df['Arr_Meu_cls']=0
df['Arr_Sqm_cls']=0
def average_flat(to):
    def f(group):
        mean_price = group.price_sqm.median()
        group[to] [group.price_sqm>mean_price]=1
    #     group.Arr_meu_cls = group.price_sqm>mean_price.astype(int)
        return group
    return f


df = df.groupby(('Arrondissement','Meuble')).apply(average_flat('Arr_Meu_cls'))
df = df.groupby(('Arrondissement','size_sqm')).apply(average_flat('Arr_Sqm_cls'))
careof = careof = ['LoyerMensuel','Surface','price_sqm','Arrondissement','Meuble','ClasseEnergie_lit','url']
qgrid.show_grid(df[df.Arr_Meu_cls>0][careof])
print df[df.Arr_Meu_cls>0].shape[0]*1./df.shape[0]
    

0.469680264609

In [56]:

careof = ['LoyerMensuel','Surface','price_sqm','Arrondissement','Meuble_int','weekday','ins_hour']
ylbl   = ['Arr_Sqm_cls',]
z = df[careof+ylbl].dropna()
print z.shape

(885, 8)

Average price for 2 ppl¶

Since the price of a flat must be smaller than 1/3 of what ppl gain (in couple), we get also the distribution of salaries expected for ppl living in paris: $ price < (salary_1 + salary_2)/3 $ assuming that $salary_1=salary_2$ (bcs of high education jobs), we get $salary_1 > 3/2 price$

In [57]:

def plot_salaries(df,Meuble=0):
    fig,(ax,bx)=sns.plt.subplots(1,2,figsize=(15,5))
    ifs =  (df.Meuble==Meuble)
    salary1 = df[ifs].LoyerMensuel.values*3/2.
    bins = range(0,6000,100)
    ax.hist(salary1,bins=bins);
    ax.axvline(salary1.mean(),color='w')
    ax.hist(salary1*2,bins=bins);
    ax.axvline(salary1.mean()*2,color='w')
    ax.set_xlabel('blue mean salary 1p, green 1+1p')

    mean_price_per_month = df[ifs].LoyerMensuel.values.mean()
    price_per_month = df[ifs].LoyerMensuel.values

    min_surf = 7
    max_surf = 10+1
    ifs=ifs & (df['sqm_bin']<max_surf) & (df['sqm_bin']>min_surf) 

    price_per_month_select = df[ifs].LoyerMensuel.values
    bx.hist(price_per_month,bins=bins);
    bx.hist(price_per_month_select,bins=bins);
    bx.axvline(mean_price_per_month,color='w')
    bx.set_xlabel('blue: mean price per month (2pieces), green: 35<sqm<50' )
    bx.text(mean_price_per_month,-2,u'{0:.0f}€'.format(mean_price_per_month),)
    fig.suptitle('Meuble:{0:d}'.format(Meuble))

plot_salaries(df,Meuble=1)
plot_salaries(df,Meuble=0)

Mapping¶

In [58]:

# Download shapes of paris arrondissements
!wget -c https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements/75/communes.geojson -O data/75.geojson

--2017-06-26 09:48:44--  https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements/75/communes.geojson
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.120.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.120.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2017-06-26 09:48:45 ERROR 404: Not Found.

In [ ]:

from ipyleaflet import (
    Map,
    Marker,
    TileLayer, ImageOverlay,
    Polyline, Polygon, Rectangle, Circle, CircleMarker,
    GeoJSON,
    DrawControl
)
import json
import matplotlib as mpl
with open('data/75.geojson') as f:
    data = json.load(f)

In [60]:

def Gnormalize(x):
    """
        normalizes the 1D array/list x to be between 0. and 1.
    """
    x = sns.plt.np.asarray(x)
    x = x.copy()
    x-=x.min()
    x/=x.max()
    return x

We will assign a colorcode to each arrondissmement, based on the average price

In [ ]:

from collections import defaultdict
def ddict(x,default=lambda : np.nan):
    """
        returns a default dict with content dict(x)
    """
    out = defaultdict(default)
    out.update(x)
    return out

tmp = []
for c in df[(df.Surface<50) & (df.Meuble_int==1)].groupby('Arrondissement'):
    tmp.append((c[0],mean_outliers(c[1].price_sqm)))

tmp = np.asarray(tmp)
aveprice_prenorm = ddict(tmp)

# normalize prices to compute colors per arrondissmement
tmp=tmp.T
tmp[1]=Gnormalize(tmp[1])

map_palette=sns.diverging_palette(220, 20, n=len(tmp),as_cmap=1)
_ = [(i,mpl.colors.rgb2hex(map_palette(j))) for i,j in tmp.T]
aveprice_color = ddict(_)

In [ ]:

# flats_per_arrondissement = []
# for j in df[df['Surface']<50].groupby('Arrondissement'):
#     flats_per_arrondissement.append([ float(j[0]),len(j[1])])
# flats_per_arrondissement = np.asarray(flats_per_arrondissement)
# flats_per_arrondissement[:,1] = Gnormalize(flats_per_arrondissement[:,1])
# flats_per_arrondissement = dict(flats_per_arrondissement)

In [ ]:

for j,feature in enumerate(data['features']):
    where = atof(feature['properties']['code'])-75100
    price = aveprice_prenorm[where]
    if not price:
        continue
#     price = flats_per_arrondissement[where]
    color = aveprice_color[where]
    
    feature['properties']['style'] = {'color':color, 'weight': .5, 'fillColor':color, 'fillOpacity':0.75}
    feature['properties']['district']='Arrondissement: {0:.0f}, price: {1:.0f}eu/sqm'.format(where,price)

layer = GeoJSON(data=data, hover_style={'fillOpacity': 1})

def hover_handler(event=None, id=None, properties=None):
    sys.stdout.write("\r" + properties['district'])
    sys.stdout.flush()
    
center = [48.85906816414895, 2.343006134033203]
zoom = 13
m = Map(center=center, zoom=zoom)
layer.on_hover(hover_handler)
m.add_layer(layer)
m.height='500px'
print 'price per sqm (<50smq, meuble\') per district'
print 'You need to run the notebook to see the map :/' 
m

Styling¶

In [61]:

from IPython.display import HTML
import urllib2

style=open('style.css','r').read()
HTML("""
<style>{0}</style>
""".format(style))

Out[61]:

In [ ]: