In [1]:

import pandas as pd
import numpy as np
import pylab as pl
import matplotlib as mpl
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [7]:

# загрузка данных
# data = pd.read_csv('D:\\Competitions\\Laura\\arenda.csv', sep=';', decimal=',')
data = pd.read_excel('D:\\Competitions\\Laura\\arenda.xlsx')
data[:3]

Out[7]:

	id	Space Size	Price / SF / Month	Space Type:	Lease Type:	Number of transport spots	Population	Population change 2013-2010	Land area	Density of people living in area	...	Household size	Average HH income 2013	Income change 2013-2010	Change in % of bachelor degrees 2013-2010	Average salary of employees ($ 000s)	Average salary of employees in new businesses	% of employees in new companies vs all	Number of new retail places 2013-2010	list id	list
0	1	11324	2.649241	NaN	NaN	14	6417.318254	0.104498	780786.644395	0.008219	...	2.811673	64039	0.008973	0.034373	43000	42220.615385	12	5	239	ln
1	2	2275	1.583333	NaN	NaN	22	3902.848712	0.041712	646901.202888	0.006033	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	388	ln
2	3	2275	1.583333	NaN	NaN	22	3902.848712	0.041712	646901.202888	0.006033	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	389	ln

3 rows × 23 columns

In [5]:

data.columns

Out[5]:

Index([u'id', u'Space Size', u'Price / SF / Month', u'Space Type: ', u'Lease Type: ', u'Number of transport spots', u'Population', u'Population change 2013-2010', u'Land area', u'Density of people living in area', u'Density of people working in area (based on lat/lon)', u'Total density (living + working)', u'Social chat score', u'Household size', u'Average HH income 2013', u'Income change 2013-2010', u'Change in % of bachelor degrees 2013-2010', u'Average salary of employees ($ 000s)', u'Average salary of employees in new businesses', u'% of employees in new companies vs all', u'Number of new retail places 2013-2010', u'list id', u'list'], dtype='object')

In [8]:

# удалить id
del data['id']

# переименовать названия признаков
data = data.rename(columns={
u'Space Size':'spacesize',
u'Price / SF / Month':'price',
u'Space Type: ':'spacetype',
u'Lease Type: ':'leasetype',
u'Number of transport spots':'nspots',
u'Population':'pop', 
u'Population change 2013-2010':'popchange',
u'Land area':'land',
u'Density of people living in area':'densliv', 
u'Density of people working in area (based on lat/lon)':'denswork',
u'Total density (living + working)':'denstotal',
u'Social chat score':'chat', 
u'Household size':'hhsize',
u'Average HH income 2013':'aincome',
u'Income change 2013-2010':'incomechange',
u'Change in % of bachelor degrees 2013-2010':'degrees',
u'Average salary of employees ($ 000s)':'asalary',
u'Average salary of employees in new businesses':'anewsalary', 
u'% of employees in new companies vs all':'employees',
u'Number of new retail places 2013-2010':'retails',
u'list id':'listid',
u'list':'list'
})
data[:3]

Out[8]:

	spacesize	price	spacetype	leasetype	nspots	pop	popchange	land	densliv	denswork	...	hhsize	aincome	incomechange	degrees	asalary	anewsalary	employees	retails	listid	list
0	11324	2.649241	NaN	NaN	14	6417.318254	0.104498	780786.644395	0.008219	0.001896	...	2.811673	64039	0.008973	0.034373	43000	42220.615385	12	5	239	ln
1	2275	1.583333	NaN	NaN	22	3902.848712	0.041712	646901.202888	0.006033	0.002220	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	388	ln
2	2275	1.583333	NaN	NaN	22	3902.848712	0.041712	646901.202888	0.006033	0.002220	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	389	ln

3 rows × 22 columns

In [10]:

# вывод статистики
data.describe()

Out[10]:

	spacesize	price	nspots	pop	popchange	land	densliv	denswork	denstotal	chat	hhsize	aincome	incomechange	degrees	asalary	anewsalary	employees	retails	listid
count	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.00000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000	2227.000000
mean	3884.932645	6.224143	29.325101	15833.326788	0.010618	744267.272294	0.021292	0.025608	0.046900	36.495285	2.513140	63835.55366	0.097281	0.011870	46687.471935	39536.089718	24.637180	34.376291	612.130669
std	16311.985235	6.582278	14.658322	8681.890404	0.055430	74177.247168	0.011673	0.051450	0.053700	68.069988	0.532757	31651.81757	0.103458	0.023377	15010.874771	12852.903269	8.480855	36.063391	337.293657
min	1.000000	0.000192	0.000000	114.945126	-0.141048	198188.986853	0.000199	0.000000	0.000249	0.000000	1.510175	17479.00000	-0.253104	-0.087200	0.000000	0.000000	0.000000	0.000000	1.000000
25%	1000.000000	2.666663	19.000000	9712.109647	-0.028305	741991.143918	0.013105	0.004082	0.019105	5.000000	2.086396	40051.00000	0.031163	-0.002626	36000.000000	30269.422362	20.000000	10.000000	319.500000
50%	1700.000000	4.166666	29.000000	15645.946460	0.008172	780549.167952	0.020760	0.006939	0.028993	13.000000	2.534359	53145.00000	0.093626	0.011536	41000.000000	36652.400004	24.000000	21.000000	615.000000
75%	3400.000000	7.343137	38.000000	21584.540555	0.044095	780832.018612	0.028699	0.020631	0.051848	42.000000	2.894399	88869.50000	0.163718	0.027190	53000.000000	46223.310708	29.000000	42.000000	902.000000
max	600000.000000	83.414634	78.000000	46664.838451	0.313075	780856.610853	0.062810	0.350893	0.369600	988.000000	4.559108	202295.00000	0.429361	0.102018	106000.000000	115360.037590	84.000000	184.000000	1217.000000

In [11]:

# убрать НаНы
# тут они все в строчках!!!
data = data.fillna('net')
data[:3]

Out[11]:

	spacesize	price	spacetype	leasetype	nspots	pop	popchange	land	densliv	denswork	...	hhsize	aincome	incomechange	degrees	asalary	anewsalary	employees	retails	listid	list
0	11324	2.649241	net	net	14	6417.318254	0.104498	780786.644395	0.008219	0.001896	...	2.811673	64039	0.008973	0.034373	43000	42220.615385	12	5	239	ln
1	2275	1.583333	net	net	22	3902.848712	0.041712	646901.202888	0.006033	0.002220	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	388	ln
2	2275	1.583333	net	net	22	3902.848712	0.041712	646901.202888	0.006033	0.002220	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	389	ln

3 rows × 22 columns

In [12]:

# число уникальных значений по столбцам
# правда... проблемы с НаНами
lengths = []
for i in xrange(data.shape[1]):
    lengths.append(data[data.columns[i]].unique().__len__())
pyplot.bar(np.arange(data.shape[1]), lengths)
# размеры
data.shape

Out[12]:

(2227, 22)

In [13]:

# анализ признаков с малым числом значений
for i in xrange(data.shape[1]):
    u = data[data.columns[i]].unique()
    if u.__len__()<10:
        print data.columns[i], u

leasetype ['net' u'Full Service' u'Industrial Gross' u'Modified Gross'
 u'Modified Net' u'NNN' u'Other']
list [u'ln' u'cf']

In [14]:

# факторы заменить мощностями вхождений
factorfeatures = ['spacetype', 'leasetype', 'list']
for f in factorfeatures:
    set = data.groupby(f).size()
    data[f] = data[f].apply(lambda x: set[x])
data[:3]

Out[14]:

	spacesize	price	spacetype	leasetype	nspots	pop	popchange	land	densliv	denswork	...	hhsize	aincome	incomechange	degrees	asalary	anewsalary	employees	retails	listid	list
0	11324	2.649241	4	427	14	6417.318254	0.104498	780786.644395	0.008219	0.001896	...	2.811673	64039	0.008973	0.034373	43000	42220.615385	12	5	239	1196
1	2275	1.583333	4	427	22	3902.848712	0.041712	646901.202888	0.006033	0.002220	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	388	1196
2	2275	1.583333	4	427	22	3902.848712	0.041712	646901.202888	0.006033	0.002220	...	2.786308	77802	-0.069537	-0.001562	30000	36709.000000	12	2	389	1196

3 rows × 22 columns

In [19]:

# ЭТО НАДО ЛИШЬ В CSV-ФОРМАТЕ
# перевести проценты в числа
# def per2num(x):
#     x = x.replace(',','.')
#     i = x.find('%')
#     if (i>0):
#         x = x[:i]
#         x = str(float(x)/100)
#     return (x)

# percentfeatures = ['degrees', 'popchange']
# for f in percentfeatures:
#     data[f] = data[f].apply(per2num)
#     data[f] = data[f].astype(float)

# data[:3]

In [20]:

# вывод значений по признакам
for i in xrange(data.shape[1]):
    print data.columns[i], data[data.columns[i]].values[:3]
#a = data[data.columns[0]][:3]
#a.values

spacesize [11324  2275  2275]
price [ 2.64924055  1.58333333  1.58333333]
spacetype [4 4 4]
leasetype [427 427 427]
nspots [14 22 22]
pop [ 6417.31825428  3902.84871166  3902.84871166]
popchange [ 0.10449835  0.04171197  0.04171197]
land [ 780786.64439507  646901.20288793  646901.20288793]
densliv [ 0.00821904  0.00603314  0.00603314]
denswork [ 0.00189552  0.00221981  0.00221981]
denstotal [ 0.01011457  0.00825296  0.00825296]
chat [7 7 7]
hhsize [ 2.81167288  2.78630753  2.78630753]
aincome [64039 77802 77802]
incomechange [ 0.00897258 -0.06953688 -0.06953688]
degrees [ 0.03437302 -0.0015618  -0.0015618 ]
asalary [43000 30000 30000]
anewsalary [ 42220.61538462  36709.          36709.        ]
employees [12 12 12]
retails [5 2 2]
listid [239 388 389]
list [1196 1196 1196]

In [68]:

# подготовить обучение и контроль
itest = data.index % 3 == 0
test = data[itest]
ytest = test['price']
test = test[test.columns - ['price']]
train = data[~itest]
ytrain = train['price']
train = train[train.columns - ['price']]

In [70]:

# константный бенчмарк
y = data['price'].values
# наши функции ошибки
def ma(a,y):
    return np.abs(a-y).mean()

def rmse(a,y):
    return np.sqrt((((a-y)**2).mean()))
    
print 'Среднее отклонение:' + str(ma(y.mean(),y))
print 'СК-отклонение:' + str(rmse(y.mean(),y))

Среднее отклонение:4.03197129723
СК-отклонение:6.58079959213

In [73]:

# RF
y = ytest.values

import sklearn
from sklearn import ensemble
clf = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, max_features = 10)

asum = 0
e1s = []
e2s = []
for i in range(10):
    clf.fit(train, ytrain)
    a = clf.predict(test)
    asum = asum + a
    aa = asum/(i+1)
    e1 = ma(aa,y)
    e2 = rmse(aa,y)
    e1s.append(e1)
    e2s.append(e2)
    print 'it=' + str(i) + ' ma=' + str(e1) + ' rmse=' + str(e2)
plt1, = plot(e1s, label='ma')
plt2, = plot(e2s, label='rmse')
#print 'Среднее отклонение:' + str(np.abs((a-y)).mean())
#print 'СК-отклонение:' + str(np.sqrt((((a-y))**2).mean()))
legend((plt1, plt2), ['ma', 'rmse'])

it=0 ma=2.13302605262 rmse=4.87093633505
it=1 ma=2.12375079444 rmse=4.89879348639
it=2 ma=2.1206627149 rmse=4.88795847537
it=3 ma=2.12332802429 rmse=4.90191429436
it=4 ma=2.12098464857 rmse=4.90193331669
it=5 ma=2.11688809906 rmse=4.90190626714
it=6 ma=2.11774837786 rmse=4.90049089551
it=7 ma=2.116069004 rmse=4.88800749006
it=8 ma=2.11885322327 rmse=4.8968311695
it=9 ma=2.12141815394 rmse=4.90396149632

Out[73]:

<matplotlib.legend.Legend at 0x1a207550>

In [69]:

print train.shape
print test.shape

(1484, 21)
(743, 21)

In [82]:

# перебор всех фолдов
clf = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, \
                                             max_features = 5)

y = data['price'].values
a = y*0

for jfold in range(3):
    # подготовить обучение и контроль
    itest = data.index % 3 == jfold
    test = data[itest]
    ytest = test['price']
    test = test[test.columns - ['price']]
    train = data[~itest]
    ytrain = train['price']
    train = train[train.columns - ['price']]
    clf.fit(train, ytrain)
    a[itest] = clf.predict(test)
    
print 'Среднее отклонение:' + str(np.abs((a-y)).mean())
print 'СК-отклонение:' + str(np.sqrt((((a-y))**2).mean()))

Среднее отклонение:2.25390001005
СК-отклонение:5.05519437016

In [ ]: