import pandas as pd
import numpy as np
import pylab as pl
import matplotlib as mpl
%matplotlib inline
%pylab inline
Populating the interactive namespace from numpy and matplotlib
# загрузка данных
# data = pd.read_csv('D:\\Competitions\\Laura\\arenda.csv', sep=';', decimal=',')
data = pd.read_excel('D:\\Competitions\\Laura\\arenda.xlsx')
data[:3]
id | Space Size | Price / SF / Month | Space Type: | Lease Type: | Number of transport spots | Population | Population change 2013-2010 | Land area | Density of people living in area | ... | Household size | Average HH income 2013 | Income change 2013-2010 | Change in % of bachelor degrees 2013-2010 | Average salary of employees ($ 000s) | Average salary of employees in new businesses | % of employees in new companies vs all | Number of new retail places 2013-2010 | list id | list | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 11324 | 2.649241 | NaN | NaN | 14 | 6417.318254 | 0.104498 | 780786.644395 | 0.008219 | ... | 2.811673 | 64039 | 0.008973 | 0.034373 | 43000 | 42220.615385 | 12 | 5 | 239 | ln |
1 | 2 | 2275 | 1.583333 | NaN | NaN | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 388 | ln |
2 | 3 | 2275 | 1.583333 | NaN | NaN | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 389 | ln |
3 rows × 23 columns
data.columns
Index([u'id', u'Space Size', u'Price / SF / Month', u'Space Type: ', u'Lease Type: ', u'Number of transport spots', u'Population', u'Population change 2013-2010', u'Land area', u'Density of people living in area', u'Density of people working in area (based on lat/lon)', u'Total density (living + working)', u'Social chat score', u'Household size', u'Average HH income 2013', u'Income change 2013-2010', u'Change in % of bachelor degrees 2013-2010', u'Average salary of employees ($ 000s)', u'Average salary of employees in new businesses', u'% of employees in new companies vs all', u'Number of new retail places 2013-2010', u'list id', u'list'], dtype='object')
# удалить id
del data['id']
# переименовать названия признаков
data = data.rename(columns={
u'Space Size':'spacesize',
u'Price / SF / Month':'price',
u'Space Type: ':'spacetype',
u'Lease Type: ':'leasetype',
u'Number of transport spots':'nspots',
u'Population':'pop',
u'Population change 2013-2010':'popchange',
u'Land area':'land',
u'Density of people living in area':'densliv',
u'Density of people working in area (based on lat/lon)':'denswork',
u'Total density (living + working)':'denstotal',
u'Social chat score':'chat',
u'Household size':'hhsize',
u'Average HH income 2013':'aincome',
u'Income change 2013-2010':'incomechange',
u'Change in % of bachelor degrees 2013-2010':'degrees',
u'Average salary of employees ($ 000s)':'asalary',
u'Average salary of employees in new businesses':'anewsalary',
u'% of employees in new companies vs all':'employees',
u'Number of new retail places 2013-2010':'retails',
u'list id':'listid',
u'list':'list'
})
data[:3]
spacesize | price | spacetype | leasetype | nspots | pop | popchange | land | densliv | denswork | ... | hhsize | aincome | incomechange | degrees | asalary | anewsalary | employees | retails | listid | list | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11324 | 2.649241 | NaN | NaN | 14 | 6417.318254 | 0.104498 | 780786.644395 | 0.008219 | 0.001896 | ... | 2.811673 | 64039 | 0.008973 | 0.034373 | 43000 | 42220.615385 | 12 | 5 | 239 | ln |
1 | 2275 | 1.583333 | NaN | NaN | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | 0.002220 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 388 | ln |
2 | 2275 | 1.583333 | NaN | NaN | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | 0.002220 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 389 | ln |
3 rows × 22 columns
# вывод статистики
data.describe()
spacesize | price | nspots | pop | popchange | land | densliv | denswork | denstotal | chat | hhsize | aincome | incomechange | degrees | asalary | anewsalary | employees | retails | listid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.00000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 | 2227.000000 |
mean | 3884.932645 | 6.224143 | 29.325101 | 15833.326788 | 0.010618 | 744267.272294 | 0.021292 | 0.025608 | 0.046900 | 36.495285 | 2.513140 | 63835.55366 | 0.097281 | 0.011870 | 46687.471935 | 39536.089718 | 24.637180 | 34.376291 | 612.130669 |
std | 16311.985235 | 6.582278 | 14.658322 | 8681.890404 | 0.055430 | 74177.247168 | 0.011673 | 0.051450 | 0.053700 | 68.069988 | 0.532757 | 31651.81757 | 0.103458 | 0.023377 | 15010.874771 | 12852.903269 | 8.480855 | 36.063391 | 337.293657 |
min | 1.000000 | 0.000192 | 0.000000 | 114.945126 | -0.141048 | 198188.986853 | 0.000199 | 0.000000 | 0.000249 | 0.000000 | 1.510175 | 17479.00000 | -0.253104 | -0.087200 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 1000.000000 | 2.666663 | 19.000000 | 9712.109647 | -0.028305 | 741991.143918 | 0.013105 | 0.004082 | 0.019105 | 5.000000 | 2.086396 | 40051.00000 | 0.031163 | -0.002626 | 36000.000000 | 30269.422362 | 20.000000 | 10.000000 | 319.500000 |
50% | 1700.000000 | 4.166666 | 29.000000 | 15645.946460 | 0.008172 | 780549.167952 | 0.020760 | 0.006939 | 0.028993 | 13.000000 | 2.534359 | 53145.00000 | 0.093626 | 0.011536 | 41000.000000 | 36652.400004 | 24.000000 | 21.000000 | 615.000000 |
75% | 3400.000000 | 7.343137 | 38.000000 | 21584.540555 | 0.044095 | 780832.018612 | 0.028699 | 0.020631 | 0.051848 | 42.000000 | 2.894399 | 88869.50000 | 0.163718 | 0.027190 | 53000.000000 | 46223.310708 | 29.000000 | 42.000000 | 902.000000 |
max | 600000.000000 | 83.414634 | 78.000000 | 46664.838451 | 0.313075 | 780856.610853 | 0.062810 | 0.350893 | 0.369600 | 988.000000 | 4.559108 | 202295.00000 | 0.429361 | 0.102018 | 106000.000000 | 115360.037590 | 84.000000 | 184.000000 | 1217.000000 |
# убрать НаНы
# тут они все в строчках!!!
data = data.fillna('net')
data[:3]
spacesize | price | spacetype | leasetype | nspots | pop | popchange | land | densliv | denswork | ... | hhsize | aincome | incomechange | degrees | asalary | anewsalary | employees | retails | listid | list | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11324 | 2.649241 | net | net | 14 | 6417.318254 | 0.104498 | 780786.644395 | 0.008219 | 0.001896 | ... | 2.811673 | 64039 | 0.008973 | 0.034373 | 43000 | 42220.615385 | 12 | 5 | 239 | ln |
1 | 2275 | 1.583333 | net | net | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | 0.002220 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 388 | ln |
2 | 2275 | 1.583333 | net | net | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | 0.002220 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 389 | ln |
3 rows × 22 columns
# число уникальных значений по столбцам
# правда... проблемы с НаНами
lengths = []
for i in xrange(data.shape[1]):
lengths.append(data[data.columns[i]].unique().__len__())
pyplot.bar(np.arange(data.shape[1]), lengths)
# размеры
data.shape
(2227, 22)
# анализ признаков с малым числом значений
for i in xrange(data.shape[1]):
u = data[data.columns[i]].unique()
if u.__len__()<10:
print data.columns[i], u
leasetype ['net' u'Full Service' u'Industrial Gross' u'Modified Gross' u'Modified Net' u'NNN' u'Other'] list [u'ln' u'cf']
# факторы заменить мощностями вхождений
factorfeatures = ['spacetype', 'leasetype', 'list']
for f in factorfeatures:
set = data.groupby(f).size()
data[f] = data[f].apply(lambda x: set[x])
data[:3]
spacesize | price | spacetype | leasetype | nspots | pop | popchange | land | densliv | denswork | ... | hhsize | aincome | incomechange | degrees | asalary | anewsalary | employees | retails | listid | list | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 11324 | 2.649241 | 4 | 427 | 14 | 6417.318254 | 0.104498 | 780786.644395 | 0.008219 | 0.001896 | ... | 2.811673 | 64039 | 0.008973 | 0.034373 | 43000 | 42220.615385 | 12 | 5 | 239 | 1196 |
1 | 2275 | 1.583333 | 4 | 427 | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | 0.002220 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 388 | 1196 |
2 | 2275 | 1.583333 | 4 | 427 | 22 | 3902.848712 | 0.041712 | 646901.202888 | 0.006033 | 0.002220 | ... | 2.786308 | 77802 | -0.069537 | -0.001562 | 30000 | 36709.000000 | 12 | 2 | 389 | 1196 |
3 rows × 22 columns
# ЭТО НАДО ЛИШЬ В CSV-ФОРМАТЕ
# перевести проценты в числа
# def per2num(x):
# x = x.replace(',','.')
# i = x.find('%')
# if (i>0):
# x = x[:i]
# x = str(float(x)/100)
# return (x)
# percentfeatures = ['degrees', 'popchange']
# for f in percentfeatures:
# data[f] = data[f].apply(per2num)
# data[f] = data[f].astype(float)
# data[:3]
# вывод значений по признакам
for i in xrange(data.shape[1]):
print data.columns[i], data[data.columns[i]].values[:3]
#a = data[data.columns[0]][:3]
#a.values
spacesize [11324 2275 2275] price [ 2.64924055 1.58333333 1.58333333] spacetype [4 4 4] leasetype [427 427 427] nspots [14 22 22] pop [ 6417.31825428 3902.84871166 3902.84871166] popchange [ 0.10449835 0.04171197 0.04171197] land [ 780786.64439507 646901.20288793 646901.20288793] densliv [ 0.00821904 0.00603314 0.00603314] denswork [ 0.00189552 0.00221981 0.00221981] denstotal [ 0.01011457 0.00825296 0.00825296] chat [7 7 7] hhsize [ 2.81167288 2.78630753 2.78630753] aincome [64039 77802 77802] incomechange [ 0.00897258 -0.06953688 -0.06953688] degrees [ 0.03437302 -0.0015618 -0.0015618 ] asalary [43000 30000 30000] anewsalary [ 42220.61538462 36709. 36709. ] employees [12 12 12] retails [5 2 2] listid [239 388 389] list [1196 1196 1196]
# подготовить обучение и контроль
itest = data.index % 3 == 0
test = data[itest]
ytest = test['price']
test = test[test.columns - ['price']]
train = data[~itest]
ytrain = train['price']
train = train[train.columns - ['price']]
# константный бенчмарк
y = data['price'].values
# наши функции ошибки
def ma(a,y):
return np.abs(a-y).mean()
def rmse(a,y):
return np.sqrt((((a-y)**2).mean()))
print 'Среднее отклонение:' + str(ma(y.mean(),y))
print 'СК-отклонение:' + str(rmse(y.mean(),y))
Среднее отклонение:4.03197129723 СК-отклонение:6.58079959213
# RF
y = ytest.values
import sklearn
from sklearn import ensemble
clf = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, max_features = 10)
asum = 0
e1s = []
e2s = []
for i in range(10):
clf.fit(train, ytrain)
a = clf.predict(test)
asum = asum + a
aa = asum/(i+1)
e1 = ma(aa,y)
e2 = rmse(aa,y)
e1s.append(e1)
e2s.append(e2)
print 'it=' + str(i) + ' ma=' + str(e1) + ' rmse=' + str(e2)
plt1, = plot(e1s, label='ma')
plt2, = plot(e2s, label='rmse')
#print 'Среднее отклонение:' + str(np.abs((a-y)).mean())
#print 'СК-отклонение:' + str(np.sqrt((((a-y))**2).mean()))
legend((plt1, plt2), ['ma', 'rmse'])
it=0 ma=2.13302605262 rmse=4.87093633505 it=1 ma=2.12375079444 rmse=4.89879348639 it=2 ma=2.1206627149 rmse=4.88795847537 it=3 ma=2.12332802429 rmse=4.90191429436 it=4 ma=2.12098464857 rmse=4.90193331669 it=5 ma=2.11688809906 rmse=4.90190626714 it=6 ma=2.11774837786 rmse=4.90049089551 it=7 ma=2.116069004 rmse=4.88800749006 it=8 ma=2.11885322327 rmse=4.8968311695 it=9 ma=2.12141815394 rmse=4.90396149632
<matplotlib.legend.Legend at 0x1a207550>
print train.shape
print test.shape
(1484, 21) (743, 21)
# перебор всех фолдов
clf = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, \
max_features = 5)
y = data['price'].values
a = y*0
for jfold in range(3):
# подготовить обучение и контроль
itest = data.index % 3 == jfold
test = data[itest]
ytest = test['price']
test = test[test.columns - ['price']]
train = data[~itest]
ytrain = train['price']
train = train[train.columns - ['price']]
clf.fit(train, ytrain)
a[itest] = clf.predict(test)
print 'Среднее отклонение:' + str(np.abs((a-y)).mean())
print 'СК-отклонение:' + str(np.sqrt((((a-y))**2).mean()))
Среднее отклонение:2.25390001005 СК-отклонение:5.05519437016