import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None
加载数据集和查看前面几行数据
train_df = pd.read_json("train.json")
train_df.head()
bathrooms | bedrooms | building_id | created | description | display_address | features | interest_level | latitude | listing_id | longitude | manager_id | photos | price | street_address | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10 | 1.5 | 3 | 53a5b119ba8f7b61d4e010512e0dfc85 | 2016-06-24 07:54:24 | A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ... | Metropolitan Avenue | [] | medium | 40.7145 | 7211212 | -73.9425 | 5ba989232d0489da1b5f2c45f6688adc | [https://photos.renthop.com/2/7211212_1ed4542e... | 3000 | 792 Metropolitan Avenue |
10000 | 1.0 | 2 | c5c8a357cba207596b04d1afd1e4f130 | 2016-06-12 12:19:27 | Columbus Avenue | [Doorman, Elevator, Fitness Center, Cats Allow... | low | 40.7947 | 7150865 | -73.9667 | 7533621a882f71e25173b27e3139d83d | [https://photos.renthop.com/2/7150865_be3306c5... | 5465 | 808 Columbus Avenue | |
100004 | 1.0 | 1 | c3ba40552e2120b0acfc3cb5730bb2aa | 2016-04-17 03:26:41 | Top Top West Village location, beautiful Pre-w... | W 13 Street | [Laundry In Building, Dishwasher, Hardwood Flo... | high | 40.7388 | 6887163 | -74.0018 | d9039c43983f6e564b1482b273bd7b01 | [https://photos.renthop.com/2/6887163_de85c427... | 2850 | 241 W 13 Street |
100007 | 1.0 | 1 | 28d9ad350afeaab8027513a3e52ac8d5 | 2016-04-18 02:22:02 | Building Amenities - Garage - Garden - fitness... | East 49th Street | [Hardwood Floors, No Fee] | low | 40.7539 | 6888711 | -73.9677 | 1067e078446a7897d2da493d2f741316 | [https://photos.renthop.com/2/6888711_6e660cee... | 3275 | 333 East 49th Street |
100013 | 1.0 | 4 | 0 | 2016-04-28 01:32:41 | Beautifully renovated 3 bedroom flex 4 bedroom... | West 143rd Street | [Pre-War] | low | 40.8241 | 6934781 | -73.9493 | 98e13ad4b495b9613cef886d79a6291f | [https://photos.renthop.com/2/6934781_1fa4b41a... | 3350 | 500 West 143rd Street |
这个数据集含有数值型特征、类别性特征、日期特征、文本串特征、图片特征。把测试集也加载进来,查看数据集和测试集的数量。
test_df = pd.read_json("test.json")
print("训练集样本数量:", train_df.shape[0])
print("测试集样本数量:", test_df.shape[0])
训练集样本数量: 49352 测试集样本数量: 74659
train_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 49352 entries, 10 to 99994 Data columns (total 15 columns): bathrooms 49352 non-null float64 bedrooms 49352 non-null int64 building_id 49352 non-null object created 49352 non-null object description 49352 non-null object display_address 49352 non-null object features 49352 non-null object interest_level 49352 non-null object latitude 49352 non-null float64 listing_id 49352 non-null int64 longitude 49352 non-null float64 manager_id 49352 non-null object photos 49352 non-null object price 49352 non-null int64 street_address 49352 non-null object dtypes: float64(3), int64(3), object(9) memory usage: 6.0+ MB
train_df.describe()
bathrooms | bedrooms | latitude | listing_id | longitude | price | |
---|---|---|---|---|---|---|
count | 49352.00000 | 49352.000000 | 49352.000000 | 4.935200e+04 | 49352.000000 | 4.935200e+04 |
mean | 1.21218 | 1.541640 | 40.741545 | 7.024055e+06 | -73.955716 | 3.830174e+03 |
std | 0.50142 | 1.115018 | 0.638535 | 1.262746e+05 | 1.177912 | 2.206687e+04 |
min | 0.00000 | 0.000000 | 0.000000 | 6.811957e+06 | -118.271000 | 4.300000e+01 |
25% | 1.00000 | 1.000000 | 40.728300 | 6.915888e+06 | -73.991700 | 2.500000e+03 |
50% | 1.00000 | 1.000000 | 40.751800 | 7.021070e+06 | -73.977900 | 3.150000e+03 |
75% | 1.00000 | 2.000000 | 40.774300 | 7.128733e+06 | -73.954800 | 4.100000e+03 |
max | 10.00000 | 8.000000 | 44.883500 | 7.753784e+06 | 0.000000 | 4.490000e+06 |
查看是否有"miss values"
train_df.isnull().sum()
bathrooms 0 bedrooms 0 building_id 0 created 0 description 0 display_address 0 features 0 interest_level 0 latitude 0 listing_id 0 longitude 0 manager_id 0 photos 0 price 0 street_address 0 dtype: int64
在深入探究数据特征之前,我们先看一下标签数据:“interest level”(感兴趣程度)
#int_level = train_df['interest_level'].value_counts()
plt.figure(figsize=(8,4))
#sns.barplot(int_level.index, int_level.values, alpha=0.8, color=color[1])
order = ['low', 'medium', 'high']
sns.countplot(train_df.interest_level, order=order)
plt.ylabel('Number of Occurrences', fontsize=12) #出现的频次
plt.xlabel('Interest level', fontsize=12)
plt.show()
#cnt_btr = train_df['bathrooms'].value_counts()
plt.figure(figsize=(8,4))
# sns.barplot(cnt_btr.index, cnt_btr.values, alpha=0.8, color=color[0])
sns.countplot(train_df.bathrooms);
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('bathrooms', fontsize=12)
plt.show()
查看bathroom与标签之间的关系,不同的interest_level下的bathrooms散点图:stripplot散点图有时会重叠,所以打点时有某种随机的jitter
order = ['low', 'medium', 'high']
sns.stripplot(train_df.interest_level, train_df.bathrooms, jitter=True, order=order)
<matplotlib.axes._subplots.AxesSubplot at 0x2899f91aeb8>
这里有一个浴室数量为10的house,我们视为离群点(outlier),去掉这个离群点(outlier),然后重新画图。
ulimit = np.percentile(train_df.bathrooms.values, 99.5)
ulimit = np.ceil(ulimit)
train_df.bathrooms.loc[train_df.bathrooms > ulimit] = ulimit
#cnt_btr = train_df['bathrooms'].value_counts()
plt.figure(figsize=(8,4))
# sns.barplot(cnt_btr.index, cnt_btr.values, alpha=0.8, color=color[0])
sns.countplot(train_df.bathrooms);
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('bathrooms', fontsize=12)
plt.show()
order = ['low', 'medium', 'high']
sns.stripplot(train_df.interest_level, train_df.bathrooms, jitter=True, order=order)
<matplotlib.axes._subplots.AxesSubplot at 0x2899f94e400>
#cnt_btr = train_df['bathrooms'].value_counts()
plt.figure(figsize=(8,4))
# sns.barplot(cnt_btr.index, cnt_btr.values, alpha=0.8, color=color[0])
sns.countplot(train_df.bathrooms, hue="interest_level", data = train_df)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('bathrooms', fontsize=12)
plt.show()
没有bathroom的房子只有low interest
#cnt_btr = train_df['bathrooms'].value_counts()
plt.figure(figsize=(8,4))
# sns.barplot(cnt_btr.index, cnt_btr.values, alpha=0.8, color=color[0])
sns.countplot(train_df.bedrooms)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('bedrooms', fontsize=12)
plt.show()
查看bedroom这个特征与标签数据的联系,stipplot是分布散点图
order = ['low', 'medium', 'high']
sns.stripplot(train_df.interest_level, train_df.bedrooms, jitter=True, order=order)
<matplotlib.axes._subplots.AxesSubplot at 0x2899fbf3358>
order = ['low', 'medium', 'high']
sns.countplot(x="bedrooms", hue="interest_level", data = train_df)
<matplotlib.axes._subplots.AxesSubplot at 0x2899fcd39e8>
plt.figure(figsize=(8,6))
#plt.scatter(range(train_df.shape[0]), np.sort(train_df.price.values))
plt.scatter(range(train_df.shape[0]), train_df.price.values, color = color[6])
plt.xlabel('the number of train data', fontsize=12)
plt.ylabel('price', fontsize=12)
plt.show()
由散点图看出,在这个特征上,有一些离群点,移除掉它们,重新画图。
ulimit = np.percentile(train_df.price.values, 99.5)
#print(ulimit)
train_df['price'].loc[train_df['price'] > ulimit] = ulimit
plt.figure(figsize=(8,6))
sns.distplot(train_df.price.values, bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()
C:\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning: The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead. alternative="'density'", removal="3.1")
这个分布向右倾斜的厉害,我们可以使用 numpy.log
函数使其变的近似正态分布。
train_df['price'].loc[train_df['price'] > ulimit] = ulimit
plt.figure(figsize=(8,6))
sns.distplot(np.log(train_df.price.values), bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()
C:\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning: The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead. alternative="'density'", removal="3.1")
order = ['low', 'medium', 'high']
sns.stripplot(train_df.interest_level, train_df.price.values, jitter=True, order=order)
plt.title("Price VS Interest Level")
plt.show()
low interest的price看起来分布比较均匀,中度(medium)感兴趣的价格分布更窄,high interest level的 price分布最窄,基本分布在 1500~ 8000 之间
violinplot 提供不同类别条件下特征更多的分部信息 核密度估计(KDE) 三个4分位数(quartile):1/4, 1/2, 3/4 1.5倍四分数间距(nterquartile range, IQR) IQR :第三四分位数和第一分位数的区别(即Q1~Q3的差距),表示变量的分散情况,播放差更稳健的统计量
order = ['low', 'medium', 'high']
sns.violinplot(x="interest_level", y = 'price', data = train_df, order = order)
plt.xlabel("interest level", fontsize = 12)
plt.ylabel('price', fontsize = 12)
plt.show()
经度和维度是虽是数值型变量,但其物理含义是房屋的地理位置。
llimit = np.percentile(train_df.latitude.values, 1)
ulimit = np.percentile(train_df.latitude.values, 99)
train_df['latitude'].loc[train_df['latitude'] < llimit] = llimit
train_df['latitude'].loc[train_df['latitude'] > ulimit] = ulimit
plt.figure(figsize=(8,6))
sns.distplot(train_df.latitude.values, bins=50, kde=True)
plt.xlabel('latitude', fontsize=12)
plt.show()
C:\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning: The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead. alternative="'density'", removal="3.1")
大部分地方纬度都在40.60~40.90之间
llimit = np.percentile(train_df.longitude.values, 1)
ulimit = np.percentile(train_df.longitude.values, 99)
train_df['longitude'].loc[train_df['longitude'] < llimit] = llimit
train_df['longitude'].loc[train_df['longitude'] > ulimit] = ulimit
plt.figure(figsize=(8, 6))
sns.distplot(train_df.longitude.values, bins=50, kde=True)
plt.xlabel('longitude', fontsize=12)
plt.show()
C:\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning: The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead. alternative="'density'", removal="3.1")
地方经度都分布在-73.850~74.025之间,因此这个数据是跟纽约城相关的
from mpl_toolkits.basemap import Basemap
from matplotlib import cm
west, south, east, north = -74.025, 40.60, -73.850, 40.86
fig = plt.figure(figsize=(18,15))
ax = fig.add_subplot(111)
m = Basemap(projection='merc', llcrnrlat=south, urcrnrlat=north,
llcrnrlon=west, urcrnrlon=east, lat_ts=south, resolution='i')
x, y = m(train_df['longitude'].values, train_df['latitude'].values)
m.hexbin(x, y, gridsize=400,
bins='log', cmap=cm.YlOrRd_r);
sns.lmplot(x = "longitude" , y = "latitude" , fit_reg = False , hue = 'interest_level',
hue_order = ['low', 'medium', 'high'] , size = 9, scatter_kws = {'alpha':0.4,'s':30},
data = train_df[(train_df.longitude > train_df.longitude.quantile(0.005))
&(train_df.longitude < train_df.longitude.quantile(0.995))
&(train_df.latitude > train_df.latitude.quantile(0.005))
&(train_df.latitude < train_df.latitude.quantile(0.995))]
)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
Text(27.965277777777786, 0.5, 'Latitude')
上述显示去掉了经度和纬度偏大或偏小的数据点。可以看出higt interet的房屋在一小段很集中。
还有一种作图,我就不列出来了,需要安装工具包:
import gpxpy as gpx import gpxpy.gpx
gpx = gpxpy.gpx.GPX()
for index, row in train.iterrows():
#print (row['latitude'], row['longitude'])
if row['interest_level'] == 'high': #opting for all nominals results in poor performance of Google Earth gps_waypoint = gpxpy.gpx.GPXWaypoint(row['latitude'],row['longitude'],elevation=10) gpx.waypoints.append(gps_waypoint)
filename = "GoogleEarth.gpx" FILE = open(filename,"w") FILE.writelines(gpx.to_xml()) FILE.close()
cnt_srs = train_df.groupby('display_address')['display_address'].count()
for i in [2, 10, 50, 100, 500]:
print('Display_address that appear less than {} times: {}%'.format(i, round((cnt_srs < i).mean() * 100, 2)))
plt.figure(figsize=(12, 6))
plt.hist(cnt_srs.values, bins=100, log=True, alpha=0.9)
plt.xlabel('Number of times display_address appeared', fontsize=12)
plt.ylabel('log(Count)', fontsize=12)
plt.show()
Display_address that appear less than 2 times: 63.22% Display_address that appear less than 10 times: 89.6% Display_address that appear less than 50 times: 97.73% Display_address that appear less than 100 times: 99.26% Display_address that appear less than 500 times: 100.0%
大部分display_address出现次数都少于100次,没有display_address出现次数超过500次的
让我们看看前10个display_address:
top10_addr = train_df.display_address.value_counts().nlargest(10).index.tolist()
fig = plt.figure(figsize=(8, 6))
ax = sns.countplot(x="display_address", hue="interest_level", data=train_df[train_df.display_address.isin(top10_addr)])
plt.xlabel('Display_address')
plt.ylabel('Number of advert occurences')
plt.tick_params(
axis='x', #变化应用于x轴
which='both', # major ticket和minor tickets都会受到影响
bottom='on', # 打开沿着底端边缘的tickets
top='off', # 关闭沿着顶端边缘的tickets
labelbottom='on') # 打开底端的label
plt.xticks(rotation='vertical')
### Adding percentitles over bars
height = [0 if np.isnan(p.get_height()) else p.get_height() for p in ax.patches]
ncol= int(len(height) / 3)
total = [height[i] + height[i + ncol] + height[i + 2 * ncol] for i in range(ncol)]
for i, p in enumerate(ax.patches):
ax.text(p.get_x() + p.get_width() / 2, height[i] + 5, '{:0.1%}'.format(height[i] /total[i % ncol]),
ha = "center")
C:\Anaconda3\lib\site-packages\matplotlib\cbook\__init__.py:424: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead. warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["date_created"] = train_df["created"].dt.date
train_df["year_created"] = train_df["created"].dt.year
train_df["month_created"] = train_df["created"].dt.month
train_df['hour_created'] = train_df['created'].dt.hour
train_df['weekday_created'] = train_df['created'].dt.weekday
train_df['quarter_created'] = train_df['created'].dt.quarter
train_df['weekend_created'] = ((train_df['weekday_created'] == 5) & (train_df['weekday_created'] == 6))
cnt_srs = train_df['date_created'].value_counts().sort_index()
plt.figure(figsize=(20,4))
ax = sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
ax.xaxis_date()
plt.xticks(rotation='vertical')
plt.show()
cnt_srs = train_df['date_created'].value_counts()
plt.figure(figsize=(12,4))
ax = plt.subplot(111)
ax.bar(cnt_srs.index, cnt_srs.values, alpha=0.8)
ax.xaxis_date()
plt.xticks(rotation='vertical')
plt.show()
注意:让我们看看测试集是否与训练集在同一个时间段
test_df["created"] = pd.to_datetime(test_df["created"])
test_df["date_created"] = test_df["created"].dt.date
cnt_srs = test_df['date_created'].value_counts()
plt.figure(figsize=(12,4))
ax = plt.subplot(111)
ax.bar(cnt_srs.index, cnt_srs.values, alpha=0.8)
ax.xaxis_date()
plt.xticks(rotation='vertical')
plt.show()
更细致地来看看是数据在以小时为单位的范围分布情况
train_df["hour_created"] = train_df["created"].dt.hour
cnt_srs = train_df['hour_created'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color=color[2])
plt.xticks(rotation='vertical')
plt.show()
数据主要分布在一天中的头几个小时,或许是这时候网络流量比较少,数据更新正在进行。
hourDF = train_df.groupby(['hour_created', 'interest_level'])['hour_created'].count().unstack()
hourDF[['low', 'medium', 'high']].plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x289a02fd0b8>
order = ['low', 'medium', 'high']
sns.countplot(x="hour_created", hue="interest_level", data = train_df)
<matplotlib.axes._subplots.AxesSubplot at 0x289a04d9a58>
hourDF = train_df.groupby(['month_created', 'interest_level'])['month_created'].count().unstack()
hourDF[['low', 'medium', 'high']].plot(kind='bar', stacked=True)
<matplotlib.axes._subplots.AxesSubplot at 0x289a027de48>
top10_building_id = train_df.building_id.value_counts().nlargest(10).index.tolist()
fig = plt.figure(figsize=(8, 6))
ax = sns.countplot(x="building_id", hue="interest_level", data=train_df[train_df.building_id.isin(top10_building_id)])
plt.xlabel('building_id')
plt.ylabel('Number of advert occurences')
plt.tick_params(
axis='x', #变化应用于x轴
which='both', # major ticket和minor tickets都会受到影响
bottom='on', # 打开沿着底端边缘的tickets
top='off', # 关闭沿着顶端边缘的tickets
labelbottom='on') # 打开底端的label
plt.xticks(rotation='vertical')
### Adding percentitles over bars
height = [0 if np.isnan(p.get_height()) else p.get_height() for p in ax.patches]
ncol= int(len(height) / 3)
total = [height[i] + height[i + ncol] + height[i + 2 * ncol] for i in range(ncol)]
for i, p in enumerate(ax.patches):
ax.text(p.get_x() + p.get_width() / 2, height[i] + 5, '{:0.1%}'.format(height[i] /total[i % ncol]),
ha = "center")
C:\Anaconda3\lib\site-packages\matplotlib\cbook\__init__.py:424: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead. warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
top10_managers = train_df.manager_id.value_counts().nlargest(10).index.tolist()
fig = plt.figure(figsize=(8, 6))
ax = sns.countplot(x="manager_id", hue="interest_level", data=train_df[train_df.manager_id.isin(top10_managers)])
plt.xlabel('manager_id')
plt.ylabel('Number of advert occurences')
plt.tick_params(
axis='x', #变化应用于x轴
which='both', # major ticket和minor tickets都会受到影响
bottom='on', # 打开沿着底端边缘的tickets
top='off', # 关闭沿着顶端边缘的tickets
labelbottom='on') # 打开底端的label
plt.xticks(rotation='vertical')
### Adding percentitles over bars
height = [0 if np.isnan(p.get_height()) else p.get_height() for p in ax.patches]
ncol= int(len(height) / 3)
total = [height[i] + height[i + ncol] + height[i + 2 * ncol] for i in range(ncol)]
for i, p in enumerate(ax.patches):
ax.text(p.get_x() + p.get_width() / 2, height[i] + 5, '{:0.1%}'.format(height[i] /total[i % ncol]),
ha = "center")
C:\Anaconda3\lib\site-packages\matplotlib\cbook\__init__.py:424: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead. warn_deprecated("2.2", "Passing one of 'on', 'true', 'off', 'false' as a "
sns.distplot(train_df.listing_id.values, bins = 50, kde = True)
plt.xlabel('listing_id')
plt.show()
C:\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py:6521: MatplotlibDeprecationWarning: The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead. alternative="'density'", removal="3.1")
order = ['low', 'medium', 'high']
sns.stripplot(train_df.interest_level, train_df.listing_id, jitter=True, order=order)
plt.title("Listing_ID VS Interest Level")
plt.show()
order = ['low', 'medium', 'high']
sns.violinplot(x="interest_level", y = 'listing_id', data = train_df, order = order)
plt.xlabel("Interest Level", fontsize = 12)
plt.ylabel('Listing_ID', fontsize = 12)
plt.show()
图片数据非常大,我们首先来看一下数量特征
train_df["num_photos"] = train_df["photos"].apply(len)
cnt_srs = train_df['num_photos'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
plt.xlabel('Number of Photos', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()
ulimit = np.percentile(train_df['num_photos'], 99)
train_df['num_photos'].loc[train_df['num_photos'] > ulimit] = ulimit
plt.figure(figsize=(12,6))
sns.violinplot(x="num_photos", y="interest_level", data=train_df, order =['low','medium','high'])
plt.xlabel('Number of Photos', fontsize=12)
plt.ylabel('Interest Level', fontsize=12)
plt.show()
看看特征的数量和它的分布
train_df["num_features"] = train_df["features"].apply(len)
cnt_srs = train_df['num_features'].value_counts()
plt.figure(figsize=(12,6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Number of features', fontsize=12)
plt.show()
ulimit = np.percentile(train_df['num_features'], 99)
train_df['num_features'].loc[train_df['num_features'] > ulimit] = ulimit
plt.figure(figsize=(12,10))
sns.violinplot(y="num_features", x="interest_level", data=train_df, order =['low','medium','high'])
plt.xlabel('Interest Level', fontsize=12)
plt.ylabel('Number of features', fontsize=12)
plt.show()
train_df['num_description_words'] = train_df['description'].apply(lambda x: len(x.strip().split(" ")))
train_df['len_description'] = train_df['description'].apply(len)
#print(train_df['len_description'].head(10))
#print(train_df['num_description_words'].head(10))
#print(train_df['description'].iloc[0])
#print(len(train_df['description'].iloc[0]))
fig = plt.figure()
order = ['low', 'medium', 'high']
sns.stripplot(train_df['interest_level'], train_df['len_description'], jitter = True, order = order)
plt.title('Length of description VS Interest_level')
plt.show()
plt.figure()
sns.violinplot(x="len_description", y="interest_level", data = train_df, order = order)
plt.xlabel('Length of description')
plt.ylabel('Interest Level')
plt.show()
plt.figure(figsize=(400,10))
ax = sns.countplot(train_df.len_description)
plt.xticks(rotation='vertical')
plt.xlabel('Length of description')
plt.ylabel('Number of occurrences')
plt.show()
plt.figure(figsize=(8,5))
order = ['low', 'medium', 'high']
sns.stripplot(train_df['interest_level'], train_df['num_description_words'], jitter = True, order = order)
plt.title('Num description words VS Interest_level')
#plt.xticks(rotation='vertical')
plt.xlabel('Number of words of description')
plt.ylabel('Number of occurrences')
plt.show()
plt.figure()
sns.violinplot(x="num_description_words", y="interest_level", data = train_df, order = order)
plt.xlabel('Number of words of description')
plt.ylabel('Interest Level')
plt.show()
plt.figure(figsize=(50,10))
ulimit = np.percentile(train_df.num_description_words.values, 99)
llimit = np.percentile(train_df.num_description_words.values, 1)
train_df.num_description_words.loc[train_df.num_description_words > ulimit] = ulimit
train_df.num_description_words.loc[train_df.num_description_words < llimit] = llimit
ax = sns.countplot(train_df.num_description_words)
plt.xticks(rotation='vertical')
plt.xlabel('Number of words of description')
plt.ylabel('Number of occurrences')
plt.show()
from wordcloud import WordCloud
text = ''
text_da = ''
text_street = ''
#i = 0;
for ind, row in train_df.iterrows():
#if(0 == i % 1000):
# print(i)
for feature in row['features']:
text = " ".join([text, "_".join(feature.strip().split(" "))])
text_da = " ".join([text_da, "_".join(row['display_address'].strip().split(" "))])
text_street = " ".join([text_street, "_".join(row['street_address'].strip().split(" "))])
i = i + 1;
text = text.strip()
text_da = text_da.strip()
text_street = text_street.strip()
plt.figure(figsize=(12, 6))
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40)
wordcloud.generate(text)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title('WordCloud for fatures', fontsize=30)
plt.axis('off')
plt.show()
允许养猫和允许养狗,其实可以合并成允许养宠物!
plt.figure()
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40)
wordcloud.generate(text_da)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("WordCloud for Display Address", fontsize=30)
plt.axis("off")
plt.show()
都是纽约比较繁华的街道!
# wordcloud for street address
plt.figure()
wordcloud = WordCloud(background_color='white', width=600, height=300, max_font_size=50, max_words=40)
wordcloud.generate(text_street)
wordcloud.recolor(random_state=0)
plt.imshow(wordcloud)
plt.title("Wordcloud for street address", fontsize=30)
plt.axis("off")
plt.show()
contFeaturelist = []
contFeaturelist.append('bathrooms')
contFeaturelist.append('bedrooms')
contFeaturelist.append('price')
print(contFeaturelist)
['bathrooms', 'bedrooms', 'price']
correlationMatrix = train_df[contFeaturelist].corr().abs()
plt.subplots()
sns.heatmap(correlationMatrix, annot=True)
#Mask unimportant features
sns.heatmap(correlationMatrix, mask=correlationMatrix < 1, cbar = False)
plt.show()
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from nltk.metrics import distance as distance
from sklearn.model_selection import StratifiedKFold
from MeanEncoder import MeanEncoder
dpath = './'
train = pd.read_json(dpath + 'train.json')
test = pd.read_json(dpath + 'test.json')
train.head().T
10 | 10000 | 100004 | 100007 | 100013 | |
---|---|---|---|---|---|
bathrooms | 1.5 | 1 | 1 | 1 | 1 |
bedrooms | 3 | 2 | 1 | 1 | 4 |
building_id | 53a5b119ba8f7b61d4e010512e0dfc85 | c5c8a357cba207596b04d1afd1e4f130 | c3ba40552e2120b0acfc3cb5730bb2aa | 28d9ad350afeaab8027513a3e52ac8d5 | 0 |
created | 2016-06-24 07:54:24 | 2016-06-12 12:19:27 | 2016-04-17 03:26:41 | 2016-04-18 02:22:02 | 2016-04-28 01:32:41 |
description | A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ... | Top Top West Village location, beautiful Pre-w... | Building Amenities - Garage - Garden - fitness... | Beautifully renovated 3 bedroom flex 4 bedroom... | |
display_address | Metropolitan Avenue | Columbus Avenue | W 13 Street | East 49th Street | West 143rd Street |
features | [] | [Doorman, Elevator, Fitness Center, Cats Allow... | [Laundry In Building, Dishwasher, Hardwood Flo... | [Hardwood Floors, No Fee] | [Pre-War] |
interest_level | medium | low | high | low | low |
latitude | 40.7145 | 40.7947 | 40.7388 | 40.7539 | 40.8241 |
listing_id | 7211212 | 7150865 | 6887163 | 6888711 | 6934781 |
longitude | -73.9425 | -73.9667 | -74.0018 | -73.9677 | -73.9493 |
manager_id | 5ba989232d0489da1b5f2c45f6688adc | 7533621a882f71e25173b27e3139d83d | d9039c43983f6e564b1482b273bd7b01 | 1067e078446a7897d2da493d2f741316 | 98e13ad4b495b9613cef886d79a6291f |
photos | [https://photos.renthop.com/2/7211212_1ed4542e... | [https://photos.renthop.com/2/7150865_be3306c5... | [https://photos.renthop.com/2/6887163_de85c427... | [https://photos.renthop.com/2/6888711_6e660cee... | [https://photos.renthop.com/2/6934781_1fa4b41a... |
price | 3000 | 5465 | 2850 | 3275 | 3350 |
street_address | 792 Metropolitan Avenue | 808 Columbus Avenue | 241 W 13 Street | 333 East 49th Street | 500 West 143rd Street |
从类别型的标签interest_level编码为数字
从前面的分析和常识来看,listing_id对确定interest_level没有用,去掉特征编码对训练集和测试集都要做,所以干脆将二者连起来一起处理
y_map = {'low' : 2 , 'medium' : 1 , 'high' : 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])
#y_train = train.interest_level.values
y_train = train.interest_level
train = train.drop(['listing_id' , 'interest_level'] , axis = 1)
listing_id = test.listing_id.values
test = test.drop('listing_id' , axis = 1)
ntrain = train.shape[0]
# concat函数是在pandas底下的方法,可以将数据根据不同的轴作简单的融合
train_test = pd.concat((train , test) , axis = 0).reset_index(drop = True)
print(y_train.head())
10 1 10000 2 100004 0 100007 2 100013 2 Name: interest_level, dtype: int64
数值型特征,+ / - / * / / ,特征的单调变换对XGBoost不必要
# remove some noise
ulimit = np.percentile(train_test.price.values , 99.99)
print(ulimit)
125420.51099969965
train_test['price'].loc[train_test['price'] > ulimit] = ulimit
# remove some noise
ulimit = np.percentile(train_test.bathrooms.values , 99.99)
print(ulimit)
outliers = train_test["bathrooms"].loc[train_test["bathrooms"] > ulimit]
print(outliers)
6.0 1990 10.0 17047 6.5 42573 7.0 69023 112.0 72329 20.0 97123 7.5 113071 20.0 121848 6.5 123666 6.5 Name: bathrooms, dtype: float64
#pandas.loc 选取指定列进行操作
#df.loc[行标签,列标签]df.loc['a':'b']#选取ab两行数据df.loc[:,'one']#选取one列的数据
#下面的代码是分别将bathrooms列中,值为112,10,20的值分别置换成1.5,1,2
train_test.loc[train_test["bathrooms"] == 112 , "bathrooms"] = 1.5
train_test.loc[train_test["bathrooms"] == 10 , "bathrooms"] = 1
train_test.loc[train_test["bathrooms"] == 20 , "bathrooms"] = 2
ulimit = np.percentile(train_test.bathrooms.values , 99.99)
print(ulimit)
outliers = train_test["bathrooms"].loc[train_test["bathrooms"] > ulimit]
print(outliers)
6.0 17047 6.5 42573 7.0 97123 7.5 121848 6.5 123666 6.5 Name: bathrooms, dtype: float64
构造新特征
train_test['price_bathrooms'] = (train_test["price"]) / (train_test["bathrooms"] + 1.0)
train_test['price_bedrooms'] = (train_test["price"] / (train_test["bedrooms"]) + 1.0)
构造新特征
train_test["room_diff"] = train_test["bathrooms"] - train_test["bedrooms"]
train_test["room_num"] = train_test["bedrooms"] + train_test["bathrooms"]
plt.figure(figsize=(8,4))
sns.countplot(train_test.bathrooms);
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('bathrooms', fontsize=12)
plt.show()
train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)
# count of words present in description colum
train_test["num_description_words"] = train_test["description"].apply(lambda x: len(x.split(" ")))
train_test = train_test.drop(['description'] , axis = 1)
将manager分为几个等级 top 1%, 2%, 5, 10, 15, 20, 25, 30, 50
def getTopXLimit(X, counted_feature_data):
'''
X: the top number
counted_feature_data: counted single feature data
functionality: return the top Xth limit of relevant feature
'''
top_limit = np.percentile(counted_feature_data.values, 100 - X)
return top_limit
def getTopX(X, counted_feature_data):
'''
X: top number
feature_data : counted feature data
'''
return counted_feature_data[counted_feature_data.values >= getTopXLimit(X, counted_feature_data)]
def storeTopX(X : int, source : dict, feature_name : str, to_data : dict, to_feature : str, isReturn: bool = False):
'''
X: top number
source: source of data
feature_name: name of feature
to_data: <class 'dict'>,
to_feature: name of stored feature
return: to_data['top_X_' + to_feature]
'''
feature_counts = source[feature_name].value_counts();
to_data[('top_{}_' + to_feature).format(X)] = source[feature_name].apply(lambda x: 1 if x in feature_counts.index.values[feature_counts.values >= getTopXLimit(X, feature_counts)] else 0)
if isReturn:
return to_data[('top_{}_' + to_feature).format(X)]
else:
return None
print(getTopX(1, train_test['manager_id'].value_counts()))
tops = [1, 2, 5, 10, 15, 20, 30, 50]
for i in tops:
storeTopX(i,train_test, 'manager_id', train_test, 'manager_id', False)
print("Stored top {} of manager id relevant to the number of rental inqueries".format(i))
print(train_test['top_1_manager_id'].head())
e6472c7237327dd3903b3d6f6a94515a 6387 6e5c10246156ae5bdcd9b487ca99d96a 1683 8f5a9c893f6d602f4953fcc0b8e6e9b4 1011 cb87dadbca78fad02b388dc9e8f25a5b 971 62b685cc0d876c3a1a51d63a0d6a8082 956 9df32cb8dda19d3222d66e69e258616b 862 2aa9bfa5f67ed9997ea341dee8a3a271 853 b7de4cb395920136663132057fa89d84 849 ad3d8ddc52c7e0859b5c6c7f7949c3bd 732 c9c33695ee2a2f818e9f1d8f7d1c4b39 731 1fb46c4a72bcf764ac35fc23f394760d 719 5599e962719af3ccc2976855c2d5893c 713 d2bce61e0e0079ebdc8c281e415e045b 677 aa9e353a6b43b125cbc89cb751090a9e 643 62826f3ae01f2ddc93b9cd28c659ab2b 585 dbbb6b990661b1e507a387f019bcb1a0 552 b209e2c4384a64cc307c26759ee0c651 538 612a00076aefe8c98d1df4835640c74b 523 8262449f40e9117f7a9ea49b4a333993 522 d1737922fe92ccb0dc37ba85589e6415 521 fc81b75568d3655a922523cb0d77ea5b 505 8b53ccf4338806ab1be3dd0267711649 482 1067e078446a7897d2da493d2f741316 470 eb5a6c4c2eb6f3e2a085c477afd5a815 441 d399821d8583b8421a55370c6eb15f6f 430 381040570cc863b743a9fce1a6ae9724 426 501b52559fd21b32808030a0f4fb1a26 425 699c325b818541f314b691b76f3238d7 404 c71cf1f472cf9b4b4517ea23fb6f2c91 391 781c05e464ce94eaa436401d07f2a6cc 383 02e17b21a1814fb10336b2ee8ceb3e79 382 30a7951cfc21240e8c46b2d079d292e5 378 9f39caedae295bf81e67463e6fd0af40 362 85860e99ade732afc385111568f8e602 362 537e06890f6a86dbb70c187db5be4d55 361 9d32b720e26a351b951c8f78f72f2fec 357 a9171b0aec2df7c31f96da798343d7ea 353 4bdc3d8c1aaa90d997ce2cb77680679b 351 2a4e0d2668fde3afa267d312b3a2a9d8 346 4f9b376a972b863bbfbdcdc8d717fa4e 344 2b6d9435a0e6c4019e5ffcb53389c3dd 336 efd6fb6d537509ab0fc98661e76d19b2 327 2ec6c71d78dd9df9046ce6899ec42de8 320 d9039c43983f6e564b1482b273bd7b01 314 Name: manager_id, dtype: int64 Stored top 1 of manager id relevant to the number of rental inqueries Stored top 2 of manager id relevant to the number of rental inqueries Stored top 5 of manager id relevant to the number of rental inqueries Stored top 10 of manager id relevant to the number of rental inqueries Stored top 15 of manager id relevant to the number of rental inqueries Stored top 20 of manager id relevant to the number of rental inqueries Stored top 30 of manager id relevant to the number of rental inqueries Stored top 50 of manager id relevant to the number of rental inqueries 0 0 1 0 2 1 3 1 4 0 Name: top_1_manager_id, dtype: int64
类似manager_id处理
print(getTopX(1, train_test['building_id'].value_counts()))
tops = [1, 2, 5, 10, 15, 20, 30, 50]
for i in tops:
storeTopX(i,train_test, 'building_id', train_test, 'building_id', False)
print("Stored top {} of building id relevant to the number of rental inqueries".format(i))
print(train_test.head(10))
0 20664 96274288c84ddd7d5c5d8e425ee75027 705 11e1dec9d14b1a9e528386a2504b3afc 546 bb8658a3e432fb62a440615333376345 522 80a120d6bc3aba97f40fee8c2204524b 510 ce6d18bf3238e668b2bf23f4110b7b67 459 f68bf347f99df026f4faad43cc604048 457 c94301249b8c09429d329864d58e5b82 410 ea9045106c4e1fe52853b6af941f1c69 397 128d4af0683efc5e1eded8dc8044d5e3 385 57ef86c28a8ae482dc3a3c3af28e8e48 373 8e3b8c607c3edcf3de131c24f0390179 367 9c18bf871b97492b96d8ddb800591f1b 362 cb14c4f807f23ecee1f7469b5159d2de 356 d0234abbc01a982d54e8d446acc03405 349 dd7b281c0dbafc8b53575ab78cca75d4 319 aed67db466854ec2aa5d249a032bc401 318 5565db9b7cba3603834c4aa6f2950960 314 18f6eb16d2f3e9885cb4a5d0a40791c6 313 093f64f52a6e43ba5e8f12bec8200554 308 1be9c496f7bdff4a552e3d4e91f915f8 308 7967a1280bf3f7644500fc79d2696b0e 306 ec12b4db154a9c75983f5e32edd93843 303 a01c99eb2cfdde327e1691e17d6696ba 295 6ce872b483cfcbb32ea805604d44ef5f 293 300d27d8ba2adbcbc8c6f2bcbc1c6f9d 291 80911ce8a425daf4989ea8a4bccc41a7 286 93cd24891f8423d45ac587ab1fdb1225 283 c40e3e74475a91aae1928541be8df678 283 a433bb20f3f94e3f3e6700286e351638 276 ... 8dbdafdf2f5f3ac31ecd0c41c6858669 137 20623ec2c6e1962d917680bfd1535d8a 137 96835a48bbe776c37aec3dc0f3df8887 136 62772db889c5fd564f47eb5d46e64e4e 132 c3ba40552e2120b0acfc3cb5730bb2aa 131 f45ba6d22add6d2c30bb2256dd715901 131 0d01cabe55fa5192cdbcabd5c585c1ea 130 e3f53dfe045875f250622856c0923deb 130 ea11299b288bdb7e740fc2dccfc3b140 130 297160caf8a88c10718b492d4672c6b4 129 a07d25f014240f198ace788470c972ce 129 83ec3cf6c56a458b9f0cc455fee2d000 129 205f95d4a78f1f3befda48b89edc9669 128 2a21319016fe50100e0b8ebb5a4f9cf0 128 d6c345de3f342d611ea0dd21eabe68e2 128 a1bd0631f4febfb498e34b7cf68bcfb1 128 6e01d34a60782126c24526cf72878117 127 665f4f6eb9a097fb213f99950b835b9a 126 c0ae658e2ffa1d5bfbc2c2625b6d766d 126 2094beb4097c7f56670afba5b01b31a4 125 d19f4e0bc32610797a7ff5f1737be546 124 8994778bc2a2ca65aa25030c2c0ec9de 124 35784800ca930a77ea64d2e67f001b33 123 af79d0d2ed3d31362185fed210e97cb3 123 67da189c736c7a3869ba9c98425ea254 122 67c9b420da4a365bc26a6cd0ef4a5320 119 57efa26ca6f2a3a6120e0f2e3972c39b 119 14ad2afd9e6a05d708e2982bfd88993b 119 bd93993052370b66755cb4af017950fc 118 d89f514c3ed0abaae52cba7017ac0701 118 Name: building_id, Length: 117, dtype: int64 Stored top 1 of building id relevant to the number of rental inqueries Stored top 2 of building id relevant to the number of rental inqueries Stored top 5 of building id relevant to the number of rental inqueries Stored top 10 of building id relevant to the number of rental inqueries Stored top 15 of building id relevant to the number of rental inqueries Stored top 20 of building id relevant to the number of rental inqueries Stored top 30 of building id relevant to the number of rental inqueries Stored top 50 of building id relevant to the number of rental inqueries bathrooms bedrooms building_id display_address \ 0 1.5 3 53a5b119ba8f7b61d4e010512e0dfc85 Metropolitan Avenue 1 1.0 2 c5c8a357cba207596b04d1afd1e4f130 Columbus Avenue 2 1.0 1 c3ba40552e2120b0acfc3cb5730bb2aa W 13 Street 3 1.0 1 28d9ad350afeaab8027513a3e52ac8d5 East 49th Street 4 1.0 4 0 West 143rd Street 5 2.0 4 38a913e46c94a7f46ddf19b756a9640c West 18th Street 6 1.0 2 3ba49a93260ca5df92fde024cb4ca61f West 107th Street 7 2.0 1 0372927bcb6a0949613ef5bf893bbac7 West 21st Street 8 1.0 1 a7efbeb58190aa267b4a9121cd0c88c0 Hamilton Terrace 9 2.0 4 0 522 E 11th features latitude longitude \ 0 [] 40.7145 -73.9425 1 [Doorman, Elevator, Fitness Center, Cats Allow... 40.7947 -73.9667 2 [Laundry In Building, Dishwasher, Hardwood Flo... 40.7388 -74.0018 3 [Hardwood Floors, No Fee] 40.7539 -73.9677 4 [Pre-War] 40.8241 -73.9493 5 [] 40.7429 -74.0028 6 [prewar, elevator, Dogs Allowed, Cats Allowed,... 40.8012 -73.9660 7 [Doorman, Elevator, Pre-War, Terrace, Laundry ... 40.7427 -73.9957 8 [Cats Allowed, Dogs Allowed, Elevator, Laundry... 40.8234 -73.9457 9 [Dishwasher, Hardwood Floors] 40.7278 -73.9808 manager_id \ 0 5ba989232d0489da1b5f2c45f6688adc 1 7533621a882f71e25173b27e3139d83d 2 d9039c43983f6e564b1482b273bd7b01 3 1067e078446a7897d2da493d2f741316 4 98e13ad4b495b9613cef886d79a6291f 5 b209e2c4384a64cc307c26759ee0c651 6 01287194f20de51872e81f660def4784 7 e6472c7237327dd3903b3d6f6a94515a 8 c1a6598437b7db560cde66e5a297a53f 9 23a01ea7717b38875f5b070282d1b9d2 photos price \ 0 [https://photos.renthop.com/2/7211212_1ed4542e... 3000.0 1 [https://photos.renthop.com/2/7150865_be3306c5... 5465.0 2 [https://photos.renthop.com/2/6887163_de85c427... 2850.0 3 [https://photos.renthop.com/2/6888711_6e660cee... 3275.0 4 [https://photos.renthop.com/2/6934781_1fa4b41a... 3350.0 5 [https://photos.renthop.com/2/6894514_9abb8592... 7995.0 6 [https://photos.renthop.com/2/6930771_7e3622b6... 3600.0 7 [https://photos.renthop.com/2/6867392_b18283f6... 5645.0 8 [https://photos.renthop.com/2/6898799_3759be4c... 1725.0 9 [https://photos.renthop.com/2/6814332_e19a8552... 5800.0 ... top_30_manager_id top_50_manager_id top_1_building_id \ 0 ... 1 1 0 1 ... 1 1 0 2 ... 1 1 1 3 ... 1 1 1 4 ... 1 1 1 5 ... 1 1 0 6 ... 1 1 0 7 ... 1 1 0 8 ... 1 1 0 9 ... 1 1 1 top_2_building_id top_5_building_id top_10_building_id \ 0 0 0 0 1 0 1 1 2 1 1 1 3 1 1 1 4 1 1 1 5 0 0 1 6 0 1 1 7 1 1 1 8 0 1 1 9 1 1 1 top_15_building_id top_20_building_id top_30_building_id \ 0 0 0 1 1 1 1 1 2 1 1 1 3 1 1 1 4 1 1 1 5 1 1 1 6 1 1 1 7 1 1 1 8 1 1 1 9 1 1 1 top_50_building_id 0 1 1 1 2 1 3 1 4 1 5 1 6 1 7 1 8 1 9 1 [10 rows x 38 columns]
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos'] , axis = 1 , inplace = True)
print(train_test['photos_count'].head(10))
0 5 1 11 2 8 3 3 4 3 5 5 6 10 7 5 8 5 9 9 Name: photos_count, dtype: int64
聚类降维编码(#用训练数据训练,对训练数据和测试数据都做变换)到中心的距离(论坛上讨论到曼哈顿中心的距离更好)
# Clustering
ntrain = train.shape[0]
train_location = train_test.loc[:ntrain - 1, ['latitude', 'longitude']]
test_location = train_test.loc[ntrain:, ['latitude', 'longitude']]
kmeans_cluster = KMeans(n_clusters=20)
res = kmeans_cluster.fit(train_location)
res = kmeans_cluster.predict( pd.concat((train_location, test_location), axis=0).reset_index(drop=True))
train_test['cenroid'] = res
# L1 distance
center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]
train_test['distance'] = abs(train_test['latitude'] - center[0]) + abs(train_test['longitude'] - center[1])
print(train_test['distance'].head())
print(train_test['cenroid'].head())
0 0.040260 1 0.064140 2 0.048829 3 0.024340 4 0.088971 Name: distance, dtype: float64 0 15 1 13 2 19 3 4 4 11 Name: cenroid, dtype: int32
train_test['display_address'] = train_test['display_address'].apply(lambda x: x.lower().strip())
print(train_test['display_address'].head())
0 metropolitan avenue 1 columbus avenue 2 w 13 street 3 east 49th street 4 west 143rd street Name: display_address, dtype: object
train_test['street_address'] = train_test['street_address'].apply(lambda x: x.lower().strip())
print(train_test['street_address'].head())
0 792 metropolitan avenue 1 808 columbus avenue 2 241 w 13 street 3 333 east 49th street 4 500 west 143rd street Name: street_address, dtype: object
LableEncode
#categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
categoricals = ['building_id', 'manager_id', 'display_address', 'street_address']
print(train_test.loc[:5, categoricals])
for feat in categoricals:
lbl = LabelEncoder()
lbl.fit(list(train_test[feat].values))
train_test[feat] = lbl.transform(list(train_test[feat].values))
building_id manager_id \ 0 53a5b119ba8f7b61d4e010512e0dfc85 5ba989232d0489da1b5f2c45f6688adc 1 c5c8a357cba207596b04d1afd1e4f130 7533621a882f71e25173b27e3139d83d 2 c3ba40552e2120b0acfc3cb5730bb2aa d9039c43983f6e564b1482b273bd7b01 3 28d9ad350afeaab8027513a3e52ac8d5 1067e078446a7897d2da493d2f741316 4 0 98e13ad4b495b9613cef886d79a6291f 5 38a913e46c94a7f46ddf19b756a9640c b209e2c4384a64cc307c26759ee0c651 display_address street_address 0 metropolitan avenue 792 metropolitan avenue 1 columbus avenue 808 columbus avenue 2 w 13 street 241 w 13 street 3 east 49th street 333 east 49th street 4 west 143rd street 500 west 143rd street 5 west 18th street 350 west 18th street
train_test.loc[:5, categoricals]
building_id | manager_id | display_address | street_address | |
---|---|---|---|---|
0 | 3797 | 1568 | 10955 | 21630 |
1 | 8986 | 1988 | 7967 | 21801 |
2 | 8889 | 3733 | 12368 | 8941 |
3 | 1848 | 282 | 9596 | 13018 |
4 | 0 | 2618 | 13640 | 17671 |
5 | 2544 | 3081 | 13763 | 13968 |
定义高基数类别型特征编码函数 (manager_id, building_id, display_address,street_address ) 对这些特征进行均值编码(该特征值在每个类别的概率,即原来的一维特征变成了C-1维特征,C为标签类别数目)
from MeanEncoder import MeanEncoder
me = MeanEncoder(categoricals)
#trian
#import pdb
#pdb.set_trace()
train_new = train_test.iloc[:ntrain, :]
train_new_cat = me.fit_transform(train_new, y_train)
#test
test_new = train_test.iloc[ntrain:, :]
test_new_cat = me.transform(test_new)
C:\Users\ruito\Desktop\jupter\kaggle\Two Sigma Connect Rental Listing Inquiries\MeanEncoder.py:53: FutureWarning: using a dict on a Series for aggregation is deprecated and will be removed in a future version col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'}) C:\Users\ruito\Desktop\jupter\kaggle\Two Sigma Connect Rental Listing Inquiries\MeanEncoder.py:38: RuntimeWarning: overflow encountered in exp self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
print(train_new_cat.head(1).T)
0 bathrooms 1.5 bedrooms 3 building_id 3797 display_address 10955 features [] latitude 40.7145 longitude -73.9425 manager_id 1568 price 3000 street_address 21630 price_bathrooms 1200 price_bedrooms 1001 room_diff -1.5 room_num 4.5 Year 2016 Month 6 Day 24 Wday 4 Yday 176 hour 7 num_description_words 95 top_1_manager_id 0 top_2_manager_id 1 top_5_manager_id 1 top_10_manager_id 1 top_15_manager_id 1 top_20_manager_id 1 top_30_manager_id 1 top_50_manager_id 1 top_1_building_id 0 top_2_building_id 0 top_5_building_id 0 top_10_building_id 0 top_15_building_id 0 top_20_building_id 0 top_30_building_id 1 top_50_building_id 1 photos_count 5 cenroid 15 distance 0.0402603 building_id_pred_0 0.0770683 building_id_pred_1 0.222439 building_id_pred_2 0.700492 manager_id_pred_0 0.0833333 manager_id_pred_1 0.208333 manager_id_pred_2 0.708333 display_address_pred_0 1.74199e-07 display_address_pred_1 5.02785e-07 display_address_pred_2 0.999999 street_address_pred_0 0.0770683 street_address_pred_1 0.222439 street_address_pred_2 0.700492
描述特征文字长度 特征中单词的词频,相当于以数据集features中出现的词语为字典的one-hot编码(虽然是词频,但在这个任务中每个单词)
train_test['features']
0 [] 1 [Doorman, Elevator, Fitness Center, Cats Allow... 2 [Laundry In Building, Dishwasher, Hardwood Flo... 3 [Hardwood Floors, No Fee] 4 [Pre-War] 5 [] 6 [prewar, elevator, Dogs Allowed, Cats Allowed,... 7 [Doorman, Elevator, Pre-War, Terrace, Laundry ... 8 [Cats Allowed, Dogs Allowed, Elevator, Laundry... 9 [Dishwasher, Hardwood Floors] 10 [] 11 [prewar, dishwasher, HIGHRISE, ROOFDECK, EAT I... 12 [Doorman, Elevator, Laundry in Building, No Fee] 13 [Swimming Pool, Doorman, Fitness Center, No Fe... 14 [Elevator, Multi-Level, Laundry in Building, D... 15 [Doorman, Elevator, Fitness Center, Laundry in... 16 [Hardwood Floors] 17 [prewar, LOWRISE, EAT IN KITCHEN, SIMPLEX, HAR... 18 [No Fee, Cats Allowed, Dogs Allowed] 19 [Cats Allowed, Dogs Allowed] 20 [Doorman, Elevator, Dishwasher, Hardwood Floors] 21 [Elevator, Central A/C, Walk in Closet(s), Par... 22 [Cats Allowed, Dogs Allowed] 23 [Doorman, Fitness Center, Pool, Elevator, Publ... 24 [Balcony, Doorman, Elevator, Fitness Center, T... 25 [Roof Deck, Dining Room, Balcony, Doorman, Ele... 26 [Dishwasher, Hardwood Floors, No Fee] 27 [Roof Deck, Doorman, Elevator, Laundry in Buil... 28 [] 29 [Doorman, Elevator, Fitness Center, Pre-War, L... ... 123981 [Swimming Pool, Roof Deck, Doorman, Elevator, ... 123982 [Dining Room, Doorman, Elevator, Laundry in Bu... 123983 [] 123984 [Elevator, Pre-War, Dogs Allowed, Cats Allowed] 123985 [Swimming Pool, Dining Room, Doorman, Elevator... 123986 [Doorman, Elevator, Fitness Center, Common gar... 123987 [Fireplace, Dining Room, Elevator, Garden/Pati... 123988 [Swimming Pool, Roof Deck, Doorman, Elevator] 123989 [Hardwood Floors, No Fee] 123990 [Cats Allowed, Dogs Allowed, Laundry In Building] 123991 [Dining Room, Balcony, Doorman, Elevator, Laun... 123992 [Fitness Center, Cats Allowed, Dogs Allowed] 123993 [Dining Room, Doorman, Elevator, Laundry in Bu... 123994 [Elevator, No Fee, Cats Allowed, Dogs Allowed] 123995 [Dishwasher, Hardwood Floors, Dogs Allowed, Ca... 123996 [Swimming Pool, Doorman, Elevator, Fitness Cen... 123997 [Dishwasher, Hardwood Floors] 123998 [Doorman, Prewar, Elevator, Hardwood Floors, H... 123999 [Laundry in Building] 124000 [Roof Deck, Elevator, Fitness Center, Laundry ... 124001 [Loft, Hardwood Floors, No Fee] 124002 [Doorman, Elevator, Laundry in Building, Dishw... 124003 [Dining Room, Doorman, Elevator, Fitness Cente... 124004 [Pre-War] 124005 [Doorman, Pre-War] 124006 [Pre-War, Dogs Allowed, Cats Allowed] 124007 [] 124008 [Doorman, Elevator, Pre-War, Laundry in Buildi... 124009 [Cats Allowed, Dogs Allowed, Reduced Fee] 124010 [Elevator, Dogs Allowed, Cats Allowed] Name: features, Length: 124011, dtype: object
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))
c_vect = CountVectorizer(stop_words='english', max_features=300, ngram_range=(1, 1))
c_vect_sparse = c_vect.fit_transform(train_test['features2'])
c_vect_sparse_cols = c_vect.get_feature_names()
train_test.drop(['features', 'features2'], axis=1, inplace=True)
#hstack作为特征处理的最后一部,先将其他所有特征都转换成数值型特征才能处理
train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()
train_test['features_count']
0 0 1 5 2 4 3 2 4 1 5 0 6 8 7 8 8 4 9 2 10 0 11 7 12 4 13 6 14 5 15 5 16 1 17 5 18 3 19 2 20 4 21 9 22 2 23 10 24 12 25 12 26 3 27 8 28 0 29 8 .. 123981 15 123982 7 123983 0 123984 4 123985 10 123986 10 123987 10 123988 4 123989 2 123990 3 123991 9 123992 3 123993 7 123994 4 123995 4 123996 5 123997 2 123998 6 123999 1 124000 9 124001 3 124002 6 124003 10 124004 1 124005 2 124006 3 124007 0 124008 6 124009 3 124010 3 Name: features_count, Length: 124011, dtype: int64
train_test_sparse
<124011x340 sparse matrix of type '<class 'numpy.float64'>' with 5216054 stored elements in Compressed Sparse Row format>
c_vect_sparse_cols
['1br', '24', '2br', '3br', '4br', 'ac', 'accepted', 'access', 'actual', 'air', 'allowed', 'appliances', 'approval', 'approved', 'apt', 'area', 'areas', 'art', 'assigned', 'atm', 'attended', 'available', 'ave', 'baby', 'backyard', 'balconies', 'balcony', 'bar', 'basement', 'basketball', 'bath', 'bathroom', 'baths', 'bed', 'bedford', 'bedroom', 'bedrooms', 'bike', 'billiards', 'bldg', 'blk', 'blks', 'brand', 'breakfast', 'brick', 'bright', 'brownstone', 'brunch', 'building', 'built', 'business', 'cable', 'case', 'cats', 'ceiling', 'ceilings', 'center', 'central', 'chef', 'chefs', 'children', 'childrens', 'city', 'clean', 'cleaning', 'close', 'closet', 'closets', 'club', 'common', 'community', 'complimentary', 'concierge', 'conditioning', 'condo', 'construction', 'cook', 'countertops', 'court', 'courtyard', 'deal', 'deck', 'deco', 'decorative', 'details', 'dining', 'dishwasher', 'dogs', 'doorman', 'dry', 'dryer', 'duplex', 'east', 'eat', 'elev', 'elevator', 'entertainment', 'equipped', 'exclusive', 'expansive', 'exposed', 'exposure', 'extra', 'facilities', 'fee', 'finishes', 'fireplace', 'fireplaces', 'fitness', 'flex', 'floor', 'floors', 'free', 'french', 'friendly', 'ft', 'fully', 'furnished', 'galore', 'game', 'garage', 'garden', 'gourmet', 'granite', 'great', 'green', 'guarantors', 'gut', 'gym', 'hardwood', 'health', 'heart', 'heat', 'hi', 'high', 'highrise', 'home', 'hot', 'hour', 'housekeeping', 'huge', 'included', 'including', 'indoor', 'intercom', 'internet', 'king', 'kitchen', 'large', 'laundry', 'level', 'light', 'live', 'living', 'lndry', 'lobby', 'location', 'loft', 'lot', 'lounge', 'low', 'lowrise', 'luxury', 'magnificent', 'mail', 'mansion', 'marble', 'massive', 'media', 'microwave', 'midrise', 'midtown', 'modern', 'month', 'mr', 'multi', 'natural', 'near', 'new', 'newly', 'nursery', 'ok', 'open', 'ornate', 'outdoor', 'oversized', 'package', 'park', 'parking', 'party', 'patio', 'penthouse', 'perfect', 'pet', 'pets', 'photos', 'playroom', 'pool', 'post', 'pre', 'prewar', 'private', 'publicoutdoor', 'queen', 'ramp', 'real', 'recreation', 'reduced', 'renovated', 'renovation', 'renovations', 'rent', 'resident', 'residents', 'rise', 'river', 'roof', 'roofdeck', 'rooftop', 'room', 'rooms', 'roomy', 'satellite', 'sauna', 'scenic', 'screening', 'security', 'separate', 'service', 'services', 'share', 'shared', 'shares', 'short', 'simplex', 'site', 'size', 'sized', 'skylight', 'southern', 'spa', 'space', 'spacious', 'speed', 'sprawling', 'sq', 'ss', 'stainless', 'state', 'steal', 'steel', 'steps', 'stop', 'storage', 'studio', 'stunning', 'style', 'sublet', 'subway', 'sun', 'sunday', 'sundeck', 'sunfilled', 'sunlight', 'super', 'superintendent', 'swimming', 'table', 'tenant', 'term', 'terrace', 'terraces', 'time', 'tons', 'train', 'true', 'tv', 'unique', 'unit', 'utilities', 'valet', 'venetian', 'video', 'view', 'views', 'village', 'virtual', 'walk', 'wall', 'walls', 'war', 'washer', 'water', 'wheelchair', 'wifi', 'windows', 'wood', 'work', 'yard', 'yoga']
#存为csv格式方便用excel查看
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]
train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
train_new.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)
X_test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)
from scipy.io import mmwrite
X_train_sparse = train_test_sparse[:ntrain, :]
X_test_sparse = train_test_sparse[ntrain:, :]
train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr()
mmwrite(dpath + 'RentListingInquries_FE_train.txt',train_sparse)
mmwrite(dpath + 'RentListingInquries_FE_test.txt',X_test_sparse)
#存为libsvm稀疏格式,直接调用XGBoost的话用稀疏格式更高效
#from sklearn.datasets import dump_svmlight_file
#dump_svmlight_file(, y_train, dpath + 'RentListingInquries_FE_train.txt',X_train_sparse)
#dump_svmlight_file(X_test_sparse, dpath + 'RentListingInquries_FE_test.txt')
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]
train_new = pd.concat((X_train, y_train), axis=1)