import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LinearRegression
#from sklearn import metrics
%matplotlib inline
df = pd.read_csv('200811-201811.csv')
df.head(20)
SO2 | CO | O3 | PM25 | Nox | NO | NO2 | THC | NMHC | CH4 | WindSpeed | TEMP | Humidity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4.4 | 0.47 | 32.2 | 31 | 24 | 3.46 | 20.84 | 2.309 | 0.231 | 2.078 | 1.91 | 24.86 | 77.11 |
1 | 6.4 | 0.52 | 30.2 | 32 | 32 | 5.64 | 26.30 | 2.186 | 0.227 | 1.959 | 1.72 | 26.58 | 71.93 |
2 | 3.2 | 0.45 | 30.5 | 46 | 20 | 2.36 | 18.05 | 0.000 | 0.000 | 0.000 | 2.08 | 24.75 | 76.33 |
3 | 5.2 | 0.47 | 32.5 | 38 | 24 | 3.18 | 20.64 | 2.374 | 0.225 | 2.150 | 1.66 | 24.97 | 79.97 |
4 | 4.5 | 0.72 | 29.9 | 32 | 29 | 5.43 | 23.82 | 2.290 | 0.287 | 2.004 | 1.03 | 26.18 | 73.78 |
5 | 3.7 | 0.48 | 35.3 | 30 | 21 | 2.20 | 18.95 | 2.202 | 0.179 | 2.023 | 2.08 | 24.58 | 79.22 |
6 | 4.8 | 0.63 | 30.4 | 34 | 31 | 5.94 | 25.24 | 2.233 | 0.228 | 2.005 | 1.78 | 25.64 | 72.95 |
7 | 2.9 | 0.56 | 33.1 | 35 | 26 | 2.96 | 23.17 | 2.287 | 0.252 | 2.035 | 1.55 | 25.85 | 70.83 |
8 | 3.0 | 0.45 | 31.2 | 40 | 22 | 3.81 | 17.81 | 2.188 | 0.213 | 1.975 | 1.89 | 24.49 | 77.84 |
9 | 1.8 | 0.35 | 33.1 | 24 | 10 | 1.54 | 8.22 | 0.000 | 0.000 | 0.000 | 0.96 | 25.25 | 76.11 |
10 | 5.0 | 0.40 | 41.0 | 38 | 18 | 1.90 | 16.16 | 2.201 | 0.103 | 2.098 | 2.11 | 25.53 | 76.27 |
11 | 3.7 | 0.51 | 34.6 | 35 | 21 | 3.36 | 17.43 | 2.197 | 0.171 | 2.027 | 2.16 | 24.77 | 77.40 |
12 | 4.5 | 0.44 | 47.1 | 28 | 20 | 1.61 | 18.52 | 2.235 | 0.201 | 2.034 | 1.97 | 26.14 | 73.20 |
13 | 5.0 | 0.44 | 43.7 | 35 | 18 | 1.76 | 16.17 | 2.353 | 0.176 | 2.177 | 1.52 | 26.17 | 78.20 |
14 | 4.1 | 0.38 | 53.8 | 29 | 12 | 0.85 | 11.24 | 2.184 | 0.094 | 2.090 | 2.22 | 26.63 | 74.09 |
15 | 3.8 | 0.53 | 45.0 | 30 | 20 | 1.61 | 18.72 | 2.239 | 0.224 | 2.015 | 1.73 | 27.35 | 66.75 |
16 | 6.7 | 0.47 | 43.7 | 29 | 25 | 3.49 | 21.17 | 2.155 | 0.186 | 1.969 | 1.82 | 27.80 | 67.62 |
17 | 1.9 | 0.34 | 43.3 | 24 | 8 | 0.98 | 6.81 | 0.000 | 0.000 | 0.000 | 1.02 | 26.31 | 75.42 |
18 | 5.0 | 0.64 | 43.7 | 29 | 24 | 3.06 | 20.79 | 2.252 | 0.242 | 2.011 | 0.96 | 27.13 | 70.88 |
19 | 5.1 | 0.57 | 44.3 | 29 | 26 | 3.37 | 22.28 | 2.237 | 0.195 | 2.042 | 1.65 | 27.01 | 69.31 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1447 entries, 0 to 1446 Data columns (total 13 columns): SO2 1447 non-null float64 CO 1447 non-null float64 O3 1447 non-null float64 PM25 1447 non-null int64 Nox 1447 non-null int64 NO 1447 non-null float64 NO2 1447 non-null float64 THC 1447 non-null float64 NMHC 1447 non-null float64 CH4 1447 non-null float64 WindSpeed 1447 non-null float64 TEMP 1447 non-null float64 Humidity 1447 non-null float64 dtypes: float64(11), int64(2) memory usage: 147.0 KB
df.describe()
SO2 | CO | O3 | PM25 | Nox | NO | NO2 | THC | NMHC | CH4 | WindSpeed | TEMP | Humidity | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 | 1447.000000 |
mean | 5.393780 | 0.475073 | 28.741811 | 34.408431 | 22.619903 | 4.491175 | 18.124202 | 1.464091 | 0.148995 | 1.315270 | 0.944368 | 12.400663 | 74.314603 |
std | 2.455167 | 0.181656 | 7.819648 | 16.745091 | 9.629188 | 2.618546 | 7.484773 | 1.005049 | 0.130524 | 0.899127 | 0.994151 | 13.162018 | 4.737597 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 3.700000 | 0.340000 | 22.900000 | 21.000000 | 16.000000 | 2.710000 | 12.395000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 71.595000 |
50% | 4.900000 | 0.470000 | 28.200000 | 35.000000 | 21.000000 | 3.940000 | 17.360000 | 2.017000 | 0.156000 | 1.845000 | 0.820000 | 0.000000 | 74.390000 |
75% | 6.600000 | 0.580000 | 33.400000 | 47.000000 | 28.000000 | 5.525000 | 23.510000 | 2.201500 | 0.233000 | 1.971000 | 1.935000 | 26.630000 | 77.375000 |
max | 17.100000 | 1.230000 | 58.500000 | 79.000000 | 56.000000 | 18.200000 | 40.310000 | 2.760000 | 0.753000 | 2.260000 | 2.940000 | 31.330000 | 90.260000 |
利用distplot來看PM2.5主要集中的區間
sns.distplot(df['PM25'])
<matplotlib.axes._subplots.AxesSubplot at 0x2eadfd68>
利用df.corr()先做出各變數間的關係係數,再用heatmap作圖
sns.heatmap(df.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x2ead6518>
df.columns
Index(['SO2', 'CO', 'O3', 'PM25', 'Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4', 'WindSpeed', 'TEMP', 'Humidity'], dtype='object')
X是想探索的自變數,Y是依變數。
X = df[['SO2', 'CO', 'O3', 'Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4',
'WindSpeed', 'TEMP', 'Humidity']]
y = df['PM25']
將資料分成訓練組及測試組
from sklearn.model_selection import train_test_split
test_size代表測試組比例。random_state代表設定隨機種子,讓測試結果可被重複
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
載入線性迴歸,並訓練模型
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
y_pred = lm.predict(X_test)
取得截距。如果公式是y=ax+b,b即是截距
print('截距b:',lm.intercept_)
截距b: 4.194703731759336
取得迴歸係數,並用Data Frame顯示
lm.coef_
array([-6.98788202e-01, 4.90313211e+00, 5.89798609e-01, -6.18576364e-01, 5.67911565e-01, 2.21597165e+00, -2.79568749e+02, 2.69027186e+02, 2.79256384e+02, -4.36464663e-01, -4.63757074e-01, -8.03178789e-02])
列出訓練的變數
X_train.columns
Index(['SO2', 'CO', 'O3', 'Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4', 'WindSpeed', 'TEMP', 'Humidity'], dtype='object')
使用測試組資料來預測結果
predictions = lm.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(20)
df1
Actual | Predicted | |
---|---|---|
134 | 33 | 38.123666 |
337 | 11 | 10.122959 |
795 | 24 | 32.257548 |
115 | 41 | 30.548681 |
70 | 9 | 17.025202 |
81 | 8 | 13.795380 |
713 | 57 | 67.452836 |
374 | 24 | 15.417207 |
1030 | 44 | 44.050396 |
1141 | 60 | 60.343275 |
475 | 8 | 11.680019 |
224 | 23 | 27.689480 |
18 | 29 | 40.981670 |
856 | 51 | 45.750445 |
90 | 22 | 26.685953 |
1012 | 45 | 45.962751 |
0 | 31 | 34.075856 |
212 | 8 | 2.375937 |
628 | 12 | 11.493624 |
194 | 17 | 5.178435 |
比較實際PM2.5及預測PM2.5的關係
plt.scatter(y_test,predictions)
<matplotlib.collections.PathCollection at 0x2ed6ea58>
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
看實際值及預測值之間的殘差分佈圖
sns.distplot((y_test-predictions))
<matplotlib.axes._subplots.AxesSubplot at 0x2edb0748>
載入迴歸常見的評估指標
from sklearn import metrics
Mean Absolute Error (MAE)代表平均誤差,公式為所有實際值及預測值相減的絕對值平均。
metrics.mean_absolute_error(y_test,predictions)
5.694068343810315
Mean Squared Error (MSE)比起MSE可以拉開誤差差距,算是蠻常用的指標,公式所有實際值及預測值相減的平方的平均
metrics.mean_squared_error(y_test,predictions)
54.77458000595562
Root Mean Squared Error (RMSE)代表MSE的平方根。比起MSE更為常用,因為更容易解釋y。
np.sqrt(metrics.mean_squared_error(y_test,predictions))
7.400985069972484