#!/usr/bin/env python # coding: utf-8 # In[41]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns get_ipython().run_line_magic('matplotlib', 'inline') # In[11]: train = pd.read_csv('200811-201811.csv') train.head() #Danger分類點說明 #對敏感族群不健康為PM2.5數值在35.5以上 # **用heatmap(.isnull())來找出缺失的資料在哪些欄位** # In[12]: sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis') # **用countplot來看Nox是否影響健康** # In[13]: sns.countplot(x='Danger',hue='Nox',data=train,palette='RdBu_r') # **用直方圖看年齡分佈。缺失資料在此不計。** # In[15]: sns.distplot(train['PM25'].dropna(),kde=False,bins=30) # **用直方圖看Nox的分佈** # In[16]: train['Nox'].hist(bins=40,figsize=(10,4)) # In[18]: X = train.drop('Danger', axis=1) y = train['Danger'] # In[19]: from sklearn.model_selection import train_test_split # In[32]: X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.75,random_state=42) # In[33]: from sklearn.linear_model import LogisticRegression # In[34]: logmodel = LogisticRegression(solver='liblinear') logmodel.fit(X_train,y_train) # In[35]: predictions = logmodel.predict(X_test) # In[36]: from sklearn.metrics import classification_report print(classification_report(y_test,predictions)) # In[40]: from sklearn.metrics import confusion_matrix # In[39]: confusion_matrix(y_test,predictions) Confusion Matrix 預測 (No) 預測(Yes) 實際(No) TN=549 FP=11 實際(Yes) FN=20 TP=506 TN=True Negative; FP=False Positive, FN=False Negative; TP=True Positive precison = TP / (TP + FP) recall = TP / (TP + FN) accuracy = (TP + TN) / (TN + FP + FN + TP) F-beta score可視為precision和recall加權平均的值,數值介於0-1,最好是1。 support代表在測試組y實際值的發生次數。** # In[ ]: