#!/usr/bin/env python
# coding: utf-8

# ### Assignment. 산학지형 기상 정보 대비 Burned Area에 대한 EDA 및 Regression를 활용한 예측
# - 데이터 집합 소스
#   - 설명: https://archive.ics.uci.edu/ml/datasets/Forest+Fires
#     - Features
#       - 1: X - x-axis spatial coordinate within the Montesinho park map: 1 to 9 
#       - 2: Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9 
#       - 3: month - month of the year: 'jan' to 'dec' 
#       - 4: day - day of the week: 'mon' to 'sun' 
#       - 5: FFMC - FFMC index from the FWI system: 18.7 to 96.20 
#       - 6: DMC - DMC index from the FWI system: 1.1 to 291.3 
#       - 7: DC - DC index from the FWI system: 7.9 to 860.6 
#       - 8: ISI - ISI index from the FWI system: 0.0 to 56.10 
#       - 9: temp - temperature in Celsius degrees: 2.2 to 33.30 
#       - 10: RH - relative humidity in %: 15.0 to 100 
#       - 11: wind - wind speed in km/h: 0.40 to 9.40 
#       - 12: rain - outside rain in mm/m2 : 0.0 to 6.4 
#     - Target 
#       - area - the burned area of the forest (in ha): 0.00 to 1090.84
#   - 관련 논문 및 자료
#     - P. Cortez and A. Morais. A Data Mining Approach to Predict Forest Fires using Meteorological Data. In J. Neves, M. F. Santos and J. Machado Eds., New Trends in Artificial Intelligence, Proceedings of the 13th EPIA 2007 - Portuguese Conference on Artificial Intelligence, December, Guimarães, Portugal, pp. 512-523, 2007. APPIA, ISBN-13 978-989-95618-0-9. http://www3.dsi.uminho.pt/pcortez/fires.pdf
#     - 논문 발췌 중요 그림
#     ![attributes](./figures/fire.png)
#     ![map](./figures/fire_map.png)
#     - Fire Weather Index
#       - https://www.frames.gov/files/6014/1576/1411/FWI-history.pdf
#   - 데이터: https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv

# In[1]:


import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
get_ipython().run_line_magic('matplotlib', 'inline')

path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
raw_csv = urllib2.urlopen(path)
feature_names = ('X', 'Y', 'month', 'day', "FFMC", "DMC", 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain')
target_name = 'area'
all_names = feature_names + (target_name,)
df = pd.read_csv(raw_csv)


# In[2]:


print df.head(5)

print 

print df[140:145]


# ### [Mission 1] 불이 가장 많이 발생한 지역 좌표 (X, Y) 상위 5개를 제시하시오.
# - numpy 및 pandas에서 제공되는 gropyby, sum, stack, sort 등의 메소드 활용 필요 

# In[3]:


sdf = df[['X', 'Y', 'area']]
print sdf.head(5)
print 
print sdf[140:145]


# ### [Mission 2] 불이 가장 많이 발생한 월 (month) 상위 2개 및 가장 많이 발생한 날 (day) 상위 2개를 제시하시오.
# - numpy 및 pandas에서 제공되는 gropyby, sum, stack, sort 등의 메소드 활용 필요

# In[4]:


sdf = df[['month', 'area']]
print sdf.head()
print
print sdf[140:145]


# ### [Mission 3] 4대 주 요인 (temp, RH, wind, rain) 속성별 기본 통계치 및 Box Plot 산출
# - numpy 및 pandas에서 제공되는 describe() 및 boxplot() 사용
# - describe()가 제공하는 통계치 및 boxplot을 보면서 나름대로의 해석을 반드시 2가지이상 제시하시오.
# - area와의 관계는 고려하지 말고 4가지 속성만 분석하시오.

# In[5]:


sdf = df[['temp', 'RH', 'wind', 'rain']]
print sdf.head()


# ### [Mission 4] 4대요인 및 area를 포함하여 dataframe을 얻어오고 area를 두 가지 부류로 나누어 각 4가지 속성을 비교 분석하기 
# - area의 값의 편차가 너무 심하기 때문에 그러한 편차를 줄이기 위하여 numpy.log1p (자연로그) 사용
#   - Calculates log(1 + x)
#   - http://docs.scipy.org/doc/numpy/reference/generated/numpy.log1p.html#numpy.log1p
# - 두 가지 부류로 나누는 기준은 log(1 + area) 값이 50% percentile 이상인 것과 50% percentile 이하인 것으로 정함
#   - 각각을 sdf_1과 sdf_2라고 명명
# - sdf_1 및 sdf_2에 대하여 4대 주 요인 (temp, RH, wind, rain) 속성별 기본 통계치 및 Box Plot 산출
#   - describe()가 제공하는 통계치 및 boxplot을 보면서 나름대로의 해석을 반드시 2가지이상 제시하시오.

# In[6]:


sdf = df[['temp', 'RH', 'wind', 'rain', 'area']]
print sdf.head()
print
print sdf[140:145]


# In[9]:


log1p_val = np.log1p(sdf['area'])
sdf.loc[:, 'area'] = log1p_val
print sdf.head()
print
print sdf[140:145]


# ### [Mission 5] 가장 영향이 높은 속성을 선택하여 단일변수 선형 회귀분석 수행
# - 다음 요인들 중 area와 가장 연관성이 높은 주요 요인을 선정
#   - FFMC
#   - DMC
#   - DC
#   - ISI
#   - temp
#   - RH
#   - wind
#   - rain
# - 가장 높은 상관관계를 지닌 속성 하나를 선정하여 선형 회귀식을 제시하시오.  

# In[14]:


sdf = df[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area']]
log1p_val = np.log1p(sdf['area'])
sdf.loc[:, 'area'] = log1p_val
print sdf.head()
print
print sdf[140:145]


# - 두 벡터의 Correlation 값 구하기

# In[13]:


a = [1, 2, 3, 4, 5]
b = [2, 5, 9, 1, 2]
print np.corrcoef(a, b)
print
corr = np.corrcoef(a, b)[0][1]
print corr


# ### [Mission 6] 가장 영향이 높은 요인 두 개로 다변수 선형 회귀분석 수행
# - 선형 회귀분석 성능이 좋은지 자신의 의견을 제시하시오.

# ### [Mission 7] 가장 영향이 높은 요인 두 개로 다변수 로지스틱 회귀분석 수행
# - area의 값의 편차가 너무 심하기 때문에 그러한 편차를 줄이기 위하여 numpy.log1p (자연로그) 사용
#   - Calculates log(1 + x)
#   - http://docs.scipy.org/doc/numpy/reference/generated/numpy.log1p.html#numpy.log1p
# - 새로운 Categorical Variable로서 0 및 1을 지니는 'fire' 컬럼 생성
# - 두 가지 부류로 나누는 기준은 log(1 + area) 값이 50% percentile 이상인 것과 50% percentile 이하인 것으로 정함
#   - log(1 + area) 값이 50% percentile 이상이면 'fire' 컬럼 값이 1
#   - log(1 + area) 값이 50% percentile 이하이면 'fire' 컬럼 값이 0
# - 로지스틱 회귀분석에 의한 분류의 정확도를 최종적으로 출력하시오.