In [10]:

import pandas as pd
import os
import re

In [11]:

def transpose_dataframe(excel_file_path):
    df = pd.read_excel(excel_file_path, skiprows=[0, 1, 3, 4])
    wide_df = df.iloc[:, 0:6]
    candidates = list(wide_df.columns)[3:]
    wide_df.columns = ['town', 'village', 'office'] + candidates
    districts = wide_df['town'].astype(str)
    districts = districts.str.replace("\u3000|nan", "")
    filled_districts = districts
    for i in range(districts.size):
        if len(districts[i]) == 0:
            filled_districts[i] = districts[i - 1]
    wide_df = wide_df.drop('town', axis=1)
    wide_df.insert(0, 'town', filled_districts)
    wide_df = wide_df.dropna()
    wide_df['office'] = wide_df['office'].astype(int)
    long_df = pd.melt(wide_df, id_vars=['town', 'village', 'office'], var_name='candidate_info', value_name='votes')
    split_candidate_info = long_df['candidate_info'].str.split("\n|\(|\)", expand=True)
    numbers = split_candidate_info[1].str.strip().values.astype(int)
    candidates = ["{}/{}".format(p, vp) for p, vp in zip(split_candidate_info[3].values, split_candidate_info[4].values)]
    long_df['number'] = numbers
    long_df['candidates'] = candidates
    long_df['votes'] = long_df['votes'].astype(str).str.replace(",", "").astype(int)
    long_df = long_df[['town', 'village', 'office', 'number', 'candidates', 'votes']]
    return long_df

In [12]:

def append_dataframes(excel_file_list, folder_name):
    df = pd.DataFrame()
    for efp in excel_file_list:
        if '.xls' in efp:
            city_county_name = re.split("\(|\)", efp)[1]
            long_dataframe = transpose_dataframe("{}/{}".format(folder_name, efp))
            long_dataframe.insert(0, 'county', city_county_name)
            df = df.append(long_dataframe)
            print("Appending {}...".format(city_county_name))
        else:
            continue
    df = df.reset_index(drop=True)
    return df

In [13]:

def sort_dataframe(df):
    column_order = ['county', 'town', 'village', 'office']
    return df.sort_values(column_order).reset_index(drop=True)

In [14]:

excel_file_list_2016 = os.listdir("presidential-2016")
excel_file_list_2020 = os.listdir("presidential-2020")

In [15]:

president_2016 = append_dataframes(excel_file_list_2016, "presidential-2016")

Appending 臺北市...
Appending 新北市...
Appending 桃園市...
Appending 苗栗縣...
Appending 臺南市...
Appending 雲林縣...
Appending 南投縣...
Appending 高雄市...
Appending 金門縣...
Appending 宜蘭縣...
Appending 彰化縣...
Appending 澎湖縣...
Appending 臺東縣...
Appending 臺中市...
Appending 屏東縣...
Appending 基隆市...
Appending 連江縣...
Appending 嘉義縣...
Appending 嘉義市...
Appending 花蓮縣...
Appending 新竹市...
Appending 新竹縣...

In [16]:

president_2020 = append_dataframes(excel_file_list_2020, "presidential-2020")

Appending 宜蘭縣...
Appending 彰化縣...
Appending 金門縣...
Appending 桃園市...
Appending 苗栗縣...
Appending 臺南市...
Appending 雲林縣...
Appending 南投縣...
Appending 高雄市...
Appending 臺北市...
Appending 新北市...
Appending 花蓮縣...
Appending 新竹市...
Appending 新竹縣...
Appending 基隆市...
Appending 連江縣...
Appending 嘉義縣...
Appending 嘉義市...
Appending 屏東縣...
Appending 澎湖縣...
Appending 臺東縣...
Appending 臺中市...

In [17]:

president_2016_sorted = sort_dataframe(president_2016)
president_2016_sorted

Out[17]:

	county	town	village	office	number	candidates	votes
0	南投縣	中寮鄉	中寮村	199	1	朱立倫/王如玄	132
1	南投縣	中寮鄉	中寮村	199	2	蔡英文/陳建仁	189
2	南投縣	中寮鄉	中寮村	199	3	宋楚瑜/徐欣瑩	46
3	南投縣	中寮鄉	內城村	206	1	朱立倫/王如玄	87
4	南投縣	中寮鄉	內城村	206	2	蔡英文/陳建仁	140
...	...	...	...	...	...	...	...
46741	高雄市	鼓山區	龍水里	926	2	蔡英文/陳建仁	713
46742	高雄市	鼓山區	龍水里	926	3	宋楚瑜/徐欣瑩	178
46743	高雄市	鼓山區	龍水里	927	1	朱立倫/王如玄	376
46744	高雄市	鼓山區	龍水里	927	2	蔡英文/陳建仁	665
46745	高雄市	鼓山區	龍水里	927	3	宋楚瑜/徐欣瑩	125

46746 rows × 7 columns

In [18]:

president_2020_sorted = sort_dataframe(president_2020)
president_2020_sorted

Out[18]:

	county	town	village	office	number	candidates	votes
0	南投縣	中寮鄉	中寮村	366	1	宋楚瑜/余湘	18
1	南投縣	中寮鄉	中寮村	366	2	韓國瑜/張善政	217
2	南投縣	中寮鄉	中寮村	366	3	蔡英文/賴清德	208
3	南投縣	中寮鄉	內城村	373	1	宋楚瑜/余湘	17
4	南投縣	中寮鄉	內城村	373	2	韓國瑜/張善政	141
...	...	...	...	...	...	...	...
51673	高雄市	鼓山區	龍水里	1268	2	韓國瑜/張善政	477
51674	高雄市	鼓山區	龍水里	1268	3	蔡英文/賴清德	647
51675	高雄市	鼓山區	龍水里	1269	1	宋楚瑜/余湘	39
51676	高雄市	鼓山區	龍水里	1269	2	韓國瑜/張善政	520
51677	高雄市	鼓山區	龍水里	1269	3	蔡英文/賴清德	673

51678 rows × 7 columns

In [19]:

president_2016_sorted.to_csv('presidential_2016.csv', index=False)
president_2020_sorted.to_csv('presidential_2020.csv', index=False)

In [1]:

import pymysql
from sqlalchemy import create_engine
import pandas as pd

In [6]:

host = "rsqltrain.ced04jhfjfgi.ap-northeast-1.rds.amazonaws.com"
port = 3306
user = "trainadmin"
passwd = "csietrain"
dbname = "tw_election_2016"
conn = create_engine('mysql+pymysql://{}:{}@{}:{}/{}'.format(user, passwd, host, port, dbname))

In [4]:

president_2016 = pd.read_csv("presidential_2016.csv")

In [7]:

president_2016.to_sql('president', conn, index=False)

In [6]:

host = "rsqltrain.ced04jhfjfgi.ap-northeast-1.rds.amazonaws.com"
port = 3306
user = "trainadmin"
passwd = "csietrain"
dbname = "tw_election_20"
conn = create_engine('mysql+pymysql://{}:{}@{}:{}/{}'.format(user, passwd, host, port, dbname))

In [17]:

president_2016_sorted.to_sql('presidential_2016', conn, index=False)

In [18]:

president_2020_sorted.to_sql('presidential_2020', conn, index=False)

In [ ]: