# supress warnings globally - just for better vieweing of notebook. Warnings were causing too much unwanted clutter to be shown.
options(warn=-1)

suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(repr))
suppressMessages(library(corrplot))
suppressMessages(library(gridExtra))

kiva_loans <- read.csv("kiva_loans.csv", header=T)
kiva_regions <- read.csv("kiva_mpi_region_locations.csv", header=T)

philp_loans <- kiva_loans %>% filter(country == "Philippines")
philp_regions <- kiva_regions %>% filter(country == "Philippines")

head(philp_loans,2)

length(levels(philp_loans$region))

philp_loans %>% group_by(region) %>% summarise(count = n()) %>% head(10)

vec <- c()
philp_loans$loan_region <- as.character(philp_loans$region)
for(i in 1:nrow(philp_loans)){
    vec <- append(vec, sub('.*,\\s*','', philp_loans$loan_region[i]))
}
philp_loans$sub_region <- vec
philp_loans$sub_region[1:20]

province = read.csv("philippines_province.csv", header=T)
region = read.csv("philippines_region.csv", header=T)

philippines_regions <- inner_join(province, region, by = "regCode")
philippines_regions$X <- NULL

head(philippines_regions,2)

# Join philip_loans to philippines_regions on "sub_region" == "provDesc"
# Changing name of provDesc to sub_region in the Philippines_regions file
names(philippines_regions)[3] <- "sub_region"

# Left joining to keep all the philip_loans (NAs whereever loan region has no match with external regions file)
philp_loans_new <- left_join(philp_loans, philippines_regions, by = "sub_region")
#prop.table(table(is.na(philp_loans_new$regDesc)))
# currently missing ~ 17% values

head(philp_loans_new,2)

# importing poverty indicators per region data
philp_poverty_indicators <- read.csv("consolidated_philippines_poverty_data.csv", header=T)
head(philp_poverty_indicators,5)

rm(philp_loans)
philp_loans <- left_join(philp_loans_new, philp_poverty_indicators, by = "regDesc")


rm(philp_loans_new)
philp_loans_new <- philp_loans %>% select(-id.x, -id.y, -psgcCode.y, -psgcCode.x)

head(philp_loans_new,2)

cat("The proportion of REGION join matches missing due to a bit of unclean data are")
prop.table(table(is.na(philp_loans_new$regDesc)))

completeFun <- function(data, desiredCols) {
  completeVec <- complete.cases(data[, desiredCols])
  return(data[completeVec, ])
}

philp_data <- completeFun(philp_loans_new, "regDesc")

philp_data$gender <- if_else((philp_data$borrower_genders == "male"), "male","female")

# Segment: Female-Agriculture


female_agr_summary <- philp_data %>% filter(gender == "female" & sector == "Agriculture") %>%
         group_by(regDesc) %>% summarise(num_loans = n_distinct(id),
                                            med_loan_amt = median(loan_amount),
                                            med_loan_term = median(term_in_months),
                                              agr_wage_farm_workers_female_2015 = mean(agr_wage_farm_workers_female_2015),
                                              avg_annual_total_incm_farm_households_02_03 = mean(avg_annual_total_incm_farm_households_02_03),
                                              avg_annual_farm_incm_farm_households_02_03 = mean(avg_annual_farm_incm_farm_households_02_03),
                                              avg_annual_off_farm_incm_farm_households_02_03 = mean(avg_annual_off_farm_incm_farm_households_02_03),
                                              avg_rural_income_2000 = mean(avg_rural_income_2000),
                                              total_emply_2016 = mean(total_emply_2016))

corr_1 = cor(female_agr_summary[,c(2:8,10)])

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

options(repr.plot.width=10, repr.plot.height=10)

corrplot(corr_1, method="color", col=col(200),  
         type="upper", order="hclust", 
         addCoef.col = "black", # Add coefficient of correlation
         tl.col="black", tl.srt=90, #Text label color and rotation
         # Combine with significance
          sig.level = 0.01, insig = "blank", 
         # hide correlation coefficient on the principal diagonal
         diag=FALSE 
         )

options(repr.plot.width=7, repr.plot.height=7)

plt_5 <- ggplot(data = female_agr_summary,
       aes(x = regDesc, y = avg_annual_farm_incm_farm_households_02_03)) +
  geom_bar(aes(fill = regDesc), stat = "identity", width = 0.6) +
  scale_x_discrete("New Region") + scale_y_continuous("Average Annual Farm Income - Farm household - 2002/03") +
  #ggtitle("Top 10 activities for females in top 5 countries") +
    coord_flip() + theme_grey()

plt_6 <- ggplot(data = female_agr_summary,
       aes(x = regDesc, y = num_loans)) +
  geom_bar(aes(fill = regDesc),stat = "identity", width = 0.6) +
  scale_x_discrete("New Region") + scale_y_continuous("Number of loans") +
  #ggtitle("Top 10 activities for females in top 5 countries") +
 coord_flip() + theme_grey()

grid_1 <- grid.arrange(plt_5, plt_6, nrow = 2, ncol=1)
grid_1

plt_7 <- ggplot(data = female_agr_summary,
       aes(x = regDesc, y = agr_wage_farm_workers_female_2015)) +
  geom_bar(aes(fill = regDesc), stat = "identity", width = 0.6) +
  scale_x_discrete("New Region") + scale_y_continuous("Average Wage for Female Farm Workers") +
  #ggtitle("Top 10 activities for females in top 5 countries") +
    coord_flip() + theme_grey()

plt_8 <- ggplot(data = female_agr_summary,
       aes(x = regDesc, y = num_loans)) +
  geom_bar(aes(fill = regDesc),stat = "identity", width = 0.6) +
  scale_x_discrete("New Region") + scale_y_continuous("Number of loans") +
  #ggtitle("Top 10 activities for females in top 5 countries") +
 coord_flip() + theme_grey()

grid_1 <- grid.arrange(plt_7, plt_8, nrow = 2, ncol=1)
grid_1

library(ggplot2)
library(rgdal)
library(maptools)
library(rgeos)
library(dplyr)

ph.adm1.spdf <- readRDS("PHL_adm2.rds")
ph.adm1.df <- fortify(ph.adm1.spdf, region = "NAME_1")

phadmunique = unique(ph.adm1.df$id)
subregunique = unique(philp_loans$sub_region)
common_regions = intersect(phadmunique, subregunique)

gb_loan_amt_ph <-  philp_loans %>% 
      filter(sub_region %in% common_regions) %>%
        group_by(sub_region) %>% summarise(loan_amt_median = median(loan_amount))

gb_loan_amt_ph <- data.frame(gb_loan_amt_ph)
names(gb_loan_amt_ph)[1] = "id"

ph.adm1.df <- merge(ph.adm1.df, gb_loan_amt_ph, by.y = 'id', all.x = TRUE)
ph.adm1.df$loan_amt_median[is.na(ph.adm1.df$loan_amt_median)] <- 0

ph.adm1.centroids.df <- data.frame(long = coordinates(ph.adm1.spdf)[, 1], 
                                   lat = coordinates(ph.adm1.spdf)[, 2]) 

# Get names and id numbers corresponding to administrative areas
ph.adm1.centroids.df[, 'ID_1'] <- ph.adm1.spdf@data[,'ID_1']
ph.adm1.centroids.df[, 'NAME_1'] <- ph.adm1.spdf@data[,'NAME_1']


options(repr.plot.width=10, repr.plot.height=10)
p <- ggplot(ph.adm1.df, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = cut(loan_amt_median,5))) +
 # geom_text(data = ph.adm1.centroids.df, aes(label = NAME_1, x = long, y = lat, group = NAME_1), size = 1) + 
  labs(x=" ", y=" ") + 
  theme_bw() + scale_fill_brewer('Loan Amount Distribution', palette  = 'OrRd') + 
  coord_map() + 
  theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) + 
  theme(axis.ticks = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) + 
  theme(panel.border = element_blank())

print(p)