# supress warnings globally - just for better vieweing of notebook. Warnings were causing too much unwanted clutter to be shown. options(warn=-1) suppressMessages(library(ggplot2)) suppressMessages(library(dplyr)) suppressMessages(library(repr)) suppressMessages(library(corrplot)) suppressMessages(library(gridExtra)) kiva_loans <- read.csv("kiva_loans.csv", header=T) kiva_regions <- read.csv("kiva_mpi_region_locations.csv", header=T) philp_loans <- kiva_loans %>% filter(country == "Philippines") philp_regions <- kiva_regions %>% filter(country == "Philippines") head(philp_loans,2) length(levels(philp_loans$region)) philp_loans %>% group_by(region) %>% summarise(count = n()) %>% head(10) vec <- c() philp_loans$loan_region <- as.character(philp_loans$region) for(i in 1:nrow(philp_loans)){ vec <- append(vec, sub('.*,\\s*','', philp_loans$loan_region[i])) } philp_loans$sub_region <- vec philp_loans$sub_region[1:20] province = read.csv("philippines_province.csv", header=T) region = read.csv("philippines_region.csv", header=T) philippines_regions <- inner_join(province, region, by = "regCode") philippines_regions$X <- NULL head(philippines_regions,2) # Join philip_loans to philippines_regions on "sub_region" == "provDesc" # Changing name of provDesc to sub_region in the Philippines_regions file names(philippines_regions)[3] <- "sub_region" # Left joining to keep all the philip_loans (NAs whereever loan region has no match with external regions file) philp_loans_new <- left_join(philp_loans, philippines_regions, by = "sub_region") #prop.table(table(is.na(philp_loans_new$regDesc))) # currently missing ~ 17% values head(philp_loans_new,2) # importing poverty indicators per region data philp_poverty_indicators <- read.csv("consolidated_philippines_poverty_data.csv", header=T) head(philp_poverty_indicators,5) rm(philp_loans) philp_loans <- left_join(philp_loans_new, philp_poverty_indicators, by = "regDesc") rm(philp_loans_new) philp_loans_new <- philp_loans %>% select(-id.x, -id.y, -psgcCode.y, -psgcCode.x) head(philp_loans_new,2) cat("The proportion of REGION join matches missing due to a bit of unclean data are") prop.table(table(is.na(philp_loans_new$regDesc))) completeFun <- function(data, desiredCols) { completeVec <- complete.cases(data[, desiredCols]) return(data[completeVec, ]) } philp_data <- completeFun(philp_loans_new, "regDesc") philp_data$gender <- if_else((philp_data$borrower_genders == "male"), "male","female") # Segment: Female-Agriculture female_agr_summary <- philp_data %>% filter(gender == "female" & sector == "Agriculture") %>% group_by(regDesc) %>% summarise(num_loans = n_distinct(id), med_loan_amt = median(loan_amount), med_loan_term = median(term_in_months), agr_wage_farm_workers_female_2015 = mean(agr_wage_farm_workers_female_2015), avg_annual_total_incm_farm_households_02_03 = mean(avg_annual_total_incm_farm_households_02_03), avg_annual_farm_incm_farm_households_02_03 = mean(avg_annual_farm_incm_farm_households_02_03), avg_annual_off_farm_incm_farm_households_02_03 = mean(avg_annual_off_farm_incm_farm_households_02_03), avg_rural_income_2000 = mean(avg_rural_income_2000), total_emply_2016 = mean(total_emply_2016)) corr_1 = cor(female_agr_summary[,c(2:8,10)]) col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA")) options(repr.plot.width=10, repr.plot.height=10) corrplot(corr_1, method="color", col=col(200), type="upper", order="hclust", addCoef.col = "black", # Add coefficient of correlation tl.col="black", tl.srt=90, #Text label color and rotation # Combine with significance sig.level = 0.01, insig = "blank", # hide correlation coefficient on the principal diagonal diag=FALSE ) options(repr.plot.width=7, repr.plot.height=7) plt_5 <- ggplot(data = female_agr_summary, aes(x = regDesc, y = avg_annual_farm_incm_farm_households_02_03)) + geom_bar(aes(fill = regDesc), stat = "identity", width = 0.6) + scale_x_discrete("New Region") + scale_y_continuous("Average Annual Farm Income - Farm household - 2002/03") + #ggtitle("Top 10 activities for females in top 5 countries") + coord_flip() + theme_grey() plt_6 <- ggplot(data = female_agr_summary, aes(x = regDesc, y = num_loans)) + geom_bar(aes(fill = regDesc),stat = "identity", width = 0.6) + scale_x_discrete("New Region") + scale_y_continuous("Number of loans") + #ggtitle("Top 10 activities for females in top 5 countries") + coord_flip() + theme_grey() grid_1 <- grid.arrange(plt_5, plt_6, nrow = 2, ncol=1) grid_1 plt_7 <- ggplot(data = female_agr_summary, aes(x = regDesc, y = agr_wage_farm_workers_female_2015)) + geom_bar(aes(fill = regDesc), stat = "identity", width = 0.6) + scale_x_discrete("New Region") + scale_y_continuous("Average Wage for Female Farm Workers") + #ggtitle("Top 10 activities for females in top 5 countries") + coord_flip() + theme_grey() plt_8 <- ggplot(data = female_agr_summary, aes(x = regDesc, y = num_loans)) + geom_bar(aes(fill = regDesc),stat = "identity", width = 0.6) + scale_x_discrete("New Region") + scale_y_continuous("Number of loans") + #ggtitle("Top 10 activities for females in top 5 countries") + coord_flip() + theme_grey() grid_1 <- grid.arrange(plt_7, plt_8, nrow = 2, ncol=1) grid_1 library(ggplot2) library(rgdal) library(maptools) library(rgeos) library(dplyr) ph.adm1.spdf <- readRDS("PHL_adm2.rds") ph.adm1.df <- fortify(ph.adm1.spdf, region = "NAME_1") phadmunique = unique(ph.adm1.df$id) subregunique = unique(philp_loans$sub_region) common_regions = intersect(phadmunique, subregunique) gb_loan_amt_ph <- philp_loans %>% filter(sub_region %in% common_regions) %>% group_by(sub_region) %>% summarise(loan_amt_median = median(loan_amount)) gb_loan_amt_ph <- data.frame(gb_loan_amt_ph) names(gb_loan_amt_ph)[1] = "id" ph.adm1.df <- merge(ph.adm1.df, gb_loan_amt_ph, by.y = 'id', all.x = TRUE) ph.adm1.df$loan_amt_median[is.na(ph.adm1.df$loan_amt_median)] <- 0 ph.adm1.centroids.df <- data.frame(long = coordinates(ph.adm1.spdf)[, 1], lat = coordinates(ph.adm1.spdf)[, 2]) # Get names and id numbers corresponding to administrative areas ph.adm1.centroids.df[, 'ID_1'] <- ph.adm1.spdf@data[,'ID_1'] ph.adm1.centroids.df[, 'NAME_1'] <- ph.adm1.spdf@data[,'NAME_1'] options(repr.plot.width=10, repr.plot.height=10) p <- ggplot(ph.adm1.df, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = cut(loan_amt_median,5))) + # geom_text(data = ph.adm1.centroids.df, aes(label = NAME_1, x = long, y = lat, group = NAME_1), size = 1) + labs(x=" ", y=" ") + theme_bw() + scale_fill_brewer('Loan Amount Distribution', palette = 'OrRd') + coord_map() + theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) + theme(axis.ticks = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) + theme(panel.border = element_blank()) print(p)