# install.packages('rio') library(rio) datakab = import('https://raw.githubusercontent.com/msaidf/statek/master/content/indo-dapoer_data.csv') datakab nrow(datakab) head(datakab, n = 3) tail(datakab) tail(datakab[1:12012,], 2) library(magrittr) datakab[-12013:-12017,] %>% tail(3) which(datakab$`Series Code` != "") %>% datakab[.,] %>% tail(2) dplyr::filter(datakab, `Series Code` != "") %>% tail(2) datakab = dplyr::filter(datakab, `Series Code` != "") c(1, 3:5, ncol(datakab)) %>% datakab[, .] %>% tail(4) datakab[,c('Region Code', 'Series Name')] %>% tail library(dplyr) select(datakab, 'Region Code':'Series Code', -'Series Name', region_name = 'Region Name', contains('YR'), -contains('YR201')) %>% tail(4) rename(datakab, region_name = 'Region Name', series_code = 'Series Code') [100:102,] names(datakab)[1:3] = names(datakab)[1:3] %>% toupper names(datakab) %>% t datakab = janitor::clean_names(datakab) names(datakab) %>% t datakab %<>% rename_at(vars(starts_with('x20')), funs(stringr::str_replace(., 'x20[0-9][0-9]_', ''))) datakab %>% names %>% t data1 = datakab %>% select(region_name:series_code, contains('201')) names(data1) %>% t data2 = datakab %>% select(-contains('201')) names(data2) %>% t merge(data1, data2, by = c('region_code', 'series_code')) %>% tail(3) dplyr::inner_join(data1, data2) %>% tail(3) datakab[2,5] (datakab[2,5] = NA) datakab %>% select(yr2000, yr2001) %>% summarytools::dfSummary(graph.col = F, na.col = F) %>% print datakab %>% mutate(yr2000 = ifelse(yr2000 == '..', NA, yr2000), yr2001 = ifelse(yr2001 == '..', NA, yr2001)) %>% select(yr2000:yr2002) %>% summarytools::dfSummary(graph.col = F, na.col = F) %>% print datakab %<>% mutate_at(vars(starts_with("yr20")), funs(ifelse(. == '..', NA, .))) datakab %<>% mutate_at(vars(starts_with("yr20")), as.numeric) datakab %>% select(starts_with("yr201")) %>% summarytools::dfSummary(graph.col = F, na.col = F) %>% print datakab %<>% gather(year, val, starts_with('yr')) datakab %>% head(2) datakab %<>% mutate(year = str_replace(year, 'yr', '') %>% as.integer) %>% select(-series_code) %>% spread(key = series_name, value = val) datakab %>% head(2) datakab %<>% janitor::clean_names() names(datakab) %<>% str_replace_all('_in_.+$', '') %>% t datakab %>% names %>% t save.image(file = ".RData") save(datakab, file = "datakab.rda") rm(data1, data2) ls() rm(list = ls()) ls() load('datakab.rda') ls() saveRDS(datakab, file = "datakab.rds") indodapoer = readRDS('datakab.rds') ls()