Castile and Leon: Crops

Description:

TODO

Author:

Sergio García Prado (garciparedes.me)

Setting up environment:

Cleaning up

In [1]:
rm(list = ls())

Library Imports

In [2]:
library(readr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(RSocrata)
library(ca)
library(forcats)
library(reshape2)
library(lubridate)
library(repr)
library(stringr)
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘reshape2’

The following object is masked from ‘package:tidyr’:

    smiths


Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date

Data Adquisition:

In [3]:
my.read.csv <- function(url, filename) {
    if (!file.exists(filename)) {
        write_csv(read.socrata(url), filename)
    }
    return(read_csv(filename))
}
In [4]:
crops.herbaceous <- my.read.csv("https://analisis.datosabiertos.jcyl.es/resource/agu2-cspz.csv",
                               "./data/crops-herbaceous.csv")
Parsed with column specification:
cols(
  a_o = col_integer(),
  codigo_comarca = col_integer(),
  codigo_muncipio = col_integer(),
  codigo_producto = col_character(),
  codigo_provincia = col_integer(),
  comarca = col_character(),
  cultivo = col_character(),
  grupo_de_cultivo = col_character(),
  municipio = col_character(),
  ocupaci_n_primera_regad_o = col_integer(),
  ocupaci_n_primera_secano = col_integer(),
  ocupaciones_asociadas_regad_o = col_character(),
  ocupaciones_asociadas_secano = col_character(),
  ocupaciones_posteriores_regad_o = col_character(),
  ocupaciones_posteriores_secano = col_character()
)
In [5]:
crops.woody <- my.read.csv("https://analisis.datosabiertos.jcyl.es/resource/2vwa-si9n.csv",
                           "./data/crops-woody.csv")
Parsed with column specification:
cols(
  a_o = col_datetime(format = ""),
  codigo_comarca = col_integer(),
  codigo_muncipio = col_integer(),
  codigo_producto = col_character(),
  codigo_provincia = col_integer(),
  comarca = col_character(),
  cultivo = col_character(),
  grupo_de_cultivo = col_character(),
  municipio = col_character(),
  n_total_arboles_diseminados = col_integer(),
  superficie_regad_o_en_producci_n = col_integer(),
  superficie_regad_o_que_no_pruduce = col_integer(),
  superficie_secano_en_producci_n = col_integer(),
  superficie_secano_que_no_pruduce = col_integer()
)

Data Cleaning:

Remove Unnecesary Columns and Rename Interesting Columns

In [6]:
code.province.to.province <- function(code) {
    recode(code,
           "47" = "Valladolid",
           "24" = "León",
           "34" = "Palencia",
           "37" = "Salamanca",
           "9"  = "Burgos",
           "49" = "Zamora",
           "5"  = "Ávila",
           "42" = "Soria",
           "40" = "Segovia")
}
In [7]:
crops.herbaceous.use <- crops.herbaceous %>%
    select(a_o, codigo_provincia:ocupaci_n_primera_secano) %>%
    rename(year = a_o, 
           code.province = codigo_provincia, 
           crop = cultivo, 
           crop.group = grupo_de_cultivo, 
           region = comarca, 
           town = municipio, 
           irrigation = ocupaci_n_primera_regad_o, 
           dry = ocupaci_n_primera_secano) %>%
    gather(c(dry, irrigation), 
           key = "crop.technique", value = "area") %>%
    filter(!is.na(area) & area > 0 ) %>%
    mutate(crop = factor(crop),
           crop.group = as.factor(crop.group),
           region = as.factor(region),
           town = as.factor(town),
           province = as.factor(code.province.to.province(code.province))) %>%
    select(-code.province)
In [8]:
crops.woody.use <- crops.woody %>%
    select(a_o, codigo_provincia:municipio,
           superficie_regad_o_en_producci_n,
           superficie_secano_en_producci_n) %>%
    rename(year = a_o, 
           code.province = codigo_provincia, 
           crop = cultivo, 
           crop.group = grupo_de_cultivo, 
           region = comarca, 
           town = municipio, 
           irrigation = superficie_regad_o_en_producci_n, 
           dry = superficie_secano_en_producci_n) %>%
    gather(c(dry, irrigation), 
           key = "crop.technique", value = "area") %>%
    filter(!is.na(area) & area > 0 ) %>%
    mutate(crop = as.factor(crop),
           crop.group = as.factor(crop.group),
           region = as.factor(region),
           town = as.factor(town),
           province = as.factor(code.province.to.province(code.province)),
           year = year(round(as.POSIXct(year), "days")),
           crop.group = recode(crop.group, 
                               "VIÑEDO OCUPACIÓN PRINCIPAL" = "VIÑEDO",
                               "CITRICOS" = "FRUTALES"),
           crop.group = replace(crop.group, crop == "MIMBRERO" | 
                                    crop == "MORERA Y OTROS", "OTROS CULTIVOS LEÑOSOS"),
           crop.group = replace(crop.group, crop == "OLI ACEITUNA ACEITE" | 
                                    crop == "OLIVAR ACEIUNA MESA", "OLIVAR")) %>%
    select(-code.province)
In [9]:
crops.woody.use %>%
    select(crop.group) %>%
    pull(crop.group) %>%
    droplevels() %>%
    levels()
# crops.woody.use %>%
#    select(crop.group, crop) %>%
#    filter(crop.group == "VIÑEDO") %>%
#    pull(crop) %>%
#    droplevels() %>%
#    levels()
#
# crops.woody.use %>%
#    select(crop.group, crop) %>%
#    filter(crop.group == "VIÑEDO OCUPACIÓN PRINCIPAL") %>%
#    pull(crop) %>%
#    droplevels() %>%
#    levels()
#
# crops.woody.use %>%
#     select(crop.group, crop) %>%
#     filter(crop.group == "CITRICOS") %>%
#     pull(crop) %>%
#     droplevels() %>%
#     levels()
crops.woody.use %>%
    select(crop.group, crop) %>%
    filter(crop.group == "OLIVAR") %>%
    pull(crop) %>%
    droplevels() %>%
    levels()
crops.woody.use %>%
    select(crop.group, crop) %>%
    filter(crop.group == "OLIVAR Y OTROS C.I.") %>%
    pull(crop) %>%
    droplevels() %>%
    levels()

crops.woody.use %>%
    select(crop.group, crop) %>%
    filter(crop.group == "OTROS CULTIVOS LEÑOSOS") %>%
    pull(crop) %>%
    droplevels() %>%
    levels()
  1. 'FRUTALES'
  2. 'OLIVAR'
  3. 'OTROS CULTIVOS LEÑOSOS'
  4. 'VIÑEDO'
  5. 'VIVEROS'
  1. 'OLI ACEITUNA ACEITE'
  2. 'OLIVAR ACEIUNA MESA'
  1. 'MIMBRERO'
  2. 'MORERA Y OTROS'
In [10]:
crops.use <- rbind(crops.woody.use %>% mutate(crop.type = factor("woody")), 
                   crops.herbaceous.use %>% mutate(crop.type = factor("herbaceous")))

Data Exploration:

View Data Sample

In [11]:
sample_n(crops.use, 10)
yearregioncropcrop.grouptowncrop.techniqueareaprovincecrop.type
2015 AGUILAR VALLICO CULTIVOS FORRAJEROS BRAÑOSERA dry 12 Palencia herbaceous
2012 BENAVENTE Y LOS VALLES PATATA MED. ESTACION TUBERCULOS SAN CRISTOBAL DE ENTREVIÑASirrigation 5 Zamora herbaceous
2013 SANABRIA MANZANO FRUTALES PEQUE dry 3 Zamora woody
2013 BENAVENTE Y LOS VALLES COL Y REPOLLO HORTALIZAS PUEBLICA DE VALVERDE dry 2 Zamora herbaceous
2014 BOEDO-OJEDA ALFALFA CULTIVOS FORRAJEROS SANTIBAÑEZ DE ECLA dry 6 Palencia herbaceous
2014 SORIA OTROS C.INDUSTRIALES CULTIV. INDUSTRIALES GARRAY irrigation 13 Soria herbaceous
2010 CAMPOS GUISANTE SECO LEGUMINOSAS GRANO MARCILLA DE CAMPOS dry 161 Palencia herbaceous
2010 CAMPOS TRIGO CEREALES GRANO ESPINOSA DE VILLAGONZALO dry 985 Palencia herbaceous
2010 SEPULVEDA I CENTENO CEREALES GRANO SACRAMENIA irrigation 10 Segovia herbaceous
2015 CAMPOS-PAN MAIZ CEREALES GRANO VEZDEMARBAN irrigation 111 Zamora herbaceous
In [12]:
summary(crops.use)
      year                         region            crop       
 Min.   :2010   CAMPOS                : 10689   CEBADA : 19305  
 1st Qu.:2011   CUELLAR               :  9783   TRIGO  : 17031  
 Median :2013   DUERO BAJO            :  8400   GIRASOL: 14123  
 Mean   :2013   CAMPOS-PAN            :  8350   AVENA  : 12975  
 3rd Qu.:2015   AREVALO-MADRIGAL      :  8343   CENTENO: 11301  
 Max.   :2016   BENAVENTE Y LOS VALLES:  8063   ALFALFA: 10584  
                (Other)               :150839   (Other):119148  
                crop.group             town        crop.technique    
 CEREALES GRANO      :73516   TORO       :   499   Length:204467     
 CULTIVOS FORRAJEROS :41015   CUELLAR    :   387   Class :character  
 LEGUMINOSAS GRANO   :29436   VALLADOLID :   366   Mode  :character  
 CULTIV. INDUSTRIALES:25781   TORDESILLAS:   343                     
 HORTALIZAS          :12645   FUENTESAUCO:   328                     
 TUBERCULOS          : 6903   ZAMORA     :   322                     
 (Other)             :15171   (Other)    :202222                     
      area                province          crop.type     
 Min.   :    1.00   Valladolid:33191   woody     : 15139  
 1st Qu.:    4.00   Zamora    :29338   herbaceous:189328  
 Median :   16.00   Burgos    :28705                      
 Mean   :   96.64   Salamanca :24236                      
 3rd Qu.:   67.00   Palencia  :23666                      
 Max.   :10191.00   León      :20747                      
                    (Other)   :44584                      
In [13]:
options(repr.plot.width = 10, repr.plot.height = 5)
crops.use %>%
    select(year, area, crop.type, crop.technique, province) %>%
    group_by(year, crop.type, crop.technique, province) %>%
    summarise(area = sum(area)) %>%
    group_by(crop.type, crop.technique, province) %>%
    mutate(area = area / sum(area)) %>%
    ggplot(aes(x=year, y = area, 
               group = province,
               col = province)) +
        labs(title = "Crop Area Ratio by Year and irrigation type", 
             x = "Year", y = "Area") + 
        facet_wrap( ~ crop.type + crop.technique, scales = "free_y") +
        geom_line()
In [14]:
options(repr.plot.width = 10, 
        repr.plot.height = 15)
crops.use %>%
    select(crop, crop.type, crop.group, crop.technique, area) %>%
    group_by(crop.group, crop.type, crop.technique, crop) %>%
    summarise(area = sum(area) / (max(crops.use$year) - min(crops.use$year))) %>%
    top_n(5, area) %>%
    ggplot(aes(x = crop.technique, y = area, fill = crop, label = str_to_title(crop) )) + 
        geom_bar(width = 1, stat = "identity") +
        theme(legend.position='none') + 
        labs(title = "Woody Crop Decomposition", 
             x = "", y = "Area") + 
        facet_wrap(crop.group ~ crop.type, scales = "free_y", ncol = 3)+ 
        geom_text(size = 3, position = position_stack(vjust = 0.5), 
                  colour = "white")
In [15]:
options(repr.plot.width = 10, 
        repr.plot.height = 4)
crops.use %>%
    select(area, crop.type, crop.technique, province) %>%
    group_by(province, crop.type, crop.technique) %>%
    summarise(area = sum(area)) %>%
    group_by(province, crop.type) %>%
    mutate(area = area / sum(area)) %>%
    ggplot(aes(x = "", y = area, fill = crop.technique)) +
        geom_bar(stat = "identity", width = 1) +
        labs(title = "Crop Method Predominance", 
             x = "", y = "", fill = "Irrigated") +
        coord_polar(theta = "y") + 
        facet_wrap(crop.type ~ province, ncol= 9) +
        theme(axis.text = element_blank(),
            axis.ticks = element_blank(),
            panel.grid  = element_blank())
In [16]:
options(repr.plot.width = 10, 
        repr.plot.height = 35)
crops.use %>%
  select(year, crop.technique, crop.group,crop.type, crop, area) %>%
  group_by(year, crop.technique, crop.group,crop.type, crop) %>%
  summarise(area = sum(area)) %>%
  group_by(year, crop.technique, crop.type) %>%
  top_n(10, area) %>%
  ggplot(aes(x = interaction(crop, area), y = area, fill = crop.group)) + 
    geom_col() + 
    theme(legend.position = 'top') +  
    guides(fill = guide_legend(nrow = 2)) +
    labs(title = "Dry Crop's Area by Year", 
         x = "Crop", y = "Area", fill = "Crop Group") +
    coord_flip() + 
    facet_wrap( ~ crop.type + year + crop.technique, scales = "free", ncol = 2) +
    scale_x_discrete(labels = function(x) str_to_title(gsub("\\..+", "", x)))