From 136cdf3a0635a4c0f280a9d50fef1cf535969eb5 Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Mon, 2 Mar 2020 17:56:38 +0100
Subject: [PATCH] Cleaned up 04_buildHeader

---
 code/04_buildHeader.Rmd | 656 ++++++++++++++++++++++++----------------
 1 file changed, 393 insertions(+), 263 deletions(-)

diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd
index f65feaf..9e7e932 100644
--- a/code/04_buildHeader.Rmd
+++ b/code/04_buildHeader.Rmd
@@ -26,8 +26,11 @@ This report documents the construction of the header file for sPlot 3.0. It is b
 ```{r results="hide", message=F, warning=F}
 knitr::opts_chunk$set(echo = TRUE)
 library(tidyverse)
+library(viridis)
 library(readr)
 library(xlsx)
+library(knitr)
+library(kableExtra)
 
 ## Spatial packages
 library(rgdal)
@@ -35,6 +38,10 @@ library(sp)
 library(rgeos)
 library(raster)
 library(rworldmap)
+library(elevatr)
+library(sf)
+library(rnaturalearth)
+library(dggridR)
 
 #save temporary files
 write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
@@ -42,6 +49,7 @@ write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_
 rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
 ```
 
+ 
 ```{bash, eval=F}
 # escape all double quotation marks. Run in Linux terminal
 #sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv
@@ -53,7 +61,7 @@ rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
 
 ```
 
-
+## 1 Import data
 Import header data
 ```{r import}
 header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv", locale = locale(encoding = 'UTF-8'),
@@ -111,7 +119,8 @@ header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv",
 )) %>% 
   rename(Sparse.vegetation=`Sparse vegetation`, 
          ESY=EUNIS) %>% 
-  dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE)  #too sparse information to be useful
+  dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE) %>%   #too sparse information to be useful
+  dplyr::select(-PlotID) #identical to PlotObservationID
 ```
 
 The following column names occurred in the header of sPlot v2.1 and are currently missing from the header of v3.0  
@@ -130,7 +139,7 @@ The following column names occurred in the header of sPlot v2.1 and are currentl
 Columns #1, #2, #3, #10, #11 will be dropped. The others will be derived below.
 
 
-## 1 Solve spatial problems
+### 1.1 Solve spatial problems
 There are 2020 plots in the Nile dataset without spatial coordinates. Assign manually with wide (90km) location uncertainty.
 ```{r}
 header <- header0 %>% 
@@ -151,19 +160,19 @@ toswap <- c(which(header$Dataset=="Japan" & header$Latitude>90),
 header[toswap, c("Latitude", "Longitude")] <- header[toswap, c("Longitude", "Latitude")]
 ```
 
-There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value to indicate this is an estimation, rather than a measure.
+There are `r nrow(header %>% filter(is.na("Location uncertainty (m)")))`. As a first approximation, we assign the median of the respective dataset, as a negative value to indicate this is an estimation, rather than a measure.
 ```{r}
 header <- header %>% 
   left_join(header %>% 
               group_by(Dataset) %>% 
               summarize(loc.uncer.median=median(`Location uncertainty (m)`, na.rm=T)), 
             by="Dataset") %>% 
-  mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)`), 
+  mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)` & !is.na(Latitude)), 
                                             -abs(loc.uncer.median), 
                                             `Location uncertainty (m)`)) %>% 
   dplyr::select(-loc.uncer.median)
 ```
-There are still `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))` plots with no estimation of location uncertainty.  
+There are still `r nrow(header %>% filter(is.na("Location uncertainty (m)")))` plots with no estimation of location uncertainty.  
 \newline
 Assign plot size to plots in the Patagonia dataset (input of Ana Cingolani)
 ```{r}
@@ -173,12 +182,11 @@ header <- header %>%
 ```
 
 ## 2 Formations
-Fill out the columns `Forest:Sparse.vegetation` with 0\\NAs, where necessary. Create columns `is.forest` and `is.non.forest` using script developed for sPlot 2.1  
+Fill out the columns `Forest:Sparse.vegetation` with NAs, where necessary. Create columns `is.forest` and `is.non.forest` using script developed for sPlot 2.1  
 ~~
 I am not assigning plots to Faber-Langedon formation at this stage, as this is only possible for European plots having an ESY classification.
 
 
-
 ```{r}
 eunis.key <- read.xlsx("../_input/EUNIS_WFT.xlsx", sheetIndex = "Sheet1", endRow = 246) %>% 
   dplyr::select(EUNIS_code, NATURALNESS:SPARSE_VEG) %>% 
@@ -193,8 +201,6 @@ eunis.key <- read.xlsx("../_input/EUNIS_WFT.xlsx", sheetIndex = "Sheet1", endRow
 
 header.backup <- header
 
-#test <- header.backup %>% slice(100000:110000) %>% dplyr::select(PlotObservationID, Dataset, ESY, Forest:Shrubland)
-
 header <- header %>% # header.backup %>% 
   mutate(ESY=as.character(ESY)) %>% 
   #mutate(ESY=ifelse(ESY=="?", NA, ESY)) %>% 
@@ -249,7 +255,7 @@ header <- header %>%
   mutate(`Plants recorded`=factor(`Plants recorded`, exclude = "#N/A"))
 ```
 Align labels consortium 
-```{r}
+```{r message=F}
 databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
 
 header <- header %>% 
@@ -338,7 +344,7 @@ continent.out$continent[which(is.na(continent.out$continent))] <- as.character(c
 save(continent.out, file = "../_derived/continent.out")
 stopCluster(cl)
 ```
-Reload and manipulate continent. Correct header.sRoot
+Reload, manipulate continent and attach to header
 ```{r}
 load("../_derived/continent.out")
 header <- header %>% 
@@ -354,97 +360,52 @@ header <- header %>%
 ```
 
 ## 4.2 Assign to sBiomes
-```{r, eval=F}
+Performed in EVE HPC cluster using function `A98_PredictorsExtract.R`. Divided in 99 chunks.
+```{r eval=F}
 sBiomes <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/Biomes_sPlot/sBiomes.shp")
 crs(sBiomes) <- crs(header.shp)
 
-library(parallel)
-library(doParallel)
-ncores=10
-cl <- makeForkCluster(ncores, outfile="" )
-registerDoParallel(cl)
-
-#divide in chunks #should take around 40'
-indices <- 1:length(header.shp)
-chunks <- split(indices, sort(indices%%99))
-chunkn <- length(chunks)
-sBiomes.out <- foreach(i=1:chunkn, .packages=c('raster'), .combine=rbind) %dopar% { 
-  sBiomes.tmp <- sp::over(x=header.shp[chunks[[i]],], y=sBiomes)
-}
-sum(is.na(sBiomes.out$Name))
-sBiomes.out.backup <- sBiomes.out
-stopCluster(cl)
-```
-There are `r sum(is.na(sBiomes.out$Name))` still to assign. Using the `dist2Line` is not feasible due to long computing times (3' for each plot!). Fall back on raster version of sBiomes (res 0.1°).
-
-```{r}
-load("../_derived/sBiome.out.RData")
-sBiomes01 <- raster("/data/sPlot/users/Francesco/Ancillary_Data/Biomes_sPlot/sBiome_raster01/sBiomes_raster01.tif")
-##  Fix NAs
-toassign <- header.shp[which(is.na(sBiomes.out$Name)),] #116878 not assigned (!)
-crs(sBiomes01) <- crs(toassign)
-
-system.time(sBiomes.out2 <- extract(sBiomes01, toassign))
-
-
-toassign3 <- toassign[which(sBiomes.out2==0),] #91793 not assigned (!)
-robust.mode <- function(x){if(any(x!=0)) {
-                              a <- x[which(x!=0)] #exclude zero (i.e. NAs)
-                              return(as.numeric(names(sort(table(a), decreasing=T))[1]))} else
-                                return(NA)        
-  }
-
-ncores=10
-cl <- makeForkCluster(ncores, outfile="" )
-registerDoParallel(cl)
-sBiomes.out3 <- foreach(i=1:length(toassign3), .packages=c('raster'), .combine=rbind) %dopar% { 
-  sBiomes.out.tmp <- extract(sBiomes01, toassign3[i,], buffer=10000, fun=robust.mode)
-}
-
-stopCluster(cl) 
-save(sBiomes.out, sBiomes.out2, sBiomes.out3, file = "../_derived/sBiome.out.RData")
-
-
-toassign4 <- toassign3[which(is.na(sBiomes.out3)),] ##37887 still to classify
-ncores=10
-cl <- makeForkCluster(ncores, outfile="" )
+cl <- makeForkCluster(5, outfile="")
 registerDoParallel(cl)
-sBiomes.out4 <- foreach(i=1:length(toassign4), .packages=c('raster'), .combine=rbind) %dopar% { 
-  sBiomes.out.tmp <- extract(sBiomes01, toassign4[i,], buffer=50000, fun=robust.mode)
-}
-
-stopCluster(cl) 
-save(sBiomes.out, sBiomes.out2, sBiomes.out3, sBiomes.out4, file = "../_derived/sBiome.out.RData")
 
+clusterEvalQ(cl, {
+  library(rgdal)
+  library(raster)
+  library(sp)
+  library(elevatr)
+  library(dplyr)
+  })
 
-toassign5 <- toassign4[which(is.na(sBiomes.out4)),] #4158 still to assign
-nearestBiome <- foreach(i=1:1000, .packages=c('raster'), .combine=rbind) %dopar% { 
-  nearestBiome.tmp <- geosphere::dist2Line(toassign[i,], sBiomes)
+sBiomes.out <- foreach(i=1:99, .combine=rbind) %dopar% {
+  source("A98_PredictorsExtract.R")
+  PredExtr(header.shp, myfunction=NA, output="../_derived/sBiomes/",  
+                     toextract=sBiomes, typp="shp", ncores=1, chunkn=99, chunk.i=i)
 }
-sBiomes.out[which(is.na(sBiomes.out$Name))] <- as.character(sBiomes@data[nearestBiome[,"ID"],])
-sum(is.na(sBiomes.out$Name))
-save(sBiomes.out, file = "../_derived/sBiome.out.RData")
-
 stopCluster(cl)
 
-
 ```
 
-
 Reimport output and join to header
-```{r}
-
+```{r, message=F, warning=F}
+sBiome.files <- list.files("../_derived/sBiomes", pattern="sBiomes-[0-9]+.csv", full.names=T)
+sBiomes.out <- do.call(rbind, lapply(sBiome.files, function(x) {read_csv(x)}))
 header <- header %>% 
-  bind_rows(sBiomes.out %>% 
-              dplyr::select(Name, BiomeID) %>% 
-              dplyr::rename(sBioem=Name, s))
-
+  left_join(sBiomes.out %>% 
+              rename(PlotObservationID=X1) %>% 
+              dplyr::select(PlotObservationID, Name, BiomeID) %>% 
+              dplyr::rename(sBiome=Name, sBiomeID=BiomeID), 
+            by="PlotObservationID")
 
 ```
 
+There are `r sum(is.na(sBiomes.out$Name))` unassigned plots. 
+
 
 ## 4.3 Extract elevation
-Split data into tiles of 1 x 1 degrees, and create sp::SpatialPointsDataFrame files. Only for plots having a location uncertainty < 50 km. (Include also plots without location uncertainty, arbitrarily set to 100 m)
+
+Extract elevation for each plot. Loops over tiles of 1 x 1°, projects to mercator, and extract elevation for plot coordinates, as well as 2.5, 50, and 97.5 quantiles for a buffer area having a radius equal to the location uncertainty of each plot (but only if location uncertainty < 50 km). DEM derive from package [elevatr](https://cran.r-project.org/web/packages/elevatr/vignettes/introduction_to_elevatr.html#get_raster_elevation_data), which uses the [Terrain Tiles on Amazon Web Services](https://registry.opendata.aws/terrain-tiles/). Resolutions of DEM rasters vary by region. I set a zoom factor z=10, which corresponds to ~ 75-150 m. Sources are: SRTM, data.gov.at in Austria, NRCAN in Canada, SRTM, NED/3DEP 1/3 arcsec, data.gov.uk in United Kingdom, INEGI in Mexico, ArcticDEM in latitudes above 60°, LINZ in New Zealand, Kartverket in Norway, as described [here](https://github.com/tilezen/joerd/blob/master/docs/data-sources.md).  
+\newline
+Split data into tiles of 1 x 1 degrees, and create `sp::SpatialPointsDataFrame` files. Only for plots having a location uncertainty < 50 km.
 ```{r create tiles}
 header.tiles <- header %>%
   dplyr::select(PlotObservationID, Dataset, Longitude, Latitude, `Location uncertainty (m)`) %>%
@@ -456,20 +417,8 @@ header.tiles <- header %>%
   mutate(tilenam=factor(paste(Longitude_tile, Latitude_tile)))
   
 ```
-There are `r nrow(header.tiles)` plots out of `r nrow(header)` plots with Location uncertainty <= 50km (or absent).  
-  
-
-
-Extract elevation for each plot. Loops over tiles of 1 x 1°, projects to mercator, and extract elevation for plot coordinates, as well as 2.5, 50, and 97.5 quantiles for a buffer area having a radius equal to the location uncertainty of each plot (but only if location uncertainty < 50 km). DEM derive from package [elevatr](https://cran.r-project.org/web/packages/elevatr/vignettes/introduction_to_elevatr.html#get_raster_elevation_data), which uses the [Terrain Tiles on Amazon Web Services](https://registry.opendata.aws/terrain-tiles/). Resolutions of DEM rasters vary by region. I set a zoom factor z=10, which corresponds to ~ 75-150 m. Sources are: SRTM, data.gov.at in Austria, NRCAN in Canada, SRTM, NED/3DEP 1/3 arcsec, data.gov.uk in United Kingdom, INEGI in Mexico, ArcticDEM in latitudes above 60°, LINZ in New Zealand, Kartverket in Norway, as described [here](https://github.com/tilezen/joerd/blob/master/docs/data-sources.md)
-
-```{r message==F}
-library(elevatr)
-
-continent.high.merc <- spTransform(continent.high, CRS( "+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m
-+nadgrids=@null +no_defs"))
-
-require(parallel)
-require(doParallel)
+There are `r nrow(header.tiles)` plots out of `r nrow(header)` plots with Location uncertainty <= 50km (or absent). The total number of tiles is `r nlevels(header.tiles$tilenam)`.    
+```{r eval=F}
 cl <- makeForkCluster(5, outfile="")
 registerDoParallel(cl)
 
@@ -481,75 +430,15 @@ clusterEvalQ(cl, {
   library(dplyr)
   })
 
-#create list of tiles for which dem could not be extracted
-myfiles <- list.files("../_derived/elevatr/", pattern = "[A-Za-z]*_[0-9]+\\.RData$")
-done <- NULL
-done <- as.numeric(unlist(regmatches(myfiles, gregexpr("[[:digit:]]+", myfiles))))
-todo <- 1:nlevels(header.tiles$tilenam)
-todo <- todo[-which(todo %in% done)]
-#todo <- sample(todo, replace=F)
-#foreach(i = 1:nlevels(header.sp$tilenam)) %do% {
-#foreach(i = todo) %dopar% {  
-for(i in todo){
-  #create sp and project data
-  if(nrow(header.tiles %>% 
-          filter(tilenam %in% levels(header.tiles$tilenam)[i])) ==0 ) next()
-  sp.tile <- SpatialPointsDataFrame(coords=header.tiles %>% 
-                                    filter(tilenam %in% levels(header.tiles$tilenam)[i]) %>%
-                                    dplyr::select(Longitude, Latitude),
-                                  data=header.tiles %>% 
-                                    filter(tilenam %in% levels(header.tiles$tilenam)[i]) %>%
-                                    dplyr::select(-Longitude, -Latitude),
-                                  proj4string = CRS("+init=epsg:4326"))
-  sp.tile <- spTransform(sp.tile, CRSobj = CRS("+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0
-                                               +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null
-                                               +no_defs ")) #project to mercator
-
-  #retrieve dem raster
-  tryCatch(raster.tile <- get_elev_raster(sp.tile, z=9, expand=max(sp.tile$`Location uncertainty (m)`), clip="bbox"),
-        error = function(e){e}
-  )
-  if(!exists("raster.tile")) {
-    raster.tile <- NA
-    save(raster.tile, file = paste("../_derived/elevatr/elevation_tile_", i, "failed.RData", sep=""))
-    message(paste("tile", i, "doesn't work!, skip to next"))
-    rm(raster.tile)
-    next
-  }
-  # clip dem tile with continent shape
-  raster.tile <- mask(raster.tile, continent.high.merc)
-  
-  #extract and summarize elevation data
-  elev.tile <- raster::extract(raster.tile, sp.tile, small=T)
-  elev.tile.buffer <- raster::extract(raster.tile, sp.tile, 
-                                      buffer=sp.tile@data$`Location uncertainty (m)`, 
-                                      small=T)
-  tmp <- round(mapply( quantile, 
-                            x=elev.tile.buffer,
-                            #center=elev.tile,
-                            probs=rep(c(0.025, 0.5, 0.975), each=length(elev.tile)), 
-                            #loc.uncert=sp.tile$`Location uncertainty (m)`, 
-                       na.rm=T))
-  elev.q95 <- setNames(data.frame(matrix(tmp, ncol = 3, nrow = length(elev.tile.buffer))), 
-         c("Elevation_q2.5", "Elevation_median", "Elevation_q97.5"))
-  output.tile <- data.frame(PlotObservationID=sp.tile$PlotObservationID, 
-                          elevation=round(elev.tile), 
-                          elev.q95, 
-                          DEM.res=res(raster.tile)[1]) 
-
- #save output
-  save(output.tile, file = paste("../_derived/elevatr/elevation_tile_", i, ".RData", sep=""))
-  print(i)
-}
+# Divided in 99 chunks
+elevation.out <- foreach(i=1:99, .combine=rbind) %dopar% {
+  source("A97_ElevationExtract.R")
+  ElevationExtract(header.shp, output, ncores=1, chunk.i=1)}
 stopCluster(cl)
-
 ```
 
-
-###### TO CHECK BELOW HERE
-
 For those tiles that failed, extract elevation of remaining plots one by one
-```{r}
+```{r, eval=F}
 #create list of tiles for which dem could not be extracted
 myfiles <- list.files("../_derived/elevatr/")
 failed <- list.files("../_derived/elevatr/", pattern = "[A-Za-z]*_[0-9]+failed\\.RData$")
@@ -567,20 +456,36 @@ sp.tile0 <- spTransform(sp.tile0, CRSobj = CRS("+init=epsg:3857 +proj=merc +a=63
                                                +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null
                                                +no_defs ")) #project to mercator
 output.tile <- data.frame(NULL)
+
+cl <- makeForkCluster(5, outfile="")
+registerDoParallel(cl)
+
+clusterEvalQ(cl, {
+  library(rgdal)
+  library(raster)
+  library(sp)
+  library(elevatr)
+  library(dplyr)
+  })
+
 #Loop over all plots
-for(i in 1:10){#nrow(sp.tile0)){
+#for(i in nrow(sp.tile0)){
+elevation.failed <- foreach(i=1:nrow(sp.tile0), .packages=c('raster'), .combine=rbind) %dopar% { 
   sp.tile <- sp.tile0[i,]
   tryCatch(raster.tile <- get_elev_raster(sp.tile, z=10, 
                                           expand=max(sp.tile$`Location uncertainty (m)`)),
-        error = function(e){#bind_rows(output.tile, 
-                            #          data.frame(PlotObservationID=sp.tile$PlotObservationID,
-                            #                     elevation=NA,
-                            #                     Elevation_q2.5=NA,
-                            #                     Elevation_median=NA,
-                            #                     Elevation_q97.5=NA,
-                            #                     DEM.res=NA))
+        error = function(e){
           print(paste("could not retrieve DEM for", sp.tile$PlotObservationID))}
           )
+   if(!exists("raster.tile")) {
+    output.tile <- data.frame(PlotObservationID==sp.tile$PlotObservationID, 
+                              elevation=NA, 
+                              Elevation_q2.5=NA, 
+                              Elevation_median=NA,
+                              Elevation_q97.5=NA,
+                              DEM.res=NA)
+    return(output.tile)
+  } else {
   # clip dem tile with continent shape
   raster.tile <- mask(raster.tile, continent.high.merc)
   
@@ -595,63 +500,100 @@ for(i in 1:10){#nrow(sp.tile0)){
                             probs=rep(c(0.025, 0.5, 0.975), each=length(elev.tile)), 
                             #loc.uncert=sp.tile$`Location uncertainty (m)`, 
                        na.rm=T)))
-   output.tile <- bind_rows(output.tile, 
-                        data.frame(PlotObservationID=sp.tile$PlotObservationID, 
+   output.tile <- data.frame(PlotObservationID=sp.tile$PlotObservationID, 
                             elevation=round(elev.tile), 
                             elev.q95, 
                             DEM.res=res(raster.tile)[1]) %>%
-    rename(Elevation_q2.5=X2.5., Elevation_median=X50., Elevation_q97.5=X97.5.))
+    rename(Elevation_q2.5=X2.5., Elevation_median=X50., Elevation_q97.5=X97.5.)
+   return(output.tile)
+   }
 }
-save(output.tile, file = paste("../_derived/elevatr/elevation_tile_", 0, ".RData", sep=""))
+stopCluster(cl)
+save(elevation.failed, file = "../_derived/elevatr/elevation_missing.RData")
 ```
 
-
 Compose tiles into a single output, and export
-```{r }
-myfiles <- list.files(path)
-myfiles <- myfiles[grep(pattern="*.RData$", myfiles)]
+```{r eval=F}
+myfiles <- list.files("../_derived/elevatr/", pattern = "elevation_tile_[0-9]+\\.RData$", full.names = T)
+
 #create empty data.frame
-elevation.out <- data.frame(PlotObservationID=header$PlotObservationID, 
-                        elevation=NA, 
-                        Elevation_q2.5=NA, 
-                        Elevation_median=NA, 
-                        Elevation_q97.5=NA, 
-                        DEM.res=NA)
+elevation.out <- matrix(NA, nrow=nrow(header.tiles), ncol=6)
+elevation.out <- as.data.frame(elevation.out)
+colnames(elevation.out) <- c("PlotObservationID", "elevation", "Elevation_q2.5", "Elevation_median", "Elevation_q97.5","DEM.res")
+elevation.out$PlotObservationID <- header.tiles$PlotObservationID
+
+tmp <- NULL
 for(i in 1:length(myfiles)){
-  load(paste(path, myfiles[i], sep=""))
+  load(myfiles[i])
   #attach results to empty data.frame
-  mymatch <- match(output.tile$PlotObservationID, elevation.out$PlotObservationID)
-  elevation.out[mymatch,] <- output.tile
-  if(i %in% seq(1,length(myfiles), by=25)){print(i)}
+  tmp <- bind_rows(tmp, output.tile)
+  if(i %in% seq(5000, length(myfiles), by=5000)){
+    mymatch <- base::match(x=tmp$PlotObservationID, table=elevation.out$PlotObservationID)
+    mymatch <- mymatch[!is.na(mymatch)]
+    elevation.out[mymatch,] <- tmp
+    tmp <- NULL
+    print(paste("Attached first", i, "files"))
+  }
+  if(i %in% seq(1,length(myfiles), by=250)){print(i)}
 }
 
-write_csv(elevation.out, path ="../_derived/elevatr/Elevation_out_v301.csv")
+load(file = "../_derived/elevatr/elevation_missing.RData")
+
+mymatch <- base::match(x=elevation.failed$PlotObservationID, table=elevation.out$PlotObservationID)
+mymatch <- mymatch[!is.na(mymatch)]
+elevation.out[mymatch,] <- elevation.failed
 
+write_csv(elevation.out, path ="../_derived/elevatr/elevation.out.csv")
 ```
-Reimport output and check
+
+
+Reimport output, attach to header and check
 ```{r message=F}
-elevation.out <- read_csv("../_derived/elevatr/Elevation_out.csv")
+elevation.out <- read_csv("../_derived/elevatr/elevation.out.csv")
 knitr::kable(head(elevation.out,10), 
-  caption="Example of elevation output") %>%
+  caption="Example of elevation output (only first 10 shown)") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   full_width = F, position = "center")
-
 summary(elevation.out)
 ```
 
-There are `r sum(elevation.out$elevation < -100, na.rm=T)` plots with elevation < -100 !  
+There are `r sum(is.na(elevation.out$elevation))` plots without elevation info, corresponding to `r round(sum(is.na(elevation.out$elevation))/nrow(header)*100,1)`% of total.  
+There are `r sum(elevation.out$elevation < -1, na.rm=T)` plots with elevation below sea level. 
+
+```{r message=F}
+elevation.out <- read_csv("../_derived/elevatr/elevation.out.csv")
+knitr::kable(header %>% 
+               dplyr::select(PlotObservationID, Dataset, `GIVD ID`) %>% 
+               left_join(elevation.out, 
+                         by="PlotObservationID") %>% 
+               filter(elevation< -1) %>% 
+               group_by( `GIVD ID`, Dataset) %>% 
+               summarize(num.plot=n()) %>% 
+               ungroup() %>% 
+               arrange(desc(num.plot)) %>% 
+               slice(1:10), 
+  caption="Dataset with highest number of plots below sea level") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+
+```
+
+```{r}
+summary(elevation.out %>% 
+          dplyr::select(-PlotObservationID))
+
+```
 
 Create Scatterplot between measured elevation in the field, and elevation derived from DEM  
 ```{r scatterplot, cache=T}
 #join measured and derived elevation
-mydata <- header %>%
-  dplyr::select(PlotObservationID, `Altitude (m)`) %>%
-  filter(!is.na(`Altitude (m)`)) %>%
-  rename(elevation_measured=`Altitude (m)`) %>%
+mydata <- header %>% 
   left_join(elevation.out %>% 
-              dplyr::select(PlotObservationID, elevation) %>% 
-              rename(elevation_dem=elevation),
-            by="PlotObservationID")
+              rename(elevation_dem=Elevation_median), by="PlotObservationID") %>%
+  dplyr::select(PlotObservationID, `Altitude (m)`, elevation_dem) %>%
+  filter(!is.na(`Altitude (m)`) & !is.na(elevation_dem)) %>%
+  rename(elevation_measured=`Altitude (m)`)
+
 ggplot(data=mydata) + 
   geom_point(aes(x=elevation_measured, y=elevation_dem), alpha=1/10, cex=0.8) + 
   theme_bw() + 
@@ -661,22 +603,9 @@ ggplot(data=mydata) +
 ```
 
 
-
-
-
-
-
-
-
 # 5 Map of plots
 
-```{r}
-library(sf)
-library(rnaturalearth)
-library(viridis)
-library(dggridR)
-#### BRT predicting
-#### alternative plotting
+```{r, message=F, warning=F}
 countries <- ne_countries(returnclass = "sf") %>% 
   st_transform(crs = "+proj=eck4") %>% 
   st_geometry()
@@ -689,10 +618,6 @@ bb <- ne_download(type = "wgs84_bounding_box", category = "physical",
   st_transform(crs = "+proj=eck4") %>% 
   st_geometry()
 
-label_parse <- function(breaks) {
-  parse(text = breaks)
-}
-
 ## basic graph of the world in Eckert projection
 w3a <- ggplot() +
   geom_sf(data = bb, col = "grey20", fill = "white") +
@@ -706,65 +631,270 @@ w3a <- ggplot() +
         legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 
         legend.key.height = unit(1.1, "cm"), 
         legend.key.width = unit(1.1, "cm")) +
-  scale_fill_viridis(label=label_parse)
+  scale_fill_viridis()
+```
 
-## Plotting of plot density in hexagons
+Graph of plot density (hexagons)
+```{r, fig.align="center", fig.width=8, fig.height=4, cache=T}
 header2 <- header %>% 
   filter(!is.na(Longitude) | !is.na(Latitude)) %>% 
   dplyr::select(PlotObservationID, Latitude, Longitude, `GIVD ID`) %>% 
   filter(!(abs(Longitude) >171 & abs(Latitude>70)))
-dggs          <- dgconstruct(spacing=300, metric=T, resround='down')
-  #Get the corresponding grid cells for each earthquake epicenter (lat-long pair)
-  header2$cell <- dgGEO_to_SEQNUM(dggs, header2$Longitude, header2$Latitude)$seqnum
-  
-  #Calculate number of plots for each cell
-  header.out   <- header2 %>% 
-    group_by(cell) %>% 
-    summarise(value.out=log(n(), 10))
-  #Get the grid cell boundaries for cells 
-  grid   <- dgcellstogrid(dggs, header.out$cell, frame=F) %>%
-    st_as_sf() %>% 
-    mutate(cell = header.out$cell) %>% 
-    mutate(value.out=header.out$value.out) %>% 
-    st_transform("+proj=eck4") %>% 
-    st_wrap_dateline(options = c("WRAPDATELINE=YES"))
-  
-  ## plotting
-  legpos <- c(0.160, .24)
-  
-  (w3 <- w3a + 
-         geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9)    +
-         geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
-         scale_fill_viridis(
-           name="# plots", breaks=0:5, labels = c("1", "10", "100",
-                                                  "1,000", "10,000", "100,000"), option="viridis" ) + 
-      #labs(fill="# plots") + 
-      theme(legend.position = legpos +c(-0.06, 0.25))
-  )
+dggs <- dgconstruct(spacing=300, metric=T, resround='down')
+#Get the corresponding grid cells for each earthquake epicenter (lat-long pair)
+header2$cell <- dgGEO_to_SEQNUM(dggs, header2$Longitude, header2$Latitude)$seqnum
 
-  ggsave(filename="../_pics/PlotDensityLog10_vir.png", width = 15, height = 7, units="in", dpi=300, plot=w3)
+#Calculate number of plots for each cell
+header.out   <- header2 %>% 
+  group_by(cell) %>% 
+  summarise(value.out=log(n(), 10))
 
-  
-## Graph of plot location by Dataset
+#Get the grid cell boundaries for cells 
+grid   <- dgcellstogrid(dggs, header.out$cell, frame=F) %>%
+  st_as_sf() %>% 
+  mutate(cell = header.out$cell) %>% 
+  mutate(value.out=header.out$value.out) %>% 
+  st_transform("+proj=eck4") %>% 
+  st_wrap_dateline(options = c("WRAPDATELINE=YES"))
+
+## plotting
+legpos <- c(0.160, .24)
+(w3 <- w3a + 
+       geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9)    +
+       geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
+       scale_fill_viridis(
+         name="# plots", breaks=0:5, labels = c("1", "10", "100",
+                                                "1,000", "10,000", "100,000"), option="viridis" ) + 
+    #labs(fill="# plots") + 
+    theme(legend.position = legpos +c(-0.06, 0.25))
+)
+ggsave(filename="../_pics/PlotDensityLog10_vir.png", width = 15, height = 7, units="in", dpi=300, plot=w3)
+```
+
+Graph of plot location by Dataset
+```{r, fig.align="center", fig.width=8, fig.height=4, cache=T}
 header.sf <- header.shp %>% 
   st_as_sf() %>% 
-  #sample_frac(0.01) %>%
-  st_transform(crs = "+proj=eck4") #%>% 
-  #st_geometry()
+  st_transform(crs = "+proj=eck4")
 
-set.seed(1984)
-w3 <- w3a + 
+(w4 <- w3a + 
          geom_sf(data=header.sf %>% 
                    mutate(GIVD.ID=fct_shuffle(GIVD.ID)), aes(col=factor(GIVD.ID)), pch=16, size=0.8, alpha=0.6) +
          geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
-         #scale_color_brewer(palette = "Dark2") + 
-         #labs(fill="# plots") + 
-         theme(legend.position = "none")
+         theme(legend.position = "none"))
+
+ggsave(filename="../_pics/PlotDistrib_Dark2_shuffle1984.png", width = 15, height = 7, units="in", dpi=300, plot=w4) ## takes ~40' to render
+```
 
+## 6 Fix output and export
 
+```{r}
+header.out <- header %>% 
+  dplyr::select(
+    #Metadata
+    PlotObservationID, Dataset, "GIVD ID", "TV2 relevé number", "ORIG_NUM", "GUID", 
+    #Geographic info
+    Longitude:"Location uncertainty (m)", Country, CONTINENT, sBiome, sBiomeID, Locality,
+    #Methodological info
+    "Relevé area (m²)", "Cover abundance scale", "Date of recording", "Plants recorded", 
+    "Herbs identified (y/n)","Mosses identified (y/n)","Lichens identified (y/n)",
+    #Topographical
+    elevation_dem, "Altitude (m)", "Aspect (°)", "Slope (°)", 
+    #Vegetation type
+    Forest:Naturalness, ESY, 
+    #Vegetation structure
+                "Cover total (%)":"Maximum height cryptogams (mm)")
+save(header.out, file = "../_output/header_splot3.0")
+
+```
 
-ggsave(filename="../_pics/PlotDistrib_Dark2_shuffle1984.png", width = 15, height = 7, units="in", dpi=300, plot=w3) ## takes ~40' to render
 
+## Supplementary Material
+### ANNEX 1 - Ancillary function - `PredExtr`
+```{r}
+PredExtr <- function(x.shp, myfunction=NA, output,  
+                     toextract, typp=c("raster", "shp"), ncores, chunkn=1, chunk.i=NA){
+  print("Load Packages")
+  require(foreach)
+  require(parallel)
+  require(doParallel)
+  require(raster)
+  require(rgeos)
+  require(rgdal) 
+ 
+  print(paste("Loading", typp, "data :", toextract))
+  print(paste("output will be:", output))
+  if(typp=="raster"){ mypredictor <- raster(toextract)} else
+    mypredictor <- readOGR(toextract)
+  header.shp <-readOGR(x.shp)
+  crs(mypredictor) <- crs(header.shp) #should be verified beforehand!
+
+  ## Divide in chunks if requested
+  if(chunkn>1 & !is.na(chunk.i)){
+    print(paste("divide in chunks and run on chunk n.", chunk.i))
+    indices <- 1:length(header.shp)
+    chunks <- split(indices, sort(indices%%chunkn))
+    header.shp <- header.shp[chunks[[chunk.i]],]
+    } 
+
+  # define ancillary functions
+  robust.mean <- function(x){mean(x, rm.na=T)}
+  robust.mode <- function(x){if(any(x!=0)) {
+ 	 a <- x[which(x!=0)] #exclude zero (i.e. NAs)
+  	return(as.numeric(names(sort(table(a), decreasing=T))[1]))} else
+	    return(NA)                          }
+
+  print("go parallel")
+  cl <- makeForkCluster(ncores, outfile="" )
+  registerDoParallel(cl)
+
+  print("start main foreach loop")
+  if(typp=="raster"){
+  out <- foreach(i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { 
+    tmp <- raster::extract(mypredictor, header.shp[i,], 
+                         buffer=min(10000,  max(250, header.shp@data$loc.uncert[i])), fun=myfunction) 
+    }
+  write.csv(out, file = output)
+  } else {
+    out <- sp::over(x=header.shp, y=mypredictor) 
+    toassign <- header.shp[which(is.na(out[,1])),]
+    print(paste(length(toassign), "plots not matched directely - seek for NN"))
+  if(length(toassign)>0){
+    
+    nearest <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { 
+      print(i)
+      nearest.tmp <- tryCatch(geosphere::dist2Line(header.shp[i,], mypredictor),
+                              error = function(e){data.frame(distance=NA, lon=NA, lat=NA,ID=NA)}
+                              )
+      #nearest.tmp <- geosphere::dist2Line(toassign[i,], mypredictor)
+      return(nearest.tmp)
+    }
+    out[which(is.na(out[,1])),] <- mypredictor@data[nearest[,"ID"],]
+   }
+    write.csv(out, file=output)
+  }
+  stopCluster(cl)
+  }
+```
+### ANNEX 2 - Ancillary function - `ElevationExtract`
+```{r}
+
+ElevationExtract <- function(header, output, ncores, chunk.i){
+  print("load packages")
+  require(tidyverse)
+  
+  require(rgdal)
+  require(sp)
+  require(rgeos)
+  require(raster)
+  require(rworldmap)
+  require(elevatr)
+  
+  require(parallel)
+  require(doParallel)
+  
+  print("Import header and divide in tiles")
+  header.shp <- readOGR(header)
+  header.tiles <- header.shp@data %>% 
+    bind_cols(as.data.frame(header.shp@coords)) %>% 
+    rename(PlotObservationID=PltObID, Longitude=coords.x1, Latitude=coords.x2) %>% 
+    mutate(lc_ncrt=abs(lc_ncrt)) %>% 
+    filter(lc_ncrt <= 50000) %>%
+    mutate_at(.vars=vars(Longitude, Latitude), 
+              .funs=list(tile=~cut(., breaks = seq(-180,180, by=.2)))) %>%
+    mutate(tilenam=factor(paste(Longitude_tile, Latitude_tile)))
+    
+  
+  print("Get continent")
+  sPDF <- rworldmap::getMap(resolution="high")
+  continent.high <- sPDF[,"continent"]
+  crs(continent.high) <- CRS("+init=epsg:4326")
+  continent.high@data$continent <- fct_recode(continent.high@data$continent, "South America"="South America and the Caribbean")
+  continent.high.merc <- spTransform(continent.high, CRS( "+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m
++nadgrids=@null +no_defs"))
+
+  print("go parallel")
+  cl <- makeForkCluster(ncores, outfile="")
+  registerDoParallel(ncores)
+  
+  clusterEvalQ(cl, {
+    library(rgdal)
+    library(raster)
+    library(sp)
+    library(elevatr)
+    library(dplyr)
+  })
+
+  print("create list of tiles still to do")
+  myfiles <- list.files(path = output, pattern = "[A-Za-z]*_[0-9]+\\.RData$")
+  done <- NULL
+  done <- as.numeric(unlist(regmatches(myfiles, gregexpr("[[:digit:]]+", myfiles))))
+  todo <- 1:nlevels(header.tiles$tilenam)
+  if(length(done)>0) {todo <- todo[-which(todo %in% done)]}
+  todo <- sample(todo, replace=F)
+  print(paste(length(todo), "tiles to do"))
+  
+  print("divide in chunks")
+  #divide in chunks #should take around 40'
+  indices <- 1:length(todo)
+  todo.chunks <- split(indices, sort(indices%%99))
+  
+  print(paste("start main foreach loop on chunk n=", chunk.i))
+  foreach(i = todo.cunks[[chunk.i]]) %dopar% {  
+	print(paste("doing", i))
+    #create sp and project data
+    if(nrow(header.tiles %>% 
+            filter(tilenam %in% levels(header.tiles$tilenam)[i])) ==0 ) next()
+    sp.tile <- SpatialPointsDataFrame(coords=header.tiles %>% 
+                                        filter(tilenam %in% levels(header.tiles$tilenam)[i]) %>%
+                                        dplyr::select(Longitude, Latitude),
+                                      data=header.tiles %>% 
+                                        filter(tilenam %in% levels(header.tiles$tilenam)[i]) %>%
+                                        dplyr::select(-Longitude, -Latitude),
+                                      proj4string = CRS("+init=epsg:4326"))
+    sp.tile <- spTransform(sp.tile, CRSobj = CRS("+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0
+                                                 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null
+                                                 +no_defs ")) #project to mercator
+    
+    #retrieve dem raster
+    tryCatch(raster.tile <- get_elev_raster(sp.tile, z=9, expand=max(sp.tile$lc_ncrt), clip="bbox"),
+             error = function(e){e}
+    )
+    if(!exists("raster.tile")) {
+      raster.tile <- NA
+      save(raster.tile, file = paste(output, "elevation_tile_", i, "failed.RData", sep=""))
+      message(paste("tile", i, "doesn't work!, skip to next"))
+      rm(raster.tile)
+      next
+    }
+    # clip dem tile with continent shape
+    raster.tile <- mask(raster.tile, continent.high.merc)
+    
+    #extract and summarize elevation data
+    elev.tile <- raster::extract(raster.tile, sp.tile, small=T)
+    elev.tile.buffer <- raster::extract(raster.tile, sp.tile, 
+                                        buffer=sp.tile@data$lc_ncrt, 
+                                        small=T)
+    tmp <- round(mapply( quantile, 
+                         x=elev.tile.buffer,
+                         #center=elev.tile,
+                         probs=rep(c(0.025, 0.5, 0.975), each=length(elev.tile)), 
+                         #lc_ncrt=sp.tile$lc_ncrt, 
+                         na.rm=T))
+    elev.q95 <- setNames(data.frame(matrix(tmp, ncol = 3, nrow = length(elev.tile.buffer))), 
+                         c("Elevation_q2.5", "Elevation_median", "Elevation_q97.5"))
+    output.tile <- data.frame(PlotObservationID=sp.tile$PlotObservationID, 
+                              elevation=round(elev.tile), 
+                              elev.q95, 
+                              DEM.res=res(raster.tile)[1]) 
+    
+    #save output
+    save(output.tile, file = paste(output, "elevation_tile_", i, ".RData", sep=""))
+    print(paste(i, "done"))
+  }
+  stopCluster(cl)
+  }
+  
 
 ```
 
-- 
GitLab