From e26c043e830ec6083b99acc9eddcb8f3cd680c8c Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Tue, 10 Mar 2020 16:25:05 +0100
Subject: [PATCH] Various bugs fixed, increased cross-consistency

---
 code/04_buildHeader.Rmd        | 202 ++-------------------------------
 code/05_ExtractEnvironment.Rmd |  13 +--
 code/06_buildDT.Rmd            |  26 +++--
 code/07_buildCWMs.Rmd          |   5 +-
 4 files changed, 36 insertions(+), 210 deletions(-)

diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd
index 54aac93..3d4a3f8 100644
--- a/code/04_buildHeader.Rmd
+++ b/code/04_buildHeader.Rmd
@@ -282,7 +282,7 @@ header <- header %>%
 ```
 
 
-# 4 Assign plots to spatial descriptors
+## 4 Assign plots to spatial descriptors
 Create spatial point dataframe for sPlot data to intersect with spatial layers
 ```{r}
 header.shp <- header %>%
@@ -293,10 +293,10 @@ header.shp <- SpatialPointsDataFrame(coords= header.shp %>%
                                data=data.frame(PlotObservationID= header.shp$PlotObservationID, 
                                                loc.uncert=header.shp$`Location uncertainty (m)`, 
                                                `GIVD ID`=header.shp$`GIVD ID`))
-#writeOGR(header.shp, dsn="../derived", layer="header.shp", driver="ESRI Shapefile", overwrite_layer = T)
+writeOGR(header.shp, dsn="../_derived/", layer="header.shp", driver="ESRI Shapefile", overwrite_layer = T)
 ```
 
-## 4.1 Assign to Continents
+### 4.1 Assign to Continents
 Download and manipulate map of continents
 ```{r}
 sPDF <- rworldmap::getMap(resolution="coarse")
@@ -369,7 +369,7 @@ knitr::kable(header %>%
                   full_width = F, position = "center")
 ```
 
-## 4.2 Assign to sBiomes
+### 4.2 Assign to sBiomes
 Performed in EVE HPC cluster using function `A98_PredictorsExtract.R`. Divided in 99 chunks.
 ```{r eval=F}
 sBiomes <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/Biomes_sPlot/sBiomes.shp")
@@ -418,7 +418,7 @@ knitr::kable(header %>%
                   full_width = F, position = "center")
 ```
 
-## 4.3 Extract WWF Ecoregion
+### 4.3 Extract WWF Ecoregions
 Extract ecoregion name and ID from [Ecoregions of the World](https://www.worldwildlife.org/publications/terrestrial-ecoregions-of-the-world). Olson et al. 2001 [(BioScience)](https://academic.oup.com/bioscience/article/51/11/933/227116).  
 Computation was performed in EVE HPC cluster using function `A98_PredictorsExtract.R`. Divided in 99 chunks.
 ```{r eval=F}
@@ -470,15 +470,17 @@ Summarize:
 ```{r message=F,  echo=F}
 knitr::kable(header %>% 
                group_by(Ecoregion) %>% 
-               summarize(num.plot=n()), 
-  caption="Number of plots per Ecoregion") %>%
+               summarize(num.plot=n()) %>% 
+               arrange(desc(num.plot)) %>% 
+               slice(1:30), 
+  caption="Number of plots in the 30 best represented Ecoregions") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   full_width = F, position = "center")
 ```
 
 
 
-## 4.4 Extract elevation
+### 4.4 Extract elevation
 
 Extract elevation for each plot. Loops over tiles of 1 x 1°, projects to mercator, and extract elevation for plot coordinates, as well as 2.5, 50, and 97.5 quantiles for a buffer area having a radius equal to the location uncertainty of each plot (but only if location uncertainty < 50 km). DEM derive from package [elevatr](https://cran.r-project.org/web/packages/elevatr/vignettes/introduction_to_elevatr.html#get_raster_elevation_data), which uses the [Terrain Tiles on Amazon Web Services](https://registry.opendata.aws/terrain-tiles/). Resolutions of DEM rasters vary by region. I set a zoom factor z=10, which corresponds to ~ 75-150 m. Sources are: SRTM, data.gov.at in Austria, NRCAN in Canada, SRTM, NED/3DEP 1/3 arcsec, data.gov.uk in United Kingdom, INEGI in Mexico, ArcticDEM in latitudes above 60°, LINZ in New Zealand, Kartverket in Norway, as described [here](https://github.com/tilezen/joerd/blob/master/docs/data-sources.md).  
 \newline
@@ -794,190 +796,10 @@ knitr::kable(header %>% slice(1:20),
 
 ## Supplementary Material
 ### ANNEX 1 - Ancillary function - `PredExtr`
-```{r}
-PredExtr <- function(x.shp, myfunction=NA, output,  
-                     toextract, typp=c("raster", "shp"), ncores, chunkn=1, chunk.i=NA){
-  print("Load Packages")
-  require(foreach)
-  require(parallel)
-  require(doParallel)
-  require(raster)
-  require(rgeos)
-  require(rgdal) 
- 
-  print(paste("Loading", typp, "data :", toextract))
-  print(paste("output will be:", output))
-  if(typp=="raster"){ mypredictor <- raster(toextract)} else
-    mypredictor <- readOGR(toextract)
-  header.shp <-readOGR(x.shp)
-  crs(mypredictor) <- crs(header.shp) #should be verified beforehand!
-
-  ## Divide in chunks if requested
-  if(chunkn>1 & !is.na(chunk.i)){
-    print(paste("divide in chunks and run on chunk n.", chunk.i))
-    indices <- 1:length(header.shp)
-    chunks <- split(indices, sort(indices%%chunkn))
-    header.shp <- header.shp[chunks[[chunk.i]],]
-    } 
-
-  # define ancillary functions
-  robust.mean <- function(x){mean(x, rm.na=T)}
-  robust.mode <- function(x){if(any(x!=0)) {
- 	 a <- x[which(x!=0)] #exclude zero (i.e. NAs)
-  	return(as.numeric(names(sort(table(a), decreasing=T))[1]))} else
-	    return(NA)                          }
-
-  print("go parallel")
-  cl <- makeForkCluster(ncores, outfile="" )
-  registerDoParallel(cl)
-
-  print("start main foreach loop")
-  if(typp=="raster"){
-  out <- foreach(i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { 
-    tmp <- raster::extract(mypredictor, header.shp[i,], 
-                         buffer=min(10000,  max(250, header.shp@data$loc.uncert[i])), fun=myfunction) 
-    }
-  write.csv(out, file = output)
-  } else {
-    out <- sp::over(x=header.shp, y=mypredictor) 
-    toassign <- header.shp[which(is.na(out[,1])),]
-    print(paste(length(toassign), "plots not matched directely - seek for NN"))
-  if(length(toassign)>0){
-    
-    nearest <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { 
-      print(i)
-      nearest.tmp <- tryCatch(geosphere::dist2Line(header.shp[i,], mypredictor),
-                              error = function(e){data.frame(distance=NA, lon=NA, lat=NA,ID=NA)}
-                              )
-      #nearest.tmp <- geosphere::dist2Line(toassign[i,], mypredictor)
-      return(nearest.tmp)
-    }
-    out[which(is.na(out[,1])),] <- mypredictor@data[nearest[,"ID"],]
-   }
-    write.csv(out, file=output)
-  }
-  stopCluster(cl)
-  }
+```{r, code = readLines("A98_PredictorsExtract.R")}
 ```
 ### ANNEX 2 - Ancillary function - `ElevationExtract`
-```{r}
-
-ElevationExtract <- function(header, output, ncores, chunk.i){
-  print("load packages")
-  require(tidyverse)
-  
-  require(rgdal)
-  require(sp)
-  require(rgeos)
-  require(raster)
-  require(rworldmap)
-  require(elevatr)
-  
-  require(parallel)
-  require(doParallel)
-  
-  print("Import header and divide in tiles")
-  header.shp <- readOGR(header)
-  header.tiles <- header.shp@data %>% 
-    bind_cols(as.data.frame(header.shp@coords)) %>% 
-    rename(PlotObservationID=PltObID, Longitude=coords.x1, Latitude=coords.x2) %>% 
-    mutate(lc_ncrt=abs(lc_ncrt)) %>% 
-    filter(lc_ncrt <= 50000) %>%
-    mutate_at(.vars=vars(Longitude, Latitude), 
-              .funs=list(tile=~cut(., breaks = seq(-180,180, by=.2)))) %>%
-    mutate(tilenam=factor(paste(Longitude_tile, Latitude_tile)))
-    
-  
-  print("Get continent")
-  sPDF <- rworldmap::getMap(resolution="high")
-  continent.high <- sPDF[,"continent"]
-  crs(continent.high) <- CRS("+init=epsg:4326")
-  continent.high@data$continent <- fct_recode(continent.high@data$continent, "South America"="South America and the Caribbean")
-  continent.high.merc <- spTransform(continent.high, CRS( "+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m
-+nadgrids=@null +no_defs"))
-
-  print("go parallel")
-  cl <- makeForkCluster(ncores, outfile="")
-  registerDoParallel(ncores)
-  
-  clusterEvalQ(cl, {
-    library(rgdal)
-    library(raster)
-    library(sp)
-    library(elevatr)
-    library(dplyr)
-  })
-
-  print("create list of tiles still to do")
-  myfiles <- list.files(path = output, pattern = "[A-Za-z]*_[0-9]+\\.RData$")
-  done <- NULL
-  done <- as.numeric(unlist(regmatches(myfiles, gregexpr("[[:digit:]]+", myfiles))))
-  todo <- 1:nlevels(header.tiles$tilenam)
-  if(length(done)>0) {todo <- todo[-which(todo %in% done)]}
-  todo <- sample(todo, replace=F)
-  print(paste(length(todo), "tiles to do"))
-  
-  print("divide in chunks")
-  #divide in chunks #should take around 40'
-  indices <- 1:length(todo)
-  todo.chunks <- split(indices, sort(indices%%99))
-  
-  print(paste("start main foreach loop on chunk n=", chunk.i))
-  foreach(i = todo.cunks[[chunk.i]]) %dopar% {  
-	print(paste("doing", i))
-    #create sp and project data
-    if(nrow(header.tiles %>% 
-            filter(tilenam %in% levels(header.tiles$tilenam)[i])) ==0 ) next()
-    sp.tile <- SpatialPointsDataFrame(coords=header.tiles %>% 
-                                        filter(tilenam %in% levels(header.tiles$tilenam)[i]) %>%
-                                        dplyr::select(Longitude, Latitude),
-                                      data=header.tiles %>% 
-                                        filter(tilenam %in% levels(header.tiles$tilenam)[i]) %>%
-                                        dplyr::select(-Longitude, -Latitude),
-                                      proj4string = CRS("+init=epsg:4326"))
-    sp.tile <- spTransform(sp.tile, CRSobj = CRS("+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0
-                                                 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null
-                                                 +no_defs ")) #project to mercator
-    
-    #retrieve dem raster
-    tryCatch(raster.tile <- get_elev_raster(sp.tile, z=9, expand=max(sp.tile$lc_ncrt), clip="bbox"),
-             error = function(e){e}
-    )
-    if(!exists("raster.tile")) {
-      raster.tile <- NA
-      save(raster.tile, file = paste(output, "elevation_tile_", i, "failed.RData", sep=""))
-      message(paste("tile", i, "doesn't work!, skip to next"))
-      rm(raster.tile)
-      next
-    }
-    # clip dem tile with continent shape
-    raster.tile <- mask(raster.tile, continent.high.merc)
-    
-    #extract and summarize elevation data
-    elev.tile <- raster::extract(raster.tile, sp.tile, small=T)
-    elev.tile.buffer <- raster::extract(raster.tile, sp.tile, 
-                                        buffer=sp.tile@data$lc_ncrt, 
-                                        small=T)
-    tmp <- round(mapply( quantile, 
-                         x=elev.tile.buffer,
-                         #center=elev.tile,
-                         probs=rep(c(0.025, 0.5, 0.975), each=length(elev.tile)), 
-                         #lc_ncrt=sp.tile$lc_ncrt, 
-                         na.rm=T))
-    elev.q95 <- setNames(data.frame(matrix(tmp, ncol = 3, nrow = length(elev.tile.buffer))), 
-                         c("Elevation_q2.5", "Elevation_median", "Elevation_q97.5"))
-    output.tile <- data.frame(PlotObservationID=sp.tile$PlotObservationID, 
-                              elevation=round(elev.tile), 
-                              elev.q95, 
-                              DEM.res=res(raster.tile)[1]) 
-    
-    #save output
-    save(output.tile, file = paste(output, "elevation_tile_", i, ".RData", sep=""))
-    print(paste(i, "done"))
-  }
-  stopCluster(cl)
-  }
-  
+```{r,  code = readLines("A97_ElevationExtract.R")}
 ```
 ### ANNEX 3 - SessionInfo()
 ```{r}
diff --git a/code/05_ExtractEnvironment.Rmd b/code/05_ExtractEnvironment.Rmd
index 466af83..8997cbe 100644
--- a/code/05_ExtractEnvironment.Rmd
+++ b/code/05_ExtractEnvironment.Rmd
@@ -17,7 +17,6 @@ output: html_document
 
 This report documents how environmental variables were extracted when constructing sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens.    
 
-
 ```{r results="hide", message=F, warning=F}
 knitr::opts_chunk$set(echo = TRUE)
 library(tidyverse)
@@ -60,8 +59,6 @@ get.summary <- function(x){x %>%
     mutate(stat=factor(stat, levels=c("no.NAs", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
     spread(key=stat, value=value)
 }
-
-
 ```
 
 ## 1 Import header data
@@ -158,7 +155,7 @@ chelsa.sd.out <- do.call(cbind, lapply(chelsa.sd.files,
                                        function(x) {read_csv(x, col_types = cols(X1 = col_character(),
                                                                                  V1 = col_double())) %>% 
                                            column_to_rownames("X1")}))
-colnames(chelsa.sd.out) <- paste0("bio", stringr::str_pad(1:19, width=2, side="left", pad="0"), "_sd")
+colnames(chelsa.sd.out) <- paste0("bio", stringr::str_pad(1:19, width=2, side="left", pad="0"), "sd")
 ```
 Show summaries
 ```{r, echo=F}
@@ -237,9 +234,9 @@ for(i in 1:8){
 Reimport and assemble output.
 ```{r, message=F, warning=F}
 ISRIC.layer.names <- c("BLDFIE", "CECSOL","CLYPPT","CRFVOL","ORCDRC","PHIHOX","SLTPPT","SNDPPT")
-ISRIC.layer.names <- paste0(ISRIC.layer.names, "_M_sl2_250m_ll")
+ISRIC.layer.names1 <- paste0(ISRIC.layer.names, "_M_sl2_250m_ll")
 isric.files <- list.files(path="../_derived/output_pred/", 
-                          pattern=paste0(paste0(ISRIC.layer.names, ".csv$"), collapse="|"), 
+                          pattern=paste0(paste0(ISRIC.layer.names1, ".csv$"), collapse="|"), 
                           full.names=T)
 
 isric.out <- do.call(cbind, lapply(isric.files,  
@@ -249,13 +246,13 @@ isric.out <- do.call(cbind, lapply(isric.files,
 colnames(isric.out) <- ISRIC.layer.names
 #same for sd values
 isric.sd.files <- list.files(path="../_derived/output_pred/", 
-                          pattern=paste0(paste0(ISRIC.layer.names, "_sd.csv$"), collapse="|"), 
+                          pattern=paste0(paste0(ISRIC.layer.names1, "_sd.csv$"), collapse="|"), 
                           full.names=T)
 isric.sd.out <- do.call(cbind, lapply(isric.sd.files, 
                                       function(x) {read_csv(x, col_types = cols(X1 = col_character(),
                                                                                  V1 = col_double())) %>% 
                                            column_to_rownames("X1")}))
-colnames(isric.sd.out) <- paste0(ISRIC.layer.names, "_sd")
+colnames(isric.sd.out) <- paste0(ISRIC.layer.names, "sd")
 ```
 Show summaries
 ```{r, echo=F}
diff --git a/code/06_buildDT.Rmd b/code/06_buildDT.Rmd
index 599322a..1c8e0a3 100644
--- a/code/06_buildDT.Rmd
+++ b/code/06_buildDT.Rmd
@@ -13,6 +13,7 @@ output: html_document
 
 MEMO!! WHAT TO DO WITH LAYER WHEN IS CONSISTENTLY ZERO IN A PLOT? CHANGE TO NA?
 WHAT TO DO INSTEAD WHEN LAYER==0 IN A PLOT WHERE LAYER INFO IS OTHERWISE AVAILABLE?
+!!! ADD Explanation of fields!!!
 
 
 **Timestamp:** `r date()`  
@@ -41,7 +42,7 @@ write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_
 Search and replace unclosed quotation marks and escape them. Run in Linux terminal 
 ```{bash, eval=F}
 # escape all double quotation marks. Run in Linux terminal
-# sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv
+# sed 's/"/\\"/g' sPlot_3_0_2_species.csv > sPlot_3_0_2_species_test.csv
 ```
 
 ## Import data Table
@@ -98,7 +99,10 @@ DT1 <- DT0 %>%
   # Simplify Rank_correct
   mutate(Rank_correct=fct_collapse(Rank_correct, 
                                    lower=c("subspecies", "variety", "infraspecies", "race", "forma"))) %>% 
-  mutate(Rank_correct=fct_explicit_na(Rank_correct, "No_match"))
+  mutate(Rank_correct=fct_explicit_na(Rank_correct, "No_match")) %>% 
+  mutate(Name_short=replace(Name_short, 
+                            list=Name_short=="No suitable", 
+                            values=NA))
 ```
 
 ## Explore name matching based on Backbone v1.2
@@ -306,9 +310,10 @@ To make the cover data more user friendly, I simplify the way cover is stored, s
 ```{r}
 # Create Ab_scale field
 DT1 <- DT1 %>% 
-  mutate(Ab_scale = ifelse(`Cover code` %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF", "x") & !is.na(x_), 
-                               `Cover code`, 
-                               "CoverPerc")) %>% 
+  mutate(Ab_scale = ifelse(`Cover code` %in% 
+                             c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF", "x") & !is.na(x_), 
+                           `Cover code`, 
+                           "CoverPerc")) %>% 
   mutate(Ab_scale = ifelse(Ab_scale =="x", "pa", Ab_scale)) 
 ```
 
@@ -341,16 +346,18 @@ Transform these plots to p\\a and correct field `Ab_scale`. Note: the column `Ab
 DT1 <- DT1 %>% 
   mutate(Ab_scale=replace(Ab_scale, 
                            list=PlotObservationID %in% mixed, 
-                           values="pa")) %>% 
+                           values="mixed")) %>% 
   #Create additional field Abundance to avoid overwriting original data
-  mutate(Abundance =ifelse(Ab_scale %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF"), 
+  mutate(Abundance =ifelse(Ab_scale %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF", "x"), 
                           x_, `Cover %`)) %>% 
   mutate(Abundance=replace(Abundance, 
                            list=PlotObservationID %in% mixed, 
                            values=1))
 ```
 
-Double check and summarize abundance_scales
+
+
+Double check and summarize `abundance_scales`
 ```{r}
 scale_check <- DT1 %>% 
   distinct(PlotObservationID, Layer, Ab_scale) %>% 
@@ -359,7 +366,7 @@ scale_check <- DT1 %>%
                                      unique(Ab_scale), 
                                      "Multiple_scales"))
 
-nrow(scale_check)== length(unique(DT2$PlotObservationID))
+nrow(scale_check)== length(unique(DT1$PlotObservationID))
 table(scale_check$Ab_scale_combined)
 ```
 
@@ -405,7 +412,6 @@ knitr::kable(DT2 %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   full_width = F, position = "center")
 ```
-***!!! ADD Explanation of fields!!!***
 
 
 ```{r}
diff --git a/code/07_buildCWMs.Rmd b/code/07_buildCWMs.Rmd
index 1c4aa8c..0a19bfd 100644
--- a/code/07_buildCWMs.Rmd
+++ b/code/07_buildCWMs.Rmd
@@ -763,7 +763,7 @@ Cross check with sPlot's 5-class (incomplete) native classification deriving fro
 ```{r}
 cross.check <- header %>% 
   dplyr::select(PlotObservationID, Forest) %>% 
-  left_join(plot.gf %>% 
+  left_join(plot.vegtype %>% 
               dplyr::select(PlotObservationID, is.forest, is.non.forest) %>% 
               rename(Forest=is.forest, Other=is.non.forest) %>% 
               gather(isfor_isnonfor, value, -PlotObservationID) %>% 
@@ -831,7 +831,8 @@ save(header, file="../_output/header_splot3.0.RData")
 
 # Appendix
 ## Growth forms of most common species
-```{r, code = readLines("../_derived/Species_missingGF_complete.csv")}
+```{r comment=''}
+cat(readLines("../_derived/Species_missingGF_complete.csv"), sep = '\n')
 ```
 ## SessionInfo
 ```{r}
-- 
GitLab