From 420d97c87dbaab9c31ab6b6299c0e942371a2dd0 Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Wed, 5 Feb 2020 18:00:46 +0100
Subject: [PATCH] Additional data cleaning, GIVD codes, standardized formations

---
 code/04_buildHeader.Rmd | 150 ++++++++++++++++++++++++++++++++--------
 1 file changed, 123 insertions(+), 27 deletions(-)

diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd
index 1ea7be7..8fe8f86 100644
--- a/code/04_buildHeader.Rmd
+++ b/code/04_buildHeader.Rmd
@@ -27,6 +27,7 @@ This report documents the construction of the header file for sPlot 3.0. It is b
 knitr::opts_chunk$set(echo = TRUE)
 library(tidyverse)
 library(readr)
+library(xlsx)
 
 ## Spatial packages
 library(sp)
@@ -109,9 +110,9 @@ header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv",
   dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE)  #too sparse information to be useful
 ```
 
-The following column names occurred in the header of sPlot v2.1
+The following column names occurred in the header of sPlot v2.1 and are currently missing from the header of v3.0  
 1. Syntaxon  
-2. Cover cryptogams (%)  
+2. Cover cryptogams (%)   
 3. Cover bare soil (%)  
 4. is.forest  
 5. is.non.forest  
@@ -121,7 +122,7 @@ The following column names occurred in the header of sPlot v2.1
 9. CONTINENT  
 10. POINT_X  
 11. POINT_Y  
-
+~~
 Columns #1, #2, #3, #10, #11 will be dropped. The others will be derived below.
 
 
@@ -139,27 +140,14 @@ header <- header0 %>%
                           list=(is.na(`Location uncertainty (m)`) & Dataset=="Egypt Nile delta"), 
                           values=-90000))
 ```
-There are two plots in the `Romanina Grassland Databse', whose lat\long are inverted. Correct.
+There are two plots in the `Romanina Grassland Databse` and ~4442 plots in the `Japan` database, whose lat\\long are inverted. Correct.
 ```{r}
-new.coords <- header %>% 
-  filter(Dataset=="Romania Grassland Database" & Longitude>40) %>% 
-  dplyr::select(PlotObservationID, Latitude, Longitude) %>% 
-  rename(Longitude=Latitude, Latitude=Longitude)
-
-header <- header %>% 
-    ##only works because the two plots have identical coords 
-    mutate(Longitude=replace(Longitude, 
-                             list=PlotObservationID %in% (new.coords %>% 
-                                                            pull(PlotObservationID)), 
-                             values=new.coords %>% pull(Longitude)))%>% 
-  mutate(Latitude=replace(Latitude, 
-                          list=PlotObservationID %in% (new.coords %>% 
-                                                            pull(PlotObservationID)), 
-                          values=new.coords %>% pull(Latitude)))
-rm(new.coords)
+toswap <- c(which(header$Dataset=="Japan" & header$Latitude>90), 
+            which(header$Dataset=="Romania Grassland Database" & header$Longitude>40))
+header[toswap, c("Latitude", "Longitude")] <- header[toswap, c("Longitude", "Latitude")]
 ```
 
-There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value, to indicate this is an estimation, rather than a measure.
+There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value to indicate this is an estimation, rather than a measure.
 ```{r}
 header <- header %>% 
   left_join(header %>% 
@@ -168,19 +156,125 @@ header <- header %>%
             by="Dataset") %>% 
   mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)`), 
                                             -abs(loc.uncer.median), 
-                                            `Location uncertainty (m)`))
+                                            `Location uncertainty (m)`)) %>% 
+  dplyr::select(-loc.uncer.median)
 ```
 There are still `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))` plots with no estimation of location uncertainty.
 
+## 2 Formations
+Fill out the columns `Forest:Sparse.vegetation` with 0\\NAs, where necessary. Create columns `is.forest` and `is.non.forest` using script developed for sPlot 2.1  
+~~
+I am not assigning plots to Faber-Langedon formation at this stage, as this is only possible for European plots having an ESY classification.
+
+
+
+```{r}
+eunis.key <- read.xlsx("../_input/EUNIS_WFT.xlsx", sheetIndex = "Sheet1", endRow = 246) %>% 
+  dplyr::select(EUNIS_code, NATURALNESS:SPARSE_VEG) %>% 
+  mutate(EUNIS_code=as.character(EUNIS_code)) %>% 
+  rename(ESY=EUNIS_code, 
+         Naturalness=NATURALNESS, 
+         Forest=FOREST,
+         Shrubland=SCRUBLAND,
+         Grassland=GRASSLAND,
+         Wetland=WETLAND,
+         Sparse.vegetation=SPARSE_VEG)#,
+
+header.backup <- header
+
+#test <- header.backup %>% slice(100000:110000) %>% dplyr::select(PlotObservationID, Dataset, ESY, Forest:Shrubland)
+
+header <- header %>% # header.backup %>% 
+  mutate(ESY=as.character(ESY)) %>% 
+  #mutate(ESY=ifelse(ESY=="?", NA, ESY)) %>% 
+  # Systematically assign some databases to forest
+  mutate(Forest=ifelse(Dataset %in% 
+                         c("Turkey Oak_Forest Database", "Turkey Forest Database", "Chile_forest", "Ethiopia"), 
+                       T, Forest)) %>% 
+  #fill up with F those rows where at least one column on formation is assigned
+  rowwise() %>% 
+  mutate(Any=any(Forest, Shrubland, Grassland, Wetland, Sparse.vegetation)) %>% 
+  mutate(Forest=ifelse( (is.na(Forest) & Any), F, Forest))  %>%
+  mutate(Shrubland=ifelse( (is.na(Shrubland) & Any), F, Shrubland))  %>% 
+  mutate(Grassland=ifelse( (is.na(Grassland) & Any), F, Grassland))  %>% 
+  mutate(Wetland=ifelse( (is.na(Wetland) & Any), F, Wetland))  %>% 
+  mutate(Sparse.vegetation=ifelse( (is.na(Sparse.vegetation) & Any), F, Sparse.vegetation))  %>%
+  ungroup() %>% 
+  dplyr::select(-Any) %>% 
+  mutate_at(vars(Forest:Shrubland), .funs=list(~.*1)) %>% 
+  mutate(Naturalness=as.numeric(as.character(Naturalness))) %>% 
+  ##join and coalesce with eunis.key
+  left_join(eunis.key, by = "ESY") %>% 
+    mutate(
+        Forest = dplyr:::coalesce(Forest.x, Forest.y), 
+        Shrubland = coalesce(Shrubland.x, Shrubland.y),
+        Grassland = coalesce(Grassland.x, Grassland.y),
+        Wetland = coalesce(Wetland.x, Wetland.y),
+        Sparse.vegetation = coalesce(Sparse.vegetation.x, Sparse.vegetation.y),
+        Naturalness = coalesce(Naturalness.x, Naturalness.y)
+    ) %>% 
+  dplyr::select(-ends_with(".x"), -ends_with(".y"))
+```
+
+
+## 3 Fix header and attach GIVD codes
 
+Reduce number of factor levels for the column `Plants recorded`
+```{r}
+header <- header %>% 
+  mutate(`Plants recorded`=fct_recode(`Plants recorded`, 
+                                      "All vascular plants"="complete vegetation",
+                                      "All vascular plants"="Complete vegetation",
+                                      "All vascular plants"="all vascular plants", 
+                                      "All vascular plants"="complete", 
+                                      "All vascular plants"="Complete vegetation (including non-terricolous tax",
+                                      "All vascular plants"="Vascular plants",
+                                      "All woody plants"="Woody plants",
+                                      "All woody plants"="All woody species",
+                                      "Woody plants >= 10 cm dbh"= "trees>=10cm dbh",
+                                      "All trees & dominant understory"="All trees & dominant shrubs",
+                                      "Woody plants >= 1 cm dbh" = "Plants >= 1 cm dbh"
+                                      )) %>% 
+  mutate(`Plants recorded`=factor(`Plants recorded`, exclude = "#N/A"))
+```
 
-## 2 Assign to Continents
+```{r}
+databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
+## align labels to consortium 
+
+header <- header %>% 
+  mutate(Dataset=fct_recode(Dataset,
+                            "BIOTA_South_Africa" = "BIOTA_South_Africa_3", 
+                            "Kriti"="Cyprus_Bergmeier", 
+                            "European Boreal Forest Database"="European Boreal Forest Database 1", 
+                            "European Boreal Forest Database"="European Boreal Forest Database 2", 
+                            "European Coastal Vegetation Database"= "European Coastal Vegetation Database-A", 
+                            "Germany_vegetweb"="Germany_vegetweb2", 
+                            "Germany_vegetweb"="Germany_vegetweb3",
+                            "Ladakh"="Ladakh_2", 
+                            "Netherlands"="Netherlands Military sites",
+                            "NSW Australia" = "NSW Austalia",
+                            )) %>% 
+  left_join(databases %>% 
+              dplyr::select(`GIVD ID`, label) %>% 
+              rename(Dataset=label),
+            by="Dataset")
+
+
+
+```
+
+
+
+
+
+## 4 Assign to Continents
 Download and manipulate map of continents
 ```{r}
 sPDF <- rworldmap::getMap(resolution="coarse")
 continent <- sPDF[,"continent"]
 crs(continent) <- CRS("+init=epsg:4326")
-#continent@data[243,"continent"] <- "South America" ## Manually correct missing data
+continent@data[243,"continent"] <- "South America" ## Manually correct missing data
 # create clipped version of continent to avoid going beyond 180 lON
 coords <- data.frame(x=c(-180,180,180,-180),
                      y=c(-90,-90,90,90))
@@ -188,7 +282,7 @@ bboxc = Polygon(coords)
 bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent))
 continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround
 
-## same but high resolution (works better for plots along coastlines)
+## same but high resolution (slower, but works better for plots along coastlines)
 sPDF <- rworldmap::getMap(resolution="high")
 continent.high <- sPDF[,"continent"]
 crs(continent.high) <- CRS("+init=epsg:4326")
@@ -221,10 +315,12 @@ crs(toassign) <- crs(continent)
 
 
 #go parallel
+ncores=6
 library(parallel)
 library(doParallel)
-cl <- makeCluster(6)
+cl <- makeForkCluster(ncores, outfile="" )
 registerDoParallel(cl)
+  
 nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { 
   nearestContinent.tmp <- geosphere::dist2Line(toassign[i,], continent_clipped)
 }
@@ -233,7 +329,7 @@ save(continent.out, file = "../_derived/continent.out")
 ```
 Reload and manipulate continent. Correct header.sRoot
 ```{r}
-load("_derived/continent.out")
+load("../_derived/continent.out")
 header <- header %>% 
   left_join(header %>% 
               filter(!is.na(Longitude) | !is.na(Latitude)) %>% 
-- 
GitLab