Started building header file

a0d70d77 · Francesco Sabatini · 72f87f65 · a0d70d77
Commit a0d70d77 authored 5 years ago by Francesco Sabatini
--- a/code/04_buildHeader.Rmd
+++ b/code/04_buildHeader.Rmd
+---
+title: "sPlot3.0 - Build Header"
+author: "Francesco Maria Sabatini"
+date: "2/4/2020"
+output: html_document
+---
+
+
+
+<center>
+  ![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")
+</center>
+
+  
+    
+      
+        
+**Timestamp:** `r date()`  
+**Drafted:** Francesco Maria Sabatini  
+**Revised:**  
+**version:**
+
+This report documents the construction of the header file for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens.    
+
+
+```{r results="hide", message=F, warning=F}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(readr)
+
+## Spatial packages
+library(sp)
+library(rgdal)
+library(rgeos)
+library(raster)
+library(rworldmap)
+
+```
+
+```{bash, eval=F}
+# escape all double quotation marks. Run in Linux terminal
+sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv
+
+#more general alternative in case some " are already escaped
+##first removing \s before all "s, and then adding \ before all ":
+#sed 's/\([^\\]\)"/\1\\\"/g; s/"/\\"/g'
+
+
+```
+
+
+Import header data
+```{r import}
+header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv", locale = locale(encoding = 'UTF-8'),
+                            delim="\t", col_types=cols(
+  PlotObservationID = col_double(),
+  PlotID = col_double(),
+  `TV2 relevé number` = col_double(),
+  Country = col_factor(),
+  `Cover abundance scale` = col_factor(),
+  `Date of recording` = col_date(format="%d-%m-%Y"),
+  `Relevé area (m²)` = col_double(),
+  `Altitude (m)` = col_double(),
+  `Aspect (°)` = col_double(),
+  `Slope (°)` = col_double(),
+  `Cover total (%)` = col_double(),
+  `Cover tree layer (%)` = col_double(),
+  `Cover shrub layer (%)` = col_double(),
+  `Cover herb layer (%)` = col_double(),
+  `Cover moss layer (%)` = col_double(),
+  `Cover lichen layer (%)` = col_double(),
+  `Cover algae layer (%)` = col_double(),
+  `Cover litter layer (%)` = col_double(),
+  `Cover open water (%)` = col_double(),
+  `Cover bare rock (%)` = col_double(),
+  `Height (highest) trees (m)` = col_double(),
+  `Height lowest trees (m)` = col_double(),
+  `Height (highest) shrubs (m)` = col_double(),
+  `Height lowest shrubs (m)` = col_double(),
+  `Aver. height (high) herbs (cm)` = col_double(),
+  `Aver. height lowest herbs (cm)` = col_double(),
+  `Maximum height herbs (cm)` = col_double(),
+  `Maximum height cryptogams (mm)` = col_double(),
+  `Mosses identified (y/n)` = col_factor(),
+  `Lichens identified (y/n)` = col_factor(),
+  COMMUNITY = col_character(),
+  SUBSTRATE = col_character(),
+  Locality = col_character(),
+  ORIG_NUM = col_double(),
+  ALLIAN_REV = col_character(),
+  REV_AUTHOR = col_character(),
+  Forest = col_logical(),
+  Grassland = col_logical(),
+  Wetland = col_logical(),
+  `Sparse vegetation` = col_logical(),
+  Shrubland = col_logical(),
+  `Plants recorded` = col_factor(),
+  `Herbs identified (y/n)` = col_factor(),
+  Naturalness = col_factor(),
+  EUNIS = col_factor(),
+  Longitude = col_double(),
+  Latitude = col_double(),
+  `Location uncertainty (m)` = col_double(),
+  Dataset = col_factor(),
+  GUID = col_character()
+)) %>% 
+  rename(Sparse.vegetation=`Sparse vegetation`, 
+         ESY=EUNIS) %>% 
+  dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE)  #too sparse information to be useful
+```
+
+The following column names occurred in the header of sPlot v2.1
+1. Syntaxon  
+2. Cover cryptogams (%)  
+3. Cover bare soil (%)  
+4. is.forest  
+5. is.non.forest  
+6. EVA  
+7. Biome  
+8. BiomeID  
+9. CONTINENT  
+10. POINT_X  
+11. POINT_Y  
+
+Columns #1, #2, #3, #10, #11 will be dropped. The others will be derived below.
+
+
+## 1 Solve spatial problems
+There are 2020 plots in the Nile dataset without spatial coordinates. Assign manually with wide (90km) location uncertainty.
+```{r}
+header <- header0 %>% 
+  mutate(Latitude=replace(Latitude, 
+                          list=(is.na(Latitude) & Dataset=="Egypt Nile delta"), 
+                          values=30.917351)) %>% 
+  mutate(Longitude=replace(Longitude, 
+                          list=(is.na(Longitude) & Dataset=="Egypt Nile delta"), 
+                          values=31.138534)) %>% 
+  mutate(`Location uncertainty (m)`=replace(`Location uncertainty (m)`, 
+                          list=(is.na(`Location uncertainty (m)`) & Dataset=="Egypt Nile delta"), 
+                          values=-90000))
+```
+There are two plots in the `Romanina Grassland Databse', whose lat\long are inverted. Correct.
+```{r}
+new.coords <- header %>% 
+  filter(Dataset=="Romania Grassland Database" & Longitude>40) %>% 
+  dplyr::select(PlotObservationID, Latitude, Longitude) %>% 
+  rename(Longitude=Latitude, Latitude=Longitude)
+
+header <- header %>% 
+    ##only works because the two plots have identical coords 
+    mutate(Longitude=replace(Longitude, 
+                             list=PlotObservationID %in% (new.coords %>% 
+                                                            pull(PlotObservationID)), 
+                             values=new.coords %>% pull(Longitude)))%>% 
+  mutate(Latitude=replace(Latitude, 
+                          list=PlotObservationID %in% (new.coords %>% 
+                                                            pull(PlotObservationID)), 
+                          values=new.coords %>% pull(Latitude)))
+rm(new.coords)
+```
+
+There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value, to indicate this is an estimation, rather than a measure.
+```{r}
+header <- header %>% 
+  left_join(header %>% 
+              group_by(Dataset) %>% 
+              summarize(loc.uncer.median=median(`Location uncertainty (m)`, na.rm=T)), 
+            by="Dataset") %>% 
+  mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)`), 
+                                            -abs(loc.uncer.median), 
+                                            `Location uncertainty (m)`))
+```
+There are still `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))` plots with no estimation of location uncertainty.
+
+
+
+## 2 Assign to Continents
+Download and manipulate map of continents
+```{r}
+sPDF <- rworldmap::getMap(resolution="coarse")
+continent <- sPDF[,"continent"]
+crs(continent) <- CRS("+init=epsg:4326")
+#continent@data[243,"continent"] <- "South America" ## Manually correct missing data
+# create clipped version of continent to avoid going beyond 180 lON
+coords <- data.frame(x=c(-180,180,180,-180),
+                     y=c(-90,-90,90,90))
+bboxc = Polygon(coords)
+bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent))
+continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround
+
+## same but high resolution (works better for plots along coastlines)
+sPDF <- rworldmap::getMap(resolution="high")
+continent.high <- sPDF[,"continent"]
+crs(continent.high) <- CRS("+init=epsg:4326")
+continent.high@data$continent <- fct_recode(continent.high@data$continent, "South America"="South America and the Caribbean")
+```
+Create spatial point dataframe for sPlot data to intersect with spatial layers
+```{r}
+header.shp <- header %>%
+  filter(!is.na(Longitude) | !is.na(Latitude))
+header.shp <- SpatialPointsDataFrame(coords= header.shp %>% 
+                                        dplyr::select(Longitude, Latitude),
+                               proj4string = CRS("+init=epsg:4326"), 
+                               data=data.frame(PlotObservationID= header.shp$PlotObservationID, 
+                                               loc.uncert=header.shp$`Location uncertainty (m)`))
+```
+
+Assign plots to continent
+```{r, eval=T}
+continent.out <- sp::over(x=header.shp, y=continent)
+#overlay unassigned points to the high resolution layer of continent
+toassign <- header.shp[which(is.na(continent.out$continent)),] #154782 remain to assign
+crs(toassign) <- crs(continent)
+continent.out2 <- sp::over(x=toassign, y=continent.high)
+#merge first and second overlay 
+continent.out$continent[is.na(continent.out$continent)] <- continent.out2$continent
+
+#correct unassigned points to closest continent
+toassign <- header.shp[which(is.na(continent.out$continent)),] #47610 remain to assign
+crs(toassign) <- crs(continent)
+
+
+#go parallel
+library(parallel)
+library(doParallel)
+cl <- makeCluster(6)
+registerDoParallel(cl)
+nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { 
+  nearestContinent.tmp <- geosphere::dist2Line(toassign[i,], continent_clipped)
+}
+continent.out$continent[which(is.na(continent.out$continent))] <- as.character(continent[-137,]@data[nearestContinent[,"ID"],])
+save(continent.out, file = "../_derived/continent.out")
+```
+Reload and manipulate continent. Correct header.sRoot
+```{r}
+load("_derived/continent.out")
+header <- header %>% 
+  left_join(header %>% 
+              filter(!is.na(Longitude) | !is.na(Latitude)) %>% 
+              dplyr::select(PlotObservationID) %>% 
+              bind_cols(continent.out), 
+            by="PlotObservationID")
+  mutate(CONTINENT=factor(continent, 
+                            levels=c("Africa", "Antarctica", "Australia", "Eurasia", "North America", "South America"),
+                            labels=c("AF", "AN", "AU", "EU", "NA", "SA"))) %>% 
+  dplyr::select(-continent)
+```
+
+
+
+
+