Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
title: "sPlot3.0 - Build Header"
author: "Francesco Maria Sabatini"
date: "2/4/2020"
output: html_document
![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")

\newline

Timestamp: r date()
Drafted: Francesco Maria Sabatini
Revised: Helge Bruelheide
Version: 1.2

This report documents the construction of the header file for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens.

Changes in version 1.1

  1. Excluded plots from Canada, as recommended by Custodian
  2. Filled missing info from most of the ~2000 plots without country information from these datasets.
  3. Corrected mismatched sBiomes and ecoregions
    Changes in version 1.2
  4. Reassigned coordinates to ~19.000 misplaced plots (mostly from SOPHY or in Hungary). Assigned country level centroids
  5. Corrected mismatched CONTINENTS & Countries
  6. Added graphs to check assignment to continents or countries
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(purrr)
library(viridis)
library(readr)
library(xlsx)
library(knitr)
library(kableExtra)

## Spatial packages
library(rgdal)
library(sp)
library(rgeos)
library(raster)
library(rworldmap)
library(elevatr)
library(sf)
library(rnaturalearth)
library(dggridR)
library(shotGroups) #minCircle

#save temporary files
write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")

1 Import data

Import header data. Clean header data from quotation and double quotation marks from linux terminal.

# escape all double quotation marks. Run in Linux terminal
#sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv

#more general alternative in case some " are already escaped
##first removing \s before all "s, and then adding \ before all ":
#sed 's/\([^\\]\)"/\1\\\"/g; s/"/\\"/g'

Import cleaned header data.

header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv", 
                             locale = locale(encoding = 'UTF-8'),
                            delim="\t", col_types=cols(
  PlotObservationID = col_double(),
  PlotID = col_double(),
  `TV2 relevé number` = col_double(),
  Country = col_character(),
  `Cover abundance scale` = col_factor(),
  `Date of recording` = col_date(format="%d-%m-%Y"),
  `Relevé area (m²)` = col_double(),
  `Altitude (m)` = col_double(),
  `Aspect (°)` = col_double(),
  `Slope (°)` = col_double(),
  `Cover total (%)` = col_double(),
  `Cover tree layer (%)` = col_double(),
  `Cover shrub layer (%)` = col_double(),
  `Cover herb layer (%)` = col_double(),
  `Cover moss layer (%)` = col_double(),
  `Cover lichen layer (%)` = col_double(),
  `Cover algae layer (%)` = col_double(),
  `Cover litter layer (%)` = col_double(),
  `Cover open water (%)` = col_double(),
  `Cover bare rock (%)` = col_double(),
  `Height (highest) trees (m)` = col_double(),
  `Height lowest trees (m)` = col_double(),
  `Height (highest) shrubs (m)` = col_double(),
  `Height lowest shrubs (m)` = col_double(),
  `Aver. height (high) herbs (cm)` = col_double(),
  `Aver. height lowest herbs (cm)` = col_double(),
  `Maximum height herbs (cm)` = col_double(),
  `Maximum height cryptogams (mm)` = col_double(),
  `Mosses identified (y/n)` = col_factor(),
  `Lichens identified (y/n)` = col_factor(),
  COMMUNITY = col_character(),
  SUBSTRATE = col_character(),
  Locality = col_character(),
  ORIG_NUM = col_character(),
  ALLIAN_REV = col_character(),
  REV_AUTHOR = col_character(),
  Forest = col_logical(),
  Grassland = col_logical(),
  Wetland = col_logical(),
  `Sparse vegetation` = col_logical(),
  Shrubland = col_logical(),
  `Plants recorded` = col_factor(),
  `Herbs identified (y/n)` = col_factor(),
  Naturalness = col_factor(),
  EUNIS = col_factor(),
  Longitude = col_double(),
  Latitude = col_double(),
  `Location uncertainty (m)` = col_double(),
  Dataset = col_factor(),
  GUID = col_character()
)) %>% 
  rename(Sparse.vegetation=`Sparse vegetation`, 
         ESY=EUNIS) %>% 
  dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE) %>%   #too sparse information to be useful
  dplyr::select(-PlotID) #identical to PlotObservationID

The following column names occurred in the header of sPlot v2.1 and are currently missing from the header of v3.0

  1. Syntaxon
  2. Cover cryptogams (%)
  3. Cover bare soil (%)
  4. is.forest
  5. is.non.forest
  6. EVA
  7. Biome
  8. BiomeID
  9. CONTINENT
  10. POINT_X
  11. POINT_Y
    ~~ Columns #1 (closed), #2 (closed), #3 (closed), #10, #11 will be dropped. The others will be derived below.

1.1 Exclude unreliable plots

Some canadian plots need to be removed, on indication of Laura Boisvert-Marsh from GIVD NA-CA-004. The plots (and corresponding PlotObservationID) are:
\newline

Fabot01 - 1707776
Fadum01, 02 & 03 - 1707779:1707781
Faers01 - 1707782
Pfe-f-08 - 1707849
Pfe-o-05- 1707854

header0 <- header0 %>% 
  filter(!PlotObservationID %in% c(1707776, 1707779:1707782, 1707849, 1707854)) %>% 
  filter(Dataset != "$Coastal_Borja") %>% 
  filter(Dataset != "$Coastal_Poland") 

1.2 Solve spatial problems

There are 2020 plots in the Nile dataset without spatial coordinates. Assign manually with wide (90km) location uncertainty.

header <- header0 %>% 
  mutate(Latitude=replace(Latitude, 
                          list=(is.na(Latitude) & Dataset=="Egypt Nile delta"), 
                          values=30.917351)) %>% 
  mutate(Longitude=replace(Longitude, 
                          list=(is.na(Longitude) & Dataset=="Egypt Nile delta"), 
                          values=31.138534)) %>% 
  mutate(`Location uncertainty (m)`=replace(`Location uncertainty (m)`, 
                          list=(is.na(`Location uncertainty (m)`) & Dataset=="Egypt Nile delta"), 
                          values=-90000))

There are two plots in the Romania Grassland Databse, ~4442 plots in the Japan database, and a few in the European Weed Vegetation Database whose lat\long are inverted. Correct.

toswap <- c(which(header$Dataset=="Japan" & header$Latitude>90), 
            which(header$Dataset=="Romania Grassland Database" & header$Longitude>40), 
            which(header$PlotObservationID==525283))
header[toswap, c("Latitude", "Longitude")] <- header[toswap, c("Longitude", "Latitude")]
nouncert <- nrow(header %>% filter(is.na(`Location uncertainty (m)`)))

There are r nouncert plots without location uncertainty. As a first approximation, we assign the median of the respective dataset, as a negative value to indicate this is an estimation, rather than a measure.

header <- header %>% 
  left_join(header %>% 
              group_by(Dataset) %>% 
              summarize(loc.uncer.median=median(`Location uncertainty (m)`, na.rm=T)), 
            by="Dataset") %>% 
  mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)` & !is.na(Latitude)), 
                                            -abs(loc.uncer.median), 
                                            `Location uncertainty (m)`)) %>% 
  dplyr::select(-loc.uncer.median)
nouncert <- nrow(header %>% filter(is.na(`Location uncertainty (m)`)))

There are still r nouncert plots with no estimation of location uncertainty.
\newline Assign plot size to plots in the Patagonia dataset (input of Ana Cingolani)

header <- header %>% 
  mutate(`Relevé area (m²)`=ifelse( (Dataset=="Patagonia" & is.na(`Relevé area (m²)`)), 
                                    -900, `Relevé area (m²)`))

There are 518 plots from the dataset Germany_gvrd (EU-DE-014) having a location uncertainty equal to 2,147,483 km (!). These plots have a location reported. Replace with a more likely estimate (20 km)

header <- header %>% 
  mutate(`Location uncertainty (m)`=replace(`Location uncertainty (m)`, 
                                            list=`Location uncertainty (m)`==2147483647, 
                                            values=20000))

There are 103 plots in the "Balkan Vegetation Database" which are erroneously assigned to Bahrain, instead of Bosnia

header <- header %>% 
  mutate(Country=ifelse(Dataset=="Balkan Vegetation Database" & Country=="Bahrain", 
                        "Bosnia-Herzegovina", 
                        Country))

There is one plot in the Czech dataset which was sampled in Italy (near Civitella Alfedena) but has the centroid of CZ republic as coordinates

plot.alfedena <- header %>% 
  filter(Dataset=="Czechia_nvd" & Country=="Italy") %>% 
  pull(PlotObservationID)
# coordinates of Civitella Alfedena - 41.764975, 13.940494

header <- header %>% 
  mutate(Latitude=ifelse(PlotObservationID %in% plot.alfedena, 41.764975, Latitude)) %>% 
  mutate(Longitude=ifelse(PlotObservationID %in% plot.alfedena, 13.940494, Longitude)) %>% 
  mutate(`Location uncertainty (m)` =ifelse(PlotObservationID %in% plot.alfedena, 30000, `Location uncertainty (m)`)) 

Many plots in SOPHY were collected from neighbouring countries, but are located at the centroid of France. Similarly, there is a batch of 108 plots from the European Mire VDB, European Weed Vegetation Database, and WetVegEurope Database that although being located in Hungary, have spatial coordinates in Chad. To all these plots, I reassign the centroid of their respective country as coordinate, and the radius of the minimum circle enclosing the country as location uncertainty

#Centroid of France
#  Longitude Latitude
#   2.55196  46.5645

plot.to.correct <- header %>% 
  filter( (Dataset=="France_SOPHY" & Country!="France" & Longitude==2.55196 & Latitude==46.5645) | #SOPHY plots
            Country=="Hungary" & Latitude<20) ##Hungary plots


nrow(plot.to.correct)

plot.to.correct %>% 
  count(Country)

countries.to.correct <- plot.to.correct %>% 
  distinct(Country) %>% 
  pull(Country)

plot.to.correct <- plot.to.correct %>% 
  pull(PlotObservationID)


## Import polygon of countries and subset 
countries.sel <- ne_countries(returnclass = "sf", scale=110) %>% 
  dplyr::select(geometry, name) %>% 
  mutate(name=ifelse(name=="Andorra", "Andorra, Principality of", name)) %>% 
  mutate(name=ifelse(name=="Czech Rep.", "Czech Republic", name)) %>% 
  filter(name %in% countries.to.correct)

# Some smaller countries cannot be resolved to the low res layer of countries above.
# Add them from another source
small.countries <- ne_countries(returnclass = "sf", scale=50) %>%  
  dplyr::select(geometry, name) %>% 
  filter(name %in% c("Andorra", "Monaco", "Liechtenstein")) %>% 
  mutate(name=ifelse(name=="Andorra", "Andorra, Principality of", name))
  

## For Norway I delete the svalbard islands to avoid centroid falling in the sea
norway <- countries.sel %>% 
  filter(name=="Norway") %>% 
  as_Spatial() %>% 
  spatialEco::explode() %>% 
  st_as_sf() %>% 
  slice(1)   # extract only norway mainland

# Bind all countries together, and replace Norway
countries.sel <- countries.sel %>% 
  filter(name != "Norway") %>% 
  bind_rows(norway) %>% 
  bind_rows(small.countries) 


# get centroids in lat long and radius (=location uncertainty) in meters (projection "eck4")
centroids <- list()
radius <- NULL
for(cc in seq_along(countries.to.correct)){
  cnt <- countries.sel %>% 
    filter(name ==countries.to.correct[cc])

  #get centroid
  centroids[[cc]] <- gCentroid(cnt %>% 
                                 as_Spatial(),byid = F)@coords
  
  ## transform polygons to points and get the minimum encolosing circle
  ## to determine ragius (in km)
  cnt.eck <- countries.sel %>% 
    st_transform(crs = "+proj=eck4") %>% 
    filter(name ==countries.to.correct[cc]) %>% 
    st_cast(to="MULTIPOINT") %>% 
    st_coordinates() %>% 
    as.data.frame() %>% 
    mutate_all(~(.)/1000) %>% #to km
    dplyr::select(point.x=X, point.y=Y)
  radius[cc] <- shotGroups::getMinCircle(cnt.eck)$rad
}

## build dataset of centroids and location uncertainty
cnt.centroids <- bind_rows(lapply(centroids, as.data.frame)) %>% 
  mutate(radius=radius*1000) %>% # transform raddius to meters
  mutate(Country=countries.to.correct)

Reassign coordinates to plots to correct, based on the centroid of their country

header <- header %>% 
  left_join(cnt.centroids, by="Country") %>% 
  mutate(Latitude=ifelse(PlotObservationID %in% plot.to.correct, y, Latitude)) %>% 
  mutate(Longitude=ifelse(PlotObservationID %in% plot.to.correct, x, Longitude)) %>% 
  mutate(`Location uncertainty (m)` =ifelse(PlotObservationID %in% plot.to.correct, radius, `Location uncertainty (m)`)) %>% 
  dplyr::select(-x,-y,-radius)

2 Formations

Fill out the columns Forest:Sparse.vegetation with NAs, where necessary. Create columns is.forest and is.non.forest using script developed for sPlot 2.1
I am not assigning plots to Faber-Langedon formation at this stage, as this is only possible for European plots having an ESY classification.

eunis.key <- read.xlsx("../_input/EUNIS_WFT.xlsx", sheetIndex = "Sheet1", endRow = 246) %>% 
  dplyr::select(EUNIS_code, NATURALNESS:SPARSE_VEG) %>% 
  mutate(EUNIS_code=as.character(EUNIS_code)) %>% 
  rename(ESY=EUNIS_code, 
         Naturalness=NATURALNESS, 
         Forest=FOREST,
         Shrubland=SCRUBLAND,
         Grassland=GRASSLAND,
         Wetland=WETLAND,
         Sparse.vegetation=SPARSE_VEG)#,

header <- header %>% # header.backup %>% 
  mutate(ESY=as.character(ESY)) %>% 
  #mutate(ESY=ifelse(ESY=="?", NA, ESY)) %>% 
  # Systematically assign some databases to forest
  mutate(Forest=ifelse(Dataset %in% 
                         c("Turkey Oak_Forest Database", "Turkey Forest Database", "Chile_forest", "Ethiopia"), 
                       T, Forest)) %>% 
  #fill up with F those rows where at least one column on formation is assigned
  rowwise() %>% 
  mutate(Any=any(Forest, Shrubland, Grassland, Wetland, Sparse.vegetation)) %>% 
  mutate(Forest=ifelse( (is.na(Forest) & Any), F, Forest))  %>%
  mutate(Shrubland=ifelse( (is.na(Shrubland) & Any), F, Shrubland))  %>% 
  mutate(Grassland=ifelse( (is.na(Grassland) & Any), F, Grassland))  %>% 
  mutate(Wetland=ifelse( (is.na(Wetland) & Any), F, Wetland))  %>% 
  mutate(Sparse.vegetation=ifelse( (is.na(Sparse.vegetation) & Any), F, Sparse.vegetation))  %>%
  ungroup() %>% 
  dplyr::select(-Any) %>% 
  mutate_at(vars(Forest:Shrubland), .funs=list(~.*1)) %>% 
  mutate(Naturalness=as.numeric(as.character(Naturalness))) %>% 
  ##join and coalesce with eunis.key
  left_join(eunis.key %>% 
              distinct(), by = "ESY") %>% 
    mutate(
        Forest = dplyr:::coalesce(Forest.x, Forest.y), 
        Shrubland = coalesce(Shrubland.x, Shrubland.y),
        Grassland = coalesce(Grassland.x, Grassland.y),
        Wetland = coalesce(Wetland.x, Wetland.y),
        Sparse.vegetation = coalesce(Sparse.vegetation.x, Sparse.vegetation.y),
        Naturalness = coalesce(Naturalness.x, Naturalness.y)
    ) %>% 
  dplyr::select(-ends_with(".x"), -ends_with(".y"))

3 Fix header and attach GIVD codes

Reduce number of factor levels for the column Plants recorded

header <- header %>% 
  mutate(`Plants recorded`=fct_recode(`Plants recorded`, 
                                      "All vascular plants"="complete vegetation",
                                      "All vascular plants"="Complete vegetation",
                                      "All vascular plants"="all vascular plants", 
                                      "All vascular plants"="complete", 
                                      "All vascular plants"="Complete vegetation (including non-terricolous tax",
                                      "All vascular plants"="Vascular plants",
                                      "All woody plants"="Woody plants",
                                      "All woody plants"="All woody species",
                                      "Woody plants >= 10 cm dbh"= "trees>=10cm dbh",
                                      "All trees & dominant understory"="All trees & dominant shrubs",
                                      "Woody plants >= 1 cm dbh" = "Plants >= 1 cm dbh"
                                      )) %>% 
  mutate(`Plants recorded`=factor(`Plants recorded`, exclude = "#N/A"))

Align consortium labels to those in sPlot's consortium archive

databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")

header <- header %>% 
  mutate(Dataset=fct_recode(Dataset,
                            "BIOTA_South_Africa" = "BIOTA_South_Africa_3", 
                            "Kriti"="Cyprus_Bergmeier", 
                            "European Boreal Forest Database"="European Boreal Forest Database 1", 
                            "European Boreal Forest Database"="European Boreal Forest Database 2", 
                            "European Coastal Vegetation Database"= "European Coastal Vegetation Database-A", 
                            "Germany_vegetweb"="Germany_vegetweb2", 
                            "Germany_vegetweb"="Germany_vegetweb3",
                            "Ladakh"="Ladakh_2", 
                            "Netherlands"="Netherlands Military sites",
                            "NSW Australia" = "NSW Austalia",
                            )) %>% 
  left_join(databases %>% 
              dplyr::select(`GIVD ID`, label) %>% 
              rename(Dataset=label),
            by="Dataset")

4 Assign plots to spatial descriptors

Create spatial point dataframe for sPlot data to intersect with spatial layers

header.shp <- header %>%
  filter(!is.na(Longitude) | !is.na(Latitude))
header.shp <- SpatialPointsDataFrame(coords= header.shp %>% 
                                        dplyr::select(Longitude, Latitude),
                               proj4string = CRS("+init=epsg:4326"), 
                               data=data.frame(PlotObservationID= header.shp$PlotObservationID, 
                                               loc.uncert=header.shp$`Location uncertainty (m)`, 
                                               `GIVD ID`=header.shp$`GIVD ID`))
writeOGR(header.shp, dsn="../_derived/", layer="header.shp", driver="ESRI Shapefile", overwrite_layer = T)

Reimport shapefile

header.shp <- readOGR("../_derived/header.shp.shp")
header.shp@data <- header.shp@data %>% 
  rename(PlotObservationID=PltObID, 
         loc.uncert=lc_ncrt,
         `GIVD ID`=GIVD_ID)
crs(header.shp) <- CRS("+init=epsg:4326")

4.1 Assign to Continents

Download and manipulate map of continents

sPDF <- rworldmap::getMap(resolution="coarse")
continent <- sPDF[,"continent"]
crs(continent) <- CRS("+init=epsg:4326")
continent@data[243,"continent"] <- "South America" ## Manually correct missing data
# create clipped version of continent to avoid going beyond 180 lON
coords <- data.frame(x=c(-180,180,180,-180),
                     y=c(-90,-90,90,90))
bboxc = Polygon(coords)
bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent))
continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround
# convert to SpatialPolygonDataFrame
pid <- sapply(slot(continent_clipped, "polygons"), function(x) slot(x, "ID"))
# Create dataframe with correct rownames
p.df <- data.frame( ID=1:length(continent_clipped), row.names = pid)
# Coerce and re-assign
continent_clipped <- SpatialPolygonsDataFrame(continent_clipped, p.df)
continent_clipped@data <- continent@data[-137,, drop=F]
 

## same but high resolution (slower, but works better for plots along coastlines)
sPDF <- rworldmap::getMap(resolution="high")
continent.high <- sPDF[,"continent"]
crs(continent.high) <- CRS("+init=epsg:4326")
continent.high@data$continent <- fct_recode(continent.high@data$continent, "South America"="South America and the Caribbean")

Assign plots to continent

continent.out <- sp::over(x=header.shp, y=continent)
#overlay unassigned points to the high resolution layer of continent
toassign <- header.shp[which(is.na(continent.out$continent)),] #154782 remain to assign
crs(toassign) <- crs(continent)
continent.out2 <- sp::over(x=toassign, y=continent.high)
#merge first and second overlay 
continent.out$continent[is.na(continent.out$continent)] <- continent.out2$continent
toassign <- header.shp[which(is.na(continent.out$continent)),] #47610 remain to assign
crs(toassign) <- crs(continent)

There are r length(toassign) plots remaining unassigned.

Match unassigned points to closest continent

#go parallel
ncores=12
library(parallel)
library(doParallel)
cl <- makeForkCluster(ncores, outfile="" )
registerDoParallel(cl)
  
clusterEvalQ(cl, {
  library(rgdal)
  library(raster)
  library(sp)
  library(elevatr)
  library(dplyr)
  })

nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% {  
#  print(i)
  ## create a subset of geoentities based on a 5° buffer radius around each target plot.
  tmp.buff <- gBuffer(toassign[i,], width=5) 
  tryCatch(
    tmp.mypredictor <- spatialEco::spatial.select(
      x = tmp.buff,
      y = continent_clipped,
      distance = 0.1,
      predicate = "intersect"
    ),
    error = function(e) {
      print(paste("Nothing close enough for plot", toassign@data$PlotObservationID[i]))
    }
  )
    # find nearest neighbour  
  nearest.tmp <- tryCatch(tmp.mypredictor@data[geosphere::dist2Line(toassign[i,],
                                                                    tmp.mypredictor)[,"ID"],],
                          error = function(e){
                            ee <- continent@data[1,, drop=F]
                            ee[1,] <- rep(NA, ncol(continent))
                            }
                          ) %>% 
    as.character()
  return(nearest.tmp)
  }

stopCluster(cl)
continent.out$continent[is.na(continent.out$continent)] <- nearestContinent[,1]
save(continent.out, file = "../_derived/continent.out.RData")

Reload, manipulate continent and attach to header

load("../_derived/continent.out.RData")
header <- header %>% 
  left_join(header.shp@data %>% 
              dplyr::select(PlotObservationID) %>% 
              bind_cols(continent.out),
            by="PlotObservationID") %>% 
  mutate(CONTINENT=factor(continent, 
                            levels=c("Africa", "Antarctica", "Australia", "Eurasia", "North America", "South America"),
                            labels=c("AF", "AN", "AU", "EU", "N-A", "S-A"))) %>% 
  
  dplyr::select(-continent)

Summarize

knitr::kable(header %>% 
               group_by(CONTINENT) %>% 
               summarize(num.plot=n()), 
  caption="Number of plots per continent") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

4.2 Assign to sBiomes

Extract unique sets of coordinates to speed up matching

unique.coord.shp <- SpatialPoints(coords = header.shp@coords %>% 
                                    as.data.frame() %>% 
                                    distinct(), 
                                  proj4string = header.shp@proj4string
                                  )

Extract sBiomes for each unique coordinate.

sBiomes <- "/data/sPlot/users/Francesco/Ancillary_Data/Biomes_sPlot/sBiomes.shp"

library(parallel)
library(doParallel)
cl <- makeForkCluster(12, outfile=paste0("../_derived/sBiomes/sBiomes_.log"))
registerDoParallel(cl)

nchunks <- 120
foreach(i=1:nchunks) %dopar% {
  source("A98_PredictorsExtract.R")
  PredExtr(unique.coord.shp, myfunction=NA,
                          output=paste0("../_derived/sBiomes/sBiomes_",i, ".csv"),
                          toextract=sBiomes, typp="shp", ncores=1, chunkn=nchunks, chunk.i=i)
}
stopCluster(cl)

Reimport and join to header

filelist <-list.files("../_derived/sBiomes/", pattern=".csv", full.names = T)
filelist.n <- as.numeric(str_extract(filelist, pattern="(\\d)+"))
filelist.order <- order(filelist.n)
filelist <- filelist[filelist.order]

sBiomes.out <- unique.coord.shp@coords %>% 
  as.data.frame() %>% 
  bind_cols(lapply(filelist, read_csv) %>% 
              bind_rows())

header <- header %>% 
  left_join(sBiomes.out %>% 
              dplyr::select(Latitude=coords.x2, 
                            Longitude=coords.x1, 
                            sBiome=Name,
                            sBiomeID=BiomeID), 
            by=c("Latitude", "Longitude"))

There are r sum(is.na(header$sBiome)) unassigned plots.

Summarize:

knitr::kable(header %>% 
               group_by(sBiome) %>% 
               summarize(num.plot=n()), 
  caption="Number of plots per Biome") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

4.3 Extract WWF Ecoregions

Extract ecoregion name and ID from Ecoregions of the World. Olson et al. 2001 (BioScience).
Computation was performed in EVE HPC cluster using function A98_PredictorsExtract.R. Divided in 99 chunks.

ecoreg.path <- "../../Ancillary_Data/Ecoregions_WWF/wwf_terr_ecos.shp"
#ecoreg <- readOGR("../../Ancillary_Data/Ecoregions_WWF", layer="wwf_terr_ecos")
#ecoreg@data <- ecoreg@data %>% 
#  dplyr::select(OBJECTID, ECO_NAME, REALM, BIOME, ECO_NUM, ECO_ID, eco_code)

library(parallel)
library(doParallel)
cl <- makeForkCluster(14, outfile="../_derived/wwf_ecoregions/wwf_ecoregions.log")
registerDoParallel(cl)

nchunks <- 98
foreach(i=1:nchunks) %dopar% {
  source("A98_PredictorsExtract.R")
 # options(echo = F, message=F)
  PredExtr(unique.coord.shp, myfunction=NA, 
           output=paste0("../_derived/wwf_ecoregions/wwf_ecoregions_", i, ".csv"),  
           toextract=ecoreg.path, typp="shp", ncores=1, chunkn=nchunks, chunk.i=i)
}
stopCluster(cl)

Reimport output and join to header

ecoreg.files <- list.files("../_derived/wwf_ecoregions/", pattern="wwf_ecoregions_[0-9]+.csv", full.names=T)
ecoreg.files <- ecoreg.files[order(as.numeric(str_extract(ecoreg.files, pattern="[0-9]+")))]
ecoreg.out <- do.call(rbind, lapply(ecoreg.files, function(x) {read_csv(x, 
  col_types=cols(
    .default = col_double(),
    ECO_NAME = col_character(),
    REALM = col_character(),
    G200_REGIO = col_character(),
    eco_code = col_character()))}))


header <- header %>% 
  left_join(ecoreg.out %>% 
              dplyr::select(Latitude, Longitude, ECO_NAME, ECO_ID) %>% 
              dplyr::rename(Ecoregion=ECO_NAME, EcoregionID=ECO_ID), 
            by=c("Latitude", "Longitude"))

There are r sum(is.na(header$ECO_NAME)) unassigned plots.

Summarize:

knitr::kable(header %>% 
               group_by(Ecoregion) %>% 
               summarize(num.plot=n()) %>% 
               arrange(desc(num.plot)) %>% 
               slice(1:30), 
  caption="Number of plots in the 30 best represented Ecoregions") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

4.4 Extract elevation

Extract elevation for each plot. Loops over tiles of 1 x 1°, projects to Mercator, and extracts elevation for plot coordinates, as well as 2.5, 50, and 97.5 quantiles for a buffer area having a radius equal to the location uncertainty of each plot (but only if location uncertainty < 50 km). DEM derives from package elevatr, which uses the Terrain Tiles on Amazon Web Services. Resolutions of DEM rasters vary by region. I set a zoom factor z=10, which corresponds to ~ 75-150 m. Sources are: SRTM, data.gov.at in Austria, NRCAN in Canada, SRTM, NED/3DEP 1/3 arcsec, data.gov.uk in United Kingdom, INEGI in Mexico, ArcticDEM in latitudes above 60°, LINZ in New Zealand, Kartverket in Norway, as described here.
\newline Split data into tiles of 1 x 1 degrees, and create sp::SpatialPointsDataFrame files. Only for plots having a location uncertainty < 50 km, which corresponds to r header.shp@data %>% mutate(lc_ncrt=abs(loc.uncert)) %>% filter(lc_ncrt <= 50000) %>% nrow() plots.

header.tiles <- header %>%
  dplyr::select(PlotObservationID, Dataset, Longitude, Latitude, `Location uncertainty (m)`) %>%
  mutate(`Location uncertainty (m)`=abs(`Location uncertainty (m)`)) %>% 
  filter(`Location uncertainty (m)`<= 50000) %>%
  mutate_at(.vars=vars(Longitude, Latitude), 
            .funs=list(tile=~cut(., breaks = seq(-180,180, by=.2)))) %>%
  filter(!is.na(Longitude_tile) & !is.na(Latitude_tile) ) %>%
  mutate(tilenam=factor(paste(Longitude_tile, Latitude_tile)))
  

There are r nrow(header.tiles) plots out of r nrow(header) plots with Location uncertainty <= 50km (or absent). The total number of tiles is r nlevels(header.tiles$tilenam).
Performed in EVE HPC cluster using function A97_ElevationExtract.R. Divided in 99 chunks.

cl <- makeForkCluster(14, outfile="")
registerDoParallel(cl)

clusterEvalQ(cl, {
  library(rgdal)
  library(raster)
  library(sp)
  library(elevatr)
  library(dplyr)})

# Divided in 99 chunks
elevation.out <- foreach(i=1:99, .combine=rbind) %dopar% {
  source("A97_ElevationExtract.R")
  ElevationExtract(header.shp, output, ncores=1, chunk.i=i)}
stopCluster(cl)

For those tiles that failed, extract elevation of remaining plots one by one

#create list of tiles for which dem could not be extracted
myfiles <- list.files("../_derived/elevatr/")
failed <- list.files("../_derived/elevatr/", pattern = "[A-Za-z]*_[0-9]+failed\\.RData$")
failed <- as.numeric(unlist(regmatches(failed, gregexpr("[[:digit:]]+", failed))))

#create SpatialPointsDataFrame
sp.tile0 <- SpatialPointsDataFrame(coords=header.tiles %>% 
                                    filter(tilenam %in% levels(header.tiles$tilenam)[failed]) %>%
                                    dplyr::select(Longitude, Latitude),
                                  data=header.tiles %>% 
                                    filter(tilenam %in% levels(header.tiles$tilenam)[failed]) %>%
                                    dplyr::select(-Longitude, -Latitude),
                                  proj4string = CRS("+init=epsg:4326"))
sp.tile0 <- spTransform(sp.tile0, CRSobj = CRS("+init=epsg:3857 +proj=merc +a=6378137 +b=6378137 +lat_ts=0.0
                                               +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null
                                               +no_defs ")) #project to mercator
output.tile <- data.frame(NULL)

cl <- makeForkCluster(5, outfile="")
registerDoParallel(cl)

clusterEvalQ(cl, {
  library(rgdal)
  library(raster)
  library(sp)
  library(elevatr)
  library(dplyr)})

#Loop over all plots
elevation.failed <- foreach(i=1:nrow(sp.tile0), .packages=c('raster'), .combine=rbind) %dopar% { 
  sp.tile <- sp.tile0[i,]
  tryCatch(raster.tile <- get_elev_raster(sp.tile, z=10, 
                                          expand=max(sp.tile$`Location uncertainty (m)`)),
        error = function(e){
          print(paste("could not retrieve DEM for", sp.tile$PlotObservationID))}
          )
   if(!exists("raster.tile")) {
    output.tile <- data.frame(PlotObservationID==sp.tile$PlotObservationID, 
                              elevation=NA, 
                              Elevation_q2.5=NA, 
                              Elevation_median=NA,
                              Elevation_q97.5=NA,
                              DEM.res=NA)
    return(output.tile)
  } else {
  # clip dem tile with continent shape
  raster.tile <- mask(raster.tile, continent.high.merc)
  
  #extract and summarize elevation data
  elev.tile <- raster::extract(raster.tile, sp.tile, small=T)
  elev.tile.buffer <- raster::extract(raster.tile, sp.tile, 
                                      buffer=sp.tile$`Location uncertainty (m)`, small=T)
  elev.q95 <- t(round(mapply( quantile, 
                            x=elev.tile.buffer,
                            probs=rep(c(0.025, 0.5, 0.975), each=length(elev.tile)), na.rm=T)))
  output.tile <- data.frame(PlotObservationID=sp.tile$PlotObservationID, 
                            elevation=round(elev.tile), 
                            elev.q95, 
                            DEM.res=res(raster.tile)[1]) %>%
  rename(Elevation_q2.5=X2.5., Elevation_median=X50., Elevation_q97.5=X97.5.)
  return(output.tile)
  }
}
stopCluster(cl)
save(elevation.failed, file = "../_derived/elevatr/elevation_missing.RData")

Compose tiles into a single output, and export

myfiles <- list.files("../_derived/elevatr/", pattern = "elevation_tile_[0-9]+\\.RData$", full.names = T)

#create empty data.frame
elevation.out <- matrix(NA, nrow=nrow(header.tiles), ncol=6)
elevation.out <- as.data.frame(elevation.out)
colnames(elevation.out) <- c("PlotObservationID", "elevation", "Elevation_q2.5", "Elevation_median", "Elevation_q97.5","DEM.res")
elevation.out$PlotObservationID <- header.tiles$PlotObservationID

tmp <- NULL
for(i in 1:length(myfiles)){
  load(myfiles[i])
  #attach results to empty data.frame
  tmp <- bind_rows(tmp, output.tile)
  if(i %in% seq(5000, length(myfiles), by=5000)){
    mymatch <- base::match(x=tmp$PlotObservationID, table=elevation.out$PlotObservationID)
    mymatch <- mymatch[!is.na(mymatch)]
    elevation.out[mymatch,] <- tmp
    tmp <- NULL
    print(paste("Attached first", i, "files"))
  }
  if(i %in% seq(1,length(myfiles), by=250)){print(i)}
}

load(file = "../_derived/elevatr/elevation_missing.RData")

mymatch <- base::match(x=elevation.failed$PlotObservationID, table=elevation.out$PlotObservationID)
mymatch <- mymatch[!is.na(mymatch)]
elevation.out[mymatch,] <- elevation.failed

write_csv(elevation.out, path ="../_derived/elevatr/elevation.out.csv")

Reimport output, attach to header and check

elevation.out <- read_csv("../_derived/elevatr/elevation.out.csv")
knitr::kable(elevation.out %>% sample_n(10), 
  caption="Example of elevation output (10 randomly selected plots shown)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")
summary(elevation.out %>% 
          dplyr::select(-PlotObservationID, -elevation))

There are r sum(is.na(elevation.out$Elevation_median)) plots without elevation info, corresponding to r round(sum(is.na(elevation.out$Elevation_median))/nrow(header)*100,1)% of the number of matched plots. Please note that elevation was extracted only for plots with location uncertainty <50 km, i.e., r header.shp@data %>% mutate(lc_ncrt=abs(loc.uncert)) %>% filter(lc_ncrt <= 50000) %>% nrow() plots.
There are r sum(elevation.out$Elevation_median < -1, na.rm=T) plots with elevation below sea level.
\newline Join elevation data (only median)

header <- header %>% 
  left_join(elevation.out %>% 
              dplyr::select(PlotObservationID, Elevation_median) %>% 
              rename(elevation_dem=Elevation_median) %>% 
              distinct(PlotObservationID, .keep_all=T), 
            by="PlotObservationID")

Summary and check

knitr::kable(header %>% 
               dplyr::select(PlotObservationID, elevation_dem, Dataset, `GIVD ID`) %>% 
               filter(elevation_dem < -1) %>% 
               group_by( `GIVD ID`, Dataset) %>% 
               summarize(num.plot=n()) %>% 
               ungroup() %>% 
               arrange(desc(num.plot)), 
  caption="Dataset with highest number of plots below sea level") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Create Scatterplot between measured elevation in the field, and elevation derived from DEM

#join measured and derived elevation
mydata <- header %>% 
  dplyr::select(PlotObservationID, `Altitude (m)`, elevation_dem) %>%
  filter(!is.na(`Altitude (m)`) & !is.na(elevation_dem)) %>%
  rename(elevation_measured=`Altitude (m)`)

ggplot(data=mydata) + 
  geom_point(aes(x=elevation_measured, y=elevation_dem), alpha=1/10, cex=0.8) + 
  theme_bw() + 
  geom_abline(slope=0, intercept=0, col=2, lty=2) + 
  geom_abline(slope=1, intercept=1, col="Dark green")

4.5 Assign to countries

There is a minor number of plots (r header %>% filter(is.na(Country)) %>% nrow()), not assigned to any countries. Fix that.

countries <- readOGR("../../Ancillary_Data/naturalearth/ne_110m_admin_0_countries.shp") 
crs(countries) <- crs(header.shp)
tmp.sel <- header %>% 
  mutate(Country=ifelse(Country=="Former USSR", NA, Country)) %>% 
  filter(is.na(Country)) %>% 
  pull(PlotObservationID)

header.shp.nocountry <- header.shp[which(header.shp$PlotObservationID %in% tmp.sel),]
countries.out <- over(header.shp.nocountry, countries)
countries.out$PlotObservationID <- header.shp.nocountry@data$PlotObservationID


header <- header %>% 
  mutate(Country=ifelse(Country=="Former USSR", NA, Country)) %>% 
  left_join(countries.out %>% 
              dplyr::select(PlotObservationID, Country2=NAME), 
            by="PlotObservationID") %>% 
  mutate(Country=coalesce(Country, Country2)) %>% 
  dplyr::select(-Country2) %>% 
  mutate(Country=ifelse(Country=="Great Britain", "United Kingdom", Country)) %>% 
  mutate(Country=ifelse(Country=="Russia", "Russian Federation", Country)) 

Plots without country info are now only r header %>% filter(is.na(Country)) %>% nrow().

5 Map of plots

Update header.shp

header.shp@data <- header.shp@data %>% 
  left_join(header %>% 
          dplyr::select(PlotObservationID, sBiome, CONTINENT, Country,
                        Ecoregion, GIVD.ID=`GIVD ID`), 
            by="PlotObservationID") 

header.sf <- header.shp %>% 
  st_as_sf() %>% 
  st_transform(crs = "+proj=eck4")

Basic Map of the world in Eckert projection

countries <- ne_countries(returnclass = "sf") %>% 
  st_transform(crs = "+proj=eck4") %>% 
  st_geometry()
graticules <- ne_download(type = "graticules_15", category = "physical",
                          returnclass = "sf") %>% 
  st_transform(crs = "+proj=eck4") %>% 
  st_geometry()
bb <- ne_download(type = "wgs84_bounding_box", category = "physical",
                  returnclass = "sf") %>% 
  st_transform(crs = "+proj=eck4") %>% 
  st_geometry()


w3a <- ggplot() +
  geom_sf(data = bb, col = "grey20", fill = "white") +
  geom_sf(data = graticules, col = "grey20", lwd = 0.1) +
  geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) +
  coord_sf(crs = "+proj=eck4") +
  theme_minimal() +
  theme(axis.text = element_blank(), 
        legend.title=element_text(size=12), 
        legend.text=element_text(size=12),
        legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 
        legend.key.height = unit(1.1, "cm"), 
        legend.key.width = unit(1.1, "cm")) +
  scale_fill_viridis()

Graph of plot density (hexagons)

header2 <- header %>% 
  filter(!is.na(Longitude) | !is.na(Latitude)) %>% 
  dplyr::select(PlotObservationID, Latitude, Longitude, `GIVD ID`) %>% 
  filter(!(abs(Longitude) >171 & abs(Latitude>70)))
dggs <- dgconstruct(spacing=300, metric=T, resround='down')
#Get the corresponding grid cells for each earthquake epicenter (lat-long pair)
header2$cell <- dgGEO_to_SEQNUM(dggs, header2$Longitude, header2$Latitude)$seqnum

#Calculate number of plots for each cell
header.out   <- header2 %>% 
  group_by(cell) %>% 
  summarise(value.out=log(n(), 10))

#Get the grid cell boundaries for cells 
grid   <- dgcellstogrid(dggs, header.out$cell, frame=F) %>%
  st_as_sf() %>% 
  mutate(cell = header.out$cell) %>% 
  mutate(value.out=header.out$value.out) %>% 
  st_transform("+proj=eck4") %>% 
  st_wrap_dateline(options = c("WRAPDATELINE=YES"))

## plotting
legpos <- c(0.160, .24)
(w3 <- w3a + 
       geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9)    +
       geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
       scale_fill_viridis(
         name="# plots", breaks=0:5, labels = c("1", "10", "100",
                                                "1,000", "10,000", "100,000"), option="viridis" ) + 
    #labs(fill="# plots") + 
    theme(legend.position = legpos +c(-0.06, 0.25))
)
ggsave(filename="../_pics/PlotDensityLog10_vir.png", width = 15, height = 7, units="in", dpi=300, plot=w3)

Graph of plot location by Dataset

(w4 <- w3a + 
         geom_sf(data=header.sf %>% 
                   mutate(GIVD.ID=fct_shuffle(GIVD.ID)), aes(col=factor(GIVD.ID)), pch=16, size=0.8, alpha=0.6) +
         geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
         theme(legend.position = "none"))

ggsave(filename="../_pics/PlotDistrib_Dark2_shuffle1984.png", width = 15, height = 7, units="in", dpi=300, plot=w4) ## takes ~40' to render

Double check attribution to continents, Biomes and Ecoregions. Do it only on a subset of plots

tmp.sel <- header %>% 
  group_by(sBiome) %>% 
  sample_n(1000) %>% 
  pull(PlotObservationID)

#sBiomes
(w5 <- w3a + 
         geom_sf(data=header.sf %>% 
                   filter(PlotObservationID %in% tmp.sel), 
                 aes(col=factor(sBiome)), pch=16, size=0.8, alpha=0.6) +
         geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
         scale_color_brewer(name="Biome", palette="Paired") + 
         guides(color  = guide_legend(override.aes = list(size = 5))))

#continent
tmp.sel <- header %>%
  filter(CONTINENT!="AN") %>% 
  group_by(CONTINENT) %>% 
  sample_n(1000) %>% 
  pull(PlotObservationID)
(w6 <- w3a + 
         geom_sf(data=header.sf %>% 
                   filter(PlotObservationID %in% tmp.sel), 
                 aes(col=factor(CONTINENT)), pch=16, size=0.8, alpha=0.6) +
         geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3)+ 
         scale_color_brewer(name="Continent", palette="Paired") + 
         guides(color  = guide_legend(override.aes = list(size = 5))))

#Country
tmp.sel <- header %>%
  nest(-Country) %>% 
  left_join(header %>% 
              count(Country) %>% 
              mutate(samplesize=ifelse(n<50, n,50)), 
            by="Country") %>% 
  mutate(Sample = purrr::map2(data, samplesize, sample_n)) %>% 
  unnest(Sample) %>% 
  filter(Country %in% (header %>% 
                         distinct(Country) %>% 
                         sample_n(10) %>% 
                         pull(Country))) %>%   
  pull(PlotObservationID)
(w7 <- w3a + 
         geom_sf(data=header.sf %>% 
                   filter(PlotObservationID %in% tmp.sel), 
                 aes(col=factor(Country)), pch=16, size=0.8, alpha=0.6) +
         geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3)+ 
         scale_color_brewer(name="Biome", palette="Paired") + 
         guides(color  = guide_legend(override.aes = list(size = 5))))

#Ecoregion - Only 10 random ecoregions tested
tmp.sel <- header %>%
  filter(Ecoregion %in% sample(unique(header$Ecoregion), 10)) %>% 
  pull(PlotObservationID)
(w7 <- w3a + 
         geom_sf(data=header.sf %>% 
                   filter(PlotObservationID %in% tmp.sel) %>% 
                   mutate(Ecoregion=factor(Ecoregion)), 
                 aes(col=factor(Ecoregion)), pch=16, size=0.8, alpha=0.6) +
         geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
    theme(legend.position="bottom") + 
    guides(color  = guide_legend(override.aes = list(size = 5), nrow = 3)))

6 Fix output and export

#check 
nrow(header)==nrow(header0)

header <- header %>% 
  dplyr::select(
    #Metadata
    PlotObservationID, Dataset, "GIVD ID", "TV2 relevé number", "ORIG_NUM", "GUID", 
    #Geographic info
    Longitude:"Location uncertainty (m)", Country, CONTINENT, sBiome, sBiomeID, Ecoregion, EcoregionID, Locality,
    #Methodological info
    "Relevé area (m²)", "Cover abundance scale", "Date of recording", "Plants recorded", 
    "Herbs identified (y/n)","Mosses identified (y/n)","Lichens identified (y/n)",
    #Topographical
    elevation_dem, "Altitude (m)", "Aspect (°)", "Slope (°)", 
    #Vegetation type
    Forest:Naturalness, ESY, 
    #Vegetation structure
                "Cover total (%)":"Maximum height cryptogams (mm)")
save(header, file = "../_output/header_sPlot3.0.RData")
knitr::kable(header %>% sample_n(20), 
  caption="Example of header (20 random rows shown)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Supplementary Material

ANNEX 1 - Ancillary function - PredExtr

ANNEX 2 - Ancillary function - ElevationExtract

ANNEX 3 - SessionInfo()

sessionInfo()