diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd new file mode 100644 index 0000000000000000000000000000000000000000..1ea7be7e503851346cd0b990af012e60ac4fd714 --- /dev/null +++ b/code/04_buildHeader.Rmd @@ -0,0 +1,252 @@ +--- +title: "sPlot3.0 - Build Header" +author: "Francesco Maria Sabatini" +date: "2/4/2020" +output: html_document +--- + + + +<center> +  +</center> + + + + + +**Timestamp:** `r date()` +**Drafted:** Francesco Maria Sabatini +**Revised:** +**version:** + +This report documents the construction of the header file for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens. + + +```{r results="hide", message=F, warning=F} +knitr::opts_chunk$set(echo = TRUE) +library(tidyverse) +library(readr) + +## Spatial packages +library(sp) +library(rgdal) +library(rgeos) +library(raster) +library(rworldmap) + +``` + +```{bash, eval=F} +# escape all double quotation marks. Run in Linux terminal +sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv + +#more general alternative in case some " are already escaped +##first removing \s before all "s, and then adding \ before all ": +#sed 's/\([^\\]\)"/\1\\\"/g; s/"/\\"/g' + + +``` + + +Import header data +```{r import} +header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv", locale = locale(encoding = 'UTF-8'), + delim="\t", col_types=cols( + PlotObservationID = col_double(), + PlotID = col_double(), + `TV2 relevé number` = col_double(), + Country = col_factor(), + `Cover abundance scale` = col_factor(), + `Date of recording` = col_date(format="%d-%m-%Y"), + `Relevé area (m²)` = col_double(), + `Altitude (m)` = col_double(), + `Aspect (°)` = col_double(), + `Slope (°)` = col_double(), + `Cover total (%)` = col_double(), + `Cover tree layer (%)` = col_double(), + `Cover shrub layer (%)` = col_double(), + `Cover herb layer (%)` = col_double(), + `Cover moss layer (%)` = col_double(), + `Cover lichen layer (%)` = col_double(), + `Cover algae layer (%)` = col_double(), + `Cover litter layer (%)` = col_double(), + `Cover open water (%)` = col_double(), + `Cover bare rock (%)` = col_double(), + `Height (highest) trees (m)` = col_double(), + `Height lowest trees (m)` = col_double(), + `Height (highest) shrubs (m)` = col_double(), + `Height lowest shrubs (m)` = col_double(), + `Aver. height (high) herbs (cm)` = col_double(), + `Aver. height lowest herbs (cm)` = col_double(), + `Maximum height herbs (cm)` = col_double(), + `Maximum height cryptogams (mm)` = col_double(), + `Mosses identified (y/n)` = col_factor(), + `Lichens identified (y/n)` = col_factor(), + COMMUNITY = col_character(), + SUBSTRATE = col_character(), + Locality = col_character(), + ORIG_NUM = col_double(), + ALLIAN_REV = col_character(), + REV_AUTHOR = col_character(), + Forest = col_logical(), + Grassland = col_logical(), + Wetland = col_logical(), + `Sparse vegetation` = col_logical(), + Shrubland = col_logical(), + `Plants recorded` = col_factor(), + `Herbs identified (y/n)` = col_factor(), + Naturalness = col_factor(), + EUNIS = col_factor(), + Longitude = col_double(), + Latitude = col_double(), + `Location uncertainty (m)` = col_double(), + Dataset = col_factor(), + GUID = col_character() +)) %>% + rename(Sparse.vegetation=`Sparse vegetation`, + ESY=EUNIS) %>% + dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE) #too sparse information to be useful +``` + +The following column names occurred in the header of sPlot v2.1 +1. Syntaxon +2. Cover cryptogams (%) +3. Cover bare soil (%) +4. is.forest +5. is.non.forest +6. EVA +7. Biome +8. BiomeID +9. CONTINENT +10. POINT_X +11. POINT_Y + +Columns #1, #2, #3, #10, #11 will be dropped. The others will be derived below. + + +## 1 Solve spatial problems +There are 2020 plots in the Nile dataset without spatial coordinates. Assign manually with wide (90km) location uncertainty. +```{r} +header <- header0 %>% + mutate(Latitude=replace(Latitude, + list=(is.na(Latitude) & Dataset=="Egypt Nile delta"), + values=30.917351)) %>% + mutate(Longitude=replace(Longitude, + list=(is.na(Longitude) & Dataset=="Egypt Nile delta"), + values=31.138534)) %>% + mutate(`Location uncertainty (m)`=replace(`Location uncertainty (m)`, + list=(is.na(`Location uncertainty (m)`) & Dataset=="Egypt Nile delta"), + values=-90000)) +``` +There are two plots in the `Romanina Grassland Databse', whose lat\long are inverted. Correct. +```{r} +new.coords <- header %>% + filter(Dataset=="Romania Grassland Database" & Longitude>40) %>% + dplyr::select(PlotObservationID, Latitude, Longitude) %>% + rename(Longitude=Latitude, Latitude=Longitude) + +header <- header %>% + ##only works because the two plots have identical coords + mutate(Longitude=replace(Longitude, + list=PlotObservationID %in% (new.coords %>% + pull(PlotObservationID)), + values=new.coords %>% pull(Longitude)))%>% + mutate(Latitude=replace(Latitude, + list=PlotObservationID %in% (new.coords %>% + pull(PlotObservationID)), + values=new.coords %>% pull(Latitude))) +rm(new.coords) +``` + +There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value, to indicate this is an estimation, rather than a measure. +```{r} +header <- header %>% + left_join(header %>% + group_by(Dataset) %>% + summarize(loc.uncer.median=median(`Location uncertainty (m)`, na.rm=T)), + by="Dataset") %>% + mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)`), + -abs(loc.uncer.median), + `Location uncertainty (m)`)) +``` +There are still `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))` plots with no estimation of location uncertainty. + + + +## 2 Assign to Continents +Download and manipulate map of continents +```{r} +sPDF <- rworldmap::getMap(resolution="coarse") +continent <- sPDF[,"continent"] +crs(continent) <- CRS("+init=epsg:4326") +#continent@data[243,"continent"] <- "South America" ## Manually correct missing data +# create clipped version of continent to avoid going beyond 180 lON +coords <- data.frame(x=c(-180,180,180,-180), + y=c(-90,-90,90,90)) +bboxc = Polygon(coords) +bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent)) +continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround + +## same but high resolution (works better for plots along coastlines) +sPDF <- rworldmap::getMap(resolution="high") +continent.high <- sPDF[,"continent"] +crs(continent.high) <- CRS("+init=epsg:4326") +continent.high@data$continent <- fct_recode(continent.high@data$continent, "South America"="South America and the Caribbean") +``` +Create spatial point dataframe for sPlot data to intersect with spatial layers +```{r} +header.shp <- header %>% + filter(!is.na(Longitude) | !is.na(Latitude)) +header.shp <- SpatialPointsDataFrame(coords= header.shp %>% + dplyr::select(Longitude, Latitude), + proj4string = CRS("+init=epsg:4326"), + data=data.frame(PlotObservationID= header.shp$PlotObservationID, + loc.uncert=header.shp$`Location uncertainty (m)`)) +``` + +Assign plots to continent +```{r, eval=T} +continent.out <- sp::over(x=header.shp, y=continent) +#overlay unassigned points to the high resolution layer of continent +toassign <- header.shp[which(is.na(continent.out$continent)),] #154782 remain to assign +crs(toassign) <- crs(continent) +continent.out2 <- sp::over(x=toassign, y=continent.high) +#merge first and second overlay +continent.out$continent[is.na(continent.out$continent)] <- continent.out2$continent + +#correct unassigned points to closest continent +toassign <- header.shp[which(is.na(continent.out$continent)),] #47610 remain to assign +crs(toassign) <- crs(continent) + + +#go parallel +library(parallel) +library(doParallel) +cl <- makeCluster(6) +registerDoParallel(cl) +nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { + nearestContinent.tmp <- geosphere::dist2Line(toassign[i,], continent_clipped) +} +continent.out$continent[which(is.na(continent.out$continent))] <- as.character(continent[-137,]@data[nearestContinent[,"ID"],]) +save(continent.out, file = "../_derived/continent.out") +``` +Reload and manipulate continent. Correct header.sRoot +```{r} +load("_derived/continent.out") +header <- header %>% + left_join(header %>% + filter(!is.na(Longitude) | !is.na(Latitude)) %>% + dplyr::select(PlotObservationID) %>% + bind_cols(continent.out), + by="PlotObservationID") + mutate(CONTINENT=factor(continent, + levels=c("Africa", "Antarctica", "Australia", "Eurasia", "North America", "South America"), + labels=c("AF", "AN", "AU", "EU", "NA", "SA"))) %>% + dplyr::select(-continent) +``` + + + + +