From 420d97c87dbaab9c31ab6b6299c0e942371a2dd0 Mon Sep 17 00:00:00 2001 From: Francesco Sabatini <francesco.sabatini@idiv.de> Date: Wed, 5 Feb 2020 18:00:46 +0100 Subject: [PATCH] Additional data cleaning, GIVD codes, standardized formations --- code/04_buildHeader.Rmd | 150 ++++++++++++++++++++++++++++++++-------- 1 file changed, 123 insertions(+), 27 deletions(-) diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd index 1ea7be7..8fe8f86 100644 --- a/code/04_buildHeader.Rmd +++ b/code/04_buildHeader.Rmd @@ -27,6 +27,7 @@ This report documents the construction of the header file for sPlot 3.0. It is b knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(readr) +library(xlsx) ## Spatial packages library(sp) @@ -109,9 +110,9 @@ header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv", dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE) #too sparse information to be useful ``` -The following column names occurred in the header of sPlot v2.1 +The following column names occurred in the header of sPlot v2.1 and are currently missing from the header of v3.0 1. Syntaxon -2. Cover cryptogams (%) +2. Cover cryptogams (%) 3. Cover bare soil (%) 4. is.forest 5. is.non.forest @@ -121,7 +122,7 @@ The following column names occurred in the header of sPlot v2.1 9. CONTINENT 10. POINT_X 11. POINT_Y - +~~ Columns #1, #2, #3, #10, #11 will be dropped. The others will be derived below. @@ -139,27 +140,14 @@ header <- header0 %>% list=(is.na(`Location uncertainty (m)`) & Dataset=="Egypt Nile delta"), values=-90000)) ``` -There are two plots in the `Romanina Grassland Databse', whose lat\long are inverted. Correct. +There are two plots in the `Romanina Grassland Databse` and ~4442 plots in the `Japan` database, whose lat\\long are inverted. Correct. ```{r} -new.coords <- header %>% - filter(Dataset=="Romania Grassland Database" & Longitude>40) %>% - dplyr::select(PlotObservationID, Latitude, Longitude) %>% - rename(Longitude=Latitude, Latitude=Longitude) - -header <- header %>% - ##only works because the two plots have identical coords - mutate(Longitude=replace(Longitude, - list=PlotObservationID %in% (new.coords %>% - pull(PlotObservationID)), - values=new.coords %>% pull(Longitude)))%>% - mutate(Latitude=replace(Latitude, - list=PlotObservationID %in% (new.coords %>% - pull(PlotObservationID)), - values=new.coords %>% pull(Latitude))) -rm(new.coords) +toswap <- c(which(header$Dataset=="Japan" & header$Latitude>90), + which(header$Dataset=="Romania Grassland Database" & header$Longitude>40)) +header[toswap, c("Latitude", "Longitude")] <- header[toswap, c("Longitude", "Latitude")] ``` -There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value, to indicate this is an estimation, rather than a measure. +There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value to indicate this is an estimation, rather than a measure. ```{r} header <- header %>% left_join(header %>% @@ -168,19 +156,125 @@ header <- header %>% by="Dataset") %>% mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)`), -abs(loc.uncer.median), - `Location uncertainty (m)`)) + `Location uncertainty (m)`)) %>% + dplyr::select(-loc.uncer.median) ``` There are still `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))` plots with no estimation of location uncertainty. +## 2 Formations +Fill out the columns `Forest:Sparse.vegetation` with 0\\NAs, where necessary. Create columns `is.forest` and `is.non.forest` using script developed for sPlot 2.1 +~~ +I am not assigning plots to Faber-Langedon formation at this stage, as this is only possible for European plots having an ESY classification. + + + +```{r} +eunis.key <- read.xlsx("../_input/EUNIS_WFT.xlsx", sheetIndex = "Sheet1", endRow = 246) %>% + dplyr::select(EUNIS_code, NATURALNESS:SPARSE_VEG) %>% + mutate(EUNIS_code=as.character(EUNIS_code)) %>% + rename(ESY=EUNIS_code, + Naturalness=NATURALNESS, + Forest=FOREST, + Shrubland=SCRUBLAND, + Grassland=GRASSLAND, + Wetland=WETLAND, + Sparse.vegetation=SPARSE_VEG)#, + +header.backup <- header + +#test <- header.backup %>% slice(100000:110000) %>% dplyr::select(PlotObservationID, Dataset, ESY, Forest:Shrubland) + +header <- header %>% # header.backup %>% + mutate(ESY=as.character(ESY)) %>% + #mutate(ESY=ifelse(ESY=="?", NA, ESY)) %>% + # Systematically assign some databases to forest + mutate(Forest=ifelse(Dataset %in% + c("Turkey Oak_Forest Database", "Turkey Forest Database", "Chile_forest", "Ethiopia"), + T, Forest)) %>% + #fill up with F those rows where at least one column on formation is assigned + rowwise() %>% + mutate(Any=any(Forest, Shrubland, Grassland, Wetland, Sparse.vegetation)) %>% + mutate(Forest=ifelse( (is.na(Forest) & Any), F, Forest)) %>% + mutate(Shrubland=ifelse( (is.na(Shrubland) & Any), F, Shrubland)) %>% + mutate(Grassland=ifelse( (is.na(Grassland) & Any), F, Grassland)) %>% + mutate(Wetland=ifelse( (is.na(Wetland) & Any), F, Wetland)) %>% + mutate(Sparse.vegetation=ifelse( (is.na(Sparse.vegetation) & Any), F, Sparse.vegetation)) %>% + ungroup() %>% + dplyr::select(-Any) %>% + mutate_at(vars(Forest:Shrubland), .funs=list(~.*1)) %>% + mutate(Naturalness=as.numeric(as.character(Naturalness))) %>% + ##join and coalesce with eunis.key + left_join(eunis.key, by = "ESY") %>% + mutate( + Forest = dplyr:::coalesce(Forest.x, Forest.y), + Shrubland = coalesce(Shrubland.x, Shrubland.y), + Grassland = coalesce(Grassland.x, Grassland.y), + Wetland = coalesce(Wetland.x, Wetland.y), + Sparse.vegetation = coalesce(Sparse.vegetation.x, Sparse.vegetation.y), + Naturalness = coalesce(Naturalness.x, Naturalness.y) + ) %>% + dplyr::select(-ends_with(".x"), -ends_with(".y")) +``` + + +## 3 Fix header and attach GIVD codes +Reduce number of factor levels for the column `Plants recorded` +```{r} +header <- header %>% + mutate(`Plants recorded`=fct_recode(`Plants recorded`, + "All vascular plants"="complete vegetation", + "All vascular plants"="Complete vegetation", + "All vascular plants"="all vascular plants", + "All vascular plants"="complete", + "All vascular plants"="Complete vegetation (including non-terricolous tax", + "All vascular plants"="Vascular plants", + "All woody plants"="Woody plants", + "All woody plants"="All woody species", + "Woody plants >= 10 cm dbh"= "trees>=10cm dbh", + "All trees & dominant understory"="All trees & dominant shrubs", + "Woody plants >= 1 cm dbh" = "Plants >= 1 cm dbh" + )) %>% + mutate(`Plants recorded`=factor(`Plants recorded`, exclude = "#N/A")) +``` -## 2 Assign to Continents +```{r} +databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv") +## align labels to consortium + +header <- header %>% + mutate(Dataset=fct_recode(Dataset, + "BIOTA_South_Africa" = "BIOTA_South_Africa_3", + "Kriti"="Cyprus_Bergmeier", + "European Boreal Forest Database"="European Boreal Forest Database 1", + "European Boreal Forest Database"="European Boreal Forest Database 2", + "European Coastal Vegetation Database"= "European Coastal Vegetation Database-A", + "Germany_vegetweb"="Germany_vegetweb2", + "Germany_vegetweb"="Germany_vegetweb3", + "Ladakh"="Ladakh_2", + "Netherlands"="Netherlands Military sites", + "NSW Australia" = "NSW Austalia", + )) %>% + left_join(databases %>% + dplyr::select(`GIVD ID`, label) %>% + rename(Dataset=label), + by="Dataset") + + + +``` + + + + + +## 4 Assign to Continents Download and manipulate map of continents ```{r} sPDF <- rworldmap::getMap(resolution="coarse") continent <- sPDF[,"continent"] crs(continent) <- CRS("+init=epsg:4326") -#continent@data[243,"continent"] <- "South America" ## Manually correct missing data +continent@data[243,"continent"] <- "South America" ## Manually correct missing data # create clipped version of continent to avoid going beyond 180 lON coords <- data.frame(x=c(-180,180,180,-180), y=c(-90,-90,90,90)) @@ -188,7 +282,7 @@ bboxc = Polygon(coords) bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent)) continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround -## same but high resolution (works better for plots along coastlines) +## same but high resolution (slower, but works better for plots along coastlines) sPDF <- rworldmap::getMap(resolution="high") continent.high <- sPDF[,"continent"] crs(continent.high) <- CRS("+init=epsg:4326") @@ -221,10 +315,12 @@ crs(toassign) <- crs(continent) #go parallel +ncores=6 library(parallel) library(doParallel) -cl <- makeCluster(6) +cl <- makeForkCluster(ncores, outfile="" ) registerDoParallel(cl) + nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { nearestContinent.tmp <- geosphere::dist2Line(toassign[i,], continent_clipped) } @@ -233,7 +329,7 @@ save(continent.out, file = "../_derived/continent.out") ``` Reload and manipulate continent. Correct header.sRoot ```{r} -load("_derived/continent.out") +load("../_derived/continent.out") header <- header %>% left_join(header %>% filter(!is.na(Longitude) | !is.na(Latitude)) %>% -- GitLab