Skip to content
Snippets Groups Projects
Commit 420d97c8 authored by Francesco Sabatini's avatar Francesco Sabatini
Browse files

Additional data cleaning, GIVD codes, standardized formations

parent a0d70d77
No related branches found
No related tags found
No related merge requests found
......@@ -27,6 +27,7 @@ This report documents the construction of the header file for sPlot 3.0. It is b
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(readr)
library(xlsx)
## Spatial packages
library(sp)
......@@ -109,9 +110,9 @@ header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv",
dplyr::select(-COMMUNITY, -ALLIAN_REV, -REV_AUTHOR, -SUBSTRATE) #too sparse information to be useful
```
The following column names occurred in the header of sPlot v2.1
The following column names occurred in the header of sPlot v2.1 and are currently missing from the header of v3.0
1. Syntaxon
2. Cover cryptogams (%)
2. Cover cryptogams (%)
3. Cover bare soil (%)
4. is.forest
5. is.non.forest
......@@ -121,7 +122,7 @@ The following column names occurred in the header of sPlot v2.1
9. CONTINENT
10. POINT_X
11. POINT_Y
~~
Columns #1, #2, #3, #10, #11 will be dropped. The others will be derived below.
......@@ -139,27 +140,14 @@ header <- header0 %>%
list=(is.na(`Location uncertainty (m)`) & Dataset=="Egypt Nile delta"),
values=-90000))
```
There are two plots in the `Romanina Grassland Databse', whose lat\long are inverted. Correct.
There are two plots in the `Romanina Grassland Databse` and ~4442 plots in the `Japan` database, whose lat\\long are inverted. Correct.
```{r}
new.coords <- header %>%
filter(Dataset=="Romania Grassland Database" & Longitude>40) %>%
dplyr::select(PlotObservationID, Latitude, Longitude) %>%
rename(Longitude=Latitude, Latitude=Longitude)
header <- header %>%
##only works because the two plots have identical coords
mutate(Longitude=replace(Longitude,
list=PlotObservationID %in% (new.coords %>%
pull(PlotObservationID)),
values=new.coords %>% pull(Longitude)))%>%
mutate(Latitude=replace(Latitude,
list=PlotObservationID %in% (new.coords %>%
pull(PlotObservationID)),
values=new.coords %>% pull(Latitude)))
rm(new.coords)
toswap <- c(which(header$Dataset=="Japan" & header$Latitude>90),
which(header$Dataset=="Romania Grassland Database" & header$Longitude>40))
header[toswap, c("Latitude", "Longitude")] <- header[toswap, c("Longitude", "Latitude")]
```
There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value, to indicate this is an estimation, rather than a measure.
There are `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))`. As a first approximation, we assign the median of the respective dataset, as a negative value to indicate this is an estimation, rather than a measure.
```{r}
header <- header %>%
left_join(header %>%
......@@ -168,19 +156,125 @@ header <- header %>%
by="Dataset") %>%
mutate(`Location uncertainty (m)`=ifelse( is.na(`Location uncertainty (m)`),
-abs(loc.uncer.median),
`Location uncertainty (m)`))
`Location uncertainty (m)`)) %>%
dplyr::select(-loc.uncer.median)
```
There are still `r nrow(header %>% filter(is.na(`Location uncertainty (m)`)))` plots with no estimation of location uncertainty.
## 2 Formations
Fill out the columns `Forest:Sparse.vegetation` with 0\\NAs, where necessary. Create columns `is.forest` and `is.non.forest` using script developed for sPlot 2.1
~~
I am not assigning plots to Faber-Langedon formation at this stage, as this is only possible for European plots having an ESY classification.
```{r}
eunis.key <- read.xlsx("../_input/EUNIS_WFT.xlsx", sheetIndex = "Sheet1", endRow = 246) %>%
dplyr::select(EUNIS_code, NATURALNESS:SPARSE_VEG) %>%
mutate(EUNIS_code=as.character(EUNIS_code)) %>%
rename(ESY=EUNIS_code,
Naturalness=NATURALNESS,
Forest=FOREST,
Shrubland=SCRUBLAND,
Grassland=GRASSLAND,
Wetland=WETLAND,
Sparse.vegetation=SPARSE_VEG)#,
header.backup <- header
#test <- header.backup %>% slice(100000:110000) %>% dplyr::select(PlotObservationID, Dataset, ESY, Forest:Shrubland)
header <- header %>% # header.backup %>%
mutate(ESY=as.character(ESY)) %>%
#mutate(ESY=ifelse(ESY=="?", NA, ESY)) %>%
# Systematically assign some databases to forest
mutate(Forest=ifelse(Dataset %in%
c("Turkey Oak_Forest Database", "Turkey Forest Database", "Chile_forest", "Ethiopia"),
T, Forest)) %>%
#fill up with F those rows where at least one column on formation is assigned
rowwise() %>%
mutate(Any=any(Forest, Shrubland, Grassland, Wetland, Sparse.vegetation)) %>%
mutate(Forest=ifelse( (is.na(Forest) & Any), F, Forest)) %>%
mutate(Shrubland=ifelse( (is.na(Shrubland) & Any), F, Shrubland)) %>%
mutate(Grassland=ifelse( (is.na(Grassland) & Any), F, Grassland)) %>%
mutate(Wetland=ifelse( (is.na(Wetland) & Any), F, Wetland)) %>%
mutate(Sparse.vegetation=ifelse( (is.na(Sparse.vegetation) & Any), F, Sparse.vegetation)) %>%
ungroup() %>%
dplyr::select(-Any) %>%
mutate_at(vars(Forest:Shrubland), .funs=list(~.*1)) %>%
mutate(Naturalness=as.numeric(as.character(Naturalness))) %>%
##join and coalesce with eunis.key
left_join(eunis.key, by = "ESY") %>%
mutate(
Forest = dplyr:::coalesce(Forest.x, Forest.y),
Shrubland = coalesce(Shrubland.x, Shrubland.y),
Grassland = coalesce(Grassland.x, Grassland.y),
Wetland = coalesce(Wetland.x, Wetland.y),
Sparse.vegetation = coalesce(Sparse.vegetation.x, Sparse.vegetation.y),
Naturalness = coalesce(Naturalness.x, Naturalness.y)
) %>%
dplyr::select(-ends_with(".x"), -ends_with(".y"))
```
## 3 Fix header and attach GIVD codes
Reduce number of factor levels for the column `Plants recorded`
```{r}
header <- header %>%
mutate(`Plants recorded`=fct_recode(`Plants recorded`,
"All vascular plants"="complete vegetation",
"All vascular plants"="Complete vegetation",
"All vascular plants"="all vascular plants",
"All vascular plants"="complete",
"All vascular plants"="Complete vegetation (including non-terricolous tax",
"All vascular plants"="Vascular plants",
"All woody plants"="Woody plants",
"All woody plants"="All woody species",
"Woody plants >= 10 cm dbh"= "trees>=10cm dbh",
"All trees & dominant understory"="All trees & dominant shrubs",
"Woody plants >= 1 cm dbh" = "Plants >= 1 cm dbh"
)) %>%
mutate(`Plants recorded`=factor(`Plants recorded`, exclude = "#N/A"))
```
## 2 Assign to Continents
```{r}
databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
## align labels to consortium
header <- header %>%
mutate(Dataset=fct_recode(Dataset,
"BIOTA_South_Africa" = "BIOTA_South_Africa_3",
"Kriti"="Cyprus_Bergmeier",
"European Boreal Forest Database"="European Boreal Forest Database 1",
"European Boreal Forest Database"="European Boreal Forest Database 2",
"European Coastal Vegetation Database"= "European Coastal Vegetation Database-A",
"Germany_vegetweb"="Germany_vegetweb2",
"Germany_vegetweb"="Germany_vegetweb3",
"Ladakh"="Ladakh_2",
"Netherlands"="Netherlands Military sites",
"NSW Australia" = "NSW Austalia",
)) %>%
left_join(databases %>%
dplyr::select(`GIVD ID`, label) %>%
rename(Dataset=label),
by="Dataset")
```
## 4 Assign to Continents
Download and manipulate map of continents
```{r}
sPDF <- rworldmap::getMap(resolution="coarse")
continent <- sPDF[,"continent"]
crs(continent) <- CRS("+init=epsg:4326")
#continent@data[243,"continent"] <- "South America" ## Manually correct missing data
continent@data[243,"continent"] <- "South America" ## Manually correct missing data
# create clipped version of continent to avoid going beyond 180 lON
coords <- data.frame(x=c(-180,180,180,-180),
y=c(-90,-90,90,90))
......@@ -188,7 +282,7 @@ bboxc = Polygon(coords)
bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent))
continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround
## same but high resolution (works better for plots along coastlines)
## same but high resolution (slower, but works better for plots along coastlines)
sPDF <- rworldmap::getMap(resolution="high")
continent.high <- sPDF[,"continent"]
crs(continent.high) <- CRS("+init=epsg:4326")
......@@ -221,10 +315,12 @@ crs(toassign) <- crs(continent)
#go parallel
ncores=6
library(parallel)
library(doParallel)
cl <- makeCluster(6)
cl <- makeForkCluster(ncores, outfile="" )
registerDoParallel(cl)
nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% {
nearestContinent.tmp <- geosphere::dist2Line(toassign[i,], continent_clipped)
}
......@@ -233,7 +329,7 @@ save(continent.out, file = "../_derived/continent.out")
```
Reload and manipulate continent. Correct header.sRoot
```{r}
load("_derived/continent.out")
load("../_derived/continent.out")
header <- header %>%
left_join(header %>%
filter(!is.na(Longitude) | !is.na(Latitude)) %>%
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment