-
Francesco Sabatini authoredFrancesco Sabatini authored
title: 'sPlot 3.0 - Validity Check'
author: "Francesco Maria Sabatini"
output:
html_document: default
always_allow_html: yes
Timestamp: r date()
Drafted: Francesco Maria Sabatini
Revised: Stephan Hennekens
Version: 1.3
This report checks for consistency of the dataset used for constructing sPlot 3.0.
Changes to v1.1 - Added check to species data. Created To Do list.
Changes to v1.2 - based on dataset sPlot_3.0.1, received on 29/06/2019 from SH
Changes to v1.3 - based on dataset sPlot_3.0.2, received on 24/07/2019 from SH
Key Problems:
The is still a high proportion of plots without location uncertainty
library(reshape2)
library(tidyverse)
library(readr)
library(dplyr)
library(data.table)
library(knitr)
library(kableExtra)
library(viridis)
library(grid)
library(gridExtra)
library(ggforce)
library(xlsx)
Check Header file
Import with parse
header <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header.csv", locale = locale(encoding = 'UTF-8'),
delim="\t", col_types=cols(
PlotObservationID = col_double(),
PlotID = col_double(),
`TV2 relevé number` = col_double(),
Country = col_factor(),
`Cover abundance scale` = col_factor(),
`Date of recording` = col_date(format="%d-%m-%Y"),
`Relevé area (m²)` = col_double(),
`Altitude (m)` = col_double(),
`Aspect (°)` = col_double(),
`Slope (°)` = col_double(),
`Cover total (%)` = col_double(),
`Cover tree layer (%)` = col_double(),
`Cover shrub layer (%)` = col_double(),
`Cover herb layer (%)` = col_double(),
`Cover moss layer (%)` = col_double(),
`Cover lichen layer (%)` = col_double(),
`Cover algae layer (%)` = col_double(),
`Cover litter layer (%)` = col_double(),
`Cover open water (%)` = col_double(),
`Cover bare rock (%)` = col_double(),
`Height (highest) trees (m)` = col_double(),
`Height lowest trees (m)` = col_double(),
`Height (highest) shrubs (m)` = col_double(),
`Height lowest shrubs (m)` = col_double(),
`Aver. height (high) herbs (cm)` = col_double(),
`Aver. height lowest herbs (cm)` = col_double(),
`Maximum height herbs (cm)` = col_double(),
`Maximum height cryptogams (mm)` = col_double(),
`Mosses identified (y/n)` = col_factor(),
`Lichens identified (y/n)` = col_factor(),
COMMUNITY = col_character(),
SUBSTRATE = col_character(),
Locality = col_character(),
ORIG_NUM = col_double(),
ALLIAN_REV = col_character(),
REV_AUTHOR = col_character(),
Forest = col_logical(),
Grassland = col_logical(),
Wetland = col_logical(),
`Sparse vegetation` = col_logical(),
Shrubland = col_logical(),
`Plants recorded` = col_factor(),
`Herbs identified (y/n)` = col_factor(),
Naturalness = col_factor(),
EUNIS = col_factor(),
Longitude = col_double(),
Latitude = col_double(),
`Location uncertainty (m)` = col_double(),
Dataset = col_factor(),
GUID = col_character()
))
This version of sPlot 3.0.1 is composed of r length(unique(header$Dataset))
data sets, for a total of r nrow(header)
plots.
Show remaining problems
knitr::kable(problems(header) %>%
mutate(Dataset=header$Dataset[problems(header)$row]) %>%
dplyr::select(Dataset, row, col) %>%
distinct(),
caption="Problems when importing header data") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), latex_options = "basic",
full_width = F, position = "center")
There seem to be some encoding problems with these plots
Plots without coordinates (by dataset)
knitr::kable(header %>%
dplyr::select(PlotObservationID, `TV2 relevé number`, Dataset, Longitude, Latitude) %>%
filter(is.na(Longitude) | is.na(Latitude)) %>%
group_by(Dataset) %>%
summarize(n()),
caption="Number of plots without coordinates per dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
Plots without location uncertainty (by dataset).
We could probably assign at least a broad location, with wide uncertainty to the 2020 plots in the Egypt Nile delta dataset.
knitr::kable(header %>%
filter(is.na(`Location uncertainty (m)`)) %>%
group_by(Dataset) %>%
summarize(n()),
caption="Number of plots without Location Uncertainty per dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
Big datasets without coordinate uncertainty are:
Czechia_nvd
Germany_vegetweb2
Poland
Slovenia
--> Talk to contributors and ask for 'average' uncertainty (e.g. 100 m)
nNAs <- nrow(header %>% filter(is.na(`Location uncertainty (m)`)))
There are still r nNAs
plots without location uncertainty.
Previously known problems:
- Import field 'Plants Recorded' into header (SH)
knitr::kable(table(header$`Plants recorded`, exclude=NULL),
caption="Number of records for each level in Plants recorded") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
--> Values not available for EVA's datasets. Ask Milan if we can simply assume 'All vascular plants'
- Import field 'Herbs identified (y/n)' into header (SH)
knitr::kable(table(header$`Herbs identified (y/n)`, exclude=NULL),
caption="Number of records for each level in Plants recorded") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
--> Values not available for EVA's datasets. According to SH, we can simply assume Y.
Check DT table
DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species.csv",
delim="\t",
col_type = cols(
PlotObservationID = col_double(),
Taxonomy = col_character(),
`Taxon group` = col_character(),
`Taxon group ID` = col_double(),
`Turboveg2 concept` = col_character(),
`Matched concept` = col_character(),
Match = col_double(),
Layer = col_double(),
`Cover %` = col_double(),
`Cover code` = col_character(),
x_ = col_double()
)
)
Show problems in DT import
knitr::kable(problems(DT0) %>%
mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>%
dplyr::select(Dataset, col, expected, actual) %>%
distinct(),
caption="Problems when importing Species data") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
id <- as.character(DT0$PlotObservationID[(problems(DT0) %>% dplyr::select(row) %>% distinct())$row])
relnum <- (header %>% filter(PlotObservationID == DT0$PlotObservationID[(problems(DT0) %>% dplyr::select(row) %>% distinct())$row]))$`TV2 relevé number`
db <- (problems(DT0) %>% mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>% dplyr::select(Dataset) %>% distinct())[1,1, drop=T]
All problems seem to be concentrated in PlotID = r id
which corresponds to TV2 relevé Number = r relnum
in r db
.
Other known problems:
#There are some plots without the appropriate cover code
knitr::kable(DT0 %>%
filter(`Cover %` ==0 & is.na(`Cover code`)) %>%
group_by(Taxonomy) %>%
summarize(n()),
caption="Summary of DBs without appropriate cover codes") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
# in db British_Columbia_meadows this seems to depend on lichen\moss species
knitr::kable(head(DT0 %>%
filter(`Cover %` ==0 & is.na(`Cover code`) & Taxonomy=="British_Columbia_meadows") %>%
dplyr::select(PlotObservationID, Taxonomy, `Matched concept`:x_),10),
caption="Example from British Columbia meadows db (only first 10 rows)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
# in db USA_VegBank there seem to be species with only p\a values, next to species with also cover (FMS will have to solve this)
knitr::kable(head(DT0 %>%
filter(`Cover %` ==0 & is.na(`Cover code`) & Taxonomy=="USA_VegBank")%>%
dplyr::select(PlotObservationID, Taxonomy, `Matched concept`:x_), 10),
caption="Example from USA_VegBank db (only first 10 rows)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
FMS will solve these problems, by assigning an arbitrary low (e.g. 0.1%) cover value to species sampled only as pa. Ideally, FMS should also add a note in the _x field
Distribution of plots across datasets:
knitr::kable(table(header$Dataset), caption="Plots per dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
Check geographic coordinates
countries <- map_data("world")
ggworld <- ggplot(countries, aes(x=long, y=lat, group = group)) +
geom_polygon(col=gray(0.3), lwd=0.3, fill = gray(0.9)) +
theme_bw() +
theme(axis.title = element_blank())
robust.range <- function(x){
return(c(floor((min(x, na.rm=T)-0.01)/5)*5,
ceiling((max(x, na.rm=T)+0.01)/5)*5))
}
ggdataset <- list()
tick <- 1
for(d in levels(header$Dataset)){
datasel <- header %>%
filter(Dataset==d)
ggdataset[[tick]] <- ggworld +
geom_point(data=datasel, aes(x=Longitude, y=Latitude, group=1),
col="red", alpha=0.5, cex=1, shape="+") +
coord_equal(ylim=robust.range(datasel$Latitude),
xlim=robust.range(datasel$Longitude)) +
ggtitle(d)
tick <- tick + 1
}
for(dd in 1:nlevels(header$Dataset)){
d <- levels(header$Dataset)[dd]
print(ggdataset[[dd]])
}
#Depreated below
#Some plots in the Hungary dataset have a altitude >5000 m (!)
print(ggplot(data=datasel %>%
melt()) +
geom_histogram(aes(x=value)) +
facet_wrap(.~variable, scale="free", ncol=3) +
theme_minimal() +
theme(axis.text = element_text(size = 8)))
#Fix known problems
header.fix <- header %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern=" ", replacement="")) %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="-", replacement=NA)) %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.$", replacement=NA)) %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.", replacement="0\\.")) %>%
mutate(`Slope (°)`=gsub(`Slope (°)`, pattern="-", replacement=NA)) %>%
mutate(`Aspect (°)`=gsub(`Aspect (°)`, pattern="-", replacement=NA)) %>%
mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`,
list=`Lichens identified (y/n)` %in% c("0","F", "n", "N" ),
values="FALSE")) %>%
mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`,
list=`Lichens identified (y/n)` %in% c("1","y", "Y"),
values="TRUE")) %>%
mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`,
list=`Mosses identified (y/n)` %in% c("0","F","f", "n", "N" ),
values="FALSE")) %>%
mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`,
list=`Mosses identified (y/n)` %in% c("1", "j", "J", "T", "y", "Y" ),
values="TRUE"))
write_csv(header.fix, path = "../sPlot_data_export/sPlot_data_header_fix1.csv")