00_CheckData.Rmd

title: 'sPlot 3.0 - Validity Check'
author: "Francesco Maria Sabatini"
output:
html_document: default
always_allow_html: yes
library(reshape2)
library(tidyverse)
library(readr)
library(dplyr)
library(data.table)
library(knitr)
library(kableExtra)
library(viridis)
library(grid)
library(gridExtra)
library(ggforce)
library(xlsx)
header <- readr::read_delim("../sPlot_data_export/sPlot 3.0.1_header.csv", locale = locale(encoding = 'UTF-8'),
                            delim="\t", col_types=cols(
  PlotObservationID = col_double(),
  PlotID = col_double(),
  `TV2 relevé number` = col_double(),
  Country = col_factor(),
  `Cover abundance scale` = col_factor(),
  `Date of recording` = col_date(format="%d-%m-%Y"),
  `Relevé area (m²)` = col_double(),
  `Altitude (m)` = col_double(),
  `Aspect (°)` = col_double(),
  `Slope (°)` = col_double(),
  `Cover total (%)` = col_double(),
  `Cover tree layer (%)` = col_double(),
  `Cover shrub layer (%)` = col_double(),
  `Cover herb layer (%)` = col_double(),
  `Cover moss layer (%)` = col_double(),
  `Cover lichen layer (%)` = col_double(),
  `Cover algae layer (%)` = col_double(),
  `Cover litter layer (%)` = col_double(),
  `Cover open water (%)` = col_double(),
  `Cover bare rock (%)` = col_double(),
  `Height (highest) trees (m)` = col_double(),
  `Height lowest trees (m)` = col_double(),
  `Height (highest) shrubs (m)` = col_double(),
  `Height lowest shrubs (m)` = col_double(),
  `Aver. height (high) herbs (cm)` = col_double(),
  `Aver. height lowest herbs (cm)` = col_double(),
  `Maximum height herbs (cm)` = col_double(),
  `Maximum height cryptogams (mm)` = col_double(),
  `Mosses identified (y/n)` = col_factor(),
  `Lichens identified (y/n)` = col_factor(),
  COMMUNITY = col_character(),
  SUBSTRATE = col_character(),
  Locality = col_character(),
  ORIG_NUM = col_double(),
  ALLIAN_REV = col_character(),
  REV_AUTHOR = col_character(),
  Forest = col_logical(),
  Grassland = col_logical(),
  Wetland = col_logical(),
  `Sparse vegetation` = col_logical(),
  Shrubland = col_logical(),
  `Plants recorded` = col_factor(),
  `Herbs identified (y/n)` = col_factor(),
  Naturalness = col_factor(),
  EUNIS = col_factor(),
  Longitude = col_double(),
  Latitude = col_double(),
  `Location uncertainty (m)` = col_double(),
  Dataset = col_factor(),
  GUID = col_character()
))
knitr::kable(problems(header) %>%
  mutate(Dataset=header$Dataset[problems(header)$row]) %>%
  dplyr::select(Dataset, row, col) %>%
  distinct(),
  caption="Problems when importing header data") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), latex_options = "basic",
                  full_width = F, position = "center")

knitr::kable(header %>%
  dplyr::select(PlotObservationID, `TV2 relevé number`, Dataset, Longitude, Latitude) %>%
  filter(is.na(Longitude) | is.na(Latitude)) %>%
  group_by(Dataset) %>%
  summarize(n()),
  caption="Number of plots without coordinates per dataset") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
knitr::kable(header %>%
               filter(is.na(`Location uncertainty (m)`)) %>%
               group_by(Dataset) %>%
               summarize(n()),
  caption="Number of plots without Location Uncertainty per dataset") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
nNAs <- nrow(header %>% filter(is.na(`Location uncertainty (m)`)))
knitr::kable(table(levels(header$`Plants recorded`), exclude=NULL),
  caption="Number of records for each level in Plants recorded") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
knitr::kable(table(levels(header$`Herbs identified (y/n)`), exclude=NULL),
  caption="Number of records for each level in Plants recorded") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
DT0 <- readr::read_delim("../sPlot_data_export/sPlot 3.0.1_species.csv",
                            delim="\t",
                         col_type = cols(
                                PlotObservationID = col_double(),
                                Taxonomy = col_character(),
                                `Taxon group` = col_character(),
                                `Taxon group ID` = col_double(),
                                `Turboveg2 concept` = col_character(),
                                `Matched concept` = col_character(),
                                Match = col_double(),
                                Layer = col_double(),
                                `Cover %` = col_double(),
                                `Cover code` = col_character(),
                                x_ = col_double()
                              )
                         )
knitr::kable(problems(DT0) %>%
  mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>%
  dplyr::select(Dataset, col, expected, actual) %>%
  distinct(),
  caption="Problems when importing Species data") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")

id <- as.character(DT0$PlotObservationID[(problems(DT0) %>% dplyr::select(row) %>% distinct())$row])
relnum <- (header %>% filter(PlotObservationID == DT0$PlotObservationID[(problems(DT0) %>% dplyr::select(row) %>% distinct())$row]))$`TV2 relevé number`
db <- (problems(DT0) %>% mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>% dplyr::select(Dataset) %>% distinct())[1,1, drop=T]

#There are some plots without the appropriate cover code
knitr::kable(DT0 %>%
  filter(`Cover %` ==0  & is.na(`Cover code`)) %>%
  group_by(Taxonomy) %>%
  summarize(n()),
  caption="Summary of DBs without appropriate cover codes") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")

# in db British_Columbia_meadows this seems to depend on lichen\moss species
knitr::kable(head(DT0 %>%
  filter(`Cover %` ==0  & is.na(`Cover code`) & Taxonomy=="British_Columbia_meadows") %>%
  dplyr::select(PlotObservationID, Taxonomy, `Matched concept`:x_),10),
  caption="Example from British Columbia meadows db (only first 10 rows)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")


# in db USA_VegBank there seem to be species with only p\a values, next to species with also cover (FMS will have to solve this)
knitr::kable(head(DT0 %>%
  filter(`Cover %` ==0  & is.na(`Cover code`) & Taxonomy=="USA_VegBank")%>%
  dplyr::select(PlotObservationID, Taxonomy, `Matched concept`:x_), 10),
  caption="Example from USA_VegBank db (only first 10 rows)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
knitr::kable(table(header$Dataset), caption="Plots per dataset") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
countries <- map_data("world")
ggworld <- ggplot(countries, aes(x=long, y=lat, group = group)) +
  geom_polygon(col=gray(0.3), lwd=0.3, fill = gray(0.9)) +
  theme_bw()  +
  theme(axis.title = element_blank())

robust.range <- function(x){
  return(c(floor((min(x, na.rm=T)-0.01)/5)*5,
           ceiling((max(x, na.rm=T)+0.01)/5)*5))
}
ggdataset <- list()
tick <- 1
for(d in levels(header$Dataset)){
  datasel <- header %>%
               filter(Dataset==d)

  ggdataset[[tick]] <- ggworld +
    geom_point(data=datasel, aes(x=Longitude, y=Latitude, group=1),
               col="red", alpha=0.5, cex=1, shape="+") +
    coord_equal(ylim=robust.range(datasel$Latitude),
                  xlim=robust.range(datasel$Longitude)) +
    ggtitle(d)
  tick <- tick + 1
  }

for(dd in 1:nlevels(header$Dataset)){
  d <- levels(header$Dataset)[dd]
  print(ggdataset[[dd]])
  }
#Depreated below
#Some plots in the Hungary dataset have a altitude >5000 m (!)

print(ggplot(data=datasel %>%
           melt()) +
    geom_histogram(aes(x=value)) +
    facet_wrap(.~variable, scale="free", ncol=3) +
    theme_minimal() +
    theme(axis.text = element_text(size = 8)))


#Fix known problems
header.fix <- header %>%
  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern=" ", replacement="")) %>%
  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="-", replacement=NA)) %>%
  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.$", replacement=NA)) %>%
  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.", replacement="0\\.")) %>%
  mutate(`Slope (°)`=gsub(`Slope (°)`, pattern="-", replacement=NA)) %>%
  mutate(`Aspect (°)`=gsub(`Aspect (°)`, pattern="-", replacement=NA)) %>%
  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`,
                                            list=`Lichens identified (y/n)` %in% c("0","F", "n", "N" ),
                                            values="FALSE")) %>%
  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`,
                                              list=`Lichens identified (y/n)` %in% c("1","y", "Y"),
                                              values="TRUE")) %>%
  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`,
                                            list=`Mosses identified (y/n)` %in% c("0","F","f", "n", "N" ),
                                            values="FALSE")) %>%
  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`,
                                            list=`Mosses identified (y/n)` %in% c("1", "j", "J", "T", "y", "Y" ),
                                            values="TRUE"))

write_csv(header.fix, path = "../sPlot_data_export/sPlot_data_header_fix1.csv")