Created 00_CheckData.Rmd for data validity check

f60c5130 · Francesco Sabatini · 3ad9f753 · f60c5130
Commit f60c5130 authored 6 years ago by Francesco Sabatini
--- a/code/00_CheckData.Rmd
+++ b/code/00_CheckData.Rmd
+---
+title: 'sPlot 3.0 - Validity Check'
+output: 
+html_document: default
+always_allow_html: yes
+---
+
+<center>
+  ![](../splot-long-rgb.png "sPlot Logo")
+</center>
+
+  
+    
+      
+        
+**Timestamp:** `r date()`  
+**Drafted:** Francesco Maria Sabatini  
+**Version:** 1.0
+  
+
+This report checks for consistency of the dataset used for constructing sPlot 3.0. 
+  
+```{r results="hide", message=F, warning=F}
+library(reshape2)
+library(tidyverse)
+library(readr)
+library(dplyr)
+library(data.table)
+library(knitr)
+library(kableExtra)
+library(viridis)
+library(grid)
+library(gridExtra)
+library(ggforce)
+library(xlsx)
+```
+
+```{r}
+#Import sPlot data
+header <- readr::read_delim("sPlot_data_export/sPlot_data_header.csv", 
+                            delim="\t", guess_max = 100000)
+```
+
+This version of sPlot 3.0 is composed of `r length(unique(header$Dataset))` data sets, for a total of `r nrow(header)` plots.
+
+
+Fix known problems
+```{r}
+header.fix <- header %>%
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern=" ", replacement="")) %>%
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="-", replacement=NA)) %>%
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.$", replacement=NA)) %>% 
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.", replacement="0\\.")) %>%
+  mutate(`Slope (°)`=gsub(`Slope (°)`, pattern="-", replacement=NA)) %>%
+  mutate(`Aspect (°)`=gsub(`Aspect (°)`, pattern="-", replacement=NA)) %>%
+  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`, 
+                                            list=`Lichens identified (y/n)` %in% c("0","F", "n", "N" ),
+                                            values="FALSE")) %>%
+  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`, 
+                                              list=`Lichens identified (y/n)` %in% c("1","y", "Y"), 
+                                              values="TRUE")) %>%
+  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`, 
+                                            list=`Mosses identified (y/n)` %in% c("0","F","f", "n", "N" ),
+                                            values="FALSE")) %>%
+  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`, 
+                                            list=`Mosses identified (y/n)` %in% c("1", "j", "J", "T", "y", "Y" ),
+                                            values="TRUE"))
+
+write_csv(header.fix, path = "sPlot_data_export/sPlot_data_header_fix1.csv")
+```
+
+Reimport with parse
+```{r}
+header <- readr::read_csv("sPlot_data_export/sPlot_data_header_fix1.csv",
+                            col_types=cols(
+  PlotObservationID = col_double(),
+  PlotID = col_double(),
+  `TV2 relevé number` = col_double(),
+  Country = col_factor(),
+  `Cover abundance scale` = col_factor(),
+  Author = col_character(),
+  `Date of recording` = col_date(format="%Y"),
+  `Relevé area (m²)` = col_double(),
+  `Altitude (m)` = col_double(),
+  `Aspect (°)` = col_double(),
+  `Slope (°)` = col_double(),
+  `Cover total (%)` = col_double(),
+  `Cover tree layer (%)` = col_double(),
+  `Cover shrub layer (%)` = col_double(),
+  `Cover herb layer (%)` = col_double(),
+  `Cover moss layer (%)` = col_double(),
+  `Cover lichen layer (%)` = col_double(),
+  `Cover algae layer (%)` = col_double(),
+  `Cover litter layer (%)` = col_double(),
+  `Cover open water (%)` = col_double(),
+  `Cover bare rock (%)` = col_double(),
+  `Height (highest) trees (m)` = col_double(),
+  `Height lowest trees (m)` = col_double(),
+  `Height (highest) shrubs (m)` = col_double(),
+  `Height lowest shrubs (m)` = col_double(),
+  `Aver. height (high) herbs (cm)` = col_double(),
+  `Aver. height lowest herbs (cm)` = col_double(),
+  `Maximum height herbs (cm)` = col_double(),
+  `Maximum height cryptogams (mm)` = col_double(),
+  `Mosses identified (y/n)` = col_logical(),
+  `Lichens identified (y/n)` = col_logical(),
+  Locality = col_character(),
+  Naturalness = col_factor(),
+  Forest = col_logical(),
+  Shrubland = col_logical(),
+  Wetland = col_logical(),
+  Grassland = col_logical(),
+  `Sparse vegetation` = col_logical(),
+  EUNIS = col_factor(),
+  Longitude = col_double(),
+  Latitude = col_double(),
+  `Location uncertainty (m)` = col_double(),
+  Dataset = col_factor()
+))
+```
+
+Show remaining problems
+```{r}
+knitr::kable(problems(header) %>% 
+  mutate(Dataset=header$Dataset[problems(header)$row]) %>%
+  dplyr::select(Dataset, col, expected, actual) %>% 
+  distinct(), 
+  caption="Problems when importing header data") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+
+
+After fixing problems there are `r nrow(header)` plots remaining.  
+
+
+Distribution of plots across datasets:
+
+```{r, echo=F}
+knitr::kable(table(header$Dataset), caption="Plots per dataset") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+
+Check geographic coordinates and print summary for each dataset
+```{r, message=F, eval=T, cache=T}
+countries <- map_data("world")
+ggworld <- ggplot(countries, aes(x=long, y=lat, group = group)) +
+  geom_polygon(col=gray(0.3), lwd=0.3, fill = gray(0.9)) +
+  theme_bw()  + 
+  theme(axis.title = element_blank())
+
+ggdataset <- list()
+tick <- 1
+for(d in levels(header$Dataset)){
+  datasel <- header %>%
+               filter(Dataset==d)
+  
+  ggdataset[[tick]] <- ggworld + 
+    geom_point(data=datasel, aes(x=Longitude, y=Latitude, group=1), 
+               col="red", alpha=0.5, cex=1, shape="+") + 
+    coord_equal(ylim=c(floor(min(datasel$Latitude)/10)*10,
+                       ceiling(max(datasel$Latitude)/10)*10),
+                  xlim=c(floor(min(datasel$Longitude)/10)*10,
+                       ceiling(max(datasel$Longitude)/10)*10)) + 
+    ggtitle(d)
+  tick <- tick + 1
+  }
+  
+```
+
+```{r}
+not_all_na <- function(x) {!all(is.na(x))}
+for(dd in 1:nlevels(header$Dataset)){
+  d <- levels(header$Dataset)[dd]
+  datasel <- header %>%
+               filter(Dataset==d) %>%
+    select_if(not_all_na)
+  print(ggdataset[[dd]])
+  print(summary(datasel))
+  }
+```
+
+
+Other observed problems:
+Some plots in the Hungary dataset have a altitude >5000 m (!)
\ No newline at end of file