Skip to content
Snippets Groups Projects
Commit f60c5130 authored by Francesco Sabatini's avatar Francesco Sabatini
Browse files

Created 00_CheckData.Rmd for data validity check

parent 3ad9f753
No related branches found
No related tags found
No related merge requests found
---
title: 'sPlot 3.0 - Validity Check'
output:
html_document: default
always_allow_html: yes
---
<center>
![](../splot-long-rgb.png "sPlot Logo")
</center>
**Timestamp:** `r date()`
**Drafted:** Francesco Maria Sabatini
**Version:** 1.0
This report checks for consistency of the dataset used for constructing sPlot 3.0.
```{r results="hide", message=F, warning=F}
library(reshape2)
library(tidyverse)
library(readr)
library(dplyr)
library(data.table)
library(knitr)
library(kableExtra)
library(viridis)
library(grid)
library(gridExtra)
library(ggforce)
library(xlsx)
```
```{r}
#Import sPlot data
header <- readr::read_delim("sPlot_data_export/sPlot_data_header.csv",
delim="\t", guess_max = 100000)
```
This version of sPlot 3.0 is composed of `r length(unique(header$Dataset))` data sets, for a total of `r nrow(header)` plots.
Fix known problems
```{r}
header.fix <- header %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern=" ", replacement="")) %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="-", replacement=NA)) %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.$", replacement=NA)) %>%
mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.", replacement="0\\.")) %>%
mutate(`Slope (°)`=gsub(`Slope (°)`, pattern="-", replacement=NA)) %>%
mutate(`Aspect (°)`=gsub(`Aspect (°)`, pattern="-", replacement=NA)) %>%
mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`,
list=`Lichens identified (y/n)` %in% c("0","F", "n", "N" ),
values="FALSE")) %>%
mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`,
list=`Lichens identified (y/n)` %in% c("1","y", "Y"),
values="TRUE")) %>%
mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`,
list=`Mosses identified (y/n)` %in% c("0","F","f", "n", "N" ),
values="FALSE")) %>%
mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`,
list=`Mosses identified (y/n)` %in% c("1", "j", "J", "T", "y", "Y" ),
values="TRUE"))
write_csv(header.fix, path = "sPlot_data_export/sPlot_data_header_fix1.csv")
```
Reimport with parse
```{r}
header <- readr::read_csv("sPlot_data_export/sPlot_data_header_fix1.csv",
col_types=cols(
PlotObservationID = col_double(),
PlotID = col_double(),
`TV2 relevé number` = col_double(),
Country = col_factor(),
`Cover abundance scale` = col_factor(),
Author = col_character(),
`Date of recording` = col_date(format="%Y"),
`Relevé area (m²)` = col_double(),
`Altitude (m)` = col_double(),
`Aspect (°)` = col_double(),
`Slope (°)` = col_double(),
`Cover total (%)` = col_double(),
`Cover tree layer (%)` = col_double(),
`Cover shrub layer (%)` = col_double(),
`Cover herb layer (%)` = col_double(),
`Cover moss layer (%)` = col_double(),
`Cover lichen layer (%)` = col_double(),
`Cover algae layer (%)` = col_double(),
`Cover litter layer (%)` = col_double(),
`Cover open water (%)` = col_double(),
`Cover bare rock (%)` = col_double(),
`Height (highest) trees (m)` = col_double(),
`Height lowest trees (m)` = col_double(),
`Height (highest) shrubs (m)` = col_double(),
`Height lowest shrubs (m)` = col_double(),
`Aver. height (high) herbs (cm)` = col_double(),
`Aver. height lowest herbs (cm)` = col_double(),
`Maximum height herbs (cm)` = col_double(),
`Maximum height cryptogams (mm)` = col_double(),
`Mosses identified (y/n)` = col_logical(),
`Lichens identified (y/n)` = col_logical(),
Locality = col_character(),
Naturalness = col_factor(),
Forest = col_logical(),
Shrubland = col_logical(),
Wetland = col_logical(),
Grassland = col_logical(),
`Sparse vegetation` = col_logical(),
EUNIS = col_factor(),
Longitude = col_double(),
Latitude = col_double(),
`Location uncertainty (m)` = col_double(),
Dataset = col_factor()
))
```
Show remaining problems
```{r}
knitr::kable(problems(header) %>%
mutate(Dataset=header$Dataset[problems(header)$row]) %>%
dplyr::select(Dataset, col, expected, actual) %>%
distinct(),
caption="Problems when importing header data") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = F, position = "center")
```
After fixing problems there are `r nrow(header)` plots remaining.
Distribution of plots across datasets:
```{r, echo=F}
knitr::kable(table(header$Dataset), caption="Plots per dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
```
Check geographic coordinates and print summary for each dataset
```{r, message=F, eval=T, cache=T}
countries <- map_data("world")
ggworld <- ggplot(countries, aes(x=long, y=lat, group = group)) +
geom_polygon(col=gray(0.3), lwd=0.3, fill = gray(0.9)) +
theme_bw() +
theme(axis.title = element_blank())
ggdataset <- list()
tick <- 1
for(d in levels(header$Dataset)){
datasel <- header %>%
filter(Dataset==d)
ggdataset[[tick]] <- ggworld +
geom_point(data=datasel, aes(x=Longitude, y=Latitude, group=1),
col="red", alpha=0.5, cex=1, shape="+") +
coord_equal(ylim=c(floor(min(datasel$Latitude)/10)*10,
ceiling(max(datasel$Latitude)/10)*10),
xlim=c(floor(min(datasel$Longitude)/10)*10,
ceiling(max(datasel$Longitude)/10)*10)) +
ggtitle(d)
tick <- tick + 1
}
```
```{r}
not_all_na <- function(x) {!all(is.na(x))}
for(dd in 1:nlevels(header$Dataset)){
d <- levels(header$Dataset)[dd]
datasel <- header %>%
filter(Dataset==d) %>%
select_if(not_all_na)
print(ggdataset[[dd]])
print(summary(datasel))
}
```
Other observed problems:
Some plots in the Hungary dataset have a altitude >5000 m (!)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment