Started modifying 00_checkData to sPlot3.0.1 delivered from SH 29/06/2019

20daf8cb · Francesco Sabatini · c3cd5374 · 20daf8cb
Commit 20daf8cb authored Jul 4, 2019 by Francesco Sabatini
--- a/code/00_CheckData.Rmd
+++ b/code/00_CheckData.Rmd
@@ -17,10 +17,11 @@ always_allow_html: yes
 **Timestamp:** `r date()`  
 **Drafted:** Francesco Maria Sabatini  
 **Revised:** Stephan Hennekens  
-**Version:** 1.1
+**Version:** 1.2
 This report checks for consistency of the dataset used for constructing sPlot 3.0.  
 *Changes to v1.1* - Added check to species data. Created To Do list.  
+*Changes to v1.1* - based on dataset sPlot_3.0.1, received on 29/06/2019 from SH
 ```{r results="hide", message=F, warning=F}
 library(reshape2)
@@ -38,51 +39,16 @@ library(xlsx)
 ```
 # Check Header file
+Import with parse
 ```{r}
-#Import sPlot data
+header <- readr::read_delim("../sPlot_data_export/sPlot 3.0.1_header.csv", locale = locale(encoding = 'UTF-8'),
-header <- readr::read_delim("../sPlot_data_export/sPlot_data_header.csv", 
+                            delim="\t", col_types=cols(
-                            delim="\t", guess_max = 100000)
-```
-This version of sPlot 3.0 is composed of `r length(unique(header$Dataset))` data sets, for a total of `r nrow(header)` plots.
-Fix known problems
-```{r}
-header.fix <- header %>%
-  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern=" ", replacement="")) %>%
-  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="-", replacement=NA)) %>%
-  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.$", replacement=NA)) %>% 
-  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.", replacement="0\\.")) %>%
-  mutate(`Slope (°)`=gsub(`Slope (°)`, pattern="-", replacement=NA)) %>%
-  mutate(`Aspect (°)`=gsub(`Aspect (°)`, pattern="-", replacement=NA)) %>%
-  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`, 
-                                            list=`Lichens identified (y/n)` %in% c("0","F", "n", "N" ),
-                                            values="FALSE")) %>%
-  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`, 
-                                              list=`Lichens identified (y/n)` %in% c("1","y", "Y"), 
-                                              values="TRUE")) %>%
-  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`, 
-                                            list=`Mosses identified (y/n)` %in% c("0","F","f", "n", "N" ),
-                                            values="FALSE")) %>%
-  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`, 
-                                            list=`Mosses identified (y/n)` %in% c("1", "j", "J", "T", "y", "Y" ),
-                                            values="TRUE"))
-write_csv(header.fix, path = "../sPlot_data_export/sPlot_data_header_fix1.csv")
-```
-Reimport with parse
-```{r}
-header <- readr::read_csv("../sPlot_data_export/sPlot_data_header_fix1.csv",
-                            col_types=cols(
  PlotObservationID = col_double(),
  PlotID = col_double(),
  `TV2 relevé number` = col_double(),
  Country = col_factor(),
  `Cover abundance scale` = col_factor(),
-  Author = col_character(),
+  `Date of recording` = col_date(format="%d-%m-%Y"),
-  `Date of recording` = col_date(format="%Y"),
  `Relevé area (m²)` = col_double(),
  `Altitude (m)` = col_double(),
  `Aspect (°)` = col_double(),
@@ -105,35 +71,46 @@ header <- readr::read_csv("../sPlot_data_export/sPlot_data_header_fix1.csv",
  `Aver. height lowest herbs (cm)` = col_double(),
  `Maximum height herbs (cm)` = col_double(),
  `Maximum height cryptogams (mm)` = col_double(),
-  `Mosses identified (y/n)` = col_logical(),
+  `Mosses identified (y/n)` = col_factor(),
-  `Lichens identified (y/n)` = col_logical(),
+  `Lichens identified (y/n)` = col_factor(),
+  COMMUNITY = col_character(),
+  SUBSTRATE = col_character(),
  Locality = col_character(),
-  Naturalness = col_factor(),
+  ORIG_NUM = col_double(),
+  ALLIAN_REV = col_character(),
+  REV_AUTHOR = col_character(),
  Forest = col_logical(),
-  Shrubland = col_logical(),
-  Wetland = col_logical(),
  Grassland = col_logical(),
+  Wetland = col_logical(),
  `Sparse vegetation` = col_logical(),
+  Shrubland = col_logical(),
+  `Plants recorded` = col_factor(),
+  `Herbs identified (y/n)` = col_factor(),
+  Naturalness = col_factor(),
  EUNIS = col_factor(),
  Longitude = col_double(),
  Latitude = col_double(),
  `Location uncertainty (m)` = col_double(),
-  Dataset = col_factor()
+  Dataset = col_factor(),
+  GUID = col_character()
 ))
 ```
-After fixing problems there are `r nrow(header)` plots remaining.  
+This version of sPlot 3.0.1 is composed of `r length(unique(header$Dataset))` data sets, for a total of `r nrow(header)` plots.
 Show remaining problems
 ```{r}
 knitr::kable(problems(header) %>% 
  mutate(Dataset=header$Dataset[problems(header)$row]) %>%
-  dplyr::select(Dataset, col, expected, actual) %>% 
+  dplyr::select(Dataset, row, col) %>% 
  distinct(), 
  caption="Problems when importing header data") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), latex_options = "basic", 
                  full_width = F, position = "center")
 ```
+There seem to be some encoding problems with these plots
 Plots without coordinates (by dataset)
 ```{r}
@@ -161,10 +138,36 @@ knitr::kable(header %>%
 ```
-Other known problems still to be fixed:  
+## Previously known problems still to be fixed:  
 1) Import field 'Plants Recorded' into header (SH) - create dictionary of possible factors (FMS)  
+```{r}
+knitr::kable(table(levels(header$`Plants recorded`), exclude=NULL), 
+  caption="Number of records for each level in Plants recorded") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+The field is mostly empty!!
 2) Import field 'Herbs identified' into header (SH)  
+```{r}
+knitr::kable(table(levels(header$`Herbs identified (y/n)`), exclude=NULL), 
+  caption="Number of records for each level in Plants recorded") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+The field is mostly empty!!
 3) Formations - Assign zeros to columns (Forest, Grassland, Shrubland, Wetland, Sparse), when at least one 1 is present (FMS)  
+```{r}
+header <- header %>%
+  mutate(any1=rowSums(select(., Forest:Shrubland), na.rm=T)) %>%
+  mutate_at(.vars = vars(Forest:Shrubland), 
+            .funs = ~ifelse(any1>0, ifelse(!is.na(.), ., 0), 0)) %>%
+  select(Forest:Shrubland, any1) %>% 
+  filter(any1>0)
+```
 4) Link to EUNIS cross-link table, and assign Faber-Langedon Formation (FMS)  
 5) Assign plot elevation using external sources (FMS)  
@@ -172,8 +175,9 @@ Other known problems still to be fixed:
 # Check DT table
-```{r}
+```{r, eval=F}
 DT0 <- readr::read_delim("../sPlot_data_export/sPlot_data_species.csv", 
                            delim="\t", 
                         col_type = cols(
@@ -193,7 +197,7 @@ DT0 <- readr::read_delim("../sPlot_data_export/sPlot_data_species.csv",
 ```
 Show problems in DT import
-```{r}
+```{r, eval=F}
 knitr::kable(problems(DT0) %>% 
  mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>%
  dplyr::select(Dataset, col, expected, actual) %>% 
@@ -204,7 +208,7 @@ knitr::kable(problems(DT0) %>%
 ```
-```{r, echo=F}
+```{r, echo=F, eval=F}
 id <- as.character(DT0$PlotObservationID[(problems(DT0) %>% dplyr::select(row) %>% distinct())$row])
 relnum <- (header %>% filter(PlotObservationID == DT0$PlotObservationID[(problems(DT0) %>% dplyr::select(row) %>% distinct())$row]))$`TV2 relevé number`
 db <- (problems(DT0) %>% mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>% dplyr::select(Dataset) %>% distinct())[1,1, drop=T]
@@ -215,7 +219,7 @@ db <- (problems(DT0) %>% mutate(Dataset=DT0$Taxonomy[problems(DT0)$row]) %>% dpl
 All problems seem to be concentrated in PlotID = `r id` which corresponds to TV2 relevé Number = `r relnum ` in `r db`. 
 Other known problems:
-```{r}
+```{r, eval=F}
 #There are some plots without the appropriate cover code
 knitr::kable(DT0 %>% 
  filter(`Cover %` ==0  & is.na(`Cover code`)) %>% 
@@ -254,7 +258,7 @@ knitr::kable(head(DT0 %>%
 Distribution of plots across datasets:
-```{r, echo=F}
+```{r, echo=F, eval=F}
 knitr::kable(table(header$Dataset), caption="Plots per dataset") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
 ```
@@ -309,3 +313,31 @@ print(ggplot(data=datasel %>%
 ```
+Depreated below
+Fix known problems
+```{r, eval=F}
+header.fix <- header %>%
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern=" ", replacement="")) %>%
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="-", replacement=NA)) %>%
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.$", replacement=NA)) %>% 
+  mutate(`Altitude (m)`=gsub(`Altitude (m)`, pattern="^\\.", replacement="0\\.")) %>%
+  mutate(`Slope (°)`=gsub(`Slope (°)`, pattern="-", replacement=NA)) %>%
+  mutate(`Aspect (°)`=gsub(`Aspect (°)`, pattern="-", replacement=NA)) %>%
+  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`, 
+                                            list=`Lichens identified (y/n)` %in% c("0","F", "n", "N" ),
+                                            values="FALSE")) %>%
+  mutate(`Lichens identified (y/n)`=replace(`Lichens identified (y/n)`, 
+                                              list=`Lichens identified (y/n)` %in% c("1","y", "Y"), 
+                                              values="TRUE")) %>%
+  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`, 
+                                            list=`Mosses identified (y/n)` %in% c("0","F","f", "n", "N" ),
+                                            values="FALSE")) %>%
+  mutate(`Mosses identified (y/n)`=replace(`Mosses identified (y/n)`, 
+                                            list=`Mosses identified (y/n)` %in% c("1", "j", "J", "T", "y", "Y" ),
+                                            values="TRUE"))
+write_csv(header.fix, path = "../sPlot_data_export/sPlot_data_header_fix1.csv")
+```