From de09f453df1f3f4cee3112721e1d29cc9848adad Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Wed, 4 Mar 2020 16:42:23 +0100
Subject: [PATCH] First version of 06_buildDT

---
 code/03_TaxonomicBackbone.Rmd |  82 +++++-----
 code/06_buildDT.Rmd           | 288 ++++++++++++++++++++++++++++++++++
 2 files changed, 333 insertions(+), 37 deletions(-)
 create mode 100644 code/06_buildDT.Rmd

diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd
index 2611f12..b7cf96e 100644
--- a/code/03_TaxonomicBackbone.Rmd
+++ b/code/03_TaxonomicBackbone.Rmd
@@ -51,25 +51,7 @@ library(vegdata)
 ## Read in taxon names from [sPlot](https://www.idiv.de/sdiv/working_groups/wg_pool/splot/splot_database.html) and [TRY](https://www.try-db.org/TryWeb/Home.php)
 
 
-```{r, echo=F}
-## fungi genera #NOT COMPLETE LIST
-mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Amanitopsis","Coprinus",
-  "Galerina","Geoglossum","Hebeloma","Hydnum","Lactarius","Leucocarpia","Naucoria","Otidea","Polyporus", "Involucrothele",
-  "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", 
-   "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe",
-  "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", 
-  "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria","Amphoroblasia", "Amphoroblastia", "Agrocybe", 
-  "Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma", 
-  "Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae", 
-  "Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora",         
-  "Kirschsteiniothelia", "Lasallia","Lepiota","Llimoniella","Mazosia","Microthelia","Mollisia",     
-  "Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella",
-  "Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Umbilicaria","Unguiculariopsis" ,
-  "Xanthoconium")
-```
-
-
-```{r, eval=F}
+```{r}
 #import and save splot names from DT table
 DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", 
                             delim="\t", 
@@ -87,7 +69,25 @@ DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv",
                                 x_ = col_double()
                               )
                          ) 
-## Exclude fungi
+
+```
+Create list of species, and exclude fungi
+```{r}
+## fungi genera #NOT COMPLETE LIST
+mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Amanitopsis","Coprinus",
+  "Galerina","Geoglossum","Hebeloma","Hydnum","Lactarius","Leucocarpia","Naucoria","Otidea","Polyporus", 
+  "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", 
+   "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe", "Paxillus",
+  "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", 
+  "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria", "Agrocybe", 
+  "Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma", 
+  "Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae", 
+  "Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora",         
+  "Kirschsteiniothelia", "Lepiota","Llimoniella","Mazosia","Mollisia",     
+  "Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella",
+  "Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Unguiculariopsis" ,
+  "Xanthoconium")
+
 splot.species <- DT0 %>%
   rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>%
   filter(`Taxon group`!="Mushroom") %>%
@@ -99,10 +99,10 @@ splot.species <- DT0 %>%
   filter(fungi==F) %>%
   dplyr::select(Species.original, Matched.concept)
 
-  
 write_csv(splot.species, path = "../_derived/splot3.0.2.species.csv")
 ```
 
+
 !!! I used the column from TRY with the full species name, not the column with only two-word name strings
 
 ```{r, message=F}
@@ -330,11 +330,11 @@ tnrs.res <- tnrs.res0 %>%
                                 levels=c("Accepted","Synonym", "No opinion","Invalid",
                                          "Illegitimate","Misapplied","Rejected name"))) %>%
   #filter(Taxonomic_status %in% c("Accepted", "Synonym")) %>%
-  arrange(Name_number, 
+  arrange(Name_number,
+          desc(Genus_score), 
+          desc(Specific_epithet_score),
           desc(Infraspecific_epithet_2_score),
           desc(Infraspecific_epithet_score), 
-          desc(Specific_epithet_score),
-          desc(Genus_score), 
           desc(Family_score),
           desc(Name_score),
           desc(Overall_score), 
@@ -979,15 +979,16 @@ tnrs.res.iter2 <- tnrs.res.iter2.raw %>%
                                 levels=c("Accepted","Synonym", "No opinion",
                                          "Invalid","Illegitimate","Misapplied",
                                          "Rejected name"))) %>%
- arrange(Name_number, 
+ arrange(Name_number,
+          desc(Genus_score), 
+          desc(Specific_epithet_score),
           desc(Infraspecific_epithet_2_score),
           desc(Infraspecific_epithet_score), 
-          desc(Specific_epithet_score),
-          desc(Genus_score), 
           desc(Family_score),
           desc(Name_score),
           desc(Overall_score), 
-          Source) %>%
+          Source, 
+          Taxonomic_status) %>%
   group_by(Name_submitted) %>%
   slice(1)
 ```
@@ -1107,13 +1108,15 @@ tnrs.ncbi <- tnrs.res.iter3.raw %>%
                                 levels=c("Accepted","Synonym", "No opinion","Invalid",
                                          "Illegitimate","Misapplied","Rejected name"))) %>%
  arrange(Name_number, 
+          desc(Genus_score), 
+          desc(Specific_epithet_score),
           desc(Infraspecific_epithet_2_score),
           desc(Infraspecific_epithet_score), 
-          desc(Specific_epithet_score),
-          desc(Genus_score), 
           desc(Family_score),
           desc(Name_score),
-          desc(Overall_score)) %>%
+          desc(Overall_score), 
+          Source, 
+          Taxonomic_status) %>%
   group_by(Name_submitted) %>%
   slice(1)
 ```
@@ -1560,14 +1563,16 @@ Backbone <- Backbone %>%
 
 #Records with missing family info
 sum(is.na(Backbone$Family_correct))
+
+nrow(Backbone) == nrow(spec.list.TRY.sPlot)
 ```
 
 ### Delete records assigned to mushroom families, if any
 ```{r}
 mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae",
                        "Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae",
-                       "Cortinariaceae", "Polyporaceae",  "Umbilicariaceae" , "Pleosporaceae",
-                       "Leotiaceae","Dermateaceae", "Hymenochaetaceae","Stereaceae","Tremellaceae")
+                       "Cortinariaceae", "Polyporaceae",   "Pleosporaceae",
+                       "Leotiaceae","Dermateaceae","Hymenochaetaceae","Stereaceae","Tremellaceae")
 Backbone <- Backbone %>% 
   filter(!Genus_correct %in% mushroom) %>% 
   filter(!Family_correct %in% mushroom.families)
@@ -1625,12 +1630,14 @@ lichens <- c("Acarosporaceae" , "Parmeliaceae", "Physciaceae", "Lichinaceae",
              "Teloschistaceae","Candelariaceae","Rhizocarpaceae","Lecideaceae",
              "Icmadophilaceae","Cladoniaceae","Collemataceae","Pannariaceae" ,
              "Lobariaceae", "Ophioparmaceae" ,"Psoraceae","Stereocaulaceae",
-             "Massalongiaceae","Peltigeraceae","Nephromataceae")
+             "Massalongiaceae","Peltigeraceae","Nephromataceae", "Umbilicariaceae" )
 lichen.genera <- c("Amygdalaria", "Anamylospora", "Arthonia", "Pertusaria", "Pyrenula","Opegrapha", 
                    "Ochrolechia", "Graphis", "Micarea", "Porpidia", "Arthopyrenia", "Graphina", "Anisomeridium",
                    "Mycobilimbia","Peltula", "Thelotrema", "Arthothelium", "Diploschistes", "Strigula",
                    "Trichothelium", "Melaspilea", "Phaeographis", "Thelenella", "Chaenothecopsis","Fuscidea",
-                   "Dactylospora", "Gyalecta", "Myriotrema", "Placynthium")
+                   "Dactylospora", "Gyalecta", "Myriotrema", "Placynthium", "Umbilicaria", 
+                   "Lasallia", "Microthelia", "Lichenochora", "Roselliniopsis", "Homostegia", 
+                   "Verrucaria", "Leptorhaphis")
 mosses <- c("Pilotrichaceae", "Chonecoleaceae", "Hypopterygiaceae", "Scorpidiaceae",
             "Balantiopsaceae", "Mesoptychiaceae","Octoblepharaceae" ,"Takakiaceae")
 algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiaceae",
@@ -1648,7 +1655,7 @@ Backbone <- Backbone %>%
                                      values=T)) %>% 
   mutate(`Taxon group`="Unknown") %>% 
   mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T), 
-                              "Vascular Plant", `Taxon group`)) %>% 
+                              "Vascular plant", `Taxon group`)) %>% 
   mutate(`Taxon group`=replace(`Taxon group`, 
                                list=Family_correct %in% lichens, 
                                values="Lichen")) %>% 
@@ -1737,7 +1744,8 @@ knitr::kable(Backbone %>%
 *`Taxon group`* -  Taxon group, as in Turboveg. 'Vascular plant', 'Moss' (include liverworts), 'Lichen', 'Algae', 'Unknown
 
 ```{r}
-save(Backbone, file="../_output/Backbone3.0.RData")
+save(Backbone, mushroom, mushroom.families, lichen.genera,
+     file="../_output/Backbone3.0.RData")
 ```
 
 ## Export species list to request in TRY
diff --git a/code/06_buildDT.Rmd b/code/06_buildDT.Rmd
new file mode 100644
index 0000000..26f642a
--- /dev/null
+++ b/code/06_buildDT.Rmd
@@ -0,0 +1,288 @@
+---
+title: "sPlot 3.0 - Build DT"
+author: "Francesco Maria Sabatini"
+date: "2/24/2020"
+output: html_document
+---
+
+
+<center>
+  ![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")
+</center>
+
+
+MEMO CHECK Field cover code! It seems to have species characters
+    
+      
+        
+**Timestamp:** `r date()`  
+**Drafted:** Francesco Maria Sabatini  
+**Revised:**  
+**version:** 1.0
+
+This report documents the construction of the DT table for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens.    
+
+
+```{r results="hide", message=F, warning=F}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(readr)
+library(xlsx)
+library(knitr)
+library(kableExtra)
+
+#save temporary files
+write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
+write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
+#rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
+```
+
+
+Search and replace unclosed quotation marks and escape them. Run in Linux terminal 
+```{bash, eval=F}
+# escape all double quotation marks. Run in Linux terminal
+# sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv
+```
+
+## Import data Table
+DT table is the species x plot matrix, in long format.
+```{r }
+DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", 
+                            delim="\t", 
+                         col_type = cols(
+                                PlotObservationID = col_double(),
+                                Taxonomy = col_character(),
+                                `Taxon group` = col_character(),
+                                `Taxon group ID` = col_double(),
+                                `Turboveg2 concept` = col_character(),
+                                `Matched concept` = col_character(),
+                                Match = col_double(),
+                                Layer = col_double(),
+                                `Cover %` = col_double(),
+                                `Cover code` = col_character(),
+                                x_ = col_double()
+                              )
+                         ) 
+nplots <- length(unique(DT0$PlotObservationID))
+nspecies <- length(unique(DT0$`Matched concept`))
+```
+Species data include `r nrow(DT0)` species * plot records, across `r nplots` plots and including `r nspecies` non-resolved species.  
+\newline
+
+
+```{r, echo=F}
+set.seed <- 1984
+sampled <- sample(unique(DT0$PlotObservationID), 10, replace=F)
+
+knitr::kable(DT0 %>%
+               filter(PlotObservationID %in% sampled[1:3]), 
+  caption="Example of initial DT table (3 randomly selected plots shown)") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+
+Import taxonomic backbone
+```{r}
+load("../_output/Backbone3.0.RData")
+```
+
+## Match species names from DT0 to those in Backbone
+```{r}
+DT1 <- DT0 %>% 
+  left_join(Backbone %>% 
+              dplyr::select(Name_sPlot_TRY, Name_short, `Taxon group`) %>%
+              rename(`Matched concept`=Name_sPlot_TRY,
+                     Taxongroup_BB=`Taxon group`), 
+            by="Matched concept")  
+```
+
+## Explore name matching based on Backbone v1.2
+Select species entries that changed after taxonomic standardization, as a way to check the backbone. 
+```{r}
+name.check <- DT1 %>% 
+  dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% 
+  rename(Name_TNRS=Name_short) %>% 
+  distinct() %>% 
+  mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% 
+  filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>%
+  dplyr::select(-Matched_short) %>% 
+  arrange(Name_TNRS)
+```
+
+```{r, echo=F}
+knitr::kable(name.check %>% sample_n(30), 
+  caption="Check 30 random species names from DT after matching to backbone") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+
+Check the most common species names from DT after matching to backbone
+```{r}
+name.check.freq <- DT1 %>% 
+  dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% 
+  rename(Name_TNRS=Name_short) %>% 
+  group_by(`Turboveg2 concept`, `Matched concept`, Name_TNRS) %>% 
+  summarize(n=n()) %>% 
+  mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% 
+  filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>%
+  dplyr::select(-Matched_short) %>% 
+  ungroup() %>% 
+  arrange(desc(n)) 
+```
+
+```{r, echo=F}
+knitr::kable(name.check.freq %>% slice(1:40), 
+  caption="Check 40 most common species names from DT after matching to backbone") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+
+## Complete field `taxon group` 
+
+Coalesce `Taxon group` info from Backbone
+```{r}
+table(DT1$`Taxon group`, exclude=NULL)
+
+DT1 <- DT1 %>% 
+  mutate(`Taxon group`=ifelse(`Taxon group`=="Unknown", NA, `Taxon group`)) %>% 
+  mutate(Taxongroup_BB=ifelse(Taxongroup_BB=="Unknown", NA, Taxongroup_BB)) %>% 
+  mutate(`Taxon group`=coalesce(`Taxon group`, Taxongroup_BB)) %>% 
+  dplyr::select(-Taxongroup_BB)
+
+
+table(DT1$`Taxon group`, exclude=NULL)
+```
+
+
+
+Cross-complement
+```{r}
+DT1 <- DT1 %>% 
+  left_join(DT1 %>% 
+              filter(!is.na(Name_short)) %>% 
+              filter(`Taxon group` != "Unknown") %>% 
+              dplyr::select(Name_short, `Taxon group`) %>% 
+              distinct(Name_short, .keep_all=T) %>% 
+              rename(TaxonGroup_compl=`Taxon group`),
+            by="Name_short") %>% 
+  mutate(`Taxon group`=coalesce(`Taxon group`, TaxonGroup_compl)) %>% 
+  dplyr::select(-TaxonGroup_compl)
+
+table(DT1$`Taxon group`, exclude=NULL)
+```
+
+Check species with conflicting `Taxon group` information and fix manually.
+```{r}
+#Attach genus info
+DT1 <- DT1 %>% 
+    left_join(Backbone %>% 
+              dplyr::select(Name_sPlot_TRY, Name_short) %>%
+              mutate(Genus=word(Name_short, 1, 1)) %>%
+              dplyr::select(-Name_short) %>% 
+              rename(`Matched concept`=Name_sPlot_TRY),
+            by="Matched concept") %>% 
+    mutate(`Taxon group`=fct_collapse(`Taxon group`, 
+                                    Alga_Stonewort=c("Alga", "Stonewort")))
+#manually fix some know problems
+mosses.gen <- c("Hypnum", "Brachytheciastrum", 
+           "Brachythecium","Hypnum",  "Zygodon", "Oxymitra", "Bryophyta", "Musci", '\\\"Moos\\\"')
+vascular.gen <- c("Polystichum", "Hypericum", "Peltaria", "Pancovia", "Calythrix", "Ripogonum",
+                  "Notogrammitis", "Fuscospora", "Lophozonia",  "Rostellularia", 
+                  "Hesperostipa", "Microsorium", 
+                  "Angiosperm","Dicotyledonae", "Spermatophy")
+alga.gen <- c("Chara", "Characeae", "Tonina", "Nostoc", "Entermorpha", "Hydrocoleum" )
+ 
+DT1 <- DT1 %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% mosses.gen, 
+                               values="Moss")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% vascular.gen, 
+                               values="Vascular plant")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% alga.gen, 
+                               values="Alga_Stonewort")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% c(lichen.genera, "Lichenes"),
+                               values="Lichen")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% mushroom, 
+                               values="Mushroom"))
+  
+table(DT1$`Taxon group`, exclude=NULL)
+```
+
+```{r, eval=F, echo=F}
+#check for conflicts in attribution of genera to Taxon groups
+conflict <- DT1 %>% 
+  filter(!is.na(Name_short)) %>% 
+  dplyr::select(Genus, `Taxon group`) %>% 
+  filter(!is.na(`Taxon group`)) %>% 
+  distinct() %>% 
+  group_by(Genus) %>% 
+  summarize(n=n()) %>% 
+  filter(n>1) %>% 
+  arrange(desc(n)) %>% 
+  pull(Genus)
+```
+
+
+Delete all records of fungi
+```{r}
+DT1f <- DT1 %>% 
+  dplyr::select(-Genus) %>% 
+  left_join(DT1 %>% 
+              distinct(`Matched concept`) %>% 
+              mutate(Genus=word(`Matched concept`, 1)), 
+            by="Matched concept") %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                                 list=Genus %in% mushroom, 
+                                 values = "Mushroom")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% lichen.genera, 
+                               values="Lichen")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% mosses.gen, 
+                               values="Moss")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus %in% vascular.gen, 
+                               values="Vascular plant")) %>% 
+  mutate(`Taxon group` = fct_explicit_na(`Taxon group`, "Unknown")) %>% 
+  filter(`Taxon group`!="Mushroom")# %>% 
+  #dplyr::select(-Genus)
+
+table(DT1f$`Taxon group`, exclude=NULL)
+```
+
+Check the most frequent species for which we don't have taxon group info
+```{r, echo=F, eval=F}
+DT1f %>% 
+  filter(`Taxon group` == "Unknown") %>% 
+  group_by(Genus) %>% 
+  summarize(n=n()) %>% 
+  arrange(desc(n)) %>% 
+    slice(1:40)
+```
+
+Clean DT and export
+```{r}
+DT2 <- DT1 %>% 
+  dplyr::select(PlotObservationID, Name_short, `Turboveg2 concept`, `Taxon group`, Layer:x_ ) %>% 
+  rename(Species_original=`Turboveg2 concept`, 
+         Species_matched=Name_short,
+         Taxon_group=`Taxon group`, 
+         Cover_perc=`Cover %`, 
+         Cover_code=`Cover code`)
+```
+
+The output of the DT table contains `r nrow(DT2)` records, over `r length(unique(DT2$PlotObservationID))` plots. The total number of taxa is `r length(unique(DT2$Species_original))` and `r length(unique(DT2$Species_matched))`, before and after standardization, respectively. Information on the `Taxon group` is available for `r DT2 %>% filter(Taxon_group!="Unknown") %>% distinct(Species_matched) %>% nrow()` standardized species.
+
+```{r}
+save(DT2, file = "../_output/DT_sPlot3.0.RData")
+```
+
+***!!! ADD Explanation of fields!!!***
+
+
+
+!! It seems there are a few species in Name_sPlot_TRY which are duplicated!!! Check in Backbone
-- 
GitLab