From d46a7381a3bf615929ff335e704de1800697e6c2 Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Tue, 23 Jul 2019 23:04:02 +0200
Subject: [PATCH] Created 03 Taxonomic Backbone

---
 code/01_Extract_elevation.Rmd |  14 +-
 code/02_Compile_dataset.Rmd   |   7 +-
 code/03_TaxonomicBackbone.Rmd | 275 ++++++++++++++++++++++++++++++++++
 3 files changed, 291 insertions(+), 5 deletions(-)
 create mode 100644 code/03_TaxonomicBackbone.Rmd

diff --git a/code/01_Extract_elevation.Rmd b/code/01_Extract_elevation.Rmd
index f00d64e..09ebec9 100644
--- a/code/01_Extract_elevation.Rmd
+++ b/code/01_Extract_elevation.Rmd
@@ -17,11 +17,12 @@ output: html_document
         
 **Timestamp:** `r date()`  
 **Drafted:** Francesco Maria Sabatini  
-**Version:** 1.1
+**Version:** 1.2
 
 *Changes to v1.2* - based on dataset sPlot_3.0.1, received on 29/06/2019 from SH. Calculates elevation also for plots without location uncertainty (arbitrarily set to 100 m).
-  
+
 This report describes how elevation data was matched to plot locations. 
+
 ```{r results="hide", message=F, warning=F}
 knitr::opts_chunk$set(echo = TRUE)
 library(reshape2)
@@ -144,8 +145,13 @@ clusterEvalQ(cl, {
       }
     }
   }
-
-foreach(i = 1:nlevels(header.sp$tilenam)) %do% {
+#create list of tiles for which dem could not be extracted
+myfiles <- list.files("../_derived/elevatr/")
+done <- as.numeric(unlist(regmatches(myfiles, gregexpr("[[:digit:]]+", myfiles))))
+todo <- 1:nlevels(header.sp$tilenam)
+todo <- todo[-which(todo %in% done)]
+#foreach(i = 1:nlevels(header.sp$tilenam)) %do% {
+foreach(i = todo) %do% {  
   #create sp and project data
   if(nrow(header.sp %>% filter(tilenam %in% levels(header.sp$tilenam)[i])) ==0 ) next()
   sp.tile <- SpatialPointsDataFrame(coords=header.sp %>% 
diff --git a/code/02_Compile_dataset.Rmd b/code/02_Compile_dataset.Rmd
index e039a6b..133b3b9 100644
--- a/code/02_Compile_dataset.Rmd
+++ b/code/02_Compile_dataset.Rmd
@@ -19,7 +19,12 @@ knitr::opts_chunk$set(echo = TRUE)
 7) Add GIVD codes  
 
 
-
+plots to delete from: (GIVD / NA-CA-004) (see email from Laura Boisver)
+Fabot01
+Fadum01, 02 et 03
+Faers01
+Pfe-f-08
+Pfe-o-05
 
 
 
diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd
new file mode 100644
index 0000000..ed3ea08
--- /dev/null
+++ b/code/03_TaxonomicBackbone.Rmd
@@ -0,0 +1,275 @@
+---
+title: "Taxonomic Backbone - sPlot 3.0"
+author: "Francesco Maria Sabatini"
+date: "`r format(Sys.time(), '%d %B, %Y')`"
+tags: 
+- database
+- big data
+- traits
+- taxonomy
+output: 
+  html_document:
+    number_sections: true
+    toc: true
+    toc_depth: 2
+abstract: "This document describes the workflow (with contributions from Oliver Purschke, Jürgen Dengler and Florian Jansen) that was used to generate the taxonomic backbone that standardizes taxon names across the (i) global vegetation plot database sPlot version 3.0 and (ii) the global plant trait data base TRY version 5."
+urlcolor: blue
+---
+
+
+<center>
+  ![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")
+</center>
+
+
+***
+
+**Timestamp:** `r date()`  
+**Drafted:** Francesco Maria Sabatini  
+**Revised:** 
+**Version:** 1.0
+
+***
+
+# Load packages 
+
+```{r results="hide", message=F, warning=F}
+library(reshape2)
+library(tidyverse)
+library(readr)
+library(data.table)
+library(knitr)
+library(kableExtra)
+library(stringr)
+library(taxize)
+library(Taxonstand)
+library(vegdata)
+```
+
+## Read in taxon names from [sPlot](https://www.idiv.de/sdiv/working_groups/wg_pool/splot/splot_database.html) and [TRY](https://www.try-db.org/TryWeb/Home.php)
+
+```{r, echo=F, eval=F}
+#import and save splot names from DT table
+DT0 <- readr::read_delim("../sPlot_data_export/sPlot 3.0.1_species.csv", 
+                            delim="\t", 
+                         col_type = cols(
+                                PlotObservationID = col_double(),
+                                Taxonomy = col_character(),
+                                `Taxon group` = col_character(),
+                                `Taxon group ID` = col_double(),
+                                `Turboveg2 concept` = col_character(),
+                                `Matched concept` = col_character(),
+                                Match = col_double(),
+                                Layer = col_double(),
+                                `Cover %` = col_double(),
+                                `Cover code` = col_character(),
+                                x_ = col_double()
+                              )
+                         ) 
+splot.species <- DT0 %>%
+  rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>%
+  dplyr::select(Species.original, Matched.concept) %>%
+  distinct()
+write_csv(splot.species, path = "../_derived/splot3.0.1.species.csv")
+```
+
+!!! Should I use the column from TRY using the full species name, or the column with only the two words from name strings?
+
+```{r, message=F}
+splot.species <- read_csv("../_derived/splot3.0.1.species.csv")
+
+try.species <- readr::read_csv("../_input/AccSpecies_TRY5.csv", col_names = F, locale = locale(encoding = 'Latin1')) %>%
+  dplyr::select(-X6, -X7) %>%
+  rename(try.ID=X1, FullSpecies=X2, Species=X3, Genus=X4, Family=X5, GrowthForm=X8)
+# Sneak in  species from the Alpine database (Borja & Riccardo), as a courtesy to Project #18
+
+alpine.species <- read_delim("../_input/new_alpine_species.txt", col_names = F, delim = "\t", locale = locale(encoding = 'Latin1')) %>% 
+  rename(Species=X1)
+```
+Use the `Matched.concept` column, as it already contains some standardization by [Stephan Hennekkens](https://www.wur.nl/de/Personen/drs.-SM-Stephan-Hennekens.htm) according to [synbiosys](https://www.synbiosys.alterra.nl/turboveg/contact.htm).  
+  
+sPlot 3.0.1 contains `r nrow(unique(splot.species[,2]))` different species names.  
+TRY 5. contains `r nrow(try.species)`.  
+I add to this a list of `r nrow(alpine.species)` alpine species delivered from Riccardo Testolin, within sPlot Project #18.
+
+## Combine species lists
+
+```{r, message=F}
+spec.list.TRY.sPlot <- splot.species %>%
+  dplyr::select(Matched.concept) %>%
+  rename(Species=Matched.concept) %>%
+  mutate(Source="S") %>%
+  bind_rows(try.species %>% 
+              dplyr::select(Species) %>%
+              mutate(Source="T")) %>%
+  bind_rows(alpine.species %>% 
+              mutate(Source="A")) %>%
+  reshape2::dcast(Species ~ Source) %>%
+  mutate(A=ifelse(A>=1, "A", "")) %>%
+  mutate(S=ifelse(S>=1, "S", "")) %>%
+  mutate(T=ifelse(T>=1, "T", "")) %>%
+  mutate(Source=paste(S, T, A, sep="")) %>%
+  dplyr::select(-A, -S, -T)
+            
+length(spec.list.TRY.sPlot)
+table(spec.list.TRY.sPlot$Source) #Number of species unique and in common across databases
+```
+
+# A-priori cleaning of names
+## Manual cleaning
+When constructing the backbone for sPlot 2.1, a list of 4,093 "weird" species names (consisting mainly of trivial names) was generated, and corrected manually by [Jürgen Dengler](https://www.uni-bayreuth.de/de/forschung/profilfelder/advanced-fields/oekologie-und-umweltwissenschaften/mitwirkende/Dengler/index.php) (thereafter JD).  
+**This step is not performed anymore**
+
+
+## String manipulation routines
+Stripping unwanted characters as well as abbreviation (such as hybrid markers) which would prevent name matching:
+
+```{r, eval = F}
+spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
+  mutate(OriginalNames=Species) %>%
+  dplyr::select(OriginalNames, Species, Source) %>%
+  mutate(Species=gsub('*', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('cf. ', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('Cf. ', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('[', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub(']', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub(' x ', ' ', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('Ã—', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('aff ', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('(', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub(')', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub(' cf ', ' ', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub(' aff. ', ' ', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('c‚e', 'ceae', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('    ', ' ', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('   ', ' ', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('  ', ' ', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('x-', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('X-', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('like ', '', Species, fixed=TRUE)) %>% 
+  mutate(Species=gsub(',', '', Species, fixed=TRUE))
+         
+         
+```
+
+For all names, that have a number in their first word, and consist of $>$ 1 words, remove that word:
+
+
+```{r, eval = F}
+spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>% 
+  mutate(firstWordWithNumbers=grepl('[0-9]', word(Species, 1))) %>%
+  mutate(numberOfWords= sapply(gregexpr("\\W+", Species), length) + 1) %>%
+  mutate(Species=ifelse((firstWordWithNumbers & numberOfWords > 1), 
+                        sapply(Species, function(x) substr(x, start=regexpr(pattern =' ', text=x)+1, stop=nchar(x))), 
+                        Species))
+
+```
+Correct some name abbreviations using `taxname.abbr` in `vegdata`:
+
+
+```{r, eval = F}
+spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>% 
+  mutate(testout=taxname.abbr(spec.list.TRY.sPlot$Species))
+
+write.csv(spec.list.TRY.sPlot$Species,  file = "../_derived/TNRS_submit/tnrs_submit_all.csv")
+
+seq1 <- c(seq(from =1, to = nrow(spec.list.TRY.sPlot), 50000), nrow(spec.list.TRY.sPlot)+1)
+for(i in 2:length(seq1)){
+  write.csv(spec.list.TRY.sPlot$Species[seq1[i-1]:(seq1[i]-1)],
+  file = paste(paste("../_derived/TNRS_submit/tnrs_submit", seq1[i], sep = "_"), "csv", sep = "."))
+}
+
+```
+
+
+# Match names against Taxonomic Name Resolution Service ([TNRS](http://tnrs.iplantcollaborative.org))
+
+## Slice `CleanedNames` into chunks
+... of 1000 species, and query both TPL and TNRS
+
+```{r, eval = F}
+
+
+for(i in 2:length(seq1)){
+  sel.species <- spec.list.TRY.sPlot$Species[seq1[i-1]:(seq1[i]-1)]
+  tpl.tmp <- TPL(sel.species)
+  if(!is.null(nrow(tpl.tmp))){
+    write_csv(tpl.tmp, path = paste("../_derived/TPL/tpl_out_", i, ".csv", sep=""))}
+  print(i)
+}
+
+for(i in 2:length(seq1)){
+  sel.species <- spec.list.TRY.sPlot$Species[seq1[i-1]:(seq1[i]-1)]
+  tnrs.tmp <- gnr_resolve(sel.species)
+  if(!is.null(nrow(tnrs.tmp))){
+    write_csv(tnrs.tmp, path = paste("../_derived/TNRS/tnrs_out_", i, ".csv", sep=""))}
+  print(i)
+}
+
+
+
+```
+
+Compile output from TNRS, and sort matches
+```{r, message=F}
+myfiles.tnrs <- list.files("../_derived/TNRS/", full.names = T)
+myfiles.tpl <- list.files("../_derived/TPL/", full.names = T)
+
+tpl.out <- read_csv(myfiles.tpl[1])
+for(i in 2:length(myfiles.tpl)) {
+  tpl.out <- tpl.out %>%
+    bind_rows(read_csv(myfiles.tpl[i]))
+  print(i)
+}
+
+tnrs.out <- read_csv(myfiles.tnrs[1])
+for(i in 2:length(myfiles.tnrs)) {
+  tnrs.out <- tnrs.out %>%
+    bind_rows(read_csv(myfiles.tnrs[i]))
+  print(i)
+}
+
+
+
+```
+
+
+
+
+
+## TNRS settings {#ID}
+The following settings were used for resolving names on TNRS.
+
+### Sources for name resolution {#ID}
+The initial TNRS name resolution run was based on the **five standard sources** that were **ranked according to preference** in the following order (default of TNRS):
+
+1. The Plant List ([TPL](http://www.theplantlist.org/))[@TPL2013]
+2. The Global Compositae Checklist ([GCC](http://compositae.landcareresearch.co.nz/))[@Flann2009]
+3. The International Legume Database and Information Service ([ILDIS](http://www.ildis.org/))[@ILDIS2006]
+4. [Tropicos](http://www.tropicos.org/) [@TROPICOS2013]
+5. PLANTS Database ([USDA](https://plants.usda.gov/java/))[@USDA2012]
+
+Because it is possible that the best match is found in lower ranked sources, see section [TNRS settings], two additional name resolution runs were realized in which the highest ranking was given to **(1) Tropicos**, or **(2)** the sixth source available in TNRS, **NCBI** (The National Center for Biotechnology Information's Taxonomy database; [@NCBI2003]), respectively, see section [TNRS settings].
+
+### Family Classification
+Resolved names were assigned to families based on the [APGIII classification](http://onlinelibrary.wiley.com/doi/10.1111/j.1095-8339.2009.00996.x/abstract) [@Chase2009], the same classification system used by Tropicos.
+
+## Retrieve results
+Once the  matching process was finished, results were retrieved from TNRS using the `Detailed Download` option that included the full name information (parsed components, warnings, links to sources, etc.).  We retrived the single best match for each species, constrained by source (TNRS default), where the name in the first source was selected as best match, unless there was `no suitable match found` in that source, the match from the next lower-ranked source was selected, until all resources where exhausted.
+
+# Manually inspecting name matching results {#ID}
+Manually inspect the TNRS-results table in a spreadsheat application (i.e. LibreOffice or Excel). Starting with the highest taxonomic rank considered (i.e. Family). For instance, if manual checking of the TRNS output reveals that all accepted names or synonyms that have accuracy scores >0.9 are correct taxon names, use the following selection procedure:
+
+* Name_matched_rank (==Family)
+* Taxonomic_status (==Accepted, Synomyn)
+* Family_score (>0.9)
+
+Continue this selection procedure for entries that were matched at lower taxonomic ranks, i.e. genus, species, etc..
+
+
+
+
+
+
+
-- 
GitLab