From f51252471f91f02d71a9f34f6d5fb51c9bc5dbec Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Thu, 12 Mar 2020 16:26:33 +0100
Subject: [PATCH] Complement families in Backbone using TRY

---
 code/03_TaxonomicBackbone.Rmd | 56 ++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd
index 58a3076..26eb62f 100644
--- a/code/03_TaxonomicBackbone.Rmd
+++ b/code/03_TaxonomicBackbone.Rmd
@@ -74,7 +74,7 @@ DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv",
 ## Import lists of species classified into groups 
 These objects are defined in the appendix
 ```{r}
-load("../derived/taxa_manual.RData")
+load("../_derived/taxa_manual.RData")
 ```
 
 
@@ -916,6 +916,7 @@ save(tnrs.res.certain, tnrs.res.uncertain, file="../_derived/TNRS_submit/tnrs.it
 Many unmatched records do contain subspecies information which could not be retrieved in TNRS, although genus and species seem to be spelled correctly. Also, sometimes the mismatch derives from having the word 'species' or 'sp' at the end of the name. 
 ```{r}
 tnrs.submit.iter2 <- data.frame(old=tnrs.res.uncertain$Name_submitted) %>%
+  mutate(old=as.character(old)) %>% 
   mutate(new=old)
 
 # delete remaining records of mushroom species
@@ -1199,10 +1200,17 @@ After iteration 3, there are still `r nrow(tnrs.ncbi.uncertain)` unresolved taxa
 Generate names list from `tnrs.ncbi.uncertain` to be matched against `The Plant List`, using `Taxonstand::TPL`. Add to this list, also all those species that in the first iterations did not return an accepted name.  
 
 ```{r, eval = F}
-tpl.submit <- tnrs.res.certain %>% filter(is.na(Accepted_name)) %>% dplyr::select(Name_submitted) %>% 
-  bind_rows(tnrs.res.iter2.certain %>% filter(is.na(Accepted_name)) %>% dplyr::select(Name_submitted)) %>% 
-  bind_rows(tnrs.ncbi.certain %>% filter(is.na(Accepted_name)) %>% dplyr::select(Name_submitted)) %>% 
-  bind_rows(tnrs.ncbi.uncertain %>% dplyr::select(Name_submitted)) %>% 
+tpl.submit <- tnrs.res.certain %>% 
+  filter(is.na(Accepted_name)) %>% 
+  dplyr::select(Name_submitted) %>% 
+  bind_rows(tnrs.res.iter2.certain %>% 
+              filter(is.na(Accepted_name)) %>% 
+              dplyr::select(Name_submitted)) %>% 
+  bind_rows(tnrs.ncbi.certain %>% 
+              filter(is.na(Accepted_name)) %>% 
+              dplyr::select(Name_submitted)) %>% 
+  bind_rows(tnrs.ncbi.uncertain %>% 
+              dplyr::select(Name_submitted)) %>% 
   distinct()
 nrow(tpl.submit)
 write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
@@ -1213,8 +1221,8 @@ chunks <- split(indices, sort(indices%%99))
 
 library(doParallel)
 library(parallel)
-cl <- makeForkCluster(3, outfile="")
-registerDoParallel(3)
+cl <- makeForkCluster(5, outfile="")
+registerDoParallel(cl)
 
 tpl.ncbi <- foreach(i=1:length(chunks), .combine=rbind) %dopar% {
   tmp <- (TPL(tpl.submit$Name_submitted[chunks[[i]]]))
@@ -1370,7 +1378,6 @@ sum((is.na(Backbone$Family_correct)))
 ```
 ### Resolve genera with missing family info with `TNRS`
 ```{r, eval=F}
-
 Genera_submit <- Backbone %>% 
   filter(is.na(Family_correct))  %>% 
   dplyr::select(Genus_correct) %>% 
@@ -1413,8 +1420,31 @@ Backbone <- Backbone %>%
 #Records with missing family info
 sum(is.na(Backbone$Family_correct))
 ```
+### Complement with data from `TRY 5.0`
+Data from try were received by [Jens Kattge](jkattge@bgc-jena.mpg.de) on Jan 21, 2020. 
+```{r, warning=F, message=F}
+# Species, Genus, Family from try
+try.species <- read_csv(
+  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv",
+  locale = locale(encoding = "latin1"))
+
+Backbone <- Backbone %>% 
+  left_join(try.species %>%
+      dplyr::select(Genus_correct=Genus, family=Family) %>% 
+      distinct() %>% 
+      filter(family != "") %>% 
+      group_by(Genus_correct), 
+    by="Genus_correct") %>% 
+  mutate(Family_correct=coalesce(Family_correct, family)) %>%
+  dplyr::select(-family)
+
+# Remaining records with missing family info
+sum((is.na(Backbone$Family_correct)))
+```
 
-### Complement with data from `The Catalogue of Life`.
+
+
+### Complement with data from `The Catalogue of Life`
 ```{r, eval=F}
 #Download data from Catalogue of Life - 2019
 download.file("http://www.catalogueoflife.org/DCA_Export/zip/archive-kingdom-plantae-bl3.zip",
@@ -1459,9 +1489,7 @@ Backbone <- Backbone %>%
       filter(genus %in% Genera_missing$Genus_correct) %>% 
       rename(Genus_correct=genus), 
     by="Genus_correct") %>% 
-  mutate(Family_correct=ifelse(is.na(Family_correct) & !is.na(family), 
-                                family, 
-                                Family_correct)) %>%
+  mutate(Family_correct=coalesce(Family_correct, family)) %>%
   dplyr::select(-family)
 
 #Records with missing family info
@@ -1652,7 +1680,7 @@ table(Backbone$is_vascular_species, exclude=NULL)
 ```{r echo=F}
 knitr::kable(Backbone %>% 
                sample_n(20), 
-  caption="Example of Backbone (only 20 randomly selected taxa shown") %>%
+  caption="Example of Backbone (only 20 randomly selected taxa shown)") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   latex_options = "basic", 
                   full_width = F, position = "center")
@@ -1987,7 +2015,7 @@ algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiac
                    #diatoms below
                    "Thalassiosiraceae", "Cymbellaceae", "Naviculaceae","Bacillariaceae")
 
-save(mushroom, vascular, lichens, lichen.genera, mosses, algae_diatoms, file="../derived/taxa_manual.RData")
+save(mushroom, vascular, lichens, lichen.genera, mosses, algae_diatoms, file="../_derived/taxa_manual.RData")
 
 ```
 
-- 
GitLab