From 460c81150d46a6a871917a2b289bf9eb54ec5199 Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Wed, 4 Mar 2020 01:33:47 +0100
Subject: [PATCH] Brushed up Backbone, Created Taxon group. Manual clean
 families

---
 code/03_TaxonomicBackbone.Rmd | 261 ++++++++++++++++++++++++++--------
 1 file changed, 200 insertions(+), 61 deletions(-)

diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd
index 27e83ce..2611f12 100644
--- a/code/03_TaxonomicBackbone.Rmd
+++ b/code/03_TaxonomicBackbone.Rmd
@@ -58,8 +58,14 @@ mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Am
   "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", 
    "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe",
   "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", 
-  "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria", 
-  "Amphoroblasia", "Amphoroblastia", "Agrocybe")
+  "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria","Amphoroblasia", "Amphoroblastia", "Agrocybe", 
+  "Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma", 
+  "Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae", 
+  "Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora",         
+  "Kirschsteiniothelia", "Lasallia","Lepiota","Llimoniella","Mazosia","Microthelia","Mollisia",     
+  "Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella",
+  "Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Umbilicaria","Unguiculariopsis" ,
+  "Xanthoconium")
 ```
 
 
@@ -1200,7 +1206,7 @@ tpl.submit <- tnrs.res.certain %>% filter(is.na(Accepted_name)) %>% dplyr::selec
 nrow(tpl.submit)
 write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
 
-#dividve in 99 batches
+#divide in 99 batches
 indices <- 1:nrow(tpl.submit)
 chunks <- split(indices, sort(indices%%99))
 
@@ -1248,11 +1254,17 @@ save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.it
 # Merge the resolved species lists into a Backbone
 ## Read files
 
-```{r, eval = T}
+```{r, eval = T, warning=F}
 load("../_derived/TNRS_submit/tnrs.iter1.RData")
 load("../_derived/TNRS_submit/tnrs.iter2.RData")
 load("../_derived/TNRS_submit/tnrs.iter3.RData")
 load("../_derived/TNRS_submit/tnrs.iter4.RData")
+
+#Double check of wrong taxa from TNRS
+finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii")
+tpl.ncbi.certain <- tpl.ncbi.certain %>% 
+  bind_rows(TPL(finalcheck))
+
 ```
 
 Combine the `certain` data sets:
@@ -1300,11 +1312,11 @@ Backbone <- spec.list.TRY.sPlot %>%
                                              levels=c("Accepted","Synonym", "No opinion","Invalid",
                                                       "Illegitimate","Misapplied","Rejected name",
                                                       "Unresolved"))) %>%
+                arrange(Taxonomic_status) %>% 
+                slice(1) %>% 
                 #delete empty spaces at end of names
                 mutate(Accepted_name=gsub(pattern=" $", replacement="", x=Accepted_name)) %>% 
-                mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)) %>% 
-                arrange(Taxonomic_status) %>% 
-                slice(1),
+                mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)),
               by="Name_submitted")
 #Double check
 nrow(Backbone) == nrow(spec.list.TRY.sPlot)
@@ -1318,7 +1330,7 @@ If names were neither resolved at the accepted or synonym level, set `Status_cor
 ```{r, eval = T}
 Backbone <- Backbone %>%
   mutate(Status_correct=fct_collapse(Taxonomic_status, 
-                                     Other=c("No opinion","Invalid",
+                                     Other=c("No opinion","Invalid", "Unresolved", 
                                          "Illegitimate","Misapplied","Rejected name"))) %>% 
   mutate(Status_correct=fct_explicit_na(Status_correct, "No suitable matches found.")) %>% 
   #Create Name_correct field. Use Accepted names, if any. Otherwise matched names.
@@ -1338,13 +1350,12 @@ Backbone <- Backbone %>%
 summary(Backbone$Status_correct)
 summary(Backbone$Rank_correct)
 ```
+
 There are `r sum(is.na(Backbone$Name_correct))` species names for which we found no match in any of the taxonomic resources we used. Yet, for as many as `r sum(Backbone$Rank_correct %in% c("higher", "family", "genus"))` taxa, the matching did not properly resolve the species name, and we only found a match at genus or higher level.
 
 
 ## Complete list of families
-There are `r sum(is.na(Backbone$Accepted_name_family))` records with missing family information.
-### Derive info from other species of the same Genera in the Backbone itself
-Copy family info for taxa resolved at family level. 
+There are `r sum(is.na(Backbone$Accepted_name_family))` records with missing family information. Create field `Family_correct`.
 ```{r}
 Backbone <- Backbone %>% 
   mutate(family.lev=str_extract(word(Name_correct,1), pattern='([^\\s]+aceae)')) %>%
@@ -1356,38 +1367,6 @@ Backbone <- Backbone %>%
 # Remaining records with missing family info
 sum((is.na(Backbone$Family_correct)))
 ```
-
-Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info.
-```{r}
-genera_families <- Backbone %>% 
-  filter(Taxonomic_status=="Accepted") %>% 
-  dplyr::select(Genus_correct, Family_correct) %>% 
-  rename(family=Family_correct) %>% 
-  distinct() %>% 
-  na.omit() %>% 
-  #for some genera there are multiple families assigned 
-  # (e.g. in case of unresolved species names )
-  # Extract the family names that occurs most often across each genus
-  group_by(Genus_correct, family) %>% 
-  summarize(n=n()) %>% 
-  arrange(desc(n)) %>% 
-  slice(1) %>% 
-  ungroup() %>% 
-  dplyr::select(-n)
-
-# Assign family derived from backbone to other records
-Backbone <- Backbone %>% 
-  left_join(genera_families, by="Genus_correct") %>% 
-  mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)),
-                                       family, 
-                                       Family_correct)) %>% 
-  dplyr::select(-family)
-
-#Records with missing family info
-sum(is.na(Backbone$Family_correct))
-```
-
-
 ### Resolve genera with missing family info with `TNRS`
 ```{r, eval=F}
 
@@ -1443,11 +1422,27 @@ unzip("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019.
 
 ```
 
-
-
 ```{r, message=F, warning=F}
 cat.life <- read_delim("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/taxa.txt", 
-                       delim="\t")
+                       delim="\t", 
+                       col_types =  cols(
+                          .default = col_character(),
+                          taxonID = col_double(),
+                          datasetID = col_double(),
+                          acceptedNameUsageID = col_double(),
+                          parentNameUsageID = col_double(),
+                          superfamily = col_logical(),
+                          subgenus = col_logical(),
+                          source = col_logical(),
+                          namePublishedIn = col_logical(),
+                          modified = col_character(),
+                          taxonConceptID = col_double(),
+                          isExtinct = col_logical()
+                        )) %>% 
+  #correct family names to match to the standards in TPL
+  mutate(family=ifelse(family=="Fabaceae", "Leguminosae", family)) %>% 
+  mutate(family=ifelse(family=="Asteraceae", "Compositae", family))
+
 Genera_missing <- Backbone %>%
   filter(is.na(Family_correct) & !is.na(Genus_correct)) %>%
   dplyr::select(Genus_correct) %>% 
@@ -1471,39 +1466,116 @@ sum(is.na(Backbone$Family_correct))
 ```
 After matching the remaining genera with the Catalogue of life there are still `r nrow(Backbone %>% filter(is.na(Family_correct)))` records without Family affiliation, for a total of `r nrow(Backbone %>% filter(is.na(Family_correct)) %>% dplyr::select(Genus_correct) %>% distinct())` genera.  
   
-### Manually fix residual, known issues
+### Manually fix some known issues
 ```{r}
 Backbone <- Backbone %>%
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Coptidium",
+                                list=Genus_correct=="Coptidium",
                                 values="Ranunculaceae")) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Balanocarpus",
+                                list=Genus_correct=="Balanocarpus",
                                 values="Dipterocarpaceae" )) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Cardaminopsis",
+                                list=Genus_correct=="Cardaminopsis",
                                 values="Brassicaceae" )) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Carpolepis",
+                                list=Genus_correct=="Carpolepis",
                                 values="Myrtaceae" )) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Cathartolinum",
+                                list=Genus_correct=="Cathartolinum",
                                 values="Linaceae" )) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Didiscus",
+                                list=Genus_correct=="Didiscus",
                                 values="Araliaceae" )) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Grammadenia",
+                                list=Genus_correct=="Grammadenia",
                                 values="Primulaceae" )) %>% 
   mutate(Family_correct=replace(Family_correct, 
-                                list=word(Accepted_name_species, 1)=="Antholoma",
-                                values="Elaeocarpaceae" )) 
+                                list=Genus_correct=="Antholoma",
+                                values="Elaeocarpaceae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Odontarrhena",
+                                values="Brassicaceae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Trichinium",
+                                values="Amaranthaceae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Limonium",
+                                values="Plumbaginaceae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Eunanus",
+                                values="Phrymaceae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Sunaptea",
+                                values="Dipterocarpaceae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Aconogonon",
+                                values="Polygonaceae" )) %>%   
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Cajophora",
+                                    values="Loasaceae" )) %>%   
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Calobota",
+                                values="Leguminosae" )) %>% 
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Alsine",
+                                values="Caryophyllaceae" )) %>%   
+    mutate(Family_correct=replace(Family_correct, 
+                                list=Genus_correct=="Cyanococcus",
+                                values="Ericaceae" ))  %>% 
+    mutate(Family_correct=ifelse(Family_correct %in% c("Papilionaceae", "Fabaceae"), 
+                               "Leguminosae", Family_correct)) %>% 
+    mutate(Family_correct=ifelse(Family_correct=="Asteraceae", "Compositae", Family_correct)) %>% 
+    mutate(Family_correct=ifelse(Family_correct=="Unknown", NA, Family_correct))
   
 #Records with missing family info
 sum(is.na(Backbone$Family_correct))
 ```
+### Derive info from other species of the same Genera in the Backbone itself
+Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info.
+```{r}
+genera_families <- Backbone %>% 
+  filter(Taxonomic_status=="Accepted") %>% 
+  dplyr::select(Genus_correct, Family_correct) %>% 
+  rename(family=Family_correct) %>% 
+  distinct() %>% 
+  na.omit() %>% 
+  #for some genera there are multiple families assigned 
+  # (e.g. in case of unresolved species names )
+  # Extract the family names that occurs most often across each genus
+  group_by(Genus_correct, family) %>% 
+  summarize(n=n()) %>% 
+  arrange(desc(n)) %>% 
+  slice(1) %>% 
+  ungroup() %>% 
+  dplyr::select(-n)
 
-### Create field `Name_short`
+# Assign family derived from backbone to other records
+Backbone <- Backbone %>% 
+  left_join(genera_families, by="Genus_correct") %>% 
+  mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)),
+                                       family, 
+                                       Family_correct)) %>% 
+  dplyr::select(-family)
+
+#Records with missing family info
+sum(is.na(Backbone$Family_correct))
+```
+
+### Delete records assigned to mushroom families, if any
+```{r}
+mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae",
+                       "Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae",
+                       "Cortinariaceae", "Polyporaceae",  "Umbilicariaceae" , "Pleosporaceae",
+                       "Leotiaceae","Dermateaceae", "Hymenochaetaceae","Stereaceae","Tremellaceae")
+Backbone <- Backbone %>% 
+  filter(!Genus_correct %in% mushroom) %>% 
+  filter(!Family_correct %in% mushroom.families)
+```
+
+
+
+## Create field `Name_short`
 Shorten names that have more than two words and where the second word is a x. If there is no species name available, fill in with either genus or family info
 ```{r}
 Backbone <- Backbone %>% 
@@ -1523,8 +1595,8 @@ sum(is.na(Backbone$Name_correct))
 ```
 
 
-## Create Field `is_vascular_plant`
-Assign all families that belong to `Tracheophyta` to category `is_vascular_species`, based on `The Catalogue of Life`
+## Create field `is_vascular_plant` and `Taxon group`
+Attach phylum information from `The Catalogue of Life`.
 ```{r}
 Backbone <- Backbone %>% 
   left_join(cat.life %>% 
@@ -1532,11 +1604,77 @@ Backbone <- Backbone %>%
               distinct() %>% 
               na.omit() %>% 
               rename(Family_correct=family), 
-            by="Family_correct") %>% 
-  mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F))
+            by="Family_correct") 
+```
+Create fields `is_vascular_species` and `Taxon group` based on list of family manually classified, and on phyla from `The Catalogue of Life`.  
+Assign all families that belong to `Tracheophyta` to category `is_vascular_species`, based on 
+```{r}
+vascular <- c("Leguminosae" ,  "Alliaceae", "Bombacaceae" ,"Taxodiaceae",
+              "Aceraceae", "Centrolepidaceae","Callitrichaceae" ,"Flacourtiaceae",
+              "Compositae", "Asclepiadaceae", "Papilionaceae","Tiliaceae",
+              "Mimosaceae" , "Xanthorrhoeaceae","Arthropteridaceae",
+              "Valerianaceae",  "Grammitidaceae" ,"Anarthriaceae", "Caesalpiniaceae",
+              "Chenopodiaceae", "Corylaceae", "Diervillaceae", "Dipsacaceae","Guttiferae",
+              "Haptanthaceae", "Hymenophyllopsidaceae", "Isoëtaceae","Labiatae",
+              "Lactoridaceae","Lemnaceae","Selaginaceae","Sterculiaceae","Myoporaceae",
+              "Myrsinaceae" ,"Pyrolaceae", "Rhoipteleaceae" ,"Xanthoceraceae")
+lichens <- c("Acarosporaceae" , "Parmeliaceae", "Physciaceae", "Lichinaceae",
+             "Caliciaceae", "Lecanoraceae", "Venturiaceae" ,"Sphaerophoraceae" ,
+             "Verrucariaceae", "Tricholomataceae","Baeomycetaceae",
+             "Catillariaceae" ,"Megasporaceae","Ramalinaceae","Pilocarpaceae" ,
+             "Teloschistaceae","Candelariaceae","Rhizocarpaceae","Lecideaceae",
+             "Icmadophilaceae","Cladoniaceae","Collemataceae","Pannariaceae" ,
+             "Lobariaceae", "Ophioparmaceae" ,"Psoraceae","Stereocaulaceae",
+             "Massalongiaceae","Peltigeraceae","Nephromataceae")
+lichen.genera <- c("Amygdalaria", "Anamylospora", "Arthonia", "Pertusaria", "Pyrenula","Opegrapha", 
+                   "Ochrolechia", "Graphis", "Micarea", "Porpidia", "Arthopyrenia", "Graphina", "Anisomeridium",
+                   "Mycobilimbia","Peltula", "Thelotrema", "Arthothelium", "Diploschistes", "Strigula",
+                   "Trichothelium", "Melaspilea", "Phaeographis", "Thelenella", "Chaenothecopsis","Fuscidea",
+                   "Dactylospora", "Gyalecta", "Myriotrema", "Placynthium")
+mosses <- c("Pilotrichaceae", "Chonecoleaceae", "Hypopterygiaceae", "Scorpidiaceae",
+            "Balantiopsaceae", "Mesoptychiaceae","Octoblepharaceae" ,"Takakiaceae")
+algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiaceae",
+                   "Chordariaceae", "Dinobryaceae", "Diploneidaceae", "Ectocarpaceae",
+                   "Fragilariaceae","Sphacelariaceae","Vaucheriaceae" ,
+                   "Amphipleuraceae",  "Fucaceae", "Gomphonemataceae", "Melosiraceae",
+                   "Laminariaceae","Acinetosporaceae" ,"Botryochloridaceae",
+                   #diatoms below
+                   "Thalassiosiraceae", "Cymbellaceae", "Naviculaceae","Bacillariaceae")
 
+Backbone <- Backbone %>% 
+  mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F)) %>% 
+  mutate(is_vascular_species=replace(is_vascular_species,
+                                     list=Family_correct %in% vascular, 
+                                     values=T)) %>% 
+  mutate(`Taxon group`="Unknown") %>% 
+  mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T), 
+                              "Vascular Plant", `Taxon group`)) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Family_correct %in% lichens, 
+                               values="Lichen")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Genus_correct %in% lichen.genera, 
+                               values="Lichen")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Family_correct %in% algae_diatoms, 
+                               values="Alga")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=phylum %in% c("Glaucophyta", "Rhodophyta", "Charophyta", "Chlorophyta"), 
+                               values="Alga")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=Family_correct %in% mosses, 
+                               values="Moss")) %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=phylum %in% c("Bryophyta", "Bryophyta", "Anthocerotophyta" ), 
+                               values="Moss")) %>% 
+  mutate(is_vascular_species=ifelse(`Taxon group` %in% c("Moss", "Alga", "Lichen"),
+                                    F, is_vascular_species))
+table(Backbone$`Taxon group`, exclude=NULL)
 table(Backbone$is_vascular_species, exclude=NULL)
 ```
+
+
+
 ## Export Backbone
 ```{r echo=F}
 knitr::kable(Backbone %>% 
@@ -1596,6 +1734,7 @@ knitr::kable(Backbone %>%
 *Name_short* - First two words of `Name_correct`  
 *phylum* - As derived from `The Catalogue of Life`  
 *is_vascular_species* -  As derived based on selection of `phylum` from `The Catalogue of Life`  
+*`Taxon group`* -  Taxon group, as in Turboveg. 'Vascular plant', 'Moss' (include liverworts), 'Lichen', 'Algae', 'Unknown
 
 ```{r}
 save(Backbone, file="../_output/Backbone3.0.RData")
-- 
GitLab