diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd index 27e83ce7ce871c8e650af4cf457555ddeb51a1c5..2611f129af473a02a7b33287e38d77d18e424e8e 100644 --- a/code/03_TaxonomicBackbone.Rmd +++ b/code/03_TaxonomicBackbone.Rmd @@ -58,8 +58,14 @@ mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Am "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe", "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", - "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria", - "Amphoroblasia", "Amphoroblastia", "Agrocybe") + "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria","Amphoroblasia", "Amphoroblastia", "Agrocybe", + "Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma", + "Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae", + "Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora", + "Kirschsteiniothelia", "Lasallia","Lepiota","Llimoniella","Mazosia","Microthelia","Mollisia", + "Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella", + "Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Umbilicaria","Unguiculariopsis" , + "Xanthoconium") ``` @@ -1200,7 +1206,7 @@ tpl.submit <- tnrs.res.certain %>% filter(is.na(Accepted_name)) %>% dplyr::selec nrow(tpl.submit) write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv") -#dividve in 99 batches +#divide in 99 batches indices <- 1:nrow(tpl.submit) chunks <- split(indices, sort(indices%%99)) @@ -1248,11 +1254,17 @@ save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.it # Merge the resolved species lists into a Backbone ## Read files -```{r, eval = T} +```{r, eval = T, warning=F} load("../_derived/TNRS_submit/tnrs.iter1.RData") load("../_derived/TNRS_submit/tnrs.iter2.RData") load("../_derived/TNRS_submit/tnrs.iter3.RData") load("../_derived/TNRS_submit/tnrs.iter4.RData") + +#Double check of wrong taxa from TNRS +finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii") +tpl.ncbi.certain <- tpl.ncbi.certain %>% + bind_rows(TPL(finalcheck)) + ``` Combine the `certain` data sets: @@ -1300,11 +1312,11 @@ Backbone <- spec.list.TRY.sPlot %>% levels=c("Accepted","Synonym", "No opinion","Invalid", "Illegitimate","Misapplied","Rejected name", "Unresolved"))) %>% + arrange(Taxonomic_status) %>% + slice(1) %>% #delete empty spaces at end of names mutate(Accepted_name=gsub(pattern=" $", replacement="", x=Accepted_name)) %>% - mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)) %>% - arrange(Taxonomic_status) %>% - slice(1), + mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)), by="Name_submitted") #Double check nrow(Backbone) == nrow(spec.list.TRY.sPlot) @@ -1318,7 +1330,7 @@ If names were neither resolved at the accepted or synonym level, set `Status_cor ```{r, eval = T} Backbone <- Backbone %>% mutate(Status_correct=fct_collapse(Taxonomic_status, - Other=c("No opinion","Invalid", + Other=c("No opinion","Invalid", "Unresolved", "Illegitimate","Misapplied","Rejected name"))) %>% mutate(Status_correct=fct_explicit_na(Status_correct, "No suitable matches found.")) %>% #Create Name_correct field. Use Accepted names, if any. Otherwise matched names. @@ -1338,13 +1350,12 @@ Backbone <- Backbone %>% summary(Backbone$Status_correct) summary(Backbone$Rank_correct) ``` + There are `r sum(is.na(Backbone$Name_correct))` species names for which we found no match in any of the taxonomic resources we used. Yet, for as many as `r sum(Backbone$Rank_correct %in% c("higher", "family", "genus"))` taxa, the matching did not properly resolve the species name, and we only found a match at genus or higher level. ## Complete list of families -There are `r sum(is.na(Backbone$Accepted_name_family))` records with missing family information. -### Derive info from other species of the same Genera in the Backbone itself -Copy family info for taxa resolved at family level. +There are `r sum(is.na(Backbone$Accepted_name_family))` records with missing family information. Create field `Family_correct`. ```{r} Backbone <- Backbone %>% mutate(family.lev=str_extract(word(Name_correct,1), pattern='([^\\s]+aceae)')) %>% @@ -1356,38 +1367,6 @@ Backbone <- Backbone %>% # Remaining records with missing family info sum((is.na(Backbone$Family_correct))) ``` - -Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info. -```{r} -genera_families <- Backbone %>% - filter(Taxonomic_status=="Accepted") %>% - dplyr::select(Genus_correct, Family_correct) %>% - rename(family=Family_correct) %>% - distinct() %>% - na.omit() %>% - #for some genera there are multiple families assigned - # (e.g. in case of unresolved species names ) - # Extract the family names that occurs most often across each genus - group_by(Genus_correct, family) %>% - summarize(n=n()) %>% - arrange(desc(n)) %>% - slice(1) %>% - ungroup() %>% - dplyr::select(-n) - -# Assign family derived from backbone to other records -Backbone <- Backbone %>% - left_join(genera_families, by="Genus_correct") %>% - mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)), - family, - Family_correct)) %>% - dplyr::select(-family) - -#Records with missing family info -sum(is.na(Backbone$Family_correct)) -``` - - ### Resolve genera with missing family info with `TNRS` ```{r, eval=F} @@ -1443,11 +1422,27 @@ unzip("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019. ``` - - ```{r, message=F, warning=F} cat.life <- read_delim("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/taxa.txt", - delim="\t") + delim="\t", + col_types = cols( + .default = col_character(), + taxonID = col_double(), + datasetID = col_double(), + acceptedNameUsageID = col_double(), + parentNameUsageID = col_double(), + superfamily = col_logical(), + subgenus = col_logical(), + source = col_logical(), + namePublishedIn = col_logical(), + modified = col_character(), + taxonConceptID = col_double(), + isExtinct = col_logical() + )) %>% + #correct family names to match to the standards in TPL + mutate(family=ifelse(family=="Fabaceae", "Leguminosae", family)) %>% + mutate(family=ifelse(family=="Asteraceae", "Compositae", family)) + Genera_missing <- Backbone %>% filter(is.na(Family_correct) & !is.na(Genus_correct)) %>% dplyr::select(Genus_correct) %>% @@ -1471,39 +1466,116 @@ sum(is.na(Backbone$Family_correct)) ``` After matching the remaining genera with the Catalogue of life there are still `r nrow(Backbone %>% filter(is.na(Family_correct)))` records without Family affiliation, for a total of `r nrow(Backbone %>% filter(is.na(Family_correct)) %>% dplyr::select(Genus_correct) %>% distinct())` genera. -### Manually fix residual, known issues +### Manually fix some known issues ```{r} Backbone <- Backbone %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Coptidium", + list=Genus_correct=="Coptidium", values="Ranunculaceae")) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Balanocarpus", + list=Genus_correct=="Balanocarpus", values="Dipterocarpaceae" )) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Cardaminopsis", + list=Genus_correct=="Cardaminopsis", values="Brassicaceae" )) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Carpolepis", + list=Genus_correct=="Carpolepis", values="Myrtaceae" )) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Cathartolinum", + list=Genus_correct=="Cathartolinum", values="Linaceae" )) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Didiscus", + list=Genus_correct=="Didiscus", values="Araliaceae" )) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Grammadenia", + list=Genus_correct=="Grammadenia", values="Primulaceae" )) %>% mutate(Family_correct=replace(Family_correct, - list=word(Accepted_name_species, 1)=="Antholoma", - values="Elaeocarpaceae" )) + list=Genus_correct=="Antholoma", + values="Elaeocarpaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Odontarrhena", + values="Brassicaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Trichinium", + values="Amaranthaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Limonium", + values="Plumbaginaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Eunanus", + values="Phrymaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Sunaptea", + values="Dipterocarpaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Aconogonon", + values="Polygonaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Cajophora", + values="Loasaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Calobota", + values="Leguminosae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Alsine", + values="Caryophyllaceae" )) %>% + mutate(Family_correct=replace(Family_correct, + list=Genus_correct=="Cyanococcus", + values="Ericaceae" )) %>% + mutate(Family_correct=ifelse(Family_correct %in% c("Papilionaceae", "Fabaceae"), + "Leguminosae", Family_correct)) %>% + mutate(Family_correct=ifelse(Family_correct=="Asteraceae", "Compositae", Family_correct)) %>% + mutate(Family_correct=ifelse(Family_correct=="Unknown", NA, Family_correct)) #Records with missing family info sum(is.na(Backbone$Family_correct)) ``` +### Derive info from other species of the same Genera in the Backbone itself +Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info. +```{r} +genera_families <- Backbone %>% + filter(Taxonomic_status=="Accepted") %>% + dplyr::select(Genus_correct, Family_correct) %>% + rename(family=Family_correct) %>% + distinct() %>% + na.omit() %>% + #for some genera there are multiple families assigned + # (e.g. in case of unresolved species names ) + # Extract the family names that occurs most often across each genus + group_by(Genus_correct, family) %>% + summarize(n=n()) %>% + arrange(desc(n)) %>% + slice(1) %>% + ungroup() %>% + dplyr::select(-n) -### Create field `Name_short` +# Assign family derived from backbone to other records +Backbone <- Backbone %>% + left_join(genera_families, by="Genus_correct") %>% + mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)), + family, + Family_correct)) %>% + dplyr::select(-family) + +#Records with missing family info +sum(is.na(Backbone$Family_correct)) +``` + +### Delete records assigned to mushroom families, if any +```{r} +mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae", + "Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae", + "Cortinariaceae", "Polyporaceae", "Umbilicariaceae" , "Pleosporaceae", + "Leotiaceae","Dermateaceae", "Hymenochaetaceae","Stereaceae","Tremellaceae") +Backbone <- Backbone %>% + filter(!Genus_correct %in% mushroom) %>% + filter(!Family_correct %in% mushroom.families) +``` + + + +## Create field `Name_short` Shorten names that have more than two words and where the second word is a x. If there is no species name available, fill in with either genus or family info ```{r} Backbone <- Backbone %>% @@ -1523,8 +1595,8 @@ sum(is.na(Backbone$Name_correct)) ``` -## Create Field `is_vascular_plant` -Assign all families that belong to `Tracheophyta` to category `is_vascular_species`, based on `The Catalogue of Life` +## Create field `is_vascular_plant` and `Taxon group` +Attach phylum information from `The Catalogue of Life`. ```{r} Backbone <- Backbone %>% left_join(cat.life %>% @@ -1532,11 +1604,77 @@ Backbone <- Backbone %>% distinct() %>% na.omit() %>% rename(Family_correct=family), - by="Family_correct") %>% - mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F)) + by="Family_correct") +``` +Create fields `is_vascular_species` and `Taxon group` based on list of family manually classified, and on phyla from `The Catalogue of Life`. +Assign all families that belong to `Tracheophyta` to category `is_vascular_species`, based on +```{r} +vascular <- c("Leguminosae" , "Alliaceae", "Bombacaceae" ,"Taxodiaceae", + "Aceraceae", "Centrolepidaceae","Callitrichaceae" ,"Flacourtiaceae", + "Compositae", "Asclepiadaceae", "Papilionaceae","Tiliaceae", + "Mimosaceae" , "Xanthorrhoeaceae","Arthropteridaceae", + "Valerianaceae", "Grammitidaceae" ,"Anarthriaceae", "Caesalpiniaceae", + "Chenopodiaceae", "Corylaceae", "Diervillaceae", "Dipsacaceae","Guttiferae", + "Haptanthaceae", "Hymenophyllopsidaceae", "Isoëtaceae","Labiatae", + "Lactoridaceae","Lemnaceae","Selaginaceae","Sterculiaceae","Myoporaceae", + "Myrsinaceae" ,"Pyrolaceae", "Rhoipteleaceae" ,"Xanthoceraceae") +lichens <- c("Acarosporaceae" , "Parmeliaceae", "Physciaceae", "Lichinaceae", + "Caliciaceae", "Lecanoraceae", "Venturiaceae" ,"Sphaerophoraceae" , + "Verrucariaceae", "Tricholomataceae","Baeomycetaceae", + "Catillariaceae" ,"Megasporaceae","Ramalinaceae","Pilocarpaceae" , + "Teloschistaceae","Candelariaceae","Rhizocarpaceae","Lecideaceae", + "Icmadophilaceae","Cladoniaceae","Collemataceae","Pannariaceae" , + "Lobariaceae", "Ophioparmaceae" ,"Psoraceae","Stereocaulaceae", + "Massalongiaceae","Peltigeraceae","Nephromataceae") +lichen.genera <- c("Amygdalaria", "Anamylospora", "Arthonia", "Pertusaria", "Pyrenula","Opegrapha", + "Ochrolechia", "Graphis", "Micarea", "Porpidia", "Arthopyrenia", "Graphina", "Anisomeridium", + "Mycobilimbia","Peltula", "Thelotrema", "Arthothelium", "Diploschistes", "Strigula", + "Trichothelium", "Melaspilea", "Phaeographis", "Thelenella", "Chaenothecopsis","Fuscidea", + "Dactylospora", "Gyalecta", "Myriotrema", "Placynthium") +mosses <- c("Pilotrichaceae", "Chonecoleaceae", "Hypopterygiaceae", "Scorpidiaceae", + "Balantiopsaceae", "Mesoptychiaceae","Octoblepharaceae" ,"Takakiaceae") +algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiaceae", + "Chordariaceae", "Dinobryaceae", "Diploneidaceae", "Ectocarpaceae", + "Fragilariaceae","Sphacelariaceae","Vaucheriaceae" , + "Amphipleuraceae", "Fucaceae", "Gomphonemataceae", "Melosiraceae", + "Laminariaceae","Acinetosporaceae" ,"Botryochloridaceae", + #diatoms below + "Thalassiosiraceae", "Cymbellaceae", "Naviculaceae","Bacillariaceae") +Backbone <- Backbone %>% + mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F)) %>% + mutate(is_vascular_species=replace(is_vascular_species, + list=Family_correct %in% vascular, + values=T)) %>% + mutate(`Taxon group`="Unknown") %>% + mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T), + "Vascular Plant", `Taxon group`)) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Family_correct %in% lichens, + values="Lichen")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus_correct %in% lichen.genera, + values="Lichen")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Family_correct %in% algae_diatoms, + values="Alga")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=phylum %in% c("Glaucophyta", "Rhodophyta", "Charophyta", "Chlorophyta"), + values="Alga")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Family_correct %in% mosses, + values="Moss")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=phylum %in% c("Bryophyta", "Bryophyta", "Anthocerotophyta" ), + values="Moss")) %>% + mutate(is_vascular_species=ifelse(`Taxon group` %in% c("Moss", "Alga", "Lichen"), + F, is_vascular_species)) +table(Backbone$`Taxon group`, exclude=NULL) table(Backbone$is_vascular_species, exclude=NULL) ``` + + + ## Export Backbone ```{r echo=F} knitr::kable(Backbone %>% @@ -1596,6 +1734,7 @@ knitr::kable(Backbone %>% *Name_short* - First two words of `Name_correct` *phylum* - As derived from `The Catalogue of Life` *is_vascular_species* - As derived based on selection of `phylum` from `The Catalogue of Life` +*`Taxon group`* - Taxon group, as in Turboveg. 'Vascular plant', 'Moss' (include liverworts), 'Lichen', 'Algae', 'Unknown ```{r} save(Backbone, file="../_output/Backbone3.0.RData")