Abstract
This document describes the workflow (with contributions from Oliver Purschke, Jürgen Dengler and Florian Jansen) that was used to generate the taxonomic backbone that standardizes taxon names across the (i) global vegetation plot database sPlot version 3.0 and (ii) the global plant trait data base TRY version 5.Timestamp: Fri Mar 13 15:01:06 2020
Drafted: Francesco Maria Sabatini
Revised: Helge Bruelheide, Borja Jimenez-Alfaro
Version: 1.3
Changes to Version 1.1 Additional manual cleaning of species names from BJA, UJ and HB.
Changes to Version 1.2 Changed order of ranking TNRS databases, when a name is matched across more than 1 DB; Using cleaned version of DT table (after stripping non-closed quotation marks). Additionally check with TPL those species that, even if resolved in TNRS, did not return an accepted name. Changes to Version 1.3 Manual check of names BEFORE matching with TNRS
library(reshape2)
library(tidyverse)
library(readr)
library(data.table)
library(knitr)
library(kableExtra)
library(stringr)
library(taxize)
library(Taxonstand)
library(vegdata)
#import and save splot names from DT table
DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv",
delim="\t",
col_type = cols(
PlotObservationID = col_double(),
Taxonomy = col_character(),
`Taxon group` = col_character(),
`Taxon group ID` = col_double(),
`Turboveg2 concept` = col_character(),
`Matched concept` = col_character(),
Match = col_double(),
Layer = col_double(),
`Cover %` = col_double(),
`Cover code` = col_character(),
x_ = col_double()
)
)
These objects are defined in the appendix
load("../_derived/taxa_manual.RData")
splot.species <- DT0 %>%
rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>%
filter(`Taxon group`!="Mushroom") %>%
dplyr::select(Species.original, Matched.concept) %>%
distinct() %>%
group_by(Matched.concept) %>%
mutate(fungi= word(Matched.concept, 1) %in% mushroom) %>%
ungroup() %>%
filter(fungi==F) %>%
dplyr::select(Species.original, Matched.concept)
write_csv(splot.species, path = "../_derived/splot3.0.2.species.csv")
!!! I used the column from TRY with the full species name, not the column with only two-word name strings
splot.species <- read_csv("../_derived/splot3.0.2.species.csv")
try.species <- readr::read_csv("../_input/AccSpecies_TRY5.csv", col_names = F, locale = locale(encoding = 'Latin1')) %>%
dplyr::select(-X6, -X7) %>%
rename(try.ID=X1, FullSpecies=X2, Species=X3, Genus=X4, Family=X5, GrowthForm=X8)
# Sneak in species from the Alpine database (Borja & Riccardo), as a courtesy to Project #18
alpine.species <- read_delim("../_input/new_alpine_species.txt", col_names = F, delim = "\t", locale = locale(encoding = 'Latin1')) %>%
rename(Species=X1)
Use the Matched.concept
column, as it already contains some standardization by Stephan Hennekkens according to synbiosys.
sPlot 3.0.1 contains 107473 different species names.
TRY 5. contains 302729.
I add to this a list of 2961 alpine species delivered from Riccardo Testolin, within sPlot Project #18.
spec.list.TRY.sPlot <- splot.species %>%
dplyr::select(Matched.concept) %>%
rename(Species=Matched.concept) %>%
mutate(Source="S") %>%
bind_rows(try.species %>%
dplyr::select(FullSpecies) %>% ##using the full name from TRY
rename(Species=FullSpecies) %>%
mutate(Source="T")) %>%
bind_rows(alpine.species %>%
mutate(Source="A")) %>%
reshape2::dcast(Species ~ Source) %>%
mutate(A=ifelse(A>=1, "A", "")) %>%
mutate(S=ifelse(S>=1, "S", "")) %>%
mutate(T=ifelse(T>=1, "T", "")) %>%
mutate(Source=paste(S, T, A, sep="")) %>%
dplyr::select(-A, -S, -T)
#Number of species unique and in common across databases
The total number of species in the backbone is 346438.
Source | Num.taxa |
---|---|
sPlot only | 43716 |
TRY only | 238137 |
Alpine only | 365 |
sPlot + TRY | 61624 |
sPlot + Alpine | 423 |
TRY + Alpine | 463 |
sPlot + TRY + Alpine | 1710 |
Stripping unwanted characters as well as abbreviation (such as hybrid markers) which would prevent name matching:
#Ancillary function to change to lower case
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(OriginalNames=Species) %>%
mutate(Species=tolower(Species)) %>%
mutate(Species=firstup(Species)) %>%
dplyr::select(OriginalNames, Species, Source) %>%
mutate(Species=gsub('*', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('cf. ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('Cf. ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('[', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(']', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' x ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('aff ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('(', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(')', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' cf ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' aff. ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub('c‚e', 'ceae', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub('x-', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('X-', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('like ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(',', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('#', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('_', ' ', Species))
For all names, that have a number in their first word, and consist of \(>\) 1 words, remove that word:
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(firstWordWithNumbers=grepl('[0-9]', word(Species, 1))) %>%
mutate(numberOfWords= sapply(gregexpr("\\W+", Species), length) + 1) %>%
mutate(Species=ifelse((firstWordWithNumbers & numberOfWords > 1),
sapply(Species,
function(x) substr(x, start=regexpr(pattern =' ', text=x)+1,
stop=nchar(x))), Species))
Correct some name abbreviations using taxname.abbr
in vegdata
:
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(Species=taxname.abbr(spec.list.TRY.sPlot$Species)) %>%
dplyr::select(OriginalNames, Species, Source) %>%
distinct()
Fix known issues in some species names
#Manual cleaning
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(Species=tolower(Species)) %>%
mutate(Species=firstup(Species)) %>%
mutate(Species=gsub("^Str ", "", Species)) %>%
mutate(Species=gsub("^Unknown ", "", Species)) %>%
mutate(Species=firstup(Species)) %>%
mutate(Species=gsub(" [0-9]*$", "", Species)) %>% #delete digits at end of object
mutate(Species=gsub("^\\d+|\\d+$", "", Species)) %>% #delete digits at the beginning or end of a string
mutate(Species=gsub(" sp.$", "", Species)) %>%
mutate(Species=gsub(" sp$", "", Species)) %>%
mutate(Species=gsub(" species$", "", Species)) %>%
mutate(Species=gsub(" *$", "", Species)) %>%
mutate(Species=gsub(" #$", "", Species)) %>%
mutate(Species=gsub(" m$", "", Species)) %>%
mutate(Species=gsub("acea ", "aceae ", Species)) %>%
mutate(Species=gsub('^Agropyrum', 'Agropyron', Species)) %>%
mutate(Species=gsub('^Anno ', 'Annona ', Species)) %>%
mutate(Species=gsub('Adpdytes dimidiata', 'Apodytes dimidiata', Species)) %>%
mutate(Species=gsub('Adenostorna fasciculaturn', 'Adenostoma fasciculatum', Species)) %>%
mutate(Species=gsub('Arctostapliylos glallca', 'Arctostaphylos glauca', Species)) %>%
mutate(Species=gsub('Bituminosa bituminosa', 'Bituminaria bituminosa', Species)) %>%
mutate(Species=gsub('Causurina equisitifolia', 'Causuarina equisetifolia', Species)) %>%
mutate(Species=gsub('Convulvus arvensis', 'Convolvulus arvensis', Species)) %>%
mutate(Species=gsub('Diospyrus dygina', 'Diospyros dygina', Species)) %>%
mutate(Species=gsub('^Dodoea', 'Dodonaea', Species)) %>%
mutate(Species=gsub('^Boheravia', 'Boerhavia', Species)) %>%
mutate(Species=gsub('Centaria maculosa', 'Centaurea maculosa', Species)) %>%
mutate(Species=gsub('Chamrenerium angustifolium', 'Chamaenerion angustifolium', Species)) %>%
mutate(Species=gsub('^Chicorium', 'Cichorium', Species)) %>%
mutate(Species=gsub('^Cirsiumum', 'Cirsium', Species)) %>%
mutate(Species=gsub('^Colubrium', 'Colubrina', Species)) %>%
mutate(Species=gsub('^Corymbium', 'Corymbia', Species)) %>%
mutate(Species=gsub('Cosmos bipinnata', 'Cosmos bipinnatus', Species)) %>%
mutate(Species=gsub('Diospyrus dygina', 'Diospyros digyna', Species)) %>%
mutate(Species=gsub('Diospyros egbert', 'Diospyros egbert-walkeri', Species)) %>%
mutate(Species=gsub('Dispyrus halesioides', 'Diospyros halesioides', Species)) %>%
mutate(Species=gsub('^Drymis', 'Drimys', Species)) %>%
mutate(Species=gsub('^Dysoxylon', 'Dysoxylum', Species)) %>%
mutate(Species=gsub('^Eleaegnus', 'Elaeagnus', Species)) %>%
mutate(Species=gsub('^Eleutherant', 'Eleutherantera', Species)) %>%
mutate(Species=gsub('^Echicea', 'Echinacea', Species)) %>%
mutate(Species=gsub('Gauteria foliolata', 'Gaultheria foliolosa', Species)) %>%
mutate(Species=gsub('^Geophylla', 'Geophyla', Species)) %>%
mutate(Species=gsub('Gloichidion insignis', 'Glochidion insigne', Species)) %>%
mutate(Species=gsub('^Glycium', 'Glycine', Species)) %>%
mutate(Species=gsub('^Hammalis', 'Hamamelis', Species)) %>%
mutate(Species=gsub('^Hippochoeris', 'Hypochaeris', Species)) %>%
mutate(Species=gsub('Ilix tephrohylla', 'Ilex tephrophylla', Species)) %>%
mutate(Species=gsub('^Jasininum', 'Jasminum', Species)) %>%
mutate(Species=gsub('Jenipa conjuta', 'Jenipa conjunta', Species)) %>%
mutate(Species=gsub('^Lechytis', 'Lecythis', Species)) %>%
mutate(Species=gsub('Lespedeza juncus', 'Lespedeza juncea', Species)) %>%
mutate(Species=gsub('Licania apelata', 'Licania apetala', Species)) %>%
mutate(Species=gsub('Limeum arenicola', 'Limeum arenicolum', Species)) %>%
mutate(Species=gsub('^Maniota', 'Manihot', Species)) %>%
mutate(Species=gsub('^Menta', 'Mentha', Species)) %>%
mutate(Species=gsub('Metophyum brownei', 'Metopium brownei', Species)) %>%
mutate(Species=gsub('Miliusa tomentosum', 'Miliusa tomentosa', Species)) %>%
mutate(Species=gsub('Mimululus ringens', 'Mimulus ringens', Species)) %>%
mutate(Species=gsub('Nardus strictus', 'Nardus stricta', Species)) %>%
mutate(Species=gsub('Neea glomeratha', 'Neea glomerata', Species)) %>%
mutate(Species=gsub('^Onopordon', 'Onopordum', Species)) %>%
mutate(Species=gsub('^Orbigynia', 'Orbignya', Species)) %>%
mutate(Species=gsub('Orites excelsa', 'Orites excelsus', Species)) %>%
mutate(Species=gsub('Paedorata lutea', 'Paederota lutea', Species)) %>%
mutate(Species=gsub('Palaquin ellipticum', 'Palaquium ellipticum', Species)) %>%
mutate(Species=gsub('Palmeria arfakensis', 'Palmeria arfakiana', Species)) %>%
mutate(Species=gsub('Petalostcmum purpureum', 'Petalostemum purpureum', Species)) %>%
mutate(Species=gsub('Petalostimum purpureum', 'Petalostemum purpureum', Species)) %>%
mutate(Species=gsub('^Petrosileum', 'Petroselinum', Species)) %>%
mutate(Species=gsub('Phlomis herba', 'Phlomis herba-venti', Species)) %>%
mutate(Species=gsub('^Phyllirea', 'Phillyrea', Species)) %>%
mutate(Species=gsub('Physilus pumula', 'Physalus pumila', Species)) %>%
mutate(Species=gsub('Picea maria', 'Picea mariana', Species)) %>%
mutate(Species=gsub('Picea retroXexa', 'Picea retroflexa', Species)) %>%
mutate(Species=gsub('Pilayella litoralis', 'Pilayella littoralis', Species)) %>%
mutate(Species=gsub('Placocarpus schaereri', 'Platecarpus schaerer', Species)) %>%
mutate(Species=gsub('Placocarpus schraereri', 'Platecarpus schaerer', Species)) %>%
mutate(Species=gsub('^Pulteea', 'Pultenaea', Species)) %>%
mutate(Species=gsub('Quercus rubrum', 'Quercus rubra', Species)) %>%
mutate(Species=gsub('Rubus fruticosa', 'Rubus fruticosus', Species)) %>%
mutate(Species=gsub('Rubus saxatile', 'Rubus saxatilis', Species)) %>%
mutate(Species=gsub('Rubus sylvatici', 'Rubus sylvaticus', Species)) %>%
mutate(Species=gsub('^Sanguiria', 'Sanguinaria', Species)) %>%
mutate(Species=gsub('Sarauja nepaulensis', 'Sarauja nepalensis', Species)) %>%
mutate(Species=gsub('^Sateria', 'Setaria', Species)) %>%
mutate(Species=gsub('Sauraiea nepulensis', 'Saurauia nepalensis', Species)) %>%
mutate(Species=gsub('Schneckia australis', 'Schenckia australis', Species)) %>%
mutate(Species=gsub('Smirnium oleastrum', 'Smyrnium olusatrum', Species)) %>%
mutate(Species=gsub('Solms laubachia', 'Solms-laubachia himalayensis', Species)) %>%
mutate(Species=gsub('Stellaria chamaejasme', 'Stellera chamaejasme', Species)) %>%
mutate(Species=gsub('Steraria parviflora', 'Setaria parviflora', Species)) %>%
mutate(Species=gsub('^Stuartia', 'Stewartia', Species)) %>%
mutate(Species=gsub('Sycops sinensis', 'Sycopsis sinensis', Species)) %>%
mutate(Species=gsub('Tacetum vulgare', 'Tanacetum vulgare', Species)) %>%
mutate(Species=gsub('Talinurn angustissimun', 'Talinun angustissimun', Species)) %>%
mutate(Species=gsub('Talloma hodgsoni', 'Talauma hodgsonii', Species)) %>%
mutate(Species=gsub('Taraxacum albo', 'Taraxacum album', Species)) %>%
mutate(Species=gsub('Tetragonia falcata', 'Tetragona falcata', Species)) %>%
mutate(Species=gsub('Trapogogon', 'Tragopogon', Species)) %>%
mutate(Species=gsub('Zyzyphus saeri', 'Zizyphus saeri', Species)) %>%
mutate(Species=gsub('^Helicrysum', 'Helichrysum', Species)) %>%
mutate(Species=gsub('^Diceropappus rhinocerotis', 'Elytropappus rhinocerotis', Species)) %>%
mutate(Species=gsub('^Euphorbiace ', 'Euphorbiacaea ', Species)) %>%
mutate(Species=gsub('^Gloecapsa', 'Gloeocapsa', Species)) %>%
mutate(Species=gsub('Glycirhiza', 'Glycyrrhiza', Species)) %>%
mutate(Species=gsub('Abiesnordmannia', 'Abies nordmannia', Species)) %>%
mutate(Species=gsub('Alnus inca', 'Alnus incana', Species)) %>%
mutate(Species=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', Species)) %>%
mutate(Species=gsub('Antylis barba-jovis', 'Anthyllis barba-jovis', Species)) %>%
mutate(Species=gsub('^Albizzia "', 'Albizia ', Species)) %>%
mutate(Species=gsub('^Ipomoena ', 'Ipomoea ', Species)) %>%
mutate(Species=gsub('^Ipomea ', 'Ipomoea ', Species)) %>%
mutate(Species=gsub('Ipomo wolco', 'Ipomoea wolcottiana', Species)) %>%
## additional manual cleaning from UJ, BJA, HB
mutate(Species=gsub('Abacaba palm', 'Oenocarpus balickii', Species)) %>%
mutate(Species=gsub('Acerkuomeii', 'Acer kuomeii', Species)) %>%
mutate(Species=gsub('Alder$', 'Alnus', Species)) %>%
mutate(Species=gsub('Amapa$', 'Tabebuia', Species)) %>%
mutate(Species=gsub('Amapa amargoso', 'Parahancornia amapa', Species)) %>%
mutate(Species=gsub('Amapa doce$', 'Tabebuia', Species)) %>%
mutate(Species=gsub('Amapai$', 'Tabebuia', Species)) %>%
mutate(Species=gsub('Amapaí$', 'Tabebuia', Species)) %>%
mutate(Species=gsub('Amapa m1', 'Tabebuia', Species)) %>%
mutate(Species=gsub('Amaranth$', 'Amaranthus', Species)) %>%
mutate(Species=gsub('Amophora fruticosa', 'Amorpha fruticosa', Species)) %>%
mutate(Species=gsub('Anacardiace ', 'Anacardiaceae ', Species)) %>%
mutate(Species=gsub('Anagallisarvensis', 'Anagallis arvensis', Species)) %>%
mutate(Species=gsub('Anemonenarcissiflora var.', 'Anemone narcissiflora', Species)) %>%
mutate(Species=gsub('Anenome ', 'Anemone', Species)) %>%
mutate(Species=gsub('Anona ', 'Annona ', Species)) %>%
mutate(Species=gsub('Antylis ', 'Anthyllis', Species)) %>%
mutate(Species=gsub('Apocyncadea gelbblueh$', 'Apocynaceae', Species)) %>%
mutate(Species=gsub('Aracium', 'Crepis', Species)) %>%
mutate(Species=gsub('Ardis mexic', 'Ardisia mexicana subsp. siltepecana', Species)) %>%
mutate(Species=gsub('Ardis verap', 'Ardisia verapazensis', Species)) %>%
mutate(Species=gsub('Argenomne hummemannii', 'Argemone hunnemanni', Species)) %>%
mutate(Species=gsub('Artabotus', 'Artabotrys', Species)) %>%
mutate(Species=gsub('Artemisiaintegrifolia', 'Artemisia integrifolia', Species)) %>%
mutate(Species=gsub('Asclepiacea$', 'Asclepiadaceae', Species)) %>%
mutate(Species=gsub('Asclep. klimmer', 'Asclepiadaceae', Species)) %>%
mutate(Species=gsub('Astartoseris triquetra', 'Lactuca triquetra', Species)) %>%
mutate(Species=gsub('Asteracee ', 'Asteraceae ', Species)) %>%
mutate(Species=gsub('Avenula glauc$', 'Avenula', Species)) %>%
mutate(Species=gsub('Baikea plurijuga', 'Baikiaea plurijuga', Species)) %>%
mutate(Species=gsub('Binse rundbl', 'Juncaceae', Species)) %>%
mutate(Species=gsub('Blättrige fabaceae th', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Bonel macro$', 'Bonellia macrocarpa subsp. macrocarpa', Species)) %>%
mutate(Species=gsub('Boraginacee samtig', 'Boraginaceae', Species)) %>%
mutate(Species=gsub('Bri¢fitos', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Bryophyte$', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Bryopsida', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Carallia macrophylla', 'Carallia', Species)) %>%
mutate(Species=gsub('Carexectabilis', 'Carex spectabilis', Species)) %>%
mutate(Species=gsub('Carex fein', 'Carex', Species)) %>%
mutate(Species=gsub('Cerania vermicularis', 'Thamnolia vermicularis', Species)) %>%
mutate(Species=gsub('Chamelauci merredin', 'Chamelaucium', Species)) %>%
mutate(Species=gsub('Chamelau drummon', 'Chamelaucium', Species)) %>%
mutate(Species=gsub('Charophyta', 'Characeae', Species)) %>%
mutate(Species=gsub('Cheiridopsis-keimlinge', 'Cheiridopsis', Species)) %>%
mutate(Species=gsub('Chenopodiacee$', 'Chenopodiaceae', Species)) %>%
mutate(Species=gsub('Chiangioden mexicanum', 'Chiangiodendron mexicanum', Species)) %>%
mutate(Species=gsub('Chiranthode pentadactylon', 'Chiranthodendron pentadactylon', Species)) %>%
mutate(Species=gsub('Chrysobalan ', 'Chrysobalanus ', Species)) %>%
mutate(Species=gsub('Cladapodiella', 'Cladopodiella', Species)) %>%
mutate(Species=gsub('Cleidium ', 'Cleidion ', Species)) %>%
mutate(Species=gsub('Collema/leptogium lichenoides', 'Collemataceae', Species)) %>%
mutate(Species=gsub('Comarostaph discolor', 'Comarostaphylis discolor', Species)) %>%
mutate(Species=gsub('Combretdodendrum africana', 'Combretodendrum africanum', Species)) %>%
mutate(Species=gsub('Commelinacaea floscopa', 'Floscopa glomerata', Species)) %>%
mutate(Species=gsub('Coyncia setigera', 'Coincya setigera', Species)) %>%
mutate(Species=gsub('Crataeva', 'Crateva', Species)) %>%
mutate(Species=gsub('Craterosperma', 'Rubiaceae', Species)) %>%
mutate(Species=gsub('Crespicium', 'Burseraceae', Species)) %>%
mutate(Species=gsub('Critoniadel nubigenus', 'Critoniadelphus nubigenus', Species)) %>%
mutate(Species=gsub('Crotalaria/vigna?', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Croto billb', 'Croton billbergianus subsp. pyramidalis', Species)) %>%
mutate(Species=gsub('Dana„ racemosa', 'Danae racemosa', Species)) %>%
mutate(Species=gsub('Deehasia', 'Dehaasia', Species)) %>%
mutate(Species=gsub('Dichapetala', 'Dichapetalum', Species)) %>%
mutate(Species=gsub('Distel bractea', 'Asteracaea', Species)) %>%
mutate(Species=gsub('Distelig asteraceae', 'Asteracaea', Species)) %>%
mutate(Species=gsub('Dodon visco', 'Dodonaea viscosa', Species)) %>%
mutate(Species=gsub('Doldenbluetler', 'Apiaceae', Species)) %>%
mutate(Species=gsub('Echinosurus capitatus', 'Poaceae', Species)) %>%
mutate(Species=gsub('Einähriges gras$', 'Poaceae', Species)) %>%
mutate(Species=gsub('Einähriges gras von gestern$', 'Poaceae', Species)) %>%
mutate(Species=gsub('Einblütiges rispengras', 'Poaceae', Species)) %>%
mutate(Species=gsub('Eiovaltrichtergrundblatt orchidee', 'Orchidaceae', Species)) %>%
mutate(Species=gsub('Elongata subsp.', 'Pohlia elongata', Species)) %>%
mutate(Species=gsub('Enriquebelt ', 'Enriquebeltrania ', Species)) %>%
mutate(Species=gsub('Entermorpha ', 'Enteromorpha ', Species)) %>%
mutate(Species=gsub('Erodiurn$', 'Erodium', Species)) %>%
mutate(Species=gsub('Euc. chloroclada x camaldulensis', 'Eucalyptus', Species)) %>%
mutate(Species=gsub('Euphorbiacée ipatouduluga gouduatché', 'Euphorbiaceae', Species)) %>%
mutate(Species=gsub('Fabacee kleeblatt stengel schwarzdrüsi', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Fabaceenstrauch wie 132446 f', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Fabaceenstr kleinbltrg', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Fabacee wie lotus f', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Farn', 'Pteridophyta', Species)) %>%
mutate(Species=gsub('Farn cystopteris', 'Cystopteris', Species)) %>%
mutate(Species=gsub('Fern', 'Pteridophyta', Species)) %>%
mutate(Species=replace(Species, list=word(Species, 1)=="Fingergras", values="Digitaria")) %>%
mutate(Species=replace(Species, list=word(Species, 1)=="Fingerhirse", values="Digitaria")) %>%
mutate(Species=gsub('Gelbe onagraceae', 'Onagraceae', Species)) %>%
mutate(Species=gsub('Gramine', 'Poaceae', Species)) %>%
mutate(Species=gsub('Graminea', 'Poaceae', Species)) %>%
mutate(Species=gsub('Graminia', 'Poaceae', Species)) %>%
mutate(Species=gsub('Grannenquecke', 'Poaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Gras",
values="Poaceae")) %>%
mutate(Species=gsub('Gynostachi dicanthus', 'Gymnostachium diacanthus', Species)) %>%
mutate(Species=gsub('Hafer haarkranz', 'Poaceae', Species)) %>%
mutate(Species=gsub('Hapolosiphon', 'Hapalosiphon', Species)) %>%
mutate(Species=gsub('Heliocrysum', 'Helichrysum', Species)) %>%
mutate(Species=replace(Species, list=word(Species, 1)=="Hepaticae", values="Bryophyta")) %>%
mutate(Species=gsub('Hepaticas', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Hepatophyta', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Hermerocalis', 'Hemerocallis', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Hirse",
values="Poaceae")) %>%
mutate(Species=gsub('Hirte trian', 'Hirtella triandra subsp. media', Species)) %>%
mutate(Species=replace(Species, list=word(Species, 1)=="Hohlzahn", values="Lamiaceae")) %>%
mutate(Species=gsub('Hondurodend urceolatum', 'Hondurodendron urceolatum', Species)) %>%
mutate(Species=gsub('Hornklee gelb', 'Fabaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Horstgras",
values="Poaceae")) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Huehnerhirse",
values="Digitaria")) %>%
mutate(Species=gsub('Hydrocoleus lyngbyaceus', 'Hydrocoleum lyngbyaceum', Species)) %>%
mutate(Species=gsub('Hyernima nipensis', 'Hieronyma nipensis', Species)) %>%
mutate(Species=gsub('Hyeronima', 'Hieronyma', Species)) %>%
mutate(Species=gsub('Hypocal angusti', 'Hypocalymma angustifolium', Species)) %>%
mutate(Species=gsub('Hypocalym nambung', 'Hypocalymma', Species)) %>%
mutate(Species=gsub('Hyprium', 'Hypericum', Species)) %>%
mutate(Species=gsub('Igelkolben', 'Sparganium', Species)) %>%
mutate(Species=gsub('Ilexã‚â paraguariensis', 'Ilex', Species)) %>%
mutate(Species=gsub('Ipomea', 'Ipomoea', Species)) %>%
mutate(Species=gsub('Ipomoena', 'Ipomoea', Species)) %>%
mutate(Species=gsub('Jm kürbis stark behaart', 'Cucurbitaceae', Species)) %>%
mutate(Species=gsub('Juncaginacee/triglochin', 'Triglochin', Species)) %>%
mutate(Species=gsub('Juncas', 'Juncus', Species)) %>%
mutate(Species=gsub('Keilblatt cyperus', 'Cyperus', Species)) %>%
mutate(Species=gsub('Khh 3010 polygalacee', 'Polygalaceae', Species)) %>%
mutate(Species=gsub(' Khh 3014 liliacee 3f„ch. kapsel schwarze samen', 'Liliaceae', Species)) %>%
mutate(Species=gsub('Khh 3024 brachiaria', 'Brachiaria', Species)) %>%
mutate(Species=gsub('Khh 3025 liliaceae gelbe blten breite bl„tter', 'Liliaceae', Species)) %>%
mutate(Species=gsub('Khh 3037 ficus', 'Ficus', Species)) %>%
mutate(Species=gsub('Khh 3054 ficus iteophylla miq.', 'Ficus', Species)) %>%
mutate(Species=gsub('Kl. borstgras', 'Poaceae', Species)) %>%
mutate(Species=gsub('Kleine malvaceae', 'Malvaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Kletter",
values="Asteraceae")) %>%
mutate(Species=gsub('Klimmer asclepiadaceae', 'Asclepiadaceae', Species)) %>%
mutate(Species=gsub('Klimmer curcuvitaceae', 'Cucurbitaceae', Species)) %>%
mutate(Species=gsub('Kl. sauergras', 'Cyperaceae', Species)) %>%
mutate(Species=gsub('Knabenkraut gefleckt', 'Orchis', Species)) %>%
mutate(Species=gsub('Knubbelblüt. gras haarkranz vgl f', 'Poaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Koenigskerze",
values="Verbascum")) %>%
mutate(Species=gsub('Kriechgras zynodon', 'Poaceae', Species)) %>%
mutate(Species=gsub('Kürbis', 'Cucurbitaceae', Species)) %>%
mutate(Species=gsub('Lamiaceen strauch', 'Lamiaceae', Species)) %>%
mutate(Species=gsub('Lamiacee orange', 'Lamiaceae', Species)) %>%
mutate(Species=gsub('Lamiales orobanchaceae + phrymaceae + plantaginaceae + scrophulariaceae', 'Orobanchaceae', Species)) %>%
mutate(Species=gsub('Lantanacamara wandelrösschen', 'Lantana camara', Species)) %>%
mutate(Species=gsub('Lasiopeta watheroo k. shepherd & c. wilkins ks', 'Lasiopetalum', Species)) %>%
mutate(Species=gsub('Leg-inderteminada', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Legu 1fiedrig groá schlank', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Legume$', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Leguminosae spgm', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Leguminosea', 'Fabaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Leguminose",
values="Fabaceae")) %>%
mutate(Species=gsub('Leheelo grass', 'Poaceae', Species)) %>%
mutate(Species=gsub('Lepid carra', 'Lepiderema', Species)) %>%
mutate(Species=gsub('Lich caloplaca', 'Caloplaca', Species)) %>%
mutate(Species=gsub('Liliacee', 'Liliaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Lilie",
values="Liliaceae")) %>%
mutate(Species=gsub('Liliengewächs', 'Liliaceae', Species)) %>%
mutate(Species=gsub('Lisea', 'Litsea', Species)) %>%
mutate(Species=gsub('Lisymachia', 'Lysimachia', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Liverwort",
values="Bryophyta")) %>%
mutate(Species=gsub('Livwort', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Lonicerachrysantha', 'Lonicera chrysantha', Species)) %>%
mutate(Species=gsub('Lycoctamnus barbatus', 'Aconitum barbatum', Species)) %>%
mutate(Species=gsub('Lygopus', 'Lycopus', Species)) %>%
mutate(Species=gsub('Maitenus', 'Maytenus', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Malpighiace",
values="Malpighiaceae")) %>%
mutate(Species=gsub('Malpighiales chrysobalanaceae + humiriaceae', 'Malpighiaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Malve",
values="Malvaceae")) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Mammutgras",
values="Poaceae")) %>%
mutate(Species=gsub('Mammutgrass', 'Poaceae', Species)) %>%
mutate(Species=gsub('Maqui guian', 'Maquira guianensis subsp. costaricana', Species)) %>%
mutate(Species=gsub('Marchantiophyta', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Mariana aphylla', 'Maireana aphylla', Species)) %>%
mutate(Species=gsub('Mehrfingeriges ährengras', 'Poaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Melastomata",
values="Melastomataceae")) %>%
mutate(Species=gsub('Mesembr minibl', 'Mesembryanthemum', Species)) %>%
mutate(Species=gsub('Mesostomma kotschyanum', 'Mesostemma kotschyana', Species)) %>%
mutate(Species=gsub('Microhepatics', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Micromeria micrantha', 'Micromeria graeca subsp. micrantha', Species)) %>%
mutate(Species=gsub('Mimose minifiedrig f', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Miniepilobium', 'Epilobium', Species)) %>%
mutate(Species=gsub('Minimargerite', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Miniochna', 'Ochna', Species)) %>%
mutate(Species=gsub('Minischilf 132466 f', 'Poaceae', Species)) %>%
mutate(Species=gsub('Mistletoe', 'Viscum', Species)) %>%
mutate(Species=gsub('Mniaecia', 'Mniaceae', Species)) %>%
mutate(Species=gsub('Molemo', 'Turraea', Species)) %>%
mutate(Species=gsub('Molses', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Momisa pigra', 'Mimosa pigra', Species)) %>%
mutate(Species=gsub('Monandrus squarrosus', 'Cyperus squarrosus', Species)) %>%
mutate(Species=gsub('Monchema debile', 'Monechma debile', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Monochna",
values="Polygalaceae")) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Moos",
values="Bryophyta")) %>%
mutate(Species=gsub('Moospolster grau-grün', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Mortonioden ', 'Mortoniodendron ', Species)) %>%
mutate(Species=gsub('Mos onbekend', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Mossen overige', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Mougetia', 'Mougeotia', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Musci",
values="Bryophyta")) %>%
mutate(Species=gsub('Myciantes', 'Myrcianthes', Species)) %>%
mutate(Species=gsub('Myrciaã‚â pulchra', 'Myrcia pulchra', Species)) %>%
mutate(Species=gsub('Myrcianov.', 'Myrcia', Species, fixed = T)) %>%
mutate(Species=gsub('Myrsi coria', 'Myrsine coriacea', Species)) %>%
mutate(Species=gsub('Myrtaceenstrauch', 'Myrtaceae', Species)) %>%
mutate(Species=gsub('Nachtkerze fru dreispaltig', 'Onagracaee', Species)) %>%
mutate(Species=gsub('Neobartsia crenoloba', 'Bartsia crenoloba', Species)) %>%
mutate(Species=gsub('None$', 'Nonea', Species)) %>%
mutate(Species=gsub('Ocos adenophylla', 'Symplocos adenophylla', Species)) %>%
mutate(Species=gsub('Officinale subsp. group', 'Taraxacum officinale s.l.', Species)) %>%
mutate(Species=gsub('Orch$', 'Orchidaceae', Species)) %>%
mutate(Species=gsub('Orchid', 'Orchidaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Orchidee",
values="Orchidaceae")) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in% c("Papilonacea", "Papilionacea"),
values="Fabaceae")) %>%
mutate(Species=gsub('Pasania dodoniifolia', 'Lithocarpus dodonaeifolius', Species)) %>%
mutate(Species=gsub('Phoebengmoensis', 'Phoebe hungmoensis', Species)) %>%
mutate(Species=gsub('Picra antid$', 'Picramnia antidesma subsp. fessonia', Species)) %>%
mutate(Species=gsub('Pinopsida', 'Coniferae', Species)) %>%
mutate(Species=gsub('Pisonianov.', 'Pisonia', Species, fixed=T)) %>%
mutate(Species=gsub('Pithecellob ', 'Pithecellobium ', Species)) %>%
mutate(Species=gsub('Pithecocten', 'Pithecoctenium', Species)) %>%
mutate(Species=gsub('Pleradenoph longicuspis', 'Pleradenophora longicuspis', Species)) %>%
mutate(Species=gsub('Pleuranthod ', 'Pleuranthodendron ', Species)) %>%
mutate(Species=gsub('Poales', 'Poaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in% c("Polygalacea", "Polygalacee"),
values="Polygalaceae")) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in% c("Polygonaceae", "Polygonacee"),
values="Polygonaceae")) %>%
mutate(Species=gsub('Polygonumlongisetum', 'Polygonum longisetum', Species)) %>%
mutate(Species=gsub('Posoq coria subsp. maxima', 'Posoqueria coriacea subsp. maxima', Species)) %>%
mutate(Species=gsub('Prosthecidi ', 'Prosthecidiscus ', Species)) %>%
mutate(Species=gsub('Pseudo bidens', '', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in%
c("Pseudobriza", "Pseudofingergras",
"Pseudogerste", "Puschelgras", "Quecke",
"Queckenblatt", "Queckengras",
"Roggen/hafer", "Ruchgras", "Silbergras",
"Suessgras"),
values="Poaceae")) %>%
mutate(Species=gsub('Ptarmica', 'Achillea', Species)) %>%
mutate(Species=gsub('Pterost cauline leaves n. gibson & m.n. lyons', 'Pterostegia', Species)) %>%
mutate(Species=gsub('Quararibeaã‚â guianensis', 'Quararibea guianensis', Species)) %>%
mutate(Species=gsub('Rainfarn f', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Ranke ipomoea', 'Ipomoea', Species)) %>%
mutate(Species=gsub('Ranke rubiaceae', 'Rubiaceae', Species)) %>%
mutate(Species=gsub('Rauwolfia', 'Rauvolfia', Species)) %>%
mutate(Species=gsub('Rheinfarn', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Rhodostemon kunthiana', 'Rhodostemonodaphne kunthiana', Species)) %>%
mutate(Species=gsub('Riccardia/aneura', 'Bryophyta', Species)) %>%
mutate(Species=gsub('Rietgras steril 134051a', 'Poaceae', Species)) %>%
mutate(Species=gsub('Rosenbergio formosum', 'Rosenbergiodendron formosum', Species)) %>%
mutate(Species=gsub('Rotes puschelgras', 'Poaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Rubiacea",
values="Rubiaceae")) %>%
mutate(Species=gsub('Rytidospe goomallin a.g. gunness et al. oakp 10/', 'Rytidosperma', Species)) %>%
mutate(Species=gsub('Salacia idoensis', 'Salacia', Species)) %>%
mutate(Species=gsub('Samphire', 'Amaranthaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in%
c("Sauergras", "Schlanksegge", "Sedge",
"Segge", "Simse"),
values="Cyperaceae")) %>%
mutate(Species=gsub('Scaev repen subsp. north sandp r.j. cranf & p.j. spenc', 'Scaevola repens', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Schachtelhalm",
values="Equisetaceae")) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Schnittlauch",
values="Amaryllidaceae")) %>%
mutate(Species=gsub('Schwertlilie trocken', 'Iridaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in% c("Scropholacea", "Scrophulariacea", "Scroph."),
values="Scrophulariacea")) %>%
mutate(Species=gsub('Sitzende onagraceae', 'Onagraceae', Species)) %>%
mutate(Species=gsub('Sonnenblume', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Stachelgurke', 'Cucurbitaceae', Species)) %>%
mutate(Species=gsub('Stark behaarte malve', 'Malvaceae', Species)) %>%
mutate(Species=gsub('Staude asteraceae bl watteweich f', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Staude crotalaria unterseite silber', 'Crotalaria', Species)) %>%
mutate(Species=gsub('Staude solanum', 'Solanaceae', Species)) %>%
mutate(Species=gsub('Staude tephrosia', 'Tephrosia', Species)) %>%
mutate(Species=gsub('Stipagrosist panicle gross', 'Stipagrostis', Species)) %>%
mutate(Species=gsub('Asteraceae u silber', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Stratonostoc communeá', 'Stratonostoc commune', Species)) %>%
mutate(Species=gsub('Strauch asteraceae nadelblätt.', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Strauch blatt wie salix reticulata astera', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Strauch blatt wie salix reticulata astera 132534b', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Strauch fabaceae gerieft schote', 'Fabaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Strauch" &
word(Species,2)=="Rubiaceae",
values="Rubiaceae")) %>%
mutate(Species=gsub('Fabaceae samtig bl lanzettlich', 'Fabaceae', Species)) %>%
mutate(Species=gsub('Ochna mini', 'Ochna', Species)) %>%
mutate(Species=gsub('Stryphnoden microstachyum', 'Stryphnodendron microstachyum', Species)) %>%
mutate(Species=gsub('Sumpfgladiole haarig', 'Gladiolus', Species)) %>%
mutate(Species=gsub('Sygnum ramphicarpa', 'Scrophulariaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1)=="Symplococar",
values="Symplococarpon")) %>%
mutate(Species=gsub('Sysirinchium', 'Sisyrinchium', Species)) %>%
mutate(Species=gsub('Syzigium accuminatisima', 'Syzygium acuminatissimum', Species)) %>%
mutate(Species=gsub('Tabernaemon ', 'Tabernaemontana ', Species)) %>%
mutate(Species=gsub('Thalassodend', 'Thalassodendron', Species)) %>%
mutate(Species=gsub('Thinouia canescens', 'Thinouia', Species)) %>%
mutate(Species=gsub('Thistle', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Trisetumicatum', 'Trisetum spicatum', Species)) %>%
mutate(Species=gsub('Undetermined sedge', 'Cyperaceae', Species)) %>%
mutate(Species=replace(Species,
list=word(Species, 1) %in%
c("Liverwort", "Liverworts", "Moss"),
values="Bryophyta")) %>%
mutate(Species=gsub('Vismi bacci', 'Vismia baccifera subsp. ferruginea', Species)) %>%
mutate(Species=gsub('Weidenr”schen', 'Onagraceae', Species)) %>%
mutate(Species=gsub('Weißpelziger brauner Spross Asteracea', 'Asteraceae', Species)) %>%
mutate(Species=gsub('Wie stipagrostis', 'Poaceae', Species)) %>%
mutate(Species=gsub('Wincassia', 'Fabaceae', Species)) %>%
mutate(Species=gsub('xDactyloden st-quintini', 'Dactylodenia st-quintinii', Species)) %>%
mutate(Species=gsub('Zizyphus sp1 IUCN1', 'Zizyphus', Species)) %>%
mutate(Species=gsub('Zwiebel Lilaceae steril', 'Lilaceae', Species)) %>%
mutate(Species=gsub('Zwstr faurea', 'Faurea', Species)) %>%
mutate(Species=gsub('Quercus crispla', 'Quercus crispula', Species)) %>%
mutate(Species=gsub('Corallorrhiza', 'Corallorhiza', Species)) %>%
mutate(Species=gsub('Brunella vulgaris', 'Prunella vulgaris', Species))
A total of 23286 species names were modified. Although substantially improved, the species list has still quite a lot of inconsistencies. The total list submitted to TNRS contains 333137 species names.
Export species name list
write_csv(spec.list.TRY.sPlot %>% dplyr::select(Species) %>% distinct() ,
path = "../_derived/TNRS_submit/tnrs_submit_iter1.csv")
The csv-file of species names was submitted to Taxonomic Name Resolution Service web application (Boyle et al. 2013, iPlant Collaborative (2015). TNRS version 4.0 was used, which became available in August 2015 (this version also included The Plant List version 1.1). TNRS was queried on 24/02/2020.
The following settings were used for resolving names on TNRS.
The initial TNRS name resolution run was based on the five standard sources that were ranked according to preference in the following order (default of TNRS):
Resolved names were assigned to families based on the APGIII classification [@Chase2009], the same classification system used by Tropicos.
Once the matching process was finished, results were retrieved from TNRS using the Detailed Download
option that included the full name information (parsed components, warnings, links to sources, etc.). We retrieved all the matches for each species, constrained by source (TNRS default), where the name in the first source was selected as best match, unless there was no suitable match found
in that source, the match from the next lower-ranked source was selected, until all resources where exhausted.
Manually inspect the TNRS-results table in a spreadsheat application (i.e. LibreOffice or Excel). Starting with the highest taxonomic rank considered (i.e. Family). For instance, if manual checking of the TRNS output reveals that all accepted names or synonyms that have accuracy scores >0.9 are correct taxon names, use the following selection procedure:
Continue this selection procedure for entries that were matched at lower taxonomic ranks, i.e. genus, species, etc..
Read the files downloaded from TNRS into R
.
tnrs.res0 <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter1.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Name_number = col_double(),
Overall_score = col_double(),
Name_score = col_double(),
Author_score = col_double(),
Family_score = col_double(),
Genus_score = col_double(),
Specific_epithet_score = col_double(),
Infraspecific_epithet_score = col_double(),
Infraspecific_epithet_2_score = col_double(),
Selected = col_logical()
))
Best matches are selected in successive steps, depending at which taxonomic level each record was matched. Records were sorted based on decreasing match scores. Matches at low taxonomic level (variety, subspecies) were favoured over matches at high taxonomic levels (family, sections). When having exactly the same ranks, the records were ranked based on their source, as explained above.
For each name submitted, only the record having the highest rank was retained.
#reorder priorities
TNRS.priorities <- c("tpl;gcc;tropicos;usda",
"tpl;gcc;tropicos",
"tpl;gcc;usda",
"tpl;ildis;tropicos",
"tpl;ildis;usda",
"tpl;tropicos;usda",
"tpl;gcc",
"tpl;ildis",
"tpl;tropicos",
"tpl;usda",
"gcc;tropicos;usda",
"gcc;tropicos",
"tropicos;gcc",
"gcc;usda",
"gcc",
"ildis;tropicos;usda",
"ildis;tropicos",
"ildis;usda",
"ildis",
"tpl", # move tpl down the list,
# because for legumes and composites,
# tpl relies on gcc or ILDIS
"tropicos;usda",
"tropicos",
"usda" )
tnrs.res <- tnrs.res0 %>%
mutate(Name_matched_rank=factor(Name_matched_rank,
levels=c("variety", "subspecies", "species", "genus",
"family", "section", "supersection",
"infraspecies", "forma", "race",
"nothosubspecies", "proles", "monstr",
"series"))) %>%
mutate(Source=factor(Source, levels=TNRS.priorities)) %>%
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion","Invalid",
"Illegitimate","Misapplied","Rejected name"))) %>%
#filter(Taxonomic_status %in% c("Accepted", "Synonym")) %>%
arrange(Name_number,
desc(Genus_score),
desc(Specific_epithet_score),
desc(Infraspecific_epithet_2_score),
desc(Infraspecific_epithet_score),
desc(Family_score),
desc(Name_score),
desc(Overall_score),
Source,
Taxonomic_status) %>%
group_by(Name_submitted) %>%
slice(1)
After this first step, there are 1709 records for which no match was found. Another 15578 were unreliably matched (overall match score <0.9).
Manually inspect sorted table and select all entries at the highest hierarchical level (family). Manually identify the family accuracy score threshold value above which a name can be considered a correct name. In the following case, this corresponds to a score $>$0.88.
index.family <- which(tnrs.res$Name_matched_rank == "family" &
(tnrs.res$Taxonomic_status == "Accepted" |
tnrs.res$Taxonomic_status == "Synonym") &
tnrs.res$Family_score > 0.88)
length(index.family)
## [1] 741
index.genus <- which(tnrs.res$Name_matched_rank == "genus" &
( tnrs.res$Taxonomic_status %in% c("Synonym", "Accepted") &
tnrs.res$Genus_score > 0.83)
|
( tnrs.res$Taxonomic_status == "No opinion" &
tnrs.res$Genus_score >= 0.99))
length(index.genus)
## [1] 45771
index.species <- which(tnrs.res$Name_matched_rank == "species" &
( (tnrs.res$Taxonomic_status == "Accepted" | #condition 1
tnrs.res$Taxonomic_status == "Synonym") &
tnrs.res$Genus_score > 0.78 &
tnrs.res$Name_score > 0.90)
|
( tnrs.res$Genus_score > 0.90 & # condition 2 - effective for records with subspecies information
(tnrs.res$Specific_epithet_score > 0.90)
))
length(index.species)
## [1] 311135
index.subspec <- which( (tnrs.res$Name_matched_rank %in% c("infraspecies", "subspecies") |
is.na(tnrs.res$Name_matched_rank)) & # there are a few records at sub-species level which are not categorized
(tnrs.res$Taxonomic_status == "Accepted" |
tnrs.res$Taxonomic_status == "Synonym"))
length(index.subspec)
## [1] 8499
index.variety <- which(tnrs.res$Name_matched_rank == "variety" &
(tnrs.res$Taxonomic_status == "Accepted" |
tnrs.res$Taxonomic_status == "Synonym"))
length(index.variety)
## [1] 7179
index.infraspec <- which(tnrs.res$Name_matched_rank == "infraspecies")
length(index.infraspec)
## [1] 92
index.forma <- which(tnrs.res$Name_matched_rank == "forma")
length(index.forma)
## [1] 173
index.spermatophyt <- which(tnrs.res$Name_matched == "No suitable matches found."
& word(tnrs.res$Name_submitted, 1) == "Spermatophyta")
length(index.spermatophyt)
## [1] 47
certain
or uncertain
namesSelect names that do not fulfill the search criteria, i.e. that were not selected as certain species, for further name matching.
index.tnrs <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt))
tnrs.res.certain <- tnrs.res[index.tnrs,]
dim(tnrs.res.certain)
## [1] 329815 36
write.csv(tnrs.res.certain, file = "../_derived/TNRS_submit/tnrs.res.iter1.certain.csv")
tnrs.res.uncertain <- tnrs.res[-index.tnrs,]
dim(tnrs.res.uncertain)
## [1] 3319 36
write.csv(tnrs.res.uncertain, file = "../_derived/TNRS_submit/tnrs.res.iter1.uncertain.csv")
save(tnrs.res.certain, tnrs.res.uncertain, file="../_derived/TNRS_submit/tnrs.iter1.RData")
Many unmatched records do contain subspecies information which could not be retrieved in TNRS, although genus and species seem to be spelled correctly. Also, sometimes the mismatch derives from having the word ‘species’ or ‘sp’ at the end of the name.
tnrs.submit.iter2 <- data.frame(old=tnrs.res.uncertain$Name_submitted) %>%
mutate(old=as.character(old)) %>%
mutate(new=old)
# delete remaining records of mushroom species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
filter(!word(new,1) %in% mushroom)
# Extract family name for unidentified species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
na.omit() %>%
group_by(old) %>%
mutate(family.lev=str_extract(word(new,1), pattern='([^\\s]+aceae)')) %>%
mutate(new=ifelse(is.na(family.lev), new, family.lev)) %>%
dplyr::select(-family.lev) %>%
ungroup()
#Cut to the first 2 words in the name string
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
group_by(old) %>%
mutate(Name_binomial=paste(word(new, c(1,2)), collapse=" ")) %>%
ungroup() %>%
mutate(Name_binomial=gsub(' NA$', '', Name_binomial))
write_csv(tnrs.submit.iter2 %>%
dplyr::select(Name_binomial) %>%
#After cleaning some names now match to those already resolved in iteration 1. Take them out
filter(!Name_binomial %in% tnrs.res.certain$Name_submitted) %>%
distinct(), path="../_derived/TNRS_submit/tnrs_submit_iter2.csv")
tnrs.res.iter2.raw <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter2.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Name_number = col_double(),
Overall_score = col_double(),
Name_score = col_double(),
Author_score = col_double(),
Family_score = col_double(),
Genus_score = col_double(),
Specific_epithet_score = col_double(),
Infraspecific_epithet_score = col_double(),
Infraspecific_epithet_2_score = col_double(),
Selected = col_logical()
))
tnrs.res.iter2 <- tnrs.res.iter2.raw %>%
mutate(Name_matched_rank=factor(Name_matched_rank,
levels=c("variety", "subspecies", "species",
"genus", "family", "section",
"supersection", "infraspecies", "forma",
"race", "nothosubspecies", "proles",
"monstr", "series"))) %>%
mutate(Source=factor(Source, levels=TNRS.priorities)) %>%
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion",
"Invalid","Illegitimate","Misapplied",
"Rejected name"))) %>%
arrange(Name_number,
desc(Genus_score),
desc(Specific_epithet_score),
desc(Infraspecific_epithet_2_score),
desc(Infraspecific_epithet_score),
desc(Family_score),
desc(Name_score),
desc(Overall_score),
Source,
Taxonomic_status) %>%
group_by(Name_submitted) %>%
slice(1)
index.family <- which(tnrs.res.iter2$Name_matched_rank == "family" &
(tnrs.res.iter2$Taxonomic_status == "Accepted" |
tnrs.res.iter2$Taxonomic_status == "Synonym") &
tnrs.res.iter2$Family_score > 0.88)
length(index.family)
## [1] 2
index.genus <- which(tnrs.res.iter2$Name_matched_rank == "genus" &
(tnrs.res.iter2$Taxonomic_status %in% c("Accepted","Synonym") &
tnrs.res.iter2$Genus_score >= 0.90 &
tnrs.res.iter2$Name_score > 0.49))
length(index.genus)
## [1] 17
index.species <- which(tnrs.res.iter2$Name_matched_rank == "species" &
#(tnrs.res.iter2$Taxonomic_status == "Accepted" |
# tnrs.res.iter2$Taxonomic_status == "Synonym") &
tnrs.res.iter2$Genus_score >= 0.80 &
tnrs.res.iter2$Specific_epithet_score > 0.90)
length(index.species)
## [1] 32
index.infraspec <- which(tnrs.res.iter2$Name_matched_rank == "infraspecies")
length(index.infraspec)
## [1] 0
index.subspec <- which((tnrs.res.iter2$Name_matched_rank %in% c("infraspecies", "subspecies") |
is.na(tnrs.res.iter2$Name_matched_rank)) & # there are a few records at sub-species level which are not categorized
(tnrs.res.iter2$Taxonomic_status == "Accepted" |
tnrs.res.iter2$Taxonomic_status == "Synonym"))
length(index.subspec)
## [1] 0
index.variety <- which(tnrs.res.iter2$Name_matched_rank == "variety" &
(tnrs.res.iter2$Taxonomic_status == "Accepted" |
tnrs.res.iter2$Taxonomic_status == "Synonym"))
length(index.variety)
## [1] 0
index.forma <- which(tnrs.res.iter2$Name_matched_rank == "forma")
length(index.forma)
## [1] 0
index.spermatophyt <- which(tnrs.res.iter2$Name_matched == "No suitable matches found."
& word(tnrs.res.iter2$Name_submitted, 1) == "Spermatophyta")
length(index.spermatophyt)
## [1] 0
index.tnrs.iter2 <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt))
tnrs.res.iter2.certain <- tnrs.res.iter2[index.tnrs.iter2,]
dim(tnrs.res.iter2.certain)
## [1] 51 36
write.csv(tnrs.res.iter2.certain, file = "../_derived/TNRS_submit/tnrs.res.iter2.certain.csv")
tnrs.res.iter2.uncertain <- tnrs.res.iter2[-index.tnrs.iter2,]
dim(tnrs.res.iter2.uncertain)
## [1] 2745 36
write.csv(tnrs.res.iter2.uncertain, file = "../_derived/TNRS_submit/tnrs.res.iter2.uncertain.csv")
save(tnrs.res.iter2.certain, tnrs.res.iter2.uncertain,
tnrs.submit.iter2, file="../_derived/TNRS_submit/tnrs.iter2.RData")
write_csv(tnrs.res.iter2.uncertain[,2], path = "../_derived/TNRS_submit/tnrs_submit_iter3.csv")
This list was submitted to TNRS
, but only selecting the NCBI
database.
TNRS_NCBI
tnrs.res.iter3.raw <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter3.txt", delim="\t",
locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Name_number = col_double(),
Overall_score = col_double(),
Name_score = col_double(),
Author_score = col_double(),
Family_score = col_double(),
Genus_score = col_double(),
Specific_epithet_score = col_double(),
Infraspecific_epithet_score = col_double(),
Infraspecific_epithet_2_score = col_double(),
Selected = col_logical()
))
tnrs.ncbi <- tnrs.res.iter3.raw %>%
mutate(Name_matched_rank=factor(Name_matched_rank,
levels=c("variety", "subspecies", "species",
"genus", "family", "section", "supersection",
"infraspecies", "forma", "race",
"nothosubspecies", "proles", "monstr",
"series"))) %>%
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion","Invalid",
"Illegitimate","Misapplied","Rejected name"))) %>%
arrange(Name_number,
desc(Genus_score),
desc(Specific_epithet_score),
desc(Infraspecific_epithet_2_score),
desc(Infraspecific_epithet_score),
desc(Family_score),
desc(Name_score),
desc(Overall_score),
Source,
Taxonomic_status) %>%
group_by(Name_submitted) %>%
slice(1)
index.family <- which(tnrs.ncbi$Name_matched_rank == "family" &
(tnrs.ncbi$Taxonomic_status == "Accepted"|
tnrs.ncbi$Taxonomic_status == "Synonym") &
tnrs.ncbi$Family_score > 0.85)
length(index.family)
## [1] 8
index.genus <- which(tnrs.ncbi$Name_matched_rank == "genus" &
tnrs.ncbi$Taxonomic_status %in% c("Accepted", "Synonym", "No opinion") &
(
(tnrs.ncbi$Genus_score > 0.89 &
tnrs.ncbi$Name_score > 0.49) |
(tnrs.ncbi$Genus_score > 0.99 &
tnrs.ncbi$Name_score > 0.2)
))
length(index.genus)
## [1] 286
index.species.1 <- which(tnrs.ncbi$Name_matched_rank == "species" &
(tnrs.ncbi$Taxonomic_status == "Accepted" |
tnrs.ncbi$Taxonomic_status == "Synonym") &
tnrs.ncbi$Name_score > 0.94 &
tnrs.ncbi$Specific_epithet_score>=0.67)
length(index.species.1)
## [1] 180
index.species.2 <- which(tnrs.ncbi$Name_matched_rank == "species" &
(tnrs.ncbi$Taxonomic_status == "Accepted" |
tnrs.ncbi$Taxonomic_status == "Synonym") &
tnrs.ncbi$Genus_score > 0.81 &
tnrs.ncbi$Name_score > 0.51 &
tnrs.ncbi$Specific_epithet_score>=0.67)
length(index.species.2)
## [1] 196
index.species.3 <- which(tnrs.ncbi$Name_matched_rank == "species" &
tnrs.ncbi$Taxonomic_status == "No opinion" &
tnrs.ncbi$Genus_score > 0.7 &
tnrs.ncbi$Specific_epithet_score > 0.75)
length(index.species.3)
## [1] 0
index.species <- unique(c(index.species.1, index.species.2, index.species.3))
length(index.species)
## [1] 212
index.var <- which((tnrs.ncbi$Name_matched_rank == "subspecies" |
tnrs.ncbi$Name_matched_rank == "unknown" |
tnrs.ncbi$Name_matched_rank == "variety") &
(tnrs.ncbi$Taxonomic_status == "Accepted" |
tnrs.ncbi$Taxonomic_status == "No opinion" |
tnrs.ncbi$Taxonomic_status == "Synonym"))
length(index.var)
## [1] 0
certain
or uncertain
namesindex.ncbi <- unique(c(index.family, index.genus, index.species, index.var))
tnrs.ncbi.certain <- tnrs.ncbi[index.ncbi,]
nrow(tnrs.ncbi.certain)
## [1] 506
write_csv(tnrs.ncbi.certain, path = "../_derived/TNRS_submit/tnrs.ncbi.certain.csv")
tnrs.ncbi.uncertain <- tnrs.ncbi[-index.ncbi,]
nrow(tnrs.ncbi.uncertain)
## [1] 2239
write_csv(tnrs.ncbi.uncertain, path = "../_derived/TNRS_submit/tnrs.ncbi.uncertain.csv")
save(tnrs.ncbi.certain, tnrs.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter3.RData")
After iteration 3, there are still 2239 unresolved taxa.
The Plant List
matching tools for unresolved namesGenerate names list from tnrs.ncbi.uncertain
to be matched against The Plant List
, using Taxonstand::TPL
. Add to this list, also all those species that in the first iterations did not return an accepted name.
tpl.submit <- tnrs.res.certain %>%
filter(is.na(Accepted_name)) %>%
dplyr::select(Name_submitted) %>%
bind_rows(tnrs.res.iter2.certain %>%
filter(is.na(Accepted_name)) %>%
dplyr::select(Name_submitted)) %>%
bind_rows(tnrs.ncbi.certain %>%
filter(is.na(Accepted_name)) %>%
dplyr::select(Name_submitted)) %>%
bind_rows(tnrs.ncbi.uncertain %>%
dplyr::select(Name_submitted)) %>%
distinct()
nrow(tpl.submit)
write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
#divide in 99 batches
indices <- 1:nrow(tpl.submit)
chunks <- split(indices, sort(indices%%99))
library(doParallel)
library(parallel)
cl <- makeForkCluster(5, outfile="")
registerDoParallel(cl)
tpl.ncbi <- foreach(i=1:length(chunks), .combine=rbind) %dopar% {
tmp <- (TPL(tpl.submit$Name_submitted[chunks[[i]]]))
save(tmp, file=paste0("../_derived/TNRS_submit/TPL_foreach/tpl.ncbi", i,".RData"))
return(tmp)
}
stopCluster(cl)
save(tpl.ncbi, file = "../_derived/TPL/tpl_results_iter4.RData")
load("../_derived/TPL/tpl_results_iter4.RData")
tpl.ncbi.certain <- tpl.ncbi %>%
filter(Plant.Name.Index==T | Higher.level==T)
nrow(tpl.ncbi.certain)
## [1] 27338
write_csv(tpl.ncbi.certain, path = "../_derived/TPL/tpl.ncbi.certain.csv")
tpl.ncbi.uncertain <- tpl.ncbi %>%
filter(Plant.Name.Index==F & Higher.level==F) %>%
dplyr::select(Taxon)
nrow(tpl.ncbi.uncertain)
## [1] 5771
write_csv(tpl.ncbi.uncertain, path = "../_derived/TPL/tpl.ncbi.uncertain.csv")
save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter4.RData")
load("../_derived/TNRS_submit/tnrs.iter1.RData")
load("../_derived/TNRS_submit/tnrs.iter2.RData")
load("../_derived/TNRS_submit/tnrs.iter3.RData")
load("../_derived/TNRS_submit/tnrs.iter4.RData")
#Double check of wrong taxa from TNRS
finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii")
tpl.ncbi.certain <- tpl.ncbi.certain %>%
bind_rows(TPL(finalcheck))
Combine the certain
data sets:
Backbone <- spec.list.TRY.sPlot %>%
as.tbl() %>%
rename(Name_sPlot_TRY=OriginalNames,
Name_string_corr1=Species) %>%
left_join(tnrs.submit.iter2 %>%
dplyr::select(-new) %>%
rename(Name_string_corr1=old, Name_string_corr2=Name_binomial),
by="Name_string_corr1") %>%
mutate(Name_submitted=ifelse(!is.na(Name_string_corr2), Name_string_corr2, Name_string_corr1)) %>%
dplyr::select(Name_sPlot_TRY, Name_string_corr1, Name_string_corr2, Source, Name_submitted) %>%
rename(sPlot_TRY=Source) %>%
left_join(tnrs.res.certain %>%
#filter(!is.na(Accepted_name)) %>%
bind_rows(tnrs.res.iter2.certain) %>%
bind_rows(tnrs.ncbi.certain) %>%
#reformat TPL output to tnrs output
bind_rows(tpl.ncbi.certain %>%
rename(Name_submitted=Taxon,
Name_matched_url=ID,
Taxonomic_status=Taxonomic.status,
Accepted_name_author=New.Authority,
Accepted_name_url=New.ID,
Accepted_name_family=Family,
Selected=Plant.Name.Index) %>%
mutate_at(.vars=vars(New.Hybrid.marker, New.Infraspecific.rank, New.Infraspecific),
.fun=~ifelse(is.na(.), "", .)) %>%
mutate(Accepted_name=paste(New.Genus, New.Hybrid.marker,
New.Species, New.Infraspecific.rank,
New.Infraspecific)) %>%
mutate(Accepted_name=gsub(pattern="\\s+", " ", Accepted_name)) %>%
mutate(Accepted_name_species=paste(New.Genus, New.Hybrid.marker, New.Species)) %>%
mutate(Accepted_name_species=gsub(pattern="\\s+", " ", Accepted_name_species)) %>%
mutate(Accepted_name_rank=ifelse(Higher.level==F, "species", NA)) %>%
mutate(Source=paste("tpl", TPL.version)) %>%
dplyr::select( (data.frame(colmatch=match(colnames(tnrs.ncbi),
names(.))) %>%
filter(!is.na(colmatch)))$colmatch)
) %>%
group_by(Name_submitted) %>% #Some double matches. Prioritize best taxonomic status
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion","Invalid",
"Illegitimate","Misapplied","Rejected name",
"Unresolved"))) %>%
arrange(Taxonomic_status) %>%
slice(1) %>%
#delete empty spaces at end of names
mutate(Accepted_name=gsub(pattern=" $", replacement="", x=Accepted_name)) %>%
mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)),
by="Name_submitted")
#Double check
nrow(Backbone) == nrow(spec.list.TRY.sPlot)
## [1] TRUE
Add four additional columns. If names were neither resolved at the accepted or synonym level, set Status_correct == "Other"
, and assign No suitable matches found.
to the remaining species.
Backbone <- Backbone %>%
mutate(Status_correct=fct_collapse(Taxonomic_status,
Other=c("No opinion","Invalid", "Unresolved",
"Illegitimate","Misapplied","Rejected name"))) %>%
mutate(Status_correct=fct_explicit_na(Status_correct, "No suitable matches found.")) %>%
#Create Name_correct field. Use Accepted names, if any. Otherwise matched names.
mutate(Name_correct=ifelse(!is.na(Accepted_name),
Accepted_name,
Name_matched)) %>%
mutate(Genus_correct=ifelse(!is.na(Name_correct) & (!Accepted_name_rank %in% c("family")),
word(Name_correct,1),
NA)) %>%
mutate(Rank_correct=ifelse(!is.na(Name_matched_rank),
as.character(Name_matched_rank),
"higher")) %>%
mutate(Rank_correct=factor(Rank_correct, levels=c("higher", "family", "genus", "species",
"subspecies", "variety", "infraspecies",
"race", "forma")))
summary(Backbone$Status_correct)
## Accepted Synonym
## 284561 28617
## Other No suitable matches found.
## 30085 3175
summary(Backbone$Rank_correct)
## higher family genus species subspecies variety
## 6390 1889 27104 294425 8948 7443
## infraspecies race forma
## 92 0 147
There are 3070 species names for which we found no match in any of the taxonomic resources we used. Yet, for as many as 35383 taxa, the matching did not properly resolve the species name, and we only found a match at genus or higher level.
There are 35195 records with missing family information. Create field Family_correct
.
Backbone <- Backbone %>%
mutate(family.lev=str_extract(word(Name_correct,1), pattern='([^\\s]+aceae)')) %>%
mutate(Family_correct=ifelse(!is.na(Accepted_name_family),
Accepted_name_family,
family.lev)) %>%
dplyr::select(-family.lev)
# Remaining records with missing family info
sum((is.na(Backbone$Family_correct)))
## [1] 33398
TNRS
Genera_submit <- Backbone %>%
filter(is.na(Family_correct)) %>%
dplyr::select(Genus_correct) %>%
distinct()
write_csv(Genera_submit, "../_derived/TNRS_submit/Genera_submit.csv")
Import results from TNRS. Best match only and simple download
import.profile <- cols(
Name_submitted = col_character(),
Name_matched = col_character(),
Author_matched = col_logical(),
Overall_score = col_double(),
Taxonomic_status = col_character(),
Accepted_name = col_character(),
Accepted_author = col_character(),
Accepted_family = col_character(),
Source = col_character(),
Warnings = col_character(),
Accepted_name_lsid = col_character()
)
tnrs.genera <- read_delim("../_derived/TNRS_submit/tnrs_genera.txt", delim="\t",
locale = locale(encoding = 'UTF-8'),quote="",col_type = import.profile)
Attach resolved families to backbone
Backbone <- Backbone %>%
left_join(tnrs.genera %>%
dplyr::select(Name_submitted, Accepted_family) %>%
rename(Genus_correct=Name_submitted, Family_import=Accepted_family),
by="Genus_correct") %>%
mutate(Family_correct=ifelse(is.na(Family_correct),
Family_import,
Family_correct)) %>%
dplyr::select(-Family_import)
#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 10067
TRY 5.0
Data from try were received by Jens Kattge on Jan 21, 2020.
# Species, Genus, Family from try
try.species <- read_csv(
"../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv",
locale = locale(encoding = "latin1"))
Backbone <- Backbone %>%
left_join(try.species %>%
dplyr::select(Genus_correct=Genus, family=Family) %>%
distinct() %>%
filter(family != "") %>%
group_by(Genus_correct),
by="Genus_correct") %>%
mutate(Family_correct=coalesce(Family_correct, family)) %>%
dplyr::select(-family)
# Remaining records with missing family info
sum((is.na(Backbone$Family_correct)))
## [1] 7954
The Catalogue of Life
#Download data from Catalogue of Life - 2019
download.file("http://www.catalogueoflife.org/DCA_Export/zip/archive-kingdom-plantae-bl3.zip",
destfile="/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019.zip")
unzip("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019.zip", files="taxa.txt", exdir = "/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/")
cat.life <- read_delim("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/taxa.txt",
delim="\t",
col_types = cols(
.default = col_character(),
taxonID = col_double(),
datasetID = col_double(),
acceptedNameUsageID = col_double(),
parentNameUsageID = col_double(),
superfamily = col_logical(),
subgenus = col_logical(),
source = col_logical(),
namePublishedIn = col_logical(),
modified = col_character(),
taxonConceptID = col_double(),
isExtinct = col_logical()
)) %>%
#correct family names to match to the standards in TPL
mutate(family=ifelse(family=="Fabaceae", "Leguminosae", family)) %>%
mutate(family=ifelse(family=="Asteraceae", "Compositae", family))
Genera_missing <- Backbone %>%
filter(is.na(Family_correct) & !is.na(Genus_correct)) %>%
dplyr::select(Genus_correct) %>%
distinct()
Backbone <- Backbone %>%
left_join(cat.life %>%
dplyr::select(genus, family) %>%
distinct() %>%
filter(family != "") %>%
group_by(genus) %>% #There are two genera with multiple attribution to families
slice(1) %>%
filter(genus %in% Genera_missing$Genus_correct) %>%
rename(Genus_correct=genus),
by="Genus_correct") %>%
mutate(Family_correct=coalesce(Family_correct, family)) %>%
dplyr::select(-family)
#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 6914
After matching the remaining genera with the Catalogue of life there are still 6914 records without Family affiliation, for a total of 1281 genera.
Backbone <- Backbone %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Coptidium",
values="Ranunculaceae")) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Balanocarpus",
values="Dipterocarpaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Cardaminopsis",
values="Brassicaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Carpolepis",
values="Myrtaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Cathartolinum",
values="Linaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Didiscus",
values="Araliaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Grammadenia",
values="Primulaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Antholoma",
values="Elaeocarpaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Odontarrhena",
values="Brassicaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Trichinium",
values="Amaranthaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Limonium",
values="Plumbaginaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Eunanus",
values="Phrymaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Sunaptea",
values="Dipterocarpaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Aconogonon",
values="Polygonaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Cajophora",
values="Loasaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Calobota",
values="Leguminosae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Alsine",
values="Caryophyllaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Cyanococcus",
values="Ericaceae" )) %>%
mutate(Family_correct=ifelse(Family_correct %in% c("Papilionaceae", "Fabaceae"),
"Leguminosae", Family_correct)) %>%
mutate(Family_correct=ifelse(Family_correct=="Asteraceae", "Compositae", Family_correct)) %>%
mutate(Family_correct=ifelse(Family_correct=="Unknown", NA, Family_correct))
#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 6830
Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info.
genera_families <- Backbone %>%
filter(Taxonomic_status=="Accepted") %>%
dplyr::select(Genus_correct, Family_correct) %>%
rename(family=Family_correct) %>%
distinct() %>%
na.omit() %>%
#for some genera there are multiple families assigned
# (e.g. in case of unresolved species names )
# Extract the family names that occurs most often across each genus
group_by(Genus_correct, family) %>%
summarize(n=n()) %>%
arrange(desc(n)) %>%
slice(1) %>%
ungroup() %>%
dplyr::select(-n)
# Assign family derived from backbone to other records
Backbone <- Backbone %>%
left_join(genera_families, by="Genus_correct") %>%
mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)),
family,
Family_correct)) %>%
dplyr::select(-family)
#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 6366
nrow(Backbone) == nrow(spec.list.TRY.sPlot)
## [1] TRUE
mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae",
"Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae",
"Cortinariaceae", "Polyporaceae", "Pleosporaceae",
"Leotiaceae","Dermateaceae","Hymenochaetaceae","Stereaceae","Tremellaceae")
Backbone <- Backbone %>%
filter(!Genus_correct %in% mushroom) %>%
filter(!Family_correct %in% mushroom.families)
Name_short
Shorten names that have more than two words and where the second word is a x. If there is no species name available, fill in with either genus or family info
Backbone <- Backbone %>%
mutate(Name_short=Name_correct) %>%
mutate(Name_short=gsub(pattern=" x ", replacement=" ", x=Name_short, fixed=T)) %>%
mutate(Name_short=word(Name_short, start=1L, end=2L)) %>%
mutate(Name_short=ifelse(!is.na(Name_short),
Name_short,
ifelse(!is.na(Genus_correct),
Genus_correct,
ifelse(!is.na(Family_correct),
Family_correct,
NA))))
sum(is.na(Backbone$Name_short))
## [1] 3076
sum(is.na(Backbone$Name_correct))
## [1] 3070
is_vascular_plant
and Taxon group
Attach phylum information from The Catalogue of Life
.
Backbone <- Backbone %>%
left_join(cat.life %>%
dplyr::select(phylum, family) %>%
distinct() %>%
na.omit() %>%
rename(Family_correct=family),
by="Family_correct")
Create fields is_vascular_species
and Taxon group
based on list of family manually classified, and on phyla from The Catalogue of Life
.
Assign all families that belong to Tracheophyta
to category is_vascular_species
, based on
Backbone <- Backbone %>%
mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F)) %>%
mutate(is_vascular_species=replace(is_vascular_species,
list=Family_correct %in% vascular,
values=T)) %>%
mutate(`Taxon group`="Unknown") %>%
mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T),
"Vascular plant", `Taxon group`)) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Family_correct %in% lichens,
values="Lichen")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Genus_correct %in% lichen.genera,
values="Lichen")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Family_correct %in% algae_diatoms,
values="Alga")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=phylum %in% c("Glaucophyta", "Rhodophyta", "Charophyta", "Chlorophyta"),
values="Alga")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Family_correct %in% mosses,
values="Moss")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=phylum %in% c("Bryophyta", "Bryophyta", "Anthocerotophyta" ),
values="Moss")) %>%
mutate(is_vascular_species=ifelse(`Taxon group` %in% c("Moss", "Alga", "Lichen"),
F, is_vascular_species))
table(Backbone$`Taxon group`, exclude=NULL)
##
## Alga Lichen Moss Unknown Vascular plant
## 211 4470 2961 6802 331948
table(Backbone$is_vascular_species, exclude=NULL)
##
## FALSE TRUE <NA>
## 8819 331948 5625
Name_sPlot_TRY | Name_string_corr1 | Name_string_corr2 | sPlot_TRY | Name_submitted | Name_number | Overall_score | Name_matched | Name_matched_rank | Name_score | Name_matched_author | Name_matched_url | Author_matched | Author_score | Family_matched | Family_score | Name_matched_accepted_family | Genus_matched | Genus_score | Specific_epithet_matched | Specific_epithet_score | Infraspecific_rank | Infraspecific_epithet_matched | Infraspecific_epithet_score | Infraspecific_rank_2 | Infraspecific_epithet_2_matched | Infraspecific_epithet_2_score | Annotations | Unmatched_terms | Taxonomic_status | Accepted_name | Accepted_name_author | Accepted_name_rank | Accepted_name_url | Accepted_name_species | Accepted_name_family | Selected | Source | Warnings | Accepted_name_lsid | Status_correct | Name_correct | Genus_correct | Rank_correct | Family_correct | Name_short | phylum | is_vascular_species | Taxon group |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Nuphar variegata | Nuphar variegata | NA | T | Nuphar variegata | 212752 | 1.0 | Nuphar variegata | species | 1.0 | Durand | http://www.theplantlist.org/tpl1.1/record/tro-22600101;http://www.tropicos.org/Name/22600101 | NA | NA | NA | NA | Nymphaeaceae | Nuphar | 1 | variegata | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Nuphar variegata | Durand | species | http://www.theplantlist.org/tpl1.1/record/tro-22600101;http://www.tropicos.org/Name/22600101 | Nuphar variegata | Nymphaeaceae | TRUE | tpl;tropicos | NA | NA | Accepted | Nuphar variegata | Nuphar | species | Nymphaeaceae | Nuphar variegata | Tracheophyta | TRUE | Vascular plant |
Euphorbia bougheyi | Euphorbia bougheyi | NA | T | Euphorbia bougheyi | 124646 | 1.0 | Euphorbia bougheyi | species | 1.0 | L.C.Leach | http://www.theplantlist.org/tpl1.1/record/kew-78692 | NA | NA | NA | NA | Euphorbiaceae | Euphorbia | 1 | bougheyi | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Euphorbia bougheyi | L.C.Leach | species | http://www.theplantlist.org/tpl1.1/record/kew-78692 | Euphorbia bougheyi | Euphorbiaceae | TRUE | tpl | NA | NA | Accepted | Euphorbia bougheyi | Euphorbia | species | Euphorbiaceae | Euphorbia bougheyi | Tracheophyta | TRUE | Vascular plant |
Astragalus squalidus | Astragalus squalidus | NA | S | Astragalus squalidus | 32307 | 1.0 | Astragalus squalidus | species | 1.0 | Boiss. | http://www.theplantlist.org/tpl1.1/record/tro-13026903;http://www.tropicos.org/Name/13026903 | NA | NA | NA | NA | Fabaceae | Astragalus | 1 | squalidus | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Synonym | Astragalus amoenus subsp. squalidus | (Boiss.) Ponert | subspecies | http://www.theplantlist.org/tpl1.1/record/tro-13074560;http://www.tropicos.org/Name/13074560 | Astragalus amoenus | Fabaceae | TRUE | tpl;tropicos | NA | NA | Synonym | Astragalus amoenus subsp. squalidus | Astragalus | species | Leguminosae | Astragalus amoenus | Tracheophyta | TRUE | Vascular plant |
Cryptantha johnstonii | Cryptantha johnstonii | NA | T | Cryptantha johnstonii | 86515 | 1.0 | Cryptantha johnstonii | species | 1.0 | L.C.Higgins | http://www.theplantlist.org/tpl1.1/record/kew-2745602 | NA | NA | NA | NA | Boraginaceae | Cryptantha | 1 | johnstonii | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Cryptantha johnstonii | L.C.Higgins | species | http://www.theplantlist.org/tpl1.1/record/kew-2745602 | Cryptantha johnstonii | Boraginaceae | TRUE | tpl | NA | NA | Accepted | Cryptantha johnstonii | Cryptantha | species | Boraginaceae | Cryptantha johnstonii | Tracheophyta | TRUE | Vascular plant |
Eugenia feijoi | Eugenia feijoi | NA | ST | Eugenia feijoi | 122930 | 1.0 | Eugenia feijoi | species | 1.0 | O.Berg | http://www.theplantlist.org/tpl1.1/record/kew-75185 | NA | NA | NA | NA | Myrtaceae | Eugenia | 1 | feijoi | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Synonym | Eugenia moschata | (Aubl.) Nied. ex T.Durand & B.D.Jacks. | species | http://www.theplantlist.org/tpl1.1/record/kew-76030 | Eugenia moschata | Myrtaceae | TRUE | tpl | NA | NA | Synonym | Eugenia moschata | Eugenia | species | Myrtaceae | Eugenia moschata | Tracheophyta | TRUE | Vascular plant |
Racosperma forrestianum | Racosperma forrestianum | NA | T | Racosperma forrestianum | 261472 | 1.0 | Racosperma forrestianum | species | 1.0 | (Pritz.) Pedley | http://www.theplantlist.org/tpl1.1/record/tro-50247318;http://www.tropicos.org/Name/50247318 | NA | NA | NA | NA | Fabaceae | Racosperma | 1 | forrestianum | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Racosperma forrestianum | (Pritz.) Pedley | species | http://www.theplantlist.org/tpl1.1/record/tro-50247318;http://www.tropicos.org/Name/50247318 | Racosperma forrestianum | Fabaceae | TRUE | tpl;tropicos | NA | NA | Accepted | Racosperma forrestianum | Racosperma | species | Leguminosae | Racosperma forrestianum | Tracheophyta | TRUE | Vascular plant |
Sebastiania klotzschiana | Sebastiania klotzschiana | NA | ST | Sebastiania klotzschiana | 282511 | 1.0 | Sebastiania klotzschiana | species | 1.0 | Müll.Arg. | http://www.theplantlist.org/tpl1.1/record/kew-190173 | NA | NA | NA | NA | Euphorbiaceae | Sebastiania | 1 | klotzschiana | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Synonym | Sebastiania commersoniana | (Baill.) L.B.Sm. & Downs | species | http://www.theplantlist.org/tpl1.1/record/kew-189993 | Sebastiania commersoniana | Euphorbiaceae | TRUE | tpl | NA | NA | Synonym | Sebastiania commersoniana | Sebastiania | species | Euphorbiaceae | Sebastiania commersoniana | Tracheophyta | TRUE | Vascular plant |
Symplocos pedicellata | Symplocos pedicellata | NA | T | Symplocos pedicellata | 302945 | 1.0 | Symplocos pedicellata | species | 1.0 | Kurz | http://www.theplantlist.org/tpl1.1/record/kew-2578473 | NA | NA | NA | NA | Symplocaceae | Symplocos | 1 | pedicellata | 1 | NA | NA | NA | NA | NA | NA | NA | NA | No opinion | NA | NA | NA | NA | NA | NA | TRUE | tpl | NA | NA | Other | Symplocos pedicellata | Symplocos | species | Symplocaceae | Symplocos pedicellata | Tracheophyta | TRUE | Vascular plant |
Mocquerysia distans | Mocquerysia distans | NA | T | Mocquerysia distans | 203260 | 0.5 | Mocquerysia | genus | 0.5 | NA | http://www.theplantlist.org/tpl1.1/search?q=Mocquerysia | NA | NA | NA | NA | Salicaceae | Mocquerysia | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | distans | Accepted | Mocquerysia | NA | genus | http://www.theplantlist.org/tpl1.1/search?q=Mocquerysia | NA | Salicaceae | TRUE | tpl | [Partial match] | NA | Accepted | Mocquerysia | Mocquerysia | genus | Salicaceae | Mocquerysia | Tracheophyta | TRUE | Vascular plant |
Litsea alba | Litsea alba | NA | T | Litsea alba | 183072 | 1.0 | Litsea alba | species | 1.0 | Kosterm. | http://www.theplantlist.org/tpl1.1/record/kew-2351565;http://www.tropicos.org/Name/17803701 | NA | NA | NA | NA | Lauraceae | Litsea | 1 | alba | 1 | NA | NA | NA | NA | NA | NA | NA | NA | No opinion | NA | NA | NA | ; | NA | NA | TRUE | tpl;tropicos | NA | NA | Other | Litsea alba | Litsea | species | Lauraceae | Litsea alba | Tracheophyta | TRUE | Vascular plant |
Agonandra peruviana | Agonandra peruviana | NA | ST | Agonandra peruviana | 9034 | 1.0 | Agonandra peruviana | species | 1.0 | Hiepko | http://www.theplantlist.org/tpl1.1/record/kew-2626673;http://www.tropicos.org/Name/50065155 | NA | NA | NA | NA | Opiliaceae | Agonandra | 1 | peruviana | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Agonandra peruviana | Hiepko | species | http://www.theplantlist.org/tpl1.1/record/kew-2626673;http://www.tropicos.org/Name/50065155 | Agonandra peruviana | Opiliaceae | TRUE | tpl;tropicos | NA | NA | Accepted | Agonandra peruviana | Agonandra | species | Opiliaceae | Agonandra peruviana | Tracheophyta | TRUE | Vascular plant |
Isoetes gymnocarpa | Isoetes gymnocarpa | NA | ST | Isoetes gymnocarpa | 165656 | 0.5 | Isoetes | genus | 0.5 | NA | http://www.theplantlist.org/tpl1.1/search?q=Isoetes;http://plants.usda.gov/java/profile?symbol=ISOET | NA | NA | NA | NA | Isoetaceae | Isoetes | 1 | NA | NA | NA | NA | NA | NA | NA | NA | NA | gymnocarpa | Accepted | Isoetes | NA | genus | http://www.theplantlist.org/tpl1.1/search?q=Isoetes;http://plants.usda.gov/java/profile?symbol=ISOET | NA | Isoetaceae | TRUE | tpl;usda | [Partial match] | NA | Accepted | Isoetes | Isoetes | genus | Isoetaceae | Isoetes | Tracheophyta | TRUE | Vascular plant |
Uvaria muricata | Uvaria muricata | NA | T | Uvaria muricata | 321204 | 1.0 | Uvaria muricata | species | 1.0 | Pierre ex Engl. & Diels | http://www.theplantlist.org/tpl1.1/record/kew-2448270;http://www.tropicos.org/Name/1601230 | NA | NA | NA | NA | Annonaceae | Uvaria | 1 | muricata | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Uvaria muricata | Pierre ex Engl. & Diels | species | http://www.theplantlist.org/tpl1.1/record/kew-2448270;http://www.tropicos.org/Name/1601230 | Uvaria muricata | Annonaceae | TRUE | tpl;tropicos | NA | NA | Accepted | Uvaria muricata | Uvaria | species | Annonaceae | Uvaria muricata | Tracheophyta | TRUE | Vascular plant |
Pittosporum glabrum | Pittosporum glabrum | NA | ST | Pittosporum glabrum | 242395 | 1.0 | Pittosporum glabrum | species | 1.0 | Hook. & Arn. | http://www.theplantlist.org/tpl1.1/record/tro-25100060;http://www.tropicos.org/Name/25100060;http://plants.usda.gov/java/profile?symbol=PIGL4 | NA | NA | NA | NA | Pittosporaceae | Pittosporum | 1 | glabrum | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Pittosporum glabrum | Hook. & Arn. | species | http://www.theplantlist.org/tpl1.1/record/tro-25100060;http://www.tropicos.org/Name/25100060;http://plants.usda.gov/java/profile?symbol=PIGL4 | Pittosporum glabrum | Pittosporaceae | TRUE | tpl;tropicos;usda | NA | NA | Accepted | Pittosporum glabrum | Pittosporum | species | Pittosporaceae | Pittosporum glabrum | Tracheophyta | TRUE | Vascular plant |
Festuca mathewsii | Festuca mathewsii | NA | T | Festuca mathewsii | 129846 | 1.0 | Festuca mathewsii | species | 1.0 | NA | http://www.theplantlist.org/tpl1.1/search?q=Festuca+mathewsii | NA | NA | NA | NA | Poaceae | Festuca | 1 | mathewsii | 1 | NA | NA | NA | NA | NA | NA | NA | NA | No opinion | NA | NA | NA | NA | NA | NA | TRUE | tpl | NA | NA | Other | Festuca mathewsii | Festuca | species | Poaceae | Festuca mathewsii | Tracheophyta | TRUE | Vascular plant |
Lambertia echinata | Lambertia echinata | NA | ST | Lambertia echinata | 172369 | 1.0 | Lambertia echinata | species | 1.0 | R.Br. | http://www.theplantlist.org/tpl1.1/record/kew-2492037 | NA | NA | NA | NA | Proteaceae | Lambertia | 1 | echinata | 1 | NA | NA | NA | NA | NA | NA | NA | NA | No opinion | NA | NA | NA | NA | NA | NA | TRUE | tpl | NA | NA | Other | Lambertia echinata | Lambertia | species | Proteaceae | Lambertia echinata | Tracheophyta | TRUE | Vascular plant |
Chresta sphaerocephala | Chresta sphaerocephala | NA | T | Chresta sphaerocephala | 68751 | 1.0 | Chresta sphaerocephala | species | 1.0 |
|
http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=3BB8E819-6557-4EDE-82A5-A0CE373674D5;http://www.tropicos.org/Name/2738517 | NA | NA | NA | NA | Asteraceae | Chresta | 1 | sphaerocephala | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Chresta sphaerocephala |
|
species | http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=3BB8E819-6557-4EDE-82A5-A0CE373674D5;http://www.tropicos.org/Name/2738517 | Chresta sphaerocephala | Asteraceae | TRUE | gcc;tropicos | NA | urn:lsid:compositae.org:names:3BB8E819-6557-4EDE-82A5-A0CE373674D5 | Accepted | Chresta sphaerocephala | Chresta | species | Compositae | Chresta sphaerocephala | Tracheophyta | TRUE | Vascular plant |
Eleocharis carniolica | Eleocharis carniolica | NA | ST | Eleocharis carniolica | 111109 | 1.0 | Eleocharis carniolica | species | 1.0 | W.D.J.Koch | http://www.theplantlist.org/tpl1.1/record/kew-242386 | NA | NA | NA | NA | Cyperaceae | Eleocharis | 1 | carniolica | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Eleocharis carniolica | W.D.J.Koch | species | http://www.theplantlist.org/tpl1.1/record/kew-242386 | Eleocharis carniolica | Cyperaceae | TRUE | tpl | NA | NA | Accepted | Eleocharis carniolica | Eleocharis | species | Cyperaceae | Eleocharis carniolica | Tracheophyta | TRUE | Vascular plant |
Lyonia squamulosa | Lyonia squamulosa | NA | ST | Lyonia squamulosa | 187845 | 1.0 | Lyonia squamulosa | species | 1.0 | M. Martens & Galeotti | http://www.theplantlist.org/tpl1.1/record/tro-12303062;http://www.tropicos.org/Name/12303062 | NA | NA | NA | NA | Ericaceae | Lyonia | 1 | squamulosa | 1 | NA | NA | NA | NA | NA | NA | NA | NA | Accepted | Lyonia squamulosa | M. Martens & Galeotti | species | http://www.theplantlist.org/tpl1.1/record/tro-12303062;http://www.tropicos.org/Name/12303062 | Lyonia squamulosa | Ericaceae | TRUE | tpl;tropicos | NA | NA | Accepted | Lyonia squamulosa | Lyonia | species | Ericaceae | Lyonia squamulosa | Tracheophyta | TRUE | Vascular plant |
Gerardia homalantha | Gerardia homalantha | NA | T | Gerardia homalantha | 139112 | 1.0 | Gerardia homalantha | species | 1.0 | (Pennell) Pennell | http://www.theplantlist.org/tpl1.1/record/kew-2824401 | NA | NA | NA | NA | Orobanchaceae | Gerardia | 1 | homalantha | 1 | NA | NA | NA | NA | NA | NA | NA | NA | No opinion | NA | NA | NA | NA | NA | NA | TRUE | tpl | NA | NA | Other | Gerardia homalantha | Gerardia | species | Orobanchaceae | Gerardia homalantha | Tracheophyta | TRUE | Vascular plant |
Accepted_name_rank
at species level of lower)Taxonomic_status
Name_matched
. This field represent the union of accepted + matched nameName_correct
, but only when Accepted_name_rank
is lower than familyAccepted_name_rank
Name_correct
. Complements Accepted_name_family
with multiple sourcesName_correct
The Catalogue of Life
phylum
from The Catalogue of Life
Taxon group
- Taxon group, as in Turboveg. ‘Vascular plant’, ‘Moss’ (include liverworts), ‘Lichen’, ‘Algae’, ’Unknown
save(Backbone, mushroom, mushroom.families, lichen.genera,
file="../_output/Backbone3.0.RData")
ToSubmit1 <- Backbone %>%
filter(grepl(sPlot_TRY, pattern = "S")) %>%
filter(grepl(sPlot_TRY, pattern = "T")) %>%
dplyr::select(Name_sPlot_TRY) %>%
rename(Name_submit = Name_sPlot_TRY)
## add names from T, which match names from S only after standardization
ToSubmit2 <- Backbone %>%
filter(grepl(sPlot_TRY, pattern = "T")) %>%
filter(!grepl(sPlot_TRY, pattern = "S")) %>%
filter(Name_sPlot_TRY %in% ((Backbone %>%
filter(grepl(sPlot_TRY, pattern = "S")) %>%
filter(!grepl(sPlot_TRY, pattern = "T")) %>%
dplyr::select(Name_correct) %>%
distinct())$Name_correct)) %>%
dplyr::select(Name_sPlot_TRY) %>%
rename(Name_submit = Name_sPlot_TRY)
ToSubmit <- ToSubmit1 %>%
bind_rows(ToSubmit2)
write_csv(ToSubmit, "../_output/Submit_TRY.csv")
Submitting 70417 species names to TRY
.
TRY5.0
Matched_names <- Backbone %>%
filter(grepl(sPlot_TRY, pattern = "S")) %>%
filter(Name_sPlot_TRY %in% ToSubmit$Name_submit) %>%
bind_rows(Backbone %>%
filter(grepl(sPlot_TRY, pattern = "S")) %>%
filter(Name_correct %in% ToSubmit$Name_submit)) %>%
distinct()
Of the species names submitted to TRY
there are 89827 species names that match sPlot’s (+ Alpine dataset) species names, before or after taxonomic resolution. These correspond to 67803 species names, AFTER taxonomic resolution.
sPlot3.0
and TRY5.0
load("../_output/Backbone3.0.RData")
How many new entries are in the backbone 3.0 compared to the backbone 2.1? How many entries are in common?
The new backbone contains 346392. The backbone 2.1 contained 130602. The two backbones have 116309 records in common.
Database affiliations (sPlot 3.1
, TRY 3.0
, and Alpine
).
Var1 | Freq |
---|---|
A | 365 |
S | 43715 |
SA | 423 |
ST | 61624 |
STA | 1710 |
T | 238092 |
TA | 463 |
107472 of the total number of entries belong to sPlot. 301889 name entries belong to TRY.
Taxonomic ranks:Var1 | Freq |
---|---|
higher | 6390 |
family | 1889 |
genus | 27104 |
species | 294379 |
subspecies | 8948 |
variety | 7443 |
infraspecies | 92 |
race | 0 |
forma | 147 |
Var1 | Freq |
---|---|
Accepted | 284516 |
Synonym | 28616 |
No opinion | 29313 |
Invalid | 369 |
Illegitimate | 386 |
Misapplied | 13 |
Rejected name | 1 |
Unresolved | 3 |
NA | 3175 |
Total number of unique standardized taxon names and families:
length(unique(Backbone$Name_short))-1 # minus 1 for NA
## [1] 271883
length(unique(Backbone$Family_correct))-1 # minus 1 for NA
## [1] 733
Number of entries corresponding to vascular plant species:
table(Backbone$is_vascular_species, exclude=NULL)
##
## FALSE TRUE <NA>
## 8819 331948 5625
Number of duplicated entries after taxonomic standardization: Frequency of original (non-standardized) species names per resolved (non-standardized) name (excluding non-vascular and non-matched species).
df.count <- Backbone %>%
dplyr::filter(is_vascular_species == TRUE & !is.na(Name_correct)) %>%
dplyr::group_by(Name_correct) %>%
dplyr::summarise(n = n()) %>%
dplyr::arrange(desc(n))
Name_correct | n |
---|---|
Poaceae | 222 |
Lauraceae | 177 |
Fabaceae | 149 |
Asteraceae | 148 |
Miconia | 144 |
Carex | 139 |
Psychotria | 131 |
Eugenia | 117 |
Cyperus | 104 |
Piper | 84 |
Myrcia | 83 |
Ocotea | 83 |
Taraxacum | 83 |
Rubiaceae | 78 |
Ficus | 75 |
Inga | 75 |
Sloanea | 69 |
Nectandra | 68 |
Myrtaceae | 67 |
Lamiaceae | 62 |
unique
standardized namesGenerate version of the backbone that only includes the unique resolved names in Name.short
, and for the non-unique names, the first rows of duplicated name:
Backbone.uni <- Backbone %>%
distinct(Name_short, .keep_all = T) %>%
filter(!is.na(Name_short))
There are 271883 unique taxon names the in the backbone.
Exclude the non-vascular plant and non-matching taxon names:
Backbone.uni.vasc <- Backbone.uni %>%
dplyr::filter(is_vascular_species == TRUE)
Now, run the stats for unique resolved names (excluding non-vascular and non-matching taxa):
length(Backbone.uni.vasc$Name_short)
## [1] 263299
There are 0 unique (vascular plant) taxon names:
Var1 | Freq |
---|---|
A | 179 |
S | 12931 |
SA | 253 |
ST | 49305 |
STA | 1337 |
T | 198911 |
TA | 383 |
Var1 | Freq |
---|---|
higher | 719 |
family | 163 |
genus | 6879 |
species | 254529 |
subspecies | 555 |
variety | 435 |
infraspecies | 3 |
race | 0 |
forma | 16 |
Var1 | Freq |
---|---|
Accepted | 228356 |
Synonym | 10370 |
Other | 24573 |
No suitable matches found. | 0 |
Total number of unique standardized taxon names and families:
length(unique(Backbone.uni.vasc$Name_short))-1 # minus 1 for NA
## [1] 263298
length(unique(Backbone.uni.vasc$Family_correct))-1
## [1] 508
sPlot
only:Backbone.uni.sPlot <- Backbone.uni.vasc %>%
filter(sPlot_TRY %in% c("S", "ST", "SA", "STA"))
There are 63826 unique, corrected names of vascular plants for sPlot species
Database affiliationsVar1 | Freq |
---|---|
S | 12931 |
SA | 253 |
ST | 49305 |
STA | 1337 |
Var1 | Freq |
---|---|
higher | 186 |
family | 106 |
genus | 961 |
species | 61887 |
subspecies | 433 |
variety | 246 |
infraspecies | 0 |
race | 0 |
forma | 7 |
Var1 | Freq |
---|---|
Accepted | 53539 |
Synonym | 4537 |
Other | 5750 |
No suitable matches found. | 0 |
Number of families in sPlot:
length(unique(Backbone$Family_correct))
## [1] 734
Done!
R
-settingssessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] vegdata_0.9.7 foreign_0.8-76 Taxonstand_2.1 pbapply_1.4-2
## [5] taxize_0.9.92 kableExtra_1.1.0 knitr_1.28 data.table_1.12.8
## [9] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.3
## [13] readr_1.3.1 tidyr_1.0.2 tibble_2.1.3 ggplot2_3.3.0
## [17] tidyverse_1.3.0 reshape2_1.4.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.3 lubridate_1.7.4 ape_5.3 lattice_0.20-40
## [5] zoo_1.8-7 foreach_1.4.8 assertthat_0.2.1 digest_0.6.23
## [9] R6_2.4.1 cellranger_1.1.0 plyr_1.8.6 backports_1.1.5
## [13] reprex_0.3.0 evaluate_0.14 highr_0.8 httr_1.4.1
## [17] pillar_1.4.2 rlang_0.4.4 curl_4.3 readxl_1.3.1
## [21] rstudioapi_0.11 rmarkdown_2.1 webshot_0.5.2 munsell_0.5.0
## [25] broom_0.5.5 compiler_3.6.3 modelr_0.1.6 xfun_0.12
## [29] pkgconfig_2.0.3 htmltools_0.4.0 tidyselect_1.0.0 httpcode_0.2.0
## [33] codetools_0.2-16 reshape_0.8.8 fansi_0.4.1 viridisLite_0.3.0
## [37] crayon_1.3.4 dbplyr_1.4.2 withr_2.1.2 crul_0.9.0
## [41] grid_3.6.3 nlme_3.1-145 jsonlite_1.6.1 gtable_0.3.0
## [45] lifecycle_0.2.0 DBI_1.1.0 magrittr_1.5 scales_1.1.0
## [49] cli_2.0.2 stringi_1.4.6 fs_1.3.2 xml2_1.2.2
## [53] generics_0.0.2 vctrs_0.2.3 iterators_1.0.12 tools_3.6.3
## [57] bold_0.9.0 glue_1.3.1 hms_0.5.3 parallel_3.6.3
## [61] yaml_2.2.1 colorspace_1.4-1 rvest_0.3.5 haven_2.2.0