Timestamp: Fri Nov 27 18:19:43 2020
Drafted: Francesco Maria Sabatini
Revised: Helge Bruelheide, Borja Jimenez-Alfaro
Version: 1.3

Changes to Version 1.1 Additional manual cleaning of species names from BJA, UJ and HB.
Changes to Version 1.2 Changed order of ranking TNRS databases, when a name is matched across more than 1 DB; Using cleaned version of DT table (after stripping non-closed quotation marks). Additionally check with TPL those species that, even if resolved in TNRS, did not return an accepted name.
Changes to Version 1.3 Manual check of names BEFORE matching with TNRS

1 Data preparation

1.1 Load packages

library(reshape2)
library(tidyverse)
library(readr)
library(data.table)
library(knitr)
library(kableExtra)
library(stringr)
library(taxize)
library(Taxonstand)
library(vegdata)

1.2 Read in taxon names from sPlot and TRY

#import and save splot names from DT table
DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", 
                            delim="\t", 
                         col_type = cols(
                                PlotObservationID = col_double(),
                                Taxonomy = col_character(),
                                `Taxon group` = col_character(),
                                `Taxon group ID` = col_double(),
                                `Turboveg2 concept` = col_character(),
                                `Matched concept` = col_character(),
                                Match = col_double(),
                                Layer = col_double(),
                                `Cover %` = col_double(),
                                `Cover code` = col_character(),
                                x_ = col_double()
                              )
                         ) 

1.3 Import lists of species classified into groups

These objects are defined in the appendix

load("../_derived/taxa_manual.RData")
splot.species <- DT0 %>%
  rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>%
  filter(`Taxon group`!="Mushroom") %>%
  dplyr::select(Species.original, Matched.concept) %>%
  distinct() %>%
  group_by(Matched.concept) %>%
  mutate(fungi= word(Matched.concept, 1) %in% mushroom) %>%
  ungroup() %>%
  filter(fungi==F) %>%
  dplyr::select(Species.original, Matched.concept)

write_csv(splot.species, path = "../_derived/splot3.0.2.species.csv")

!!! I used the column from TRY with the full species name, not the column with only two-word name strings

splot.species <- read_csv("../_derived/splot3.0.2.species.csv")

try.species <- readr::read_csv("../_input/AccSpecies_TRY5.csv", col_names = F, locale = locale(encoding = 'Latin1')) %>%
  dplyr::select(-X6, -X7) %>%
  rename(try.ID=X1, FullSpecies=X2, Species=X3, Genus=X4, Family=X5, GrowthForm=X8)

# Sneak in  species from the Alpine database (Borja & Riccardo), as a courtesy to Project #18
alpine.species <- read_delim("../_input/new_alpine_species.txt", col_names = F, delim = "\t", locale = locale(encoding = 'Latin1')) %>% 
  rename(Species=X1)

Use the Matched.concept column, as it already contains some standardization by Stephan Hennekkens according to synbiosys.

sPlot 3.0.1 contains 107473 different species names.
TRY 5. contains 302729.
I add to this a list of 2961 alpine species delivered from Riccardo Testolin, within sPlot Project #18.

1.4 Combine species lists

spec.list.TRY.sPlot <- splot.species %>%
  dplyr::select(Matched.concept) %>%
  rename(Species=Matched.concept) %>%
  mutate(Source="S") %>%
  bind_rows(try.species %>% 
              dplyr::select(FullSpecies) %>% ##using the full name from TRY
              rename(Species=FullSpecies) %>% 
              mutate(Source="T")) %>%
  bind_rows(alpine.species %>% 
              mutate(Source="A")) %>%
  reshape2::dcast(Species ~ Source) %>%
  mutate(A=ifelse(A>=1, "A", "")) %>%
  mutate(S=ifelse(S>=1, "S", "")) %>%
  mutate(T=ifelse(T>=1, "T", "")) %>%
  mutate(Source=paste(S, T, A, sep="")) %>%
  dplyr::select(-A, -S, -T)
            
 #Number of species unique and in common across databases

The total number of species in the backbone is 346438.

## `summarise()` ungrouping output (override with `.groups` argument)
Number of taxa per database
Source Num.taxa
sPlot only 43716
TRY only 238137
Alpine only 365
sPlot + TRY 61624
sPlot + Alpine 423
TRY + Alpine 463
sPlot + TRY + Alpine 1710

1.5 A-priori cleaning of names

Stripping unwanted characters as well as abbreviation (such as hybrid markers) which would prevent name matching:

#Ancillary function to change to lower case
firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
  mutate(OriginalNames=Species) %>%
  mutate(Species=tolower(Species)) %>%
  mutate(Species=firstup(Species)) %>%
  dplyr::select(OriginalNames, Species, Source) %>%
  mutate(Species=gsub('*', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('cf. ', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('Cf. ', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('[', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub(']', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub(' x ', ' ', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('aff ', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('(', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub(')', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub(' cf ', ' ', Species, fixed=TRUE)) %>%
  mutate(Species=gsub(' aff. ', ' ', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('c‚e', 'ceae', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('    ', ' ', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('   ', ' ', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('  ', ' ', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('x-', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('X-', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('like ', '', Species, fixed=TRUE)) %>% 
  mutate(Species=gsub(',', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('#', '', Species, fixed=TRUE)) %>%
  mutate(Species=gsub('_', ' ', Species))

For all names, that have a number in their first word, and consist of \(>\) 1 words, remove that word:

spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>% 
  mutate(firstWordWithNumbers=grepl('[0-9]', word(Species, 1))) %>%
  mutate(numberOfWords= sapply(gregexpr("\\W+", Species), length) + 1) %>%
  mutate(Species=ifelse((firstWordWithNumbers & numberOfWords > 1), 
                        sapply(Species, 
                               function(x) substr(x, start=regexpr(pattern =' ', text=x)+1,
                                                  stop=nchar(x))), Species))

Correct some name abbreviations using taxname.abbr in vegdata:

spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>% 
  mutate(Species=taxname.abbr(spec.list.TRY.sPlot$Species)) %>%
  dplyr::select(OriginalNames, Species, Source)  %>% 
  distinct()

1.6 Manual cleaning

Fix known issues in some species names

#Manual cleaning
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
  mutate(Species=tolower(Species)) %>%
  mutate(Species=firstup(Species)) %>%
  mutate(Species=gsub("^Str ", "", Species)) %>%
  mutate(Species=gsub("^Unknown ", "", Species)) %>%
  mutate(Species=firstup(Species)) %>%
  mutate(Species=gsub(" [0-9]*$", "", Species)) %>%  #delete digits at end of object
  mutate(Species=gsub("^\\d+|\\d+$", "", Species)) %>% #delete digits at the beginning or end of a string
  mutate(Species=gsub(" sp.$", "", Species)) %>%
  mutate(Species=gsub(" sp$", "", Species)) %>%
  mutate(Species=gsub(" species$", "", Species)) %>%
  mutate(Species=gsub(" *$", "", Species)) %>%
  mutate(Species=gsub(" #$", "", Species)) %>%
  mutate(Species=gsub(" m$", "", Species)) %>%
  mutate(Species=gsub("acea ", "aceae ", Species)) %>%
  mutate(Species=gsub('^Agropyrum', 'Agropyron', Species)) %>%
  mutate(Species=gsub('^Anno ', 'Annona ', Species)) %>%
  mutate(Species=gsub('Adpdytes dimidiata', 'Apodytes dimidiata', Species)) %>%
  mutate(Species=gsub('Adenostorna fasciculaturn', 'Adenostoma fasciculatum', Species)) %>%
  mutate(Species=gsub('Arctostapliylos glallca', 'Arctostaphylos glauca', Species)) %>%
  mutate(Species=gsub('Bituminosa bituminosa', 'Bituminaria bituminosa', Species)) %>%
  mutate(Species=gsub('Causurina equisitifolia', 'Causuarina equisetifolia', Species)) %>%
  mutate(Species=gsub('Convulvus arvensis', 'Convolvulus arvensis', Species)) %>%
  mutate(Species=gsub('Diospyrus dygina', 'Diospyros dygina', Species)) %>%
  mutate(Species=gsub('^Dodoea', 'Dodonaea', Species)) %>%
  mutate(Species=gsub('^Boheravia', 'Boerhavia', Species)) %>%
  mutate(Species=gsub('Centaria maculosa', 'Centaurea maculosa', Species)) %>%
  mutate(Species=gsub('Chamrenerium angustifolium', 'Chamaenerion angustifolium', Species)) %>%
  mutate(Species=gsub('^Chicorium', 'Cichorium', Species)) %>%
  mutate(Species=gsub('^Cirsiumum', 'Cirsium', Species)) %>%
  mutate(Species=gsub('^Colubrium', 'Colubrina', Species)) %>%
  mutate(Species=gsub('^Corymbium', 'Corymbia', Species)) %>%
  mutate(Species=gsub('Cosmos bipinnata', 'Cosmos bipinnatus', Species)) %>%
  mutate(Species=gsub('Diospyrus dygina', 'Diospyros digyna', Species)) %>%
  mutate(Species=gsub('Diospyros egbert', 'Diospyros egbert-walkeri', Species)) %>%
  mutate(Species=gsub('Dispyrus halesioides', 'Diospyros halesioides', Species)) %>%
  mutate(Species=gsub('^Drymis', 'Drimys', Species)) %>%
  mutate(Species=gsub('^Dysoxylon', 'Dysoxylum', Species)) %>%
  mutate(Species=gsub('^Eleaegnus', 'Elaeagnus', Species)) %>%
  mutate(Species=gsub('^Eleutherant', 'Eleutherantera', Species)) %>%
  mutate(Species=gsub('^Echicea', 'Echinacea', Species)) %>%
  mutate(Species=gsub('Gauteria foliolata', 'Gaultheria foliolosa', Species)) %>%
  mutate(Species=gsub('^Geophylla', 'Geophyla', Species)) %>%
  mutate(Species=gsub('Gloichidion insignis', 'Glochidion insigne', Species)) %>%
  mutate(Species=gsub('^Glycium', 'Glycine', Species)) %>%
  mutate(Species=gsub('^Hammalis', 'Hamamelis', Species)) %>%
  mutate(Species=gsub('^Hippochoeris', 'Hypochaeris', Species)) %>%
  mutate(Species=gsub('Ilix tephrohylla', 'Ilex tephrophylla', Species)) %>%
  mutate(Species=gsub('^Jasininum', 'Jasminum', Species)) %>%
  mutate(Species=gsub('Jenipa conjuta', 'Jenipa conjunta', Species)) %>%
  mutate(Species=gsub('^Lechytis', 'Lecythis', Species)) %>%
  mutate(Species=gsub('Lespedeza juncus', 'Lespedeza juncea', Species)) %>%
  mutate(Species=gsub('Licania apelata', 'Licania apetala', Species)) %>%
  mutate(Species=gsub('Limeum arenicola', 'Limeum arenicolum', Species)) %>%
  mutate(Species=gsub('^Maniota', 'Manihot', Species)) %>%
  mutate(Species=gsub('^Menta', 'Mentha', Species)) %>%
  mutate(Species=gsub('Metophyum brownei', 'Metopium brownei', Species)) %>%
  mutate(Species=gsub('Miliusa tomentosum', 'Miliusa tomentosa', Species)) %>%
  mutate(Species=gsub('Mimululus ringens', 'Mimulus ringens', Species)) %>%
  mutate(Species=gsub('Nardus strictus', 'Nardus stricta', Species)) %>%
  mutate(Species=gsub('Neea glomeratha', 'Neea glomerata', Species)) %>%
  mutate(Species=gsub('^Onopordon', 'Onopordum', Species)) %>%
  mutate(Species=gsub('^Orbigynia', 'Orbignya', Species)) %>%
  mutate(Species=gsub('Orites excelsa', 'Orites excelsus', Species)) %>%
  mutate(Species=gsub('Paedorata lutea', 'Paederota lutea', Species)) %>%
  mutate(Species=gsub('Palaquin ellipticum', 'Palaquium ellipticum', Species)) %>%
  mutate(Species=gsub('Palmeria arfakensis', 'Palmeria arfakiana', Species)) %>%
  mutate(Species=gsub('Petalostcmum purpureum', 'Petalostemum purpureum', Species)) %>%
  mutate(Species=gsub('Petalostimum purpureum', 'Petalostemum purpureum', Species)) %>%
  mutate(Species=gsub('^Petrosileum', 'Petroselinum', Species)) %>%
  mutate(Species=gsub('Phlomis herba', 'Phlomis herba-venti', Species)) %>%
  mutate(Species=gsub('^Phyllirea', 'Phillyrea', Species)) %>%
  mutate(Species=gsub('Physilus pumula', 'Physalus pumila', Species)) %>%
  mutate(Species=gsub('Picea maria', 'Picea mariana', Species)) %>%
  mutate(Species=gsub('Picea retroXexa', 'Picea retroflexa', Species)) %>%
  mutate(Species=gsub('Pilayella litoralis', 'Pilayella littoralis', Species)) %>%
  mutate(Species=gsub('Placocarpus schaereri', 'Platecarpus schaerer', Species)) %>%
  mutate(Species=gsub('Placocarpus schraereri', 'Platecarpus schaerer', Species)) %>%
  mutate(Species=gsub('^Pulteea', 'Pultenaea', Species)) %>%
  mutate(Species=gsub('Quercus rubrum', 'Quercus rubra', Species)) %>%
  mutate(Species=gsub('Rubus fruticosa', 'Rubus fruticosus', Species)) %>%
  mutate(Species=gsub('Rubus saxatile', 'Rubus saxatilis', Species)) %>%
  mutate(Species=gsub('Rubus sylvatici', 'Rubus sylvaticus', Species)) %>%
  mutate(Species=gsub('^Sanguiria', 'Sanguinaria', Species)) %>%
  mutate(Species=gsub('Sarauja nepaulensis', 'Sarauja nepalensis', Species)) %>%
  mutate(Species=gsub('^Sateria', 'Setaria', Species)) %>%
  mutate(Species=gsub('Sauraiea nepulensis', 'Saurauia nepalensis', Species)) %>%
  mutate(Species=gsub('Schneckia australis', 'Schenckia australis', Species)) %>%
  mutate(Species=gsub('Smirnium oleastrum', 'Smyrnium olusatrum', Species)) %>%
  mutate(Species=gsub('Solms laubachia', 'Solms-laubachia himalayensis', Species)) %>%
  mutate(Species=gsub('Stellaria chamaejasme', 'Stellera chamaejasme', Species)) %>%
  mutate(Species=gsub('Steraria parviflora', 'Setaria parviflora', Species)) %>%
  mutate(Species=gsub('^Stuartia', 'Stewartia', Species)) %>%
  mutate(Species=gsub('Sycops sinensis', 'Sycopsis sinensis', Species)) %>%
  mutate(Species=gsub('Tacetum vulgare', 'Tanacetum vulgare', Species)) %>%
  mutate(Species=gsub('Talinurn angustissimun', 'Talinun angustissimun', Species)) %>%
  mutate(Species=gsub('Talloma hodgsoni', 'Talauma hodgsonii', Species)) %>%
  mutate(Species=gsub('Taraxacum albo', 'Taraxacum album', Species)) %>%
  mutate(Species=gsub('Tetragonia falcata', 'Tetragona falcata', Species)) %>%
  mutate(Species=gsub('Trapogogon', 'Tragopogon', Species)) %>%
  mutate(Species=gsub('Zyzyphus saeri', 'Zizyphus saeri', Species)) %>%
  mutate(Species=gsub('^Helicrysum', 'Helichrysum', Species)) %>%
  mutate(Species=gsub('^Diceropappus rhinocerotis', 'Elytropappus rhinocerotis', Species)) %>%
  mutate(Species=gsub('^Euphorbiace ', 'Euphorbiacaea ', Species)) %>%
  mutate(Species=gsub('^Gloecapsa', 'Gloeocapsa', Species)) %>%
  mutate(Species=gsub('Glycirhiza', 'Glycyrrhiza', Species)) %>%
  mutate(Species=gsub('Abiesnordmannia', 'Abies nordmannia', Species)) %>%
  mutate(Species=gsub('Alnus inca', 'Alnus incana', Species)) %>%
  mutate(Species=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', Species)) %>% 
  mutate(Species=gsub('Antylis barba-jovis', 'Anthyllis barba-jovis', Species)) %>% 
  mutate(Species=gsub('^Albizzia "', 'Albizia ', Species)) %>% 
  mutate(Species=gsub('^Ipomoena ', 'Ipomoea ', Species)) %>% 
  mutate(Species=gsub('^Ipomea ', 'Ipomoea ', Species)) %>% 
  mutate(Species=gsub('Ipomo wolco', 'Ipomoea wolcottiana', Species)) %>% 
  ## additional manual cleaning from UJ, BJA, HB
  mutate(Species=gsub('Abacaba palm', 'Oenocarpus balickii', Species)) %>% 
  mutate(Species=gsub('Acerkuomeii', 'Acer kuomeii', Species)) %>% 
  mutate(Species=gsub('Alder$', 'Alnus', Species)) %>% 
  mutate(Species=gsub('Amapa$', 'Tabebuia', Species)) %>% 
  mutate(Species=gsub('Amapa amargoso', 'Parahancornia amapa', Species)) %>% 
  mutate(Species=gsub('Amapa doce$', 'Tabebuia', Species)) %>% 
  mutate(Species=gsub('Amapai$', 'Tabebuia', Species)) %>% 
  mutate(Species=gsub('Amapaí$', 'Tabebuia', Species)) %>% 
  mutate(Species=gsub('Amapa m1', 'Tabebuia', Species)) %>% 
  mutate(Species=gsub('Amaranth$', 'Amaranthus', Species)) %>% 
  mutate(Species=gsub('Amophora fruticosa', 'Amorpha fruticosa', Species)) %>% 
  mutate(Species=gsub('Anacardiace ', 'Anacardiaceae ', Species)) %>% 
  mutate(Species=gsub('Anagallisarvensis', 'Anagallis arvensis', Species)) %>% 
  mutate(Species=gsub('Anemonenarcissiflora var.', 'Anemone narcissiflora', Species)) %>% 
  mutate(Species=gsub('Anenome ', 'Anemone', Species)) %>% 
  mutate(Species=gsub('Anona ', 'Annona ', Species)) %>% 
  mutate(Species=gsub('Antylis ', 'Anthyllis', Species)) %>% 
  mutate(Species=gsub('Apocyncadea gelbblueh$', 'Apocynaceae', Species)) %>% 
  mutate(Species=gsub('Aracium', 'Crepis', Species)) %>% 
  mutate(Species=gsub('Ardis mexic', 'Ardisia mexicana subsp. siltepecana', Species)) %>% 
  mutate(Species=gsub('Ardis verap', 'Ardisia verapazensis', Species)) %>% 
  mutate(Species=gsub('Argenomne hummemannii', 'Argemone hunnemanni', Species)) %>% 
  mutate(Species=gsub('Artabotus', 'Artabotrys', Species)) %>% 
  mutate(Species=gsub('Artemisiaintegrifolia', 'Artemisia integrifolia', Species)) %>% 
  mutate(Species=gsub('Asclepiacea$', 'Asclepiadaceae', Species)) %>% 
  mutate(Species=gsub('Asclep. klimmer', 'Asclepiadaceae', Species)) %>% 
  mutate(Species=gsub('Astartoseris triquetra', 'Lactuca triquetra', Species)) %>% 
  mutate(Species=gsub('Asteracee ', 'Asteraceae ', Species)) %>% 
  mutate(Species=gsub('Avenula glauc$', 'Avenula', Species)) %>% 
  mutate(Species=gsub('Baikea plurijuga', 'Baikiaea plurijuga', Species)) %>% 
  mutate(Species=gsub('Binse rundbl', 'Juncaceae', Species)) %>% 
  mutate(Species=gsub('Blättrige fabaceae th', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Bonel macro$', 'Bonellia macrocarpa subsp. macrocarpa', Species)) %>% 
  mutate(Species=gsub('Boraginacee samtig', 'Boraginaceae', Species)) %>% 
  mutate(Species=gsub('Bri¢fitos', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Bryophyte$', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Bryopsida', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Carallia macrophylla', 'Carallia', Species)) %>% 
  mutate(Species=gsub('Carexectabilis', 'Carex spectabilis', Species)) %>% 
  mutate(Species=gsub('Carex fein', 'Carex', Species)) %>% 
  mutate(Species=gsub('Cerania vermicularis', 'Thamnolia vermicularis', Species)) %>% 
  mutate(Species=gsub('Chamelauci merredin', 'Chamelaucium', Species)) %>% 
  mutate(Species=gsub('Chamelau drummon', 'Chamelaucium', Species)) %>% 
  mutate(Species=gsub('Charophyta', 'Characeae', Species)) %>% 
  mutate(Species=gsub('Cheiridopsis-keimlinge', 'Cheiridopsis', Species)) %>% 
  mutate(Species=gsub('Chenopodiacee$', 'Chenopodiaceae', Species)) %>% 
  mutate(Species=gsub('Chiangioden mexicanum', 'Chiangiodendron mexicanum', Species)) %>% 
  mutate(Species=gsub('Chiranthode pentadactylon', 'Chiranthodendron pentadactylon', Species)) %>% 
  mutate(Species=gsub('Chrysobalan ', 'Chrysobalanus ', Species)) %>% 
  mutate(Species=gsub('Cladapodiella', 'Cladopodiella', Species)) %>% 
  mutate(Species=gsub('Cleidium ', 'Cleidion ', Species)) %>% 
  mutate(Species=gsub('Collema/leptogium lichenoides', 'Collemataceae', Species)) %>% 
  mutate(Species=gsub('Comarostaph discolor', 'Comarostaphylis discolor', Species)) %>% 
  mutate(Species=gsub('Combretdodendrum africana', 'Combretodendrum africanum', Species)) %>% 
  mutate(Species=gsub('Commelinacaea floscopa', 'Floscopa glomerata', Species)) %>% 
  mutate(Species=gsub('Coyncia setigera', 'Coincya setigera', Species)) %>% 
  mutate(Species=gsub('Crataeva', 'Crateva', Species)) %>% 
  mutate(Species=gsub('Craterosperma', 'Rubiaceae', Species)) %>% 
  mutate(Species=gsub('Crespicium', 'Burseraceae', Species)) %>% 
  mutate(Species=gsub('Critoniadel nubigenus', 'Critoniadelphus nubigenus', Species)) %>% 
  mutate(Species=gsub('Crotalaria/vigna?', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Croto billb', 'Croton billbergianus subsp. pyramidalis', Species)) %>% 
  mutate(Species=gsub('Dana„ racemosa', 'Danae racemosa', Species)) %>% 
  mutate(Species=gsub('Deehasia', 'Dehaasia', Species)) %>% 
  mutate(Species=gsub('Dichapetala', 'Dichapetalum', Species)) %>% 
  mutate(Species=gsub('Distel bractea', 'Asteracaea', Species)) %>% 
  mutate(Species=gsub('Distelig asteraceae', 'Asteracaea', Species)) %>% 
  mutate(Species=gsub('Dodon visco', 'Dodonaea viscosa', Species)) %>% 
  mutate(Species=gsub('Doldenbluetler', 'Apiaceae', Species)) %>% 
  mutate(Species=gsub('Echinosurus capitatus', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Einähriges gras$', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Einähriges gras von gestern$', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Einblütiges rispengras', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Eiovaltrichtergrundblatt orchidee', 'Orchidaceae', Species)) %>% 
  mutate(Species=gsub('Elongata subsp.', 'Pohlia elongata', Species)) %>% 
  mutate(Species=gsub('Enriquebelt ', 'Enriquebeltrania ', Species)) %>% 
  mutate(Species=gsub('Entermorpha ', 'Enteromorpha ', Species)) %>% 
  mutate(Species=gsub('Erodiurn$', 'Erodium', Species)) %>% 
  mutate(Species=gsub('Euc. chloroclada x camaldulensis', 'Eucalyptus', Species)) %>% 
  mutate(Species=gsub('Euphorbiacée ipatouduluga gouduatché', 'Euphorbiaceae', Species)) %>% 
  mutate(Species=gsub('Fabacee kleeblatt stengel schwarzdrüsi', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Fabaceenstrauch wie 132446 f', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Fabaceenstr kleinbltrg', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Fabacee wie lotus f', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Farn', 'Pteridophyta', Species)) %>% 
  mutate(Species=gsub('Farn cystopteris', 'Cystopteris', Species)) %>% 
  mutate(Species=gsub('Fern', 'Pteridophyta', Species)) %>% 
  mutate(Species=replace(Species, list=word(Species, 1)=="Fingergras", values="Digitaria")) %>% 
    mutate(Species=replace(Species, list=word(Species, 1)=="Fingerhirse", values="Digitaria")) %>% 
  mutate(Species=gsub('Gelbe onagraceae', 'Onagraceae', Species)) %>% 
  mutate(Species=gsub('Gramine', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Graminea', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Graminia', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Grannenquecke', 'Poaceae', Species)) %>% 
  mutate(Species=replace(Species, 
                     list=word(Species, 1)=="Gras", 
                     values="Poaceae")) %>% 
  mutate(Species=gsub('Gynostachi dicanthus', 'Gymnostachium diacanthus', Species)) %>% 
  mutate(Species=gsub('Hafer haarkranz', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Hapolosiphon', 'Hapalosiphon', Species)) %>% 
  mutate(Species=gsub('Heliocrysum', 'Helichrysum', Species)) %>% 
  mutate(Species=replace(Species, list=word(Species, 1)=="Hepaticae", values="Bryophyta")) %>% 
  mutate(Species=gsub('Hepaticas', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Hepatophyta', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Hermerocalis', 'Hemerocallis', Species)) %>% 
  mutate(Species=replace(Species, 
                     list=word(Species, 1)=="Hirse", 
                     values="Poaceae")) %>% 
  mutate(Species=gsub('Hirte trian', 'Hirtella triandra subsp. media', Species)) %>% 
    mutate(Species=replace(Species, list=word(Species, 1)=="Hohlzahn", values="Lamiaceae")) %>%
  mutate(Species=gsub('Hondurodend urceolatum', 'Hondurodendron urceolatum', Species)) %>% 
  mutate(Species=gsub('Hornklee gelb', 'Fabaceae', Species)) %>% 
  mutate(Species=replace(Species, 
                     list=word(Species, 1)=="Horstgras",
                     values="Poaceae")) %>% 
  mutate(Species=replace(Species, 
                     list=word(Species, 1)=="Huehnerhirse",
                     values="Digitaria")) %>% 
  mutate(Species=gsub('Hydrocoleus lyngbyaceus', 'Hydrocoleum lyngbyaceum', Species)) %>% 
  mutate(Species=gsub('Hyernima nipensis', 'Hieronyma nipensis', Species)) %>% 
  mutate(Species=gsub('Hyeronima', 'Hieronyma', Species)) %>% 
  mutate(Species=gsub('Hypocal angusti', 'Hypocalymma angustifolium', Species)) %>% 
  mutate(Species=gsub('Hypocalym nambung', 'Hypocalymma', Species)) %>% 
  mutate(Species=gsub('Hyprium', 'Hypericum', Species)) %>% 
  mutate(Species=gsub('Igelkolben', 'Sparganium', Species)) %>% 
  mutate(Species=gsub('Ilexã‚â paraguariensis', 'Ilex', Species)) %>% 
  mutate(Species=gsub('Ipomea', 'Ipomoea', Species)) %>% 
  mutate(Species=gsub('Ipomoena', 'Ipomoea', Species)) %>% 
  mutate(Species=gsub('Jm kürbis stark behaart', 'Cucurbitaceae', Species)) %>% 
  mutate(Species=gsub('Juncaginacee/triglochin', 'Triglochin', Species)) %>% 
  mutate(Species=gsub('Juncas', 'Juncus', Species)) %>% 
  mutate(Species=gsub('Keilblatt cyperus', 'Cyperus', Species)) %>% 
  mutate(Species=gsub('Khh 3010 polygalacee', 'Polygalaceae', Species)) %>% 
  mutate(Species=gsub(' Khh 3014 liliacee 3f„ch. kapsel schwarze samen', 'Liliaceae', Species)) %>% 
  mutate(Species=gsub('Khh 3024 brachiaria', 'Brachiaria', Species)) %>% 
  mutate(Species=gsub('Khh 3025 liliaceae gelbe blten breite bl„tter', 'Liliaceae', Species)) %>% 
  mutate(Species=gsub('Khh 3037 ficus', 'Ficus', Species)) %>% 
  mutate(Species=gsub('Khh 3054 ficus iteophylla miq.', 'Ficus', Species)) %>% 
  mutate(Species=gsub('Kl. borstgras', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Kleine malvaceae', 'Malvaceae', Species)) %>% 
  mutate(Species=replace(Species, 
                     list=word(Species, 1)=="Kletter", 
                     values="Asteraceae")) %>% 
  mutate(Species=gsub('Klimmer asclepiadaceae', 'Asclepiadaceae', Species)) %>% 
  mutate(Species=gsub('Klimmer curcuvitaceae', 'Cucurbitaceae', Species)) %>% 
  mutate(Species=gsub('Kl. sauergras', 'Cyperaceae', Species)) %>% 
  mutate(Species=gsub('Knabenkraut gefleckt', 'Orchis', Species)) %>% 
  mutate(Species=gsub('Knubbelblüt. gras haarkranz vgl f', 'Poaceae', Species)) %>% 
  mutate(Species=replace(Species, 
                   list=word(Species, 1)=="Koenigskerze", 
                   values="Verbascum")) %>% 
  mutate(Species=gsub('Kriechgras zynodon', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Kürbis', 'Cucurbitaceae', Species)) %>% 
  mutate(Species=gsub('Lamiaceen strauch', 'Lamiaceae', Species)) %>% 
  mutate(Species=gsub('Lamiacee orange', 'Lamiaceae', Species)) %>% 
  mutate(Species=gsub('Lamiales orobanchaceae + phrymaceae + plantaginaceae + scrophulariaceae', 'Orobanchaceae', Species)) %>% 
  mutate(Species=gsub('Lantanacamara wandelrösschen', 'Lantana camara', Species)) %>% 
  mutate(Species=gsub('Lasiopeta watheroo k. shepherd & c. wilkins ks', 'Lasiopetalum', Species)) %>% 
  mutate(Species=gsub('Leg-inderteminada', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Legu 1fiedrig groá schlank', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Legume$', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Leguminosae spgm', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Leguminosea', 'Fabaceae', Species)) %>% 
  mutate(Species=replace(Species, 
                 list=word(Species, 1)=="Leguminose", 
                 values="Fabaceae")) %>% 
  mutate(Species=gsub('Leheelo grass', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Lepid carra', 'Lepiderema', Species)) %>% 
  mutate(Species=gsub('Lich caloplaca', 'Caloplaca', Species)) %>% 
  mutate(Species=gsub('Liliacee', 'Liliaceae', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Lilie", 
               values="Liliaceae")) %>% 
  mutate(Species=gsub('Liliengewächs', 'Liliaceae', Species)) %>% 
  mutate(Species=gsub('Lisea', 'Litsea', Species)) %>% 
  mutate(Species=gsub('Lisymachia', 'Lysimachia', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Liverwort", 
               values="Bryophyta")) %>% 
  mutate(Species=gsub('Livwort', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Lonicerachrysantha', 'Lonicera chrysantha', Species)) %>% 
  mutate(Species=gsub('Lycoctamnus barbatus', 'Aconitum barbatum', Species)) %>% 
  mutate(Species=gsub('Lygopus', 'Lycopus', Species)) %>% 
  mutate(Species=gsub('Maitenus', 'Maytenus', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Malpighiace", 
               values="Malpighiaceae")) %>% 
  mutate(Species=gsub('Malpighiales chrysobalanaceae + humiriaceae', 'Malpighiaceae', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Malve", 
               values="Malvaceae")) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Mammutgras", 
               values="Poaceae")) %>% 
  mutate(Species=gsub('Mammutgrass', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Maqui guian', 'Maquira guianensis subsp. costaricana', Species)) %>% 
  mutate(Species=gsub('Marchantiophyta', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Mariana aphylla', 'Maireana aphylla', Species)) %>% 
  mutate(Species=gsub('Mehrfingeriges ährengras', 'Poaceae', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Melastomata", 
               values="Melastomataceae")) %>% 
  mutate(Species=gsub('Mesembr minibl', 'Mesembryanthemum', Species)) %>% 
  mutate(Species=gsub('Mesostomma kotschyanum', 'Mesostemma kotschyana', Species)) %>% 
  mutate(Species=gsub('Microhepatics', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Micromeria micrantha', 'Micromeria graeca subsp. micrantha', Species)) %>% 
  mutate(Species=gsub('Mimose minifiedrig f', 'Fabaceae', Species)) %>% 
  mutate(Species=gsub('Miniepilobium', 'Epilobium', Species)) %>% 
  mutate(Species=gsub('Minimargerite', 'Asteraceae', Species)) %>% 
  mutate(Species=gsub('Miniochna', 'Ochna', Species)) %>% 
  mutate(Species=gsub('Minischilf 132466 f', 'Poaceae', Species)) %>% 
  mutate(Species=gsub('Mistletoe', 'Viscum', Species)) %>% 
  mutate(Species=gsub('Mniaecia', 'Mniaceae', Species)) %>% 
  mutate(Species=gsub('Molemo', 'Turraea', Species)) %>% 
  mutate(Species=gsub('Molses', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Momisa pigra', 'Mimosa pigra', Species)) %>% 
  mutate(Species=gsub('Monandrus squarrosus', 'Cyperus squarrosus', Species)) %>% 
  mutate(Species=gsub('Monchema debile', 'Monechma debile', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Monochna", 
               values="Polygalaceae")) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Moos", 
               values="Bryophyta")) %>% 
  mutate(Species=gsub('Moospolster grau-grün', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Mortonioden ', 'Mortoniodendron ', Species)) %>% 
  mutate(Species=gsub('Mos onbekend', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Mossen overige', 'Bryophyta', Species)) %>% 
  mutate(Species=gsub('Mougetia', 'Mougeotia', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Musci", 
               values="Bryophyta")) %>% 
  mutate(Species=gsub('Myciantes', 'Myrcianthes', Species)) %>% 
  mutate(Species=gsub('Myrciaã‚â pulchra', 'Myrcia pulchra', Species)) %>% 
  mutate(Species=gsub('Myrcianov.', 'Myrcia', Species, fixed = T)) %>% 
  mutate(Species=gsub('Myrsi coria', 'Myrsine coriacea', Species)) %>% 
  mutate(Species=gsub('Myrtaceenstrauch', 'Myrtaceae', Species)) %>% 
  mutate(Species=gsub('Nachtkerze fru dreispaltig', 'Onagracaee', Species)) %>% 
  mutate(Species=gsub('Neobartsia crenoloba', 'Bartsia crenoloba', Species)) %>% 
  mutate(Species=gsub('None$', 'Nonea', Species)) %>%
  mutate(Species=gsub('Ocos adenophylla', 'Symplocos adenophylla', Species)) %>%
  mutate(Species=gsub('Officinale subsp. group', 'Taraxacum officinale s.l.', Species)) %>%
  mutate(Species=gsub('Orch$', 'Orchidaceae', Species)) %>%
  mutate(Species=gsub('Orchid', 'Orchidaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Orchidee", 
               values="Orchidaceae")) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% c("Papilonacea", "Papilionacea"),
               values="Fabaceae")) %>% 
  mutate(Species=gsub('Pasania dodoniifolia', 'Lithocarpus dodonaeifolius', Species)) %>%
  mutate(Species=gsub('Phoebengmoensis', 'Phoebe hungmoensis', Species)) %>%
  mutate(Species=gsub('Picra antid$', 'Picramnia antidesma subsp. fessonia', Species)) %>%
  mutate(Species=gsub('Pinopsida', 'Coniferae', Species)) %>%
  mutate(Species=gsub('Pisonianov.', 'Pisonia', Species, fixed=T)) %>%
  mutate(Species=gsub('Pithecellob ', 'Pithecellobium ', Species)) %>%
  mutate(Species=gsub('Pithecocten', 'Pithecoctenium', Species)) %>%
  mutate(Species=gsub('Pleradenoph longicuspis', 'Pleradenophora longicuspis', Species)) %>%
  mutate(Species=gsub('Pleuranthod ', 'Pleuranthodendron ', Species)) %>%
  mutate(Species=gsub('Poales', 'Poaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% c("Polygalacea", "Polygalacee"),
               values="Polygalaceae")) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% c("Polygonaceae", "Polygonacee"),
               values="Polygonaceae")) %>% 
  mutate(Species=gsub('Polygonumlongisetum', 'Polygonum longisetum', Species)) %>%
  mutate(Species=gsub('Posoq coria subsp. maxima', 'Posoqueria coriacea subsp. maxima', Species)) %>%
  mutate(Species=gsub('Prosthecidi ', 'Prosthecidiscus ', Species)) %>%
  mutate(Species=gsub('Pseudo bidens', '', Species)) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% 
                 c("Pseudobriza", "Pseudofingergras", 
                   "Pseudogerste", "Puschelgras", "Quecke",
                   "Queckenblatt", "Queckengras", 
                   "Roggen/hafer", "Ruchgras", "Silbergras", 
                   "Suessgras"), 
               values="Poaceae")) %>% 
  mutate(Species=gsub('Ptarmica', 'Achillea', Species)) %>%
  mutate(Species=gsub('Pterost cauline leaves n. gibson & m.n. lyons', 'Pterostegia', Species)) %>%
  mutate(Species=gsub('Quararibeaã‚â guianensis', 'Quararibea guianensis', Species)) %>%
  mutate(Species=gsub('Rainfarn f', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Ranke ipomoea', 'Ipomoea', Species)) %>%
  mutate(Species=gsub('Ranke rubiaceae', 'Rubiaceae', Species)) %>%
  mutate(Species=gsub('Rauwolfia', 'Rauvolfia', Species)) %>%
  mutate(Species=gsub('Rheinfarn', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Rhodostemon kunthiana', 'Rhodostemonodaphne kunthiana', Species)) %>%
  mutate(Species=gsub('Riccardia/aneura', 'Bryophyta', Species)) %>%
  mutate(Species=gsub('Rietgras steril 134051a', 'Poaceae', Species)) %>%
  mutate(Species=gsub('Rosenbergio formosum', 'Rosenbergiodendron formosum', Species)) %>%
  mutate(Species=gsub('Rotes puschelgras', 'Poaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Rubiacea", 
               values="Rubiaceae")) %>% 
  mutate(Species=gsub('Rytidospe goomallin a.g. gunness et al. oakp 10/', 'Rytidosperma', Species)) %>%
  mutate(Species=gsub('Salacia idoensis', 'Salacia', Species)) %>%
  mutate(Species=gsub('Samphire', 'Amaranthaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% 
                 c("Sauergras", "Schlanksegge", "Sedge", 
                   "Segge", "Simse"),
               values="Cyperaceae")) %>% 
  mutate(Species=gsub('Scaev repen subsp. north sandp r.j. cranf & p.j. spenc', 'Scaevola repens', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Schachtelhalm", 
               values="Equisetaceae")) %>% 
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Schnittlauch", 
               values="Amaryllidaceae")) %>% 
  mutate(Species=gsub('Schwertlilie trocken', 'Iridaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% c("Scropholacea", "Scrophulariacea", "Scroph."),
               values="Scrophulariacea")) %>% 
  mutate(Species=gsub('Sitzende onagraceae', 'Onagraceae', Species)) %>%
  mutate(Species=gsub('Sonnenblume', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Stachelgurke', 'Cucurbitaceae', Species)) %>%
  mutate(Species=gsub('Stark behaarte malve', 'Malvaceae', Species)) %>%
  mutate(Species=gsub('Staude asteraceae bl watteweich f', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Staude crotalaria unterseite silber', 'Crotalaria', Species)) %>%
  mutate(Species=gsub('Staude solanum', 'Solanaceae', Species)) %>%
  mutate(Species=gsub('Staude tephrosia', 'Tephrosia', Species)) %>%
  mutate(Species=gsub('Stipagrosist panicle gross', 'Stipagrostis', Species)) %>%
  mutate(Species=gsub('Asteraceae u silber', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Stratonostoc communeá', 'Stratonostoc commune', Species)) %>%
  mutate(Species=gsub('Strauch asteraceae nadelblätt.', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Strauch blatt wie salix reticulata astera', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Strauch blatt wie salix reticulata astera 132534b', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Strauch fabaceae gerieft schote', 'Fabaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Strauch" &
                 word(Species,2)=="Rubiaceae", 
               values="Rubiaceae")) %>% 
  mutate(Species=gsub('Fabaceae samtig bl lanzettlich', 'Fabaceae', Species)) %>%
  mutate(Species=gsub('Ochna mini', 'Ochna', Species)) %>%
  mutate(Species=gsub('Stryphnoden microstachyum', 'Stryphnodendron microstachyum', Species)) %>%
  mutate(Species=gsub('Sumpfgladiole haarig', 'Gladiolus', Species)) %>%
  mutate(Species=gsub('Sygnum ramphicarpa', 'Scrophulariaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1)=="Symplococar", 
               values="Symplococarpon")) %>% 
  mutate(Species=gsub('Sysirinchium', 'Sisyrinchium', Species)) %>%
  mutate(Species=gsub('Syzigium accuminatisima', 'Syzygium acuminatissimum', Species)) %>%
  mutate(Species=gsub('Tabernaemon ', 'Tabernaemontana ', Species)) %>%
  mutate(Species=gsub('Thalassodend', 'Thalassodendron', Species)) %>%
  mutate(Species=gsub('Thinouia canescens', 'Thinouia', Species)) %>%
  mutate(Species=gsub('Thistle', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Trisetumicatum', 'Trisetum spicatum', Species)) %>%
  mutate(Species=gsub('Undetermined sedge', 'Cyperaceae', Species)) %>%
  mutate(Species=replace(Species, 
               list=word(Species, 1) %in% 
                 c("Liverwort", "Liverworts", "Moss"), 
               values="Bryophyta")) %>% 
  mutate(Species=gsub('Vismi bacci', 'Vismia baccifera subsp. ferruginea', Species)) %>%
  mutate(Species=gsub('Weidenr”schen', 'Onagraceae', Species)) %>%
  mutate(Species=gsub('Weißpelziger brauner Spross Asteracea', 'Asteraceae', Species)) %>%
  mutate(Species=gsub('Wie stipagrostis', 'Poaceae', Species)) %>%
  mutate(Species=gsub('Wincassia', 'Fabaceae', Species)) %>%
  mutate(Species=gsub('xDactyloden st-quintini', 'Dactylodenia st-quintinii', Species)) %>%
  mutate(Species=gsub('Zizyphus sp1 IUCN1', 'Zizyphus', Species)) %>%
  mutate(Species=gsub('Zwiebel Lilaceae steril', 'Lilaceae', Species)) %>%
  mutate(Species=gsub('Zwstr faurea', 'Faurea', Species)) %>% 
  mutate(Species=gsub('Quercus crispla', 'Quercus crispula', Species)) %>% 
  mutate(Species=gsub('Corallorrhiza', 'Corallorhiza', Species)) %>% 
  mutate(Species=gsub('Brunella vulgaris', 'Prunella vulgaris', Species)) %>% 
  mutate(Species=gsub('Lamprothamnum', 'Lamprothamnium', Species))

A total of 23287 species names were modified. Although substantially improved, the species list has still quite a lot of inconsistencies. The total list submitted to TNRS contains 333136 species names.

2 Match names against Taxonomic Name Resolution Service (TNRS)

Export species name list

write_csv(spec.list.TRY.sPlot %>% dplyr::select(Species) %>% distinct() ,  
          path = "../_derived/TNRS_submit/tnrs_submit_iter1.csv")
## Warning: The `path` argument of `write_csv()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

The csv-file of species names was submitted to Taxonomic Name Resolution Service web application (Boyle et al. 2013, iPlant Collaborative (2015). TNRS version 4.0 was used, which became available in August 2015 (this version also included The Plant List version 1.1). TNRS was queried on 24/02/2020.

2.1 TNRS settings

The following settings were used for resolving names on TNRS.

2.1.1 Sources for name resolution

The initial TNRS name resolution run was based on the five standard sources that were ranked according to preference in the following order (default of TNRS):

  1. The Plant List (TPL)[@TPL2013]
  2. The Global Compositae Checklist (GCC)[@Flann2009]
  3. The International Legume Database and Information Service (ILDIS)[@ILDIS2006]
  4. Tropicos [@TROPICOS2013]
  5. PLANTS Database (USDA)[@USDA2012]

2.1.2 Family Classification

Resolved names were assigned to families based on the APGIII classification [@Chase2009], the same classification system used by Tropicos.

2.1.3 Retrieve results

Once the matching process was finished, results were retrieved from TNRS using the Detailed Download option that included the full name information (parsed components, warnings, links to sources, etc.). We retrieved all the matches for each species, constrained by source (TNRS default), where the name in the first source was selected as best match, unless there was no suitable match found in that source, the match from the next lower-ranked source was selected, until all resources where exhausted.

2.1.4 General procedure

Manually inspect the TNRS-results table in a spreadsheat application (i.e. LibreOffice or Excel). Starting with the highest taxonomic rank considered (i.e. Family). For instance, if manual checking of the TRNS output reveals that all accepted names or synonyms that have accuracy scores >0.9 are correct taxon names, use the following selection procedure:

  • Name_matched_rank (==Family)
  • Taxonomic_status (==Accepted, Synomyn)
  • Family_score (>0.9)

Continue this selection procedure for entries that were matched at lower taxonomic ranks, i.e. genus, species, etc..

2.2 Iteration 1 - Read and combine TNRS result files

Read the files downloaded from TNRS into R.

tnrs.res0 <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter1.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
          col_type = cols(
                .default = col_character(),
                Name_number = col_double(),
                Overall_score = col_double(),
                Name_score = col_double(),
                Author_score = col_double(),
                Family_score = col_double(),
                Genus_score = col_double(),
                Specific_epithet_score = col_double(),
                Infraspecific_epithet_score = col_double(),
                Infraspecific_epithet_2_score = col_double(),
                Selected = col_logical()
              ))

2.2.1 Select best match for each submitted name

Best matches are selected in successive steps, depending at which taxonomic level each record was matched. Records were sorted based on decreasing match scores. Matches at low taxonomic level (variety, subspecies) were favoured over matches at high taxonomic levels (family, sections). When having exactly the same ranks, the records were ranked based on their source, as explained above.
For each name submitted, only the record having the highest rank was retained.

#reorder priorities
TNRS.priorities <- c("tpl;gcc;tropicos;usda", 
                    "tpl;gcc;tropicos",
                    "tpl;gcc;usda",
                    "tpl;ildis;tropicos",
                    "tpl;ildis;usda",
                    "tpl;tropicos;usda",
                    "tpl;gcc",
                    "tpl;ildis",
                    "tpl;tropicos",
                    "tpl;usda",
                        
                    
                    "gcc;tropicos;usda",
                    "gcc;tropicos",
                    "tropicos;gcc",
                    "gcc;usda", 
                    "gcc",
                    
                    "ildis;tropicos;usda",
                    "ildis;tropicos",
                    "ildis;usda",
                    "ildis", 
                    
                    "tpl",  # move tpl down the list, 
                            # because for legumes and composites, 
                            # tpl relies on gcc or ILDIS
                    "tropicos;usda",
                    "tropicos",
                    "usda"  )
tnrs.res <- tnrs.res0 %>%
  mutate(Name_matched_rank=factor(Name_matched_rank, 
                                  levels=c("variety", "subspecies", "species", "genus",
                                           "family", "section", "supersection",
                                           "infraspecies", "forma", "race",
                                           "nothosubspecies", "proles", "monstr",
                                           "series"))) %>%
  mutate(Source=factor(Source, levels=TNRS.priorities)) %>%
  mutate(Taxonomic_status=factor(Taxonomic_status, 
                                levels=c("Accepted","Synonym", "No opinion","Invalid",
                                         "Illegitimate","Misapplied","Rejected name"))) %>%
  #filter(Taxonomic_status %in% c("Accepted", "Synonym")) %>%
  arrange(Name_number,
          desc(Genus_score), 
          desc(Specific_epithet_score),
          desc(Infraspecific_epithet_2_score),
          desc(Infraspecific_epithet_score), 
          desc(Family_score),
          desc(Name_score),
          desc(Overall_score), 
          Source, 
          Taxonomic_status) %>%
  group_by(Name_submitted) %>%
  slice(1)

After this first step, there are 1709 records for which no match was found. Another 15578 were unreliably matched (overall match score <0.9).

2.2.2 Family level

Manually inspect sorted table and select all entries at the highest hierarchical level (family). Manually identify the family accuracy score threshold value above which a name can be considered a correct name. In the following case, this corresponds to a score $>$0.88.

index.family <- which(tnrs.res$Name_matched_rank == "family" &
                               (tnrs.res$Taxonomic_status == "Accepted" |
                                tnrs.res$Taxonomic_status == "Synonym") &
                                tnrs.res$Family_score > 0.88)
length(index.family)
## [1] 741

2.2.3 Genus level

index.genus <- which(tnrs.res$Name_matched_rank == "genus" &
                        (       tnrs.res$Taxonomic_status %in% c("Synonym", "Accepted") &
                                tnrs.res$Genus_score > 0.83) 
                     |
                        (       tnrs.res$Taxonomic_status == "No opinion" &
                                tnrs.res$Genus_score >= 0.99))
length(index.genus)
## [1] 45771

2.2.4 Species level

index.species  <- which(tnrs.res$Name_matched_rank == "species" &
                            (     (tnrs.res$Taxonomic_status == "Accepted" |  #condition 1
                                  tnrs.res$Taxonomic_status == "Synonym") &
                                  tnrs.res$Genus_score > 0.78 &
                                  tnrs.res$Name_score > 0.90) 
                        |
                            (     tnrs.res$Genus_score > 0.90 &         # condition 2 - effective for records with subspecies information
                                  (tnrs.res$Specific_epithet_score > 0.90) 
                        ))
length(index.species)
## [1] 311135

2.2.5 Subspecies level

index.subspec <- which( (tnrs.res$Name_matched_rank %in% c("infraspecies", "subspecies") |
                                   is.na(tnrs.res$Name_matched_rank)) & # there are a few records at sub-species level which are not categorized
                                (tnrs.res$Taxonomic_status == "Accepted" |
                                 tnrs.res$Taxonomic_status == "Synonym"))
length(index.subspec)
## [1] 8499
index.variety <- which(tnrs.res$Name_matched_rank == "variety" &
                                (tnrs.res$Taxonomic_status == "Accepted" |
                                 tnrs.res$Taxonomic_status == "Synonym"))
length(index.variety)
## [1] 7179
index.infraspec <- which(tnrs.res$Name_matched_rank == "infraspecies")
length(index.infraspec)
## [1] 92
index.forma <- which(tnrs.res$Name_matched_rank == "forma")
length(index.forma)
## [1] 173

2.2.6 Identifying “non-matched” species that are spermatophyta

index.spermatophyt <- which(tnrs.res$Name_matched == "No suitable matches found."
                                     & word(tnrs.res$Name_submitted, 1) == "Spermatophyta")
length(index.spermatophyt)
## [1] 47

2.2.7 Select certain or uncertain names

Select names that do not fulfill the search criteria, i.e. that were not selected as certain species, for further name matching.

index.tnrs <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
               index.variety, index.spermatophyt))

tnrs.res.certain <- tnrs.res[index.tnrs,]
dim(tnrs.res.certain)
## [1] 329815     36
write.csv(tnrs.res.certain, file = "../_derived/TNRS_submit/tnrs.res.iter1.certain.csv")

tnrs.res.uncertain <- tnrs.res[-index.tnrs,]  
dim(tnrs.res.uncertain)
## [1] 3319   36
write.csv(tnrs.res.uncertain, file = "../_derived/TNRS_submit/tnrs.res.iter1.uncertain.csv")

save(tnrs.res.certain, tnrs.res.uncertain, file="../_derived/TNRS_submit/tnrs.iter1.RData")

2.2.8 Delete subspecies information and rerun match in TNRS

Many unmatched records do contain subspecies information which could not be retrieved in TNRS, although genus and species seem to be spelled correctly. Also, sometimes the mismatch derives from having the word ‘species’ or ‘sp’ at the end of the name.

tnrs.submit.iter2 <- data.frame(old=tnrs.res.uncertain$Name_submitted) %>%
  mutate(old=as.character(old)) %>% 
  mutate(new=old)

# delete remaining records of mushroom species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
  filter(!word(new,1) %in% mushroom)
  
# Extract family name for unidentified species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
  na.omit() %>%
  group_by(old) %>%
  mutate(family.lev=str_extract(word(new,1), pattern='([^\\s]+aceae)')) %>%
  mutate(new=ifelse(is.na(family.lev), new, family.lev)) %>%
  dplyr::select(-family.lev) %>%
  ungroup()

#Cut to the first 2 words in the name string
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
  group_by(old) %>%
  mutate(Name_binomial=paste(word(new, c(1,2)), collapse=" ")) %>%
  ungroup() %>%
  mutate(Name_binomial=gsub(' NA$', '', Name_binomial))

2.2.9 Save species list to submit to TNRS for iteration 2

write_csv(tnrs.submit.iter2 %>% 
            dplyr::select(Name_binomial) %>% 
            #After cleaning some names now match to those already resolved in iteration 1. Take them out
            filter(!Name_binomial %in% tnrs.res.certain$Name_submitted) %>% 
            distinct(), path="../_derived/TNRS_submit/tnrs_submit_iter2.csv")

2.3 Iteration 2 - Reimport resolved species names from TNRS and mark solved

tnrs.res.iter2.raw <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter2.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
          col_type = cols(
                .default = col_character(),
                Name_number = col_double(),
                Overall_score = col_double(),
                Name_score = col_double(),
                Author_score = col_double(),
                Family_score = col_double(),
                Genus_score = col_double(),
                Specific_epithet_score = col_double(),
                Infraspecific_epithet_score = col_double(),
                Infraspecific_epithet_2_score = col_double(),
                Selected = col_logical()
              ))

tnrs.res.iter2 <- tnrs.res.iter2.raw %>%
  mutate(Name_matched_rank=factor(Name_matched_rank, 
                                  levels=c("variety", "subspecies", "species", 
                                           "genus", "family", "section", 
                                           "supersection", "infraspecies", "forma", 
                                           "race", "nothosubspecies", "proles", 
                                           "monstr", "series"))) %>%
  mutate(Source=factor(Source, levels=TNRS.priorities)) %>%
  mutate(Taxonomic_status=factor(Taxonomic_status, 
                                levels=c("Accepted","Synonym", "No opinion",
                                         "Invalid","Illegitimate","Misapplied",
                                         "Rejected name"))) %>%
 arrange(Name_number,
          desc(Genus_score), 
          desc(Specific_epithet_score),
          desc(Infraspecific_epithet_2_score),
          desc(Infraspecific_epithet_score), 
          desc(Family_score),
          desc(Name_score),
          desc(Overall_score), 
          Source, 
          Taxonomic_status) %>%
  group_by(Name_submitted) %>%
  slice(1)

2.3.1 Family level

index.family <- which(tnrs.res.iter2$Name_matched_rank == "family" &
                               (tnrs.res.iter2$Taxonomic_status == "Accepted" |
                                tnrs.res.iter2$Taxonomic_status == "Synonym") &
                                tnrs.res.iter2$Family_score > 0.88)
length(index.family)
## [1] 2

2.3.2 Genus level

index.genus <- which(tnrs.res.iter2$Name_matched_rank == "genus" &
                                 (tnrs.res.iter2$Taxonomic_status %in% c("Accepted","Synonym") &
                                 tnrs.res.iter2$Genus_score >= 0.90 &
                                 tnrs.res.iter2$Name_score > 0.49))
length(index.genus)
## [1] 17

2.3.3 Species level

index.species <- which(tnrs.res.iter2$Name_matched_rank == "species" &
                                 #(tnrs.res.iter2$Taxonomic_status == "Accepted" |
                                 #  tnrs.res.iter2$Taxonomic_status == "Synonym") &
                                  tnrs.res.iter2$Genus_score >= 0.80 &
                                  tnrs.res.iter2$Specific_epithet_score > 0.90)
length(index.species)
## [1] 32

2.3.4 Subspecies level

index.infraspec <- which(tnrs.res.iter2$Name_matched_rank == "infraspecies")
length(index.infraspec)
## [1] 0
index.subspec <- which((tnrs.res.iter2$Name_matched_rank %in% c("infraspecies", "subspecies") |
                                   is.na(tnrs.res.iter2$Name_matched_rank)) & # there are a few records at sub-species level which are not categorized
                                (tnrs.res.iter2$Taxonomic_status == "Accepted" |
                                 tnrs.res.iter2$Taxonomic_status == "Synonym"))
length(index.subspec)
## [1] 0
index.variety <- which(tnrs.res.iter2$Name_matched_rank == "variety" &
                                (tnrs.res.iter2$Taxonomic_status == "Accepted" |
                                 tnrs.res.iter2$Taxonomic_status == "Synonym"))
length(index.variety)
## [1] 0
index.forma <- which(tnrs.res.iter2$Name_matched_rank == "forma")
length(index.forma)
## [1] 0
index.spermatophyt <- which(tnrs.res.iter2$Name_matched == "No suitable matches found."
                                     & word(tnrs.res.iter2$Name_submitted, 1) == "Spermatophyta")
length(index.spermatophyt)
## [1] 0
index.tnrs.iter2 <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
               index.variety, index.spermatophyt))

tnrs.res.iter2.certain <- tnrs.res.iter2[index.tnrs.iter2,]
dim(tnrs.res.iter2.certain)
## [1] 51 36
write.csv(tnrs.res.iter2.certain, file = "../_derived/TNRS_submit/tnrs.res.iter2.certain.csv")

tnrs.res.iter2.uncertain <- tnrs.res.iter2[-index.tnrs.iter2,]
dim(tnrs.res.iter2.uncertain)
## [1] 2745   36
write.csv(tnrs.res.iter2.uncertain, file = "../_derived/TNRS_submit/tnrs.res.iter2.uncertain.csv")

save(tnrs.res.iter2.certain, tnrs.res.iter2.uncertain, 
     tnrs.submit.iter2, file="../_derived/TNRS_submit/tnrs.iter2.RData")

2.3.5 Save species list to submit to TNRS for iteration 3

write_csv(tnrs.res.iter2.uncertain[,2], path = "../_derived/TNRS_submit/tnrs_submit_iter3.csv")

This list was submitted to TNRS, but only selecting the NCBI database.

2.4 Iteration 3 - Reimport resolved species names from TNRS_NCBI

tnrs.res.iter3.raw <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter3.txt", delim="\t", 
                                        locale = locale(encoding = 'UTF-8'),quote="",
          col_type = cols(
                .default = col_character(),
                Name_number = col_double(),
                Overall_score = col_double(),
                Name_score = col_double(),
                Author_score = col_double(),
                Family_score = col_double(),
                Genus_score = col_double(),
                Specific_epithet_score = col_double(),
                Infraspecific_epithet_score = col_double(),
                Infraspecific_epithet_2_score = col_double(),
                Selected = col_logical()
              ))

tnrs.ncbi <- tnrs.res.iter3.raw %>%
  mutate(Name_matched_rank=factor(Name_matched_rank, 
                                  levels=c("variety", "subspecies", "species", 
                                           "genus", "family", "section", "supersection",
                                           "infraspecies", "forma", "race",
                                           "nothosubspecies", "proles", "monstr",
                                           "series"))) %>%
  mutate(Taxonomic_status=factor(Taxonomic_status, 
                                levels=c("Accepted","Synonym", "No opinion","Invalid",
                                         "Illegitimate","Misapplied","Rejected name"))) %>%
 arrange(Name_number, 
          desc(Genus_score), 
          desc(Specific_epithet_score),
          desc(Infraspecific_epithet_2_score),
          desc(Infraspecific_epithet_score), 
          desc(Family_score),
          desc(Name_score),
          desc(Overall_score), 
          Source, 
          Taxonomic_status) %>%
  group_by(Name_submitted) %>%
  slice(1)

2.4.1 Family level

index.family <- which(tnrs.ncbi$Name_matched_rank == "family" &
                                (tnrs.ncbi$Taxonomic_status == "Accepted"|
                                 tnrs.ncbi$Taxonomic_status == "Synonym") &
                                tnrs.ncbi$Family_score > 0.85)
length(index.family)
## [1] 8

2.4.2 Genus level

index.genus <- which(tnrs.ncbi$Name_matched_rank == "genus" &
                       tnrs.ncbi$Taxonomic_status  %in% c("Accepted", "Synonym", "No opinion") &
                         (
                           (tnrs.ncbi$Genus_score > 0.89 & 
                             tnrs.ncbi$Name_score > 0.49)   |
                           (tnrs.ncbi$Genus_score > 0.99 &
                               tnrs.ncbi$Name_score > 0.2)
                          ))
length(index.genus)
## [1] 286

2.4.3 Species level

index.species.1 <- which(tnrs.ncbi$Name_matched_rank == "species" &
                                   (tnrs.ncbi$Taxonomic_status == "Accepted" |
                                    tnrs.ncbi$Taxonomic_status == "Synonym") &
                                    tnrs.ncbi$Name_score > 0.94 & 
                                    tnrs.ncbi$Specific_epithet_score>=0.67)
length(index.species.1)
## [1] 180
index.species.2 <- which(tnrs.ncbi$Name_matched_rank == "species" &
                                   (tnrs.ncbi$Taxonomic_status == "Accepted" |
                                   tnrs.ncbi$Taxonomic_status == "Synonym") &
                                   tnrs.ncbi$Genus_score > 0.81 &
                                   tnrs.ncbi$Name_score > 0.51 & 
                                   tnrs.ncbi$Specific_epithet_score>=0.67)
length(index.species.2)
## [1] 196
index.species.3 <- which(tnrs.ncbi$Name_matched_rank == "species" &
                                   tnrs.ncbi$Taxonomic_status == "No opinion"  &
                                   tnrs.ncbi$Genus_score > 0.7 &
                                   tnrs.ncbi$Specific_epithet_score > 0.75)
length(index.species.3)
## [1] 0
index.species <- unique(c(index.species.1, index.species.2, index.species.3))
length(index.species)
## [1] 212

2.4.4 Variety level

index.var <- which((tnrs.ncbi$Name_matched_rank == "subspecies" |
                              tnrs.ncbi$Name_matched_rank == "unknown" |
                              tnrs.ncbi$Name_matched_rank == "variety") &
                             (tnrs.ncbi$Taxonomic_status == "Accepted" |
                              tnrs.ncbi$Taxonomic_status == "No opinion" |
                              tnrs.ncbi$Taxonomic_status == "Synonym"))
length(index.var)
## [1] 0

2.4.5 Select certain or uncertain names

index.ncbi <- unique(c(index.family, index.genus, index.species, index.var))

tnrs.ncbi.certain <- tnrs.ncbi[index.ncbi,]
nrow(tnrs.ncbi.certain)
## [1] 506
write_csv(tnrs.ncbi.certain, path = "../_derived/TNRS_submit/tnrs.ncbi.certain.csv")

tnrs.ncbi.uncertain <- tnrs.ncbi[-index.ncbi,]
nrow(tnrs.ncbi.uncertain)
## [1] 2239
write_csv(tnrs.ncbi.uncertain, path = "../_derived/TNRS_submit/tnrs.ncbi.uncertain.csv")
save(tnrs.ncbi.certain, tnrs.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter3.RData")

After iteration 3, there are still 2239 unresolved taxa.

2.5 Iteration 4 - Using The Plant List matching tools for unresolved names

Generate names list from tnrs.ncbi.uncertain to be matched against The Plant List, using Taxonstand::TPL. Add to this list, also all those species that in the first iterations did not return an accepted name.

tpl.submit <- tnrs.res.certain %>% 
  filter(is.na(Accepted_name)) %>% 
  dplyr::select(Name_submitted) %>% 
  bind_rows(tnrs.res.iter2.certain %>% 
              filter(is.na(Accepted_name)) %>% 
              dplyr::select(Name_submitted)) %>% 
  bind_rows(tnrs.ncbi.certain %>% 
              filter(is.na(Accepted_name)) %>% 
              dplyr::select(Name_submitted)) %>% 
  bind_rows(tnrs.ncbi.uncertain %>% 
              dplyr::select(Name_submitted)) %>% 
  distinct()
nrow(tpl.submit)
write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")

#divide in 99 batches
indices <- 1:nrow(tpl.submit)
chunks <- split(indices, sort(indices%%99))

library(doParallel)
library(parallel)
cl <- makeForkCluster(5, outfile="")
registerDoParallel(cl)

tpl.ncbi <- foreach(i=1:length(chunks), .combine=rbind) %dopar% {
  tmp <- (TPL(tpl.submit$Name_submitted[chunks[[i]]]))
  save(tmp, file=paste0("../_derived/TNRS_submit/TPL_foreach/tpl.ncbi", i,".RData"))
  return(tmp)
}
stopCluster(cl)
save(tpl.ncbi, file = "../_derived/TPL/tpl_results_iter4.RData")
load("../_derived/TPL/tpl_results_iter4.RData")
tpl.ncbi.certain <- tpl.ncbi %>%
  filter(Plant.Name.Index==T | Higher.level==T)
nrow(tpl.ncbi.certain)
## [1] 27338
write_csv(tpl.ncbi.certain, path = "../_derived/TPL/tpl.ncbi.certain.csv")

tpl.ncbi.uncertain <- tpl.ncbi %>%
  filter(Plant.Name.Index==F & Higher.level==F) %>%
  dplyr::select(Taxon)
nrow(tpl.ncbi.uncertain)
## [1] 5771
write_csv(tpl.ncbi.uncertain, path = "../_derived/TPL/tpl.ncbi.uncertain.csv")

save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter4.RData")

3 Merge the resolved species lists into a Backbone

3.1 Read files

load("../_derived/TNRS_submit/tnrs.iter1.RData")
load("../_derived/TNRS_submit/tnrs.iter2.RData")
load("../_derived/TNRS_submit/tnrs.iter3.RData")
load("../_derived/TNRS_submit/tnrs.iter4.RData")

#Double check of wrong taxa from TNRS
finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii", "Lamprothamnium papulosum")
tpl.ncbi.certain <- tpl.ncbi.certain %>% 
  bind_rows(TPL(finalcheck))

Combine the certain data sets:

Backbone <- spec.list.TRY.sPlot %>%
  as.tbl() %>%
  rename(Name_sPlot_TRY=OriginalNames, 
         Name_string_corr1=Species) %>%
  left_join(tnrs.submit.iter2 %>%
              dplyr::select(-new) %>%
              rename(Name_string_corr1=old, Name_string_corr2=Name_binomial),
            by="Name_string_corr1") %>%
  mutate(Name_submitted=ifelse(!is.na(Name_string_corr2), Name_string_corr2, Name_string_corr1)) %>%
  dplyr::select(Name_sPlot_TRY, Name_string_corr1, Name_string_corr2, Source, Name_submitted) %>%
  rename(sPlot_TRY=Source) %>%
  left_join(tnrs.res.certain %>%
              #filter out wrongly matches species 
              filter(!Name_submitted %in% finalcheck) %>% 
              #filter(!is.na(Accepted_name)) %>% 
              bind_rows(tnrs.res.iter2.certain) %>%
              bind_rows(tnrs.ncbi.certain) %>%
  #reformat TPL output to tnrs output
              bind_rows(tpl.ncbi.certain %>%
                    rename(Name_submitted=Taxon,
                    Name_matched_url=ID,
                    Taxonomic_status=Taxonomic.status,
                    Accepted_name_author=New.Authority,
                    Accepted_name_url=New.ID,
                    Accepted_name_family=Family, 
                    Selected=Plant.Name.Index) %>%
                    mutate_at(.vars=vars(New.Hybrid.marker, New.Infraspecific.rank, New.Infraspecific),
                              .fun=~ifelse(is.na(.), "", .)) %>%
                    mutate(Accepted_name=paste(New.Genus, New.Hybrid.marker, 
                                               New.Species, New.Infraspecific.rank, 
                                               New.Infraspecific)) %>%
                    mutate(Accepted_name=gsub(pattern="\\s+", " ", Accepted_name)) %>%
                    mutate(Accepted_name_species=paste(New.Genus, New.Hybrid.marker, New.Species)) %>%
                    mutate(Accepted_name_species=gsub(pattern="\\s+", " ", Accepted_name_species)) %>%
                    mutate(Accepted_name_rank=ifelse(Higher.level==F, "species", NA)) %>%
                    mutate(Source=paste("tpl", TPL.version)) %>%
                    dplyr::select( (data.frame(colmatch=match(colnames(tnrs.ncbi), 
                                                        names(.))) %>%
                                filter(!is.na(colmatch)))$colmatch)
                    ) %>%
                group_by(Name_submitted) %>%  #Some double matches. Prioritize best taxonomic status
                mutate(Taxonomic_status=factor(Taxonomic_status, 
                                             levels=c("Accepted","Synonym", "No opinion","Invalid",
                                                      "Illegitimate","Misapplied","Rejected name",
                                                      "Unresolved"))) %>%
                arrange(Taxonomic_status) %>% 
                slice(1) %>% 
                #delete empty spaces at end of names
                mutate(Accepted_name=gsub(pattern=" $", replacement="", x=Accepted_name)) %>% 
                mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)),
              by="Name_submitted")
#Double check
nrow(Backbone) == nrow(spec.list.TRY.sPlot)
## [1] TRUE

3.2 Tag unresolved names and create output columns

Add four additional columns. If names were neither resolved at the accepted or synonym level, set Status_correct == "Other", and assign No suitable matches found. to the remaining species.

Backbone <- Backbone %>%
  mutate(Status_correct=fct_collapse(Taxonomic_status, 
                                     Other=c("No opinion","Invalid", "Unresolved", 
                                         "Illegitimate","Misapplied","Rejected name"))) %>% 
  mutate(Status_correct=fct_explicit_na(Status_correct, "No suitable matches found.")) %>% 
  #Create Name_correct field. Use Accepted names, if any. Otherwise matched names.
  mutate(Name_correct=ifelse(!is.na(Accepted_name), 
                             Accepted_name, 
                             Name_matched)) %>%
  mutate(Genus_correct=ifelse(!is.na(Name_correct) & (!Accepted_name_rank %in% c("family")), 
                      word(Name_correct,1), 
                      NA)) %>%
  mutate(Rank_correct=ifelse(!is.na(Name_matched_rank),
                             as.character(Name_matched_rank), 
                             "higher")) %>%
  mutate(Rank_correct=factor(Rank_correct, levels=c("higher", "family", "genus", "species",
                                                   "subspecies", "variety", "infraspecies",
                                                   "race", "forma"))) 

summary(Backbone$Status_correct)
##                   Accepted                    Synonym 
##                     284559                      28617 
##                      Other No suitable matches found. 
##                      30085                       3177
summary(Backbone$Rank_correct)
##       higher       family        genus      species   subspecies      variety 
##         6392         1889        27102       294425         8948         7443 
## infraspecies         race        forma 
##           92            0          147

There are 3070 species names for which we found no match in any of the taxonomic resources we used. Yet, for as many as 35383 taxa, the matching did not properly resolve the species name, and we only found a match at genus or higher level.

3.3 Complete list of families

There are 35195 records with missing family information. Create field Family_correct.

Backbone <- Backbone %>% 
  mutate(family.lev=str_extract(word(Name_correct,1), pattern='([^\\s]+aceae)')) %>%
  mutate(Family_correct=ifelse(!is.na(Accepted_name_family), 
                                     Accepted_name_family, 
                                     family.lev)) %>% 
  dplyr::select(-family.lev) 

# Remaining records with missing family info
sum((is.na(Backbone$Family_correct)))
## [1] 33398

3.3.1 Resolve genera with missing family info with TNRS

Genera_submit <- Backbone %>% 
  filter(is.na(Family_correct))  %>% 
  dplyr::select(Genus_correct) %>% 
  distinct()

write_csv(Genera_submit, "../_derived/TNRS_submit/Genera_submit.csv")

Import results from TNRS. Best match only and simple download

import.profile <- cols(
  Name_submitted = col_character(),
  Name_matched = col_character(),
  Author_matched = col_logical(),
  Overall_score = col_double(),
  Taxonomic_status = col_character(),
  Accepted_name = col_character(),
  Accepted_author = col_character(),
  Accepted_family = col_character(),
  Source = col_character(),
  Warnings = col_character(),
  Accepted_name_lsid = col_character()
)
      
tnrs.genera <- read_delim("../_derived/TNRS_submit/tnrs_genera.txt", delim="\t",  
         locale = locale(encoding = 'UTF-8'),quote="",col_type = import.profile)

Attach resolved families to backbone

Backbone <- Backbone %>% 
  left_join(tnrs.genera %>%
              dplyr::select(Name_submitted, Accepted_family) %>% 
              rename(Genus_correct=Name_submitted, Family_import=Accepted_family), 
            by="Genus_correct") %>% 
  mutate(Family_correct=ifelse(is.na(Family_correct), 
                                Family_import, 
                                Family_correct)) %>%
  dplyr::select(-Family_import)
  
#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 10067

3.3.2 Complement with data from TRY 5.0

Data from try were received by Jens Kattge on Jan 21, 2020.

# Species, Genus, Family from try
try.species <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv",
  locale = locale(encoding = "latin1"))

Backbone <- Backbone %>% 
  left_join(try.species %>%
      dplyr::select(Genus_correct=Genus, family=Family) %>% 
      distinct() %>% 
      filter(family != "") %>% 
      group_by(Genus_correct), 
    by="Genus_correct") %>% 
  mutate(Family_correct=coalesce(Family_correct, family)) %>%
  dplyr::select(-family)

# Remaining records with missing family info
sum((is.na(Backbone$Family_correct)))
## [1] 7954

3.3.3 Complement with data from The Catalogue of Life

#Download data from Catalogue of Life - 2019
download.file("http://www.catalogueoflife.org/DCA_Export/zip/archive-kingdom-plantae-bl3.zip",
              destfile="/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019.zip")
unzip("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019.zip", files="taxa.txt", exdir = "/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/")
cat.life <- read_delim("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/taxa.txt", 
                       delim="\t", 
                       col_types =  cols(
                          .default = col_character(),
                          taxonID = col_double(),
                          datasetID = col_double(),
                          acceptedNameUsageID = col_double(),
                          parentNameUsageID = col_double(),
                          superfamily = col_logical(),
                          subgenus = col_logical(),
                          source = col_logical(),
                          namePublishedIn = col_logical(),
                          modified = col_character(),
                          taxonConceptID = col_double(),
                          isExtinct = col_logical()
                        )) %>% 
  #correct family names to match to the standards in TPL
  mutate(family=ifelse(family=="Fabaceae", "Leguminosae", family)) %>% 
  mutate(family=ifelse(family=="Asteraceae", "Compositae", family))

Genera_missing <- Backbone %>%
  filter(is.na(Family_correct) & !is.na(Genus_correct)) %>%
  dplyr::select(Genus_correct) %>% 
  distinct()

Backbone <- Backbone %>% 
  left_join(cat.life %>%
      dplyr::select(genus, family) %>% 
      distinct() %>% 
      filter(family != "") %>% 
      group_by(genus) %>%  #There are two genera with multiple attribution to families
      slice(1) %>% 
      filter(genus %in% Genera_missing$Genus_correct) %>% 
      rename(Genus_correct=genus), 
    by="Genus_correct") %>% 
  mutate(Family_correct=coalesce(Family_correct, family)) %>%
  dplyr::select(-family)

#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 6914

After matching the remaining genera with the Catalogue of life there are still 6914 records without Family affiliation, for a total of 1281 genera.

3.3.4 Manually fix some known issues

Backbone <- Backbone %>%
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Coptidium",
                                values="Ranunculaceae")) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Balanocarpus",
                                values="Dipterocarpaceae" )) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Cardaminopsis",
                                values="Brassicaceae" )) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Carpolepis",
                                values="Myrtaceae" )) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Cathartolinum",
                                values="Linaceae" )) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Didiscus",
                                values="Araliaceae" )) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Grammadenia",
                                values="Primulaceae" )) %>% 
  mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Antholoma",
                                values="Elaeocarpaceae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Odontarrhena",
                                values="Brassicaceae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Trichinium",
                                values="Amaranthaceae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Limonium",
                                values="Plumbaginaceae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Eunanus",
                                values="Phrymaceae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Sunaptea",
                                values="Dipterocarpaceae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Aconogonon",
                                values="Polygonaceae" )) %>%   
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Cajophora",
                                    values="Loasaceae" )) %>%   
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Calobota",
                                values="Leguminosae" )) %>% 
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Alsine",
                                values="Caryophyllaceae" )) %>%   
    mutate(Family_correct=replace(Family_correct, 
                                list=Genus_correct=="Cyanococcus",
                                values="Ericaceae" ))  %>% 
    mutate(Family_correct=ifelse(Family_correct %in% c("Papilionaceae", "Fabaceae"), 
                               "Leguminosae", Family_correct)) %>% 
    mutate(Family_correct=ifelse(Family_correct=="Asteraceae", "Compositae", Family_correct)) %>% 
    mutate(Family_correct=ifelse(Family_correct=="Unknown", NA, Family_correct))
  
#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 6830

3.3.5 Derive info from other species of the same Genera in the Backbone itself

Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info.

genera_families <- Backbone %>% 
  filter(Taxonomic_status=="Accepted") %>% 
  dplyr::select(Genus_correct, Family_correct) %>% 
  rename(family=Family_correct) %>% 
  distinct() %>% 
  na.omit() %>% 
  #for some genera there are multiple families assigned 
  # (e.g. in case of unresolved species names )
  # Extract the family names that occurs most often across each genus
  group_by(Genus_correct, family) %>% 
  summarize(n=n()) %>% 
  arrange(desc(n)) %>% 
  slice(1) %>% 
  ungroup() %>% 
  dplyr::select(-n)
## `summarise()` regrouping output by 'Genus_correct' (override with `.groups` argument)
# Assign family derived from backbone to other records
Backbone <- Backbone %>% 
  left_join(genera_families, by="Genus_correct") %>% 
  mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)),
                                       family, 
                                       Family_correct)) %>% 
  dplyr::select(-family)

#Records with missing family info
sum(is.na(Backbone$Family_correct))
## [1] 6366
nrow(Backbone) == nrow(spec.list.TRY.sPlot)
## [1] TRUE

3.3.6 Delete records assigned to mushroom families, if any

mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae",
                       "Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae",
                       "Cortinariaceae", "Polyporaceae",   "Pleosporaceae",
                       "Leotiaceae","Dermateaceae","Hymenochaetaceae","Stereaceae","Tremellaceae")
Backbone <- Backbone %>% 
  filter(!Genus_correct %in% mushroom) %>% 
  filter(!Family_correct %in% mushroom.families)

3.4 Create field Name_short

Shorten names that have more than two words and where the second word is a x. If there is no species name available, fill in with either genus or family info

Backbone <- Backbone %>% 
  mutate(Name_short=Name_correct) %>% 
  mutate(Name_short=gsub(pattern=" x ", replacement=" ", x=Name_short, fixed=T)) %>%
  mutate(Name_short=word(Name_short, start=1L, end=2L)) %>% 
  mutate(Name_short=ifelse(!is.na(Name_short), 
                           Name_short, 
                           ifelse(!is.na(Genus_correct), 
                                  Genus_correct,
                                  ifelse(!is.na(Family_correct), 
                                         Family_correct, 
                                         NA))))

sum(is.na(Backbone$Name_short))
## [1] 3076
sum(is.na(Backbone$Name_correct))
## [1] 3070

3.5 Create field is_vascular_plant and Taxon group

Attach phylum information from The Catalogue of Life.

Backbone <- Backbone %>% 
  left_join(cat.life %>% 
              dplyr::select(phylum, family) %>% 
              distinct() %>% 
              na.omit() %>% 
              rename(Family_correct=family), 
            by="Family_correct") 

Create fields is_vascular_species and Taxon group based on list of family manually classified, and on phyla from The Catalogue of Life.
Assign all families that belong to Tracheophyta to category is_vascular_species, based on

Backbone <- Backbone %>% 
  mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F)) %>% 
  mutate(is_vascular_species=replace(is_vascular_species,
                                     list=Family_correct %in% vascular, 
                                     values=T)) %>% 
  mutate(`Taxon group`="Unknown") %>% 
  mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T), 
                              "Vascular plant", `Taxon group`)) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Family_correct %in% lichens, 
                               values="Lichen")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus_correct %in% lichen.genera, 
                               values="Lichen")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Family_correct %in% algae_diatoms, 
                               values="Alga")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=phylum %in% c("Glaucophyta", "Rhodophyta", "Charophyta", "Chlorophyta"), 
                               values="Alga")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Family_correct %in% mosses, 
                               values="Moss")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=phylum %in% c("Bryophyta", "Bryophyta", "Anthocerotophyta" ), 
                               values="Moss")) %>% 
  mutate(is_vascular_species=ifelse(`Taxon group` %in% c("Moss", "Alga", "Lichen"),
                                    F, is_vascular_species))
table(Backbone$`Taxon group`, exclude=NULL)
## 
##           Alga         Lichen           Moss        Unknown Vascular plant 
##            211           4470           2961           6804         331946
table(Backbone$is_vascular_species, exclude=NULL)
## 
##  FALSE   TRUE   <NA> 
##   8819 331946   5627

3.6 Export Backbone

Example of Backbone (only 20 randomly selected taxa shown)
Name_sPlot_TRY Name_string_corr1 Name_string_corr2 sPlot_TRY Name_submitted Name_number Overall_score Name_matched Name_matched_rank Name_score Name_matched_author Name_matched_url Author_matched Author_score Family_matched Family_score Name_matched_accepted_family Genus_matched Genus_score Specific_epithet_matched Specific_epithet_score Infraspecific_rank Infraspecific_epithet_matched Infraspecific_epithet_score Infraspecific_rank_2 Infraspecific_epithet_2_matched Infraspecific_epithet_2_score Annotations Unmatched_terms Taxonomic_status Accepted_name Accepted_name_author Accepted_name_rank Accepted_name_url Accepted_name_species Accepted_name_family Selected Source Warnings Accepted_name_lsid Status_correct Name_correct Genus_correct Rank_correct Family_correct Name_short phylum is_vascular_species Taxon group
Antidesma orthogyne Antidesma orthogyne NA T Antidesma orthogyne 21638 1.0 Antidesma orthogyne species 1.0 (Hook.f.) Airy Shaw http://www.theplantlist.org/tpl1.1/record/kew-12209 NA NA NA NA Phyllanthaceae Antidesma 1 orthogyne 1 NA NA NA NA NA NA NA NA Accepted Antidesma orthogyne (Hook.f.) Airy Shaw species http://www.theplantlist.org/tpl1.1/record/kew-12209 Antidesma orthogyne Phyllanthaceae TRUE tpl NA NA Accepted Antidesma orthogyne Antidesma species Phyllanthaceae Antidesma orthogyne Tracheophyta TRUE Vascular plant
Croton cupreatus Croton cupreatus NA T Croton cupreatus 85161 1.0 Croton cupreatus species 1.0 Croizat http://www.theplantlist.org/tpl1.1/record/kew-49729 NA NA NA NA Euphorbiaceae Croton 1 cupreatus 1 NA NA NA NA NA NA NA NA Accepted Croton cupreatus Croizat species http://www.theplantlist.org/tpl1.1/record/kew-49729 Croton cupreatus Euphorbiaceae TRUE tpl NA NA Accepted Croton cupreatus Croton species Euphorbiaceae Croton cupreatus Tracheophyta TRUE Vascular plant
Cullumia selago Cullumia selago NA T Cullumia selago 87616 1.0 Cullumia selago species 1.0 Roessler http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=206FD9BD-5196-4E13-96F0-4E30730B22E7;http://www.tropicos.org/Name/2722363 NA NA NA NA Asteraceae Cullumia 1 selago 1 NA NA NA NA NA NA NA NA Accepted Cullumia selago Roessler species http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=206FD9BD-5196-4E13-96F0-4E30730B22E7;http://www.tropicos.org/Name/2722363 Cullumia selago Asteraceae TRUE gcc;tropicos NA urn:lsid:compositae.org:names:206FD9BD-5196-4E13-96F0-4E30730B22E7 Accepted Cullumia selago Cullumia species Compositae Cullumia selago Tracheophyta TRUE Vascular plant
Stenocereus queretaroensis Stenocereus queretaroensis NA T Stenocereus queretaroensis 297772 1.0 Stenocereus queretaroensis species 1.0 (F.A.C.Weber ex Mathes.) Buxb. http://www.theplantlist.org/tpl1.1/record/kew-2486758 NA NA NA NA Cactaceae Stenocereus 1 queretaroensis 1 NA NA NA NA NA NA NA NA Accepted Stenocereus queretaroensis (F.A.C.Weber ex Mathes.) Buxb. species http://www.theplantlist.org/tpl1.1/record/kew-2486758 Stenocereus queretaroensis Cactaceae TRUE tpl NA NA Accepted Stenocereus queretaroensis Stenocereus species Cactaceae Stenocereus queretaroensis Tracheophyta TRUE Vascular plant
Bidens ghedoensis Bidens ghedoensis NA T Bidens ghedoensis 41540 1.0 Bidens ghedoensis species 1.0 Mesfin http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=92C4E45F-9192-47B9-87E8-A2D0D7B831F5 NA NA NA NA Asteraceae Bidens 1 ghedoensis 1 NA NA NA NA NA NA NA NA Accepted Bidens ghedoensis Mesfin species http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=92C4E45F-9192-47B9-87E8-A2D0D7B831F5 Bidens ghedoensis Asteraceae TRUE gcc NA urn:lsid:compositae.org:names:92C4E45F-9192-47B9-87E8-A2D0D7B831F5 Accepted Bidens ghedoensis Bidens species Compositae Bidens ghedoensis Tracheophyta TRUE Vascular plant
Fimbristylis ratnagirica Fimbristylis ratnagirica NA T Fimbristylis ratnagirica 131775 1.0 Fimbristylis ratnagirica species 1.0 V.P.Prasad & N.P.Singh http://www.theplantlist.org/tpl1.1/record/kew-245872 NA NA NA NA Cyperaceae Fimbristylis 1 ratnagirica 1 NA NA NA NA NA NA NA NA Accepted Fimbristylis ratnagirica V.P.Prasad & N.P.Singh species http://www.theplantlist.org/tpl1.1/record/kew-245872 Fimbristylis ratnagirica Cyperaceae TRUE tpl NA NA Accepted Fimbristylis ratnagirica Fimbristylis species Cyperaceae Fimbristylis ratnagirica Tracheophyta TRUE Vascular plant
Solanum tapojense Solanum tapojense NA T Solanum tapojense 292134 1.0 Solanum tapojense species 1.0 Ochoa http://www.theplantlist.org/tpl1.1/record/tro-29605478;http://www.tropicos.org/Name/29605478 NA NA NA NA Solanaceae Solanum 1 tapojense 1 NA NA NA NA NA NA NA NA Accepted Solanum tapojense Ochoa species http://www.theplantlist.org/tpl1.1/record/tro-29605478;http://www.tropicos.org/Name/29605478 Solanum tapojense Solanaceae TRUE tpl;tropicos NA NA Accepted Solanum tapojense Solanum species Solanaceae Solanum tapojense Tracheophyta TRUE Vascular plant
Baccharoides tenoreana Baccharoides tenoreana NA T Baccharoides tenoreana 35312 1.0 Baccharoides tenoreana species 1.0 (Oliv.) Isawumi http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=E41A5830-E56B-480C-AA80-C91E3478FDA5 NA NA NA NA Asteraceae Baccharoides 1 tenoreana 1 NA NA NA NA NA NA NA NA Accepted Baccharoides tenoreana (Oliv.) Isawumi species http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=E41A5830-E56B-480C-AA80-C91E3478FDA5 Baccharoides tenoreana Asteraceae TRUE gcc NA urn:lsid:compositae.org:names:E41A5830-E56B-480C-AA80-C91E3478FDA5 Accepted Baccharoides tenoreana Baccharoides species Compositae Baccharoides tenoreana Tracheophyta TRUE Vascular plant
Avicennia balanophora Avicennia balanophora NA T Avicennia balanophora 34314 1.0 Avicennia balanophora species 1.0 Stapf & Moldenke http://www.theplantlist.org/tpl1.1/record/kew-18436 NA NA NA NA Acanthaceae Avicennia 1 balanophora 1 NA NA NA NA NA NA NA NA Accepted Avicennia balanophora Stapf & Moldenke species http://www.theplantlist.org/tpl1.1/record/kew-18436 Avicennia balanophora Acanthaceae TRUE tpl NA NA Accepted Avicennia balanophora Avicennia species Acanthaceae Avicennia balanophora Tracheophyta TRUE Vascular plant
Discocalyx pachyphylla Discocalyx pachyphylla NA T Discocalyx pachyphylla 103267 1.0 Discocalyx pachyphylla species 1.0 Merr. http://www.theplantlist.org/tpl1.1/record/kew-2772907 NA NA NA NA Primulaceae Discocalyx 1 pachyphylla 1 NA NA NA NA NA NA NA NA Accepted Discocalyx pachyphylla Merr. species http://www.theplantlist.org/tpl1.1/record/kew-2772907 Discocalyx pachyphylla Primulaceae TRUE tpl NA NA Accepted Discocalyx pachyphylla Discocalyx species Primulaceae Discocalyx pachyphylla Tracheophyta TRUE Vascular plant
Neea popular Neea popular NA T Neea popular 209266 0.5 Neea genus 0.5 NA http://www.theplantlist.org/tpl1.1/search?q=Neea NA NA NA NA Nyctaginaceae Neea 1 NA NA NA NA NA NA NA NA NA popular Accepted Neea NA genus http://www.theplantlist.org/tpl1.1/search?q=Neea NA Nyctaginaceae TRUE tpl [Partial match] NA Accepted Neea Neea genus Nyctaginaceae Neea Tracheophyta TRUE Vascular plant
Eupatorium subhastatum Eupatorium subhastatum NA S Eupatorium subhastatum 124382 1.0 Eupatorium subhastatum species 1.0 Hook. & Arn. http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=5B4A0E16-77C5-4782-B008-3D89D62157B5 NA NA NA NA Asteraceae Eupatorium 1 subhastatum 1 NA NA NA NA NA NA NA NA Synonym Badilloa steetzii (B.L.Rob.) R.M.King & H.Rob. species http://dixon.iplantcollaborative.org/CompositaeWeb/default.aspx?Page=NameDetails&TabNum=0&NameId=7EF48DA1-8F27-4E4B-83F0-5C0E17E96EFB Badilloa steetzii Asteraceae TRUE gcc NA urn:lsid:compositae.org:names:7EF48DA1-8F27-4E4B-83F0-5C0E17E96EFB Synonym Badilloa steetzii Badilloa species Compositae Badilloa steetzii Tracheophyta TRUE Vascular plant
Polygala gracilipes Polygala gracilipes NA T Polygala gracilipes 247694 1.0 Polygala gracilipes species 1.0 Harv. http://www.theplantlist.org/tpl1.1/record/tro-25900620;http://www.tropicos.org/Name/25900620 NA NA NA NA Polygalaceae Polygala 1 gracilipes 1 NA NA NA NA NA NA NA NA Accepted Polygala gracilipes Harv. species http://www.theplantlist.org/tpl1.1/record/tro-25900620;http://www.tropicos.org/Name/25900620 Polygala gracilipes Polygalaceae TRUE tpl;tropicos NA NA Accepted Polygala gracilipes Polygala species Polygalaceae Polygala gracilipes Tracheophyta TRUE Vascular plant
Sobralia ecuadorana Sobralia ecuadorana NA T Sobralia ecuadorana 290653 1.0 Sobralia ecuadorana species 1.0 Dodson http://www.theplantlist.org/tpl1.1/record/kew-192650;http://www.tropicos.org/Name/50119531 NA NA NA NA Orchidaceae Sobralia 1 ecuadorana 1 NA NA NA NA NA NA NA NA Accepted Sobralia ecuadorana Dodson species http://www.theplantlist.org/tpl1.1/record/kew-192650;http://www.tropicos.org/Name/50119531 Sobralia ecuadorana Orchidaceae TRUE tpl;tropicos NA NA Accepted Sobralia ecuadorana Sobralia species Orchidaceae Sobralia ecuadorana Tracheophyta TRUE Vascular plant
Sphaeradenia lemaensis Sphaeradenia lemaensis NA T Sphaeradenia lemaensis 294392 1.0 Sphaeradenia lemaensis species 1.0 Harling http://www.theplantlist.org/tpl1.1/record/kew-287652;http://www.tropicos.org/Name/9700331 NA NA NA NA Cyclanthaceae Sphaeradenia 1 lemaensis 1 NA NA NA NA NA NA NA NA Accepted Sphaeradenia lemaensis Harling species http://www.theplantlist.org/tpl1.1/record/kew-287652;http://www.tropicos.org/Name/9700331 Sphaeradenia lemaensis Cyclanthaceae TRUE tpl;tropicos NA NA Accepted Sphaeradenia lemaensis Sphaeradenia species Cyclanthaceae Sphaeradenia lemaensis Tracheophyta TRUE Vascular plant
Satyria bracteolosa Satyria bracteolosa NA T Satyria bracteolosa 276746 1.0 Satyria bracteolosa species 1.0 A.C. Sm. http://www.theplantlist.org/tpl1.1/record/tro-50217845;http://www.tropicos.org/Name/50217845 NA NA NA NA Ericaceae Satyria 1 bracteolosa 1 NA NA NA NA NA NA NA NA Accepted Satyria bracteolosa A.C. Sm. species http://www.theplantlist.org/tpl1.1/record/tro-50217845;http://www.tropicos.org/Name/50217845 Satyria bracteolosa Ericaceae TRUE tpl;tropicos NA NA Accepted Satyria bracteolosa Satyria species Ericaceae Satyria bracteolosa Tracheophyta TRUE Vascular plant
Axinaea quitensis Axinaea quitensis NA T Axinaea quitensis 34361 1.0 Axinaea quitensis species 1.0 Benoist http://www.theplantlist.org/tpl1.1/record/tro-20303909;http://www.tropicos.org/Name/20303909 NA NA NA NA Melastomataceae Axinaea 1 quitensis 1 NA NA NA NA NA NA NA NA Accepted Axinaea quitensis Benoist species http://www.theplantlist.org/tpl1.1/record/tro-20303909;http://www.tropicos.org/Name/20303909 Axinaea quitensis Melastomataceae TRUE tpl;tropicos NA NA Accepted Axinaea quitensis Axinaea species Melastomataceae Axinaea quitensis Tracheophyta TRUE Vascular plant
Xanthosoma trilobum Xanthosoma trilobum NA T Xanthosoma trilobum 330208 1.0 Xanthosoma trilobum species 1.0 G.S.Bunting http://www.theplantlist.org/tpl1.1/record/kew-215156 NA NA NA NA Araceae Xanthosoma 1 trilobum 1 NA NA NA NA NA NA NA NA Accepted Xanthosoma trilobum G.S.Bunting species http://www.theplantlist.org/tpl1.1/record/kew-215156 Xanthosoma trilobum Araceae TRUE tpl NA NA Accepted Xanthosoma trilobum Xanthosoma species Araceae Xanthosoma trilobum Tracheophyta TRUE Vascular plant
Epimedium trifoliatobinatum Epimedium trifoliatobinatum NA S Epimedium trifoliatobinatum 115095 1.0 Epimedium trifoliatobinatum species 1.0 Koidz. http://www.theplantlist.org/tpl1.1/record/tro-3500297;http://www.tropicos.org/Name/3500297 NA NA NA NA Berberidaceae Epimedium 1 trifoliatobinatum 1 NA NA NA NA NA NA NA NA No opinion NA NA NA ; NA NA TRUE tpl;tropicos NA NA Other Epimedium trifoliatobinatum Epimedium species Berberidaceae Epimedium trifoliatobinatum Tracheophyta TRUE Vascular plant
Lippia schlimii Lippia schlimii NA T Lippia schlimii 182386 1.0 Lippia schlimii species 1.0 Turcz. http://www.theplantlist.org/tpl1.1/record/kew-113749;http://www.tropicos.org/Name/33702547 NA NA NA NA Verbenaceae Lippia 1 schlimii 1 NA NA NA NA NA NA NA NA Accepted Lippia schlimii Turcz. species http://www.theplantlist.org/tpl1.1/record/kew-113749;http://www.tropicos.org/Name/33702547 Lippia schlimii Verbenaceae TRUE tpl;tropicos NA NA Accepted Lippia schlimii Lippia species Verbenaceae Lippia schlimii Tracheophyta TRUE Vascular plant

3.6.1 Description of fields in the Backbone

Name_sPlot_TRY - Name as retrieved in sPlot or TRY
Name_string_corr1 - Name after first round of string cleaning
Name_string_corr2 - Name after second round of string cleaning
sPlot_TRY - Origin or species name (S - sPlot, T - Try, A - Alpine dataset & combinations)
Name_submitted - Name as submitted to TNRS\TPL
Name_number - Number of species names when submitted to TNRS\TPL
Overall_score - Matching score from TNRS
Name_matched - Name matched in TNRS\TPL
Name_matched_rank - Taxonomic rank of name matched (e.g., species, family…)
Name_score - Matching score of name matched
Name_matched_author - Author names of matched names
Name_matched_url - Url from TNRS\TPL of matche name
Author_matched - Authors as matched from query (Empty since we only submitted species names)
Author_score - Score of author matching (Empty since we only submitted species names)
Family_matched - Family of the matched name
Family_score - Score of matched family name
Name_matched_accepted_family - Accepted family of the matched name (if available)
Genus_matched - Genus of the matched name
Genus_score - Score of matched genus name
Specific_epithet_matched - Specific epithet of the matched name
Specific_epithet_score - Score Specific epithet of the matched name
Infraspecific_rank - Rank of matched name, if below species
Infraspecific_epithet_matched - Infraspecific epithet of matched name
Infraspecific_epithet_score- Score of infraspecific epithet of matched name
Infraspecific_rank_2 - Rank of matched name (2nd level), if below species
Infraspecific_epithet_2_matched - Infraspecific epithet (2nd level) of matched name
Infraspecific_epithet_2_score - Score of infraspecific epithet (2nd level) of matched name
Annotations -
Unmatched_terms -
Taxonomic_status - Status of matched name (Accepted, Synonim, Unresolved…)
Accepted_name - Accepted name
Accepted_name_author - Author of accepted name
Accepted_name_rank - Rank of accepted name (family, genus, species, infraspecific…)
Accepted_name_url - url of accepted name
Accepted_name_species - Accepted species name (if Accepted_name_rank at species level of lower)
Accepted_name_family - Family of accepted name Selected - ignore Source - Database where the info comes from Warnings -
Accepted_name_lsid -
Below columns added specifically for this backbone
Status_correct - Simplification of Taxonomic_status
Name_correct - If Accepted_name is non-null, otherwise returnes Name_matched. This field represent the union of accepted + matched name
Genus_correct - Genus derived from Name_correct, but only when Accepted_name_rank is lower than family
Rank_correct - Simplification of Accepted_name_rank
Family_correct - Family of Name_correct. Complements Accepted_name_family with multiple sources
Name_short - First two words of Name_correct
phylum - As derived from The Catalogue of Life
is_vascular_species - As derived based on selection of phylum from The Catalogue of Life
Taxon group - Taxon group, as in Turboveg. ‘Vascular plant’, ‘Moss’ (include liverworts), ‘Lichen’, ‘Algae’, ’Unknown

save(Backbone, mushroom, mushroom.families, lichen.genera,
     file="../_output/Backbone3.0.RData")

3.7 Export species list to request in TRY

ToSubmit1 <- Backbone %>% 
  filter(grepl(sPlot_TRY, pattern = "S")) %>% 
  filter(grepl(sPlot_TRY, pattern = "T")) %>% 
  dplyr::select(Name_sPlot_TRY) %>% 
  rename(Name_submit = Name_sPlot_TRY)

## add names from T, which match names from S only after standardization
ToSubmit2 <- Backbone %>% 
  filter(grepl(sPlot_TRY, pattern = "T")) %>% 
  filter(!grepl(sPlot_TRY, pattern = "S")) %>% 
  filter(Name_sPlot_TRY  %in% ((Backbone %>% 
                                  filter(grepl(sPlot_TRY, pattern = "S")) %>% 
                                  filter(!grepl(sPlot_TRY, pattern = "T")) %>% 
                                  dplyr::select(Name_correct) %>% 
                                  distinct())$Name_correct)) %>% 
  dplyr::select(Name_sPlot_TRY) %>% 
  rename(Name_submit = Name_sPlot_TRY)

ToSubmit <- ToSubmit1 %>% 
  bind_rows(ToSubmit2)

write_csv(ToSubmit, "../_output/Submit_TRY.csv")

Submitting 70417 species names to TRY.

3.7.1 Check how many species from sPlot where submitted to TRY5.0

Matched_names <- Backbone %>% 
  filter(grepl(sPlot_TRY, pattern = "S")) %>% 
  filter(Name_sPlot_TRY %in% ToSubmit$Name_submit) %>% 
  bind_rows(Backbone %>% 
              filter(grepl(sPlot_TRY, pattern = "S")) %>% 
              filter(Name_correct %in% ToSubmit$Name_submit)) %>% 
  distinct() 

Of the species names submitted to TRY there are 89827 species names that match sPlot’s (+ Alpine dataset) species names, before or after taxonomic resolution. These correspond to 67803 species names, AFTER taxonomic resolution.

4 Statistics

4.1 Statistics for backbone combining names in sPlot3.0 and TRY5.0

4.1.1 All taxon name entries

load("../_output/Backbone3.0.RData")

How many new entries are in the backbone 3.0 compared to the backbone 2.1? How many entries are in common?

The new backbone contains 346392. The backbone 2.1 contained 130602. The two backbones have 116309 records in common.

Database affiliations (sPlot 3.1, TRY 3.0, and Alpine).

Number of (standardized) name entries unique to, or shared between sPlot (S), TRY (T) and Alpine (A).
Var1 Freq
A 365
S 43715
SA 423
ST 61624
STA 1710
T 238092
TA 463

107472 of the total number of entries belong to sPlot. 301889 name entries belong to TRY.

Taxonomic ranks:
Number of (standardized) name entries per taxonomic rank.
Var1 Freq
higher 6392
family 1889
genus 27102
species 294379
subspecies 8948
variety 7443
infraspecies 92
race 0
forma 147
Taxonomic status:
Number of (standardized) name entries for taxonomic status
Var1 Freq
Accepted 284514
Synonym 28616
No opinion 29313
Invalid 369
Illegitimate 386
Misapplied 13
Rejected name 1
Unresolved 3
NA 3177

Total number of unique standardized taxon names and families:

length(unique(Backbone$Name_short))-1 # minus 1 for NA
## [1] 271883
length(unique(Backbone$Family_correct))-1 # minus 1 for NA
## [1] 734

Number of entries corresponding to vascular plant species:

table(Backbone$is_vascular_species, exclude=NULL)
## 
##  FALSE   TRUE   <NA> 
##   8819 331946   5627

Number of duplicated entries after taxonomic standardization: Frequency of original (non-standardized) species names per resolved (non-standardized) name (excluding non-vascular and non-matched species).

df.count <- Backbone %>%
    dplyr::filter(is_vascular_species == TRUE & !is.na(Name_correct)) %>%
    dplyr::group_by(Name_correct) %>%
    dplyr::summarise(n = n()) %>%
    dplyr::arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
Number of unresolved, original name entries per resolved name. (Only first 20 shown
Name_correct n
Poaceae 222
Lauraceae 177
Fabaceae 149
Asteraceae 148
Miconia 144
Carex 139
Psychotria 131
Eugenia 117
Cyperus 104
Piper 84
Myrcia 83
Ocotea 83
Taraxacum 83
Rubiaceae 78
Ficus 75
Inga 75
Sloanea 69
Nectandra 68
Myrtaceae 67
Lamiaceae 62

4.1.2 Based on unique standardized names

Generate version of the backbone that only includes the unique resolved names in Name.short, and for the non-unique names, the first rows of duplicated name:

Backbone.uni <- Backbone %>% 
  distinct(Name_short, .keep_all = T) %>%
  filter(!is.na(Name_short))

There are 271883 unique taxon names the in the backbone.

Exclude the non-vascular plant and non-matching taxon names:

Backbone.uni.vasc <- Backbone.uni %>%
    dplyr::filter(is_vascular_species == TRUE)

Now, run the stats for unique resolved names (excluding non-vascular and non-matching taxa):

length(Backbone.uni.vasc$Name_short)
## [1] 263298

There are 0 unique (vascular plant) taxon names:

Number of (standardized) vascular plant taxon names per unique to, and shared between TRY (S), sPlot (T) and the Alpine (A) dataset.
Var1 Freq
A 179
S 12930
SA 253
ST 49305
STA 1337
T 198911
TA 383
Taxonomic ranks:
Number of (standardized) name entries per taxonomic rank.
Var1 Freq
higher 719
family 163
genus 6878
species 254529
subspecies 555
variety 435
infraspecies 3
race 0
forma 16
Taxonomic status:
Number of (standardized) name entries per taxonomic status
Var1 Freq
Accepted 228355
Synonym 10370
Other 24573
No suitable matches found. 0

Total number of unique standardized taxon names and families:

length(unique(Backbone.uni.vasc$Name_short))-1 # minus 1 for NA
## [1] 263297
length(unique(Backbone.uni.vasc$Family_correct))-1
## [1] 508

4.2 Stats for the corrected names in sPlot only:

Backbone.uni.sPlot <- Backbone.uni.vasc %>%
  filter(sPlot_TRY %in% c("S", "ST", "SA", "STA"))

There are 63825 unique, corrected names of vascular plants for sPlot species

Database affiliations
Number of (standardized) vascular plant taxon names per unique to sPlot (S), and shared with TRY (ST), the Alpine dataset (SA) or both (STA).
Var1 Freq
S 12930
SA 253
ST 49305
STA 1337
Taxonomic ranks:
Number of (standardized) vascular plant taxon names per taxonomic rank.
Var1 Freq
higher 186
family 106
genus 960
species 61887
subspecies 433
variety 246
infraspecies 0
race 0
forma 7
Taxonomic status:
Number of (standardized) vascular plant taxon names that correspond to Accepted, Synonyms or Unresolved species, respecively.
Var1 Freq
Accepted 53538
Synonym 4537
Other 5750
No suitable matches found. 0

Number of families in sPlot:

length(unique(Backbone$Family_correct))
## [1] 735

Done!


5 Appendix

5.1 Create lists of genera manually classified into taxonomic groups

6 R-settings

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.7 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] vegdata_0.9.7     foreign_0.8-76    Taxonstand_2.1    pbapply_1.4-3    
##  [5] taxize_0.9.99     kableExtra_1.3.1  knitr_1.30        data.table_1.13.2
##  [9] forcats_0.5.0     stringr_1.4.0     dplyr_1.0.2       purrr_0.3.4      
## [13] readr_1.4.0       tidyr_1.1.2       tibble_3.0.1      ggplot2_3.3.0    
## [17] tidyverse_1.3.0   reshape2_1.4.4   
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2        jsonlite_1.7.1    viridisLite_0.3.0 foreach_1.5.1    
##  [5] bold_1.1.0        modelr_0.1.6      assertthat_0.2.1  highr_0.8        
##  [9] cellranger_1.1.0  yaml_2.2.1        pillar_1.4.3      backports_1.2.0  
## [13] lattice_0.20-41   glue_1.4.2        uuid_0.1-4        digest_0.6.25    
## [17] rvest_0.3.6       colorspace_2.0-0  htmltools_0.5.0   plyr_1.8.6       
## [21] pkgconfig_2.0.3   httpcode_0.3.0    broom_0.7.0       haven_2.3.1      
## [25] scales_1.1.1      webshot_0.5.2     generics_0.1.0    ellipsis_0.3.1   
## [29] withr_2.3.0       cli_2.2.0         magrittr_2.0.1    crayon_1.3.4     
## [33] readxl_1.3.1      evaluate_0.14     fs_1.5.0          fansi_0.4.1      
## [37] nlme_3.1-150      xml2_1.3.2        tools_3.6.3       hms_0.5.3        
## [41] lifecycle_0.2.0   munsell_0.5.0     reprex_0.3.0      compiler_3.6.3   
## [45] rlang_0.4.8       grid_3.6.3        conditionz_0.1.0  iterators_1.0.13 
## [49] rstudioapi_0.13   rmarkdown_2.5     gtable_0.3.0      codetools_0.2-18 
## [53] DBI_1.1.0         reshape_0.8.8     curl_4.3          R6_2.5.0         
## [57] zoo_1.8-8         lubridate_1.7.9.2 ape_5.4-1         stringi_1.5.3    
## [61] parallel_3.6.3    crul_1.0.0        Rcpp_1.0.5        vctrs_0.3.5      
## [65] dbplyr_2.0.0      tidyselect_1.1.0  xfun_0.19