-
Francesco Sabatini authoredFrancesco Sabatini authored
title: "Taxonomic Backbone - sPlot 3.0"
author: "Francesco Maria Sabatini"
date: "`r format(Sys.time(), '%d %B, %Y')`"
tags:
- database
- big data
- traits
- taxonomy
output:
html_document:
number_sections: true
toc: true
toc_depth: 2
abstract: "This document describes the workflow (with contributions from Oliver Purschke, Jürgen Dengler and Florian Jansen) that was used to generate the taxonomic backbone that standardizes taxon names across the (i) global vegetation plot database sPlot version 3.0 and (ii) the global plant trait data base TRY version 5."
urlcolor: blue
Timestamp: r date()
Drafted: Francesco Maria Sabatini
Revised:
Version: 1.0
Data preparation
Load packages
library(reshape2)
library(tidyverse)
library(readr)
library(data.table)
library(knitr)
library(kableExtra)
library(stringr)
library(taxize)
library(Taxonstand)
library(vegdata)
sPlot and TRY
Read in taxon names from## fungi genera #NOT COMPLETE LIST
mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Amanitopsis","Coprinus",
"Galerina","Geoglossum","Hebeloma","Hydnum","Lactarius","Leucocarpia","Naucoria","Otidea","Polyporus", "Involucrothele",
"Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma",
"Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe",
"Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe",
"Rickenella", "Sarcoscypha", "Vascellum", "Ramaria")
#import and save splot names from DT table
DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species.csv",
delim="\t",
col_type = cols(
PlotObservationID = col_double(),
Taxonomy = col_character(),
`Taxon group` = col_character(),
`Taxon group ID` = col_double(),
`Turboveg2 concept` = col_character(),
`Matched concept` = col_character(),
Match = col_double(),
Layer = col_double(),
`Cover %` = col_double(),
`Cover code` = col_character(),
x_ = col_double()
)
)
## Exclude fungi
splot.species <- DT0 %>%
rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>%
filter(`Taxon group`!="Mushroom") %>%
dplyr::select(Species.original, Matched.concept) %>%
distinct() %>%
group_by(Matched.concept) %>%
mutate(fungi= word(Matched.concept, 1) %in% mushroom) %>%
ungroup() %>%
filter(fungi==F) %>%
dplyr::select(Species.original, Matched.concept)
write_csv(splot.species, path = "../_derived/splot3.0.2.species.csv")
!!! I used the column from TRY with the full species name, not the column with only a two-word name strings
splot.species <- read_csv("../_derived/splot3.0.2.species.csv")
try.species <- readr::read_csv("../_input/AccSpecies_TRY5.csv", col_names = F, locale = locale(encoding = 'Latin1')) %>%
dplyr::select(-X6, -X7) %>%
rename(try.ID=X1, FullSpecies=X2, Species=X3, Genus=X4, Family=X5, GrowthForm=X8)
# Sneak in species from the Alpine database (Borja & Riccardo), as a courtesy to Project #18
alpine.species <- read_delim("../_input/new_alpine_species.txt", col_names = F, delim = "\t", locale = locale(encoding = 'Latin1')) %>%
rename(Species=X1)
Use the Matched.concept
column, as it already contains some standardization by Stephan Hennekkens according to synbiosys.
sPlot 3.0.1 contains r nrow(unique(splot.species[,2]))
different species names.
TRY 5. contains r nrow(try.species)
.
I add to this a list of r nrow(alpine.species)
alpine species delivered from Riccardo Testolin, within sPlot Project #18.
Combine species lists
spec.list.TRY.sPlot <- splot.species %>%
dplyr::select(Matched.concept) %>%
rename(Species=Matched.concept) %>%
mutate(Source="S") %>%
bind_rows(try.species %>%
dplyr::select(FullSpecies) %>% ##using the full name from TRY
rename(Species=FullSpecies) %>%
mutate(Source="T")) %>%
bind_rows(alpine.species %>%
mutate(Source="A")) %>%
reshape2::dcast(Species ~ Source) %>%
mutate(A=ifelse(A>=1, "A", "")) %>%
mutate(S=ifelse(S>=1, "S", "")) %>%
mutate(T=ifelse(T>=1, "T", "")) %>%
mutate(Source=paste(S, T, A, sep="")) %>%
dplyr::select(-A, -S, -T)
#Number of species unique and in common across databases
The total number of species in the backbone is r nrow(spec.list.TRY.sPlot)
.
knitr::kable(spec.list.TRY.sPlot %>%
mutate(Source=factor(Source,
levels=c("S", "T", "A", "ST", "SA", "TA", "STA"),
labels=c("sPlot only", "TRY only", "Alpine only",
"sPlot + TRY", "sPlot + Alpine", "TRY + Alpine",
"sPlot + TRY + Alpine"))) %>%
group_by(Source) %>%
summarize(Num.taxa=n()),
caption="Number of taxa per database") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
latex_options = "basic",
full_width = F, position = "center")
A-priori cleaning of names
Stripping unwanted characters as well as abbreviation (such as hybrid markers) which would prevent name matching:
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(OriginalNames=Species) %>%
dplyr::select(OriginalNames, Species, Source) %>%
mutate(Species=gsub('*', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('cf. ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('Cf. ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('[', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(']', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' x ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('aff ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('(', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(')', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' cf ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' aff. ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub('c‚e', 'ceae', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub(' ', ' ', Species, fixed=TRUE)) %>%
mutate(Species=gsub('x-', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('X-', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('like ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(',', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('_', ' ', Species))
For all names, that have a number in their first word, and consist of > 1 words, remove that word:
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(firstWordWithNumbers=grepl('[0-9]', word(Species, 1))) %>%
mutate(numberOfWords= sapply(gregexpr("\\W+", Species), length) + 1) %>%
mutate(Species=ifelse((firstWordWithNumbers & numberOfWords > 1),
sapply(Species,
function(x) substr(x, start=regexpr(pattern =' ', text=x)+1,
stop=nchar(x))), Species))
Correct some name abbreviations using taxname.abbr
in vegdata
:
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(Species=taxname.abbr(spec.list.TRY.sPlot$Species)) %>%
dplyr::select(OriginalNames, Species, Source) %>%
distinct()
A total of r nrow(spec.list.TRY.sPlot %>% filter(OriginalNames != Species))
species names were modified. Although substantially improved, the species list has still quite a lot of inconsistencies.
The total list submitted to TNRS containes r length(unique(spec.list.TRY.sPlot$Species))
species names.
TNRS)
Match names against Taxonomic Name Resolution Service (Export species name list
write_csv(spec.list.TRY.sPlot %>% dplyr::select(Species) %>% distinct() ,
path = "../_derived/TNRS_submit/tnrs_submit_iter1.csv")
The csv-file of species names was submitted to Taxonomic Name Resolution Service web application (Boyle et al. 2013, iPlant Collaborative (2015)). TNRS version 4.0 was used, which became available in August 2015 (this version also included The Plant List version 1.1). TNRS was queried on 27/07/2019.
TNRS settings {#ID}
The following settings were used for resolving names on TNRS.
Sources for name resolution {#ID}
The initial TNRS name resolution run was based on the five standard sources that were ranked according to preference in the following order (default of TNRS):
- The Plant List (TPL)[@TPL2013]
- The Global Compositae Checklist (GCC)[@Flann2009]
- The International Legume Database and Information Service (ILDIS)[@ILDIS2006]
- Tropicos [@TROPICOS2013]
- PLANTS Database (USDA)[@USDA2012]
Family Classification
Resolved names were assigned to families based on the APGIII classification [@Chase2009], the same classification system used by Tropicos.
Retrieve results
Once the matching process was finished, results were retrieved from TNRS using the Detailed Download
option that included the full name information (parsed components, warnings, links to sources, etc.). We retrieved all the matches for each species, constrained by source (TNRS default), where the name in the first source was selected as best match, unless there was no suitable match found
in that source, the match from the next lower-ranked source was selected, until all resources where exhausted.
General procedure {#ID}
Manually inspect the TNRS-results table in a spreadsheat application (i.e. LibreOffice or Excel). Starting with the highest taxonomic rank considered (i.e. Family). For instance, if manual checking of the TRNS output reveals that all accepted names or synonyms that have accuracy scores >0.9 are correct taxon names, use the following selection procedure:
- Name_matched_rank (==Family)
- Taxonomic_status (==Accepted, Synomyn)
- Family_score (>0.9)
Continue this selection procedure for entries that were matched at lower taxonomic ranks, i.e. genus, species, etc..
Iteration 1 - Read and combine TNRS result files
Read the files downloaded from TNRS into R
.
tnrs.res0 <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter1.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Name_number = col_double(),
Overall_score = col_double(),
Name_score = col_double(),
Author_score = col_double(),
Family_score = col_double(),
Genus_score = col_double(),
Specific_epithet_score = col_double(),
Infraspecific_epithet_score = col_double(),
Infraspecific_epithet_2_score = col_double(),
Selected = col_logical()
))
Select best match for each submitted name
Best matches are selected in successive steps, depending at which taxonomic level each record was matched. Records were sorted based on decreasing match scores. Matches at low taxonomic level (variety, subspecies) were favoured over matches at high taxonomic levels (family, sections). When having exactly the same ranks, the records were ranked based on their source, as explained above.
For each name submitted, only the record having the highest rank was retained.
tnrs.res <- tnrs.res0 %>%
mutate(Name_matched_rank=factor(Name_matched_rank,
levels=c("variety", "subspecies", "species", "genus",
"family", "section", "supersection",
"infraspecies", "forma", "race",
"nothosubspecies", "proles", "monstr",
"series"))) %>%
mutate(Source=factor(Source, levels=c("tpl", #reorder priorities
"tpl;gcc",
"tpl;gcc;tropicos",
"tpl;gcc;tropicos;usda",
"tpl;gcc;usda",
"tpl;ildis",
"tpl;ildis;tropicos",
"tpl;ildis;usda",
"tpl;tropicos",
"tpl;tropicos;usda",
"tpl;usda",
"gcc",
"gcc;tropicos",
"gcc;tropicos;usda",
"gcc;usda",
"ildis",
"ildis;tropicos",
"ildis;tropicos;usda",
"ildis;usda",
"tropicos",
"tropicos;gcc",
"tropicos;usda",
"usda" ))) %>%
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion","Invalid","Illegitimate","Misapplied","Rejected name"))) %>%
#filter(Taxonomic_status %in% c("Accepted", "Synonym")) %>%
arrange(Name_number,
desc(Infraspecific_epithet_2_score),
desc(Infraspecific_epithet_score),
desc(Specific_epithet_score),
desc(Genus_score),
desc(Family_score),
desc(Name_score),
desc(Overall_score),
#Taxonomic_status,
Source) %>%
group_by(Name_submitted) %>%
slice(1)
After this first step, there are r sum(tnrs.res$Name_matched=="No suitable matches found.")
recprds for which no match was found. Another r sum(tnrs.res$Overall_score<0.9)
were unreliably matched (overall match score <0.9).
Family level {#ID}
Manually inspect sorted table and select all entries at the highest hierarchical level (family). Manually identify the family accuracy score threshold value above which a name can be considered a correct name. In the following case, this corresponds to a score $>$0.88.
index.family <- which(tnrs.res$Name_matched_rank == "family" &
(tnrs.res$Taxonomic_status == "Accepted" |
tnrs.res$Taxonomic_status == "Synonym") &
tnrs.res$Family_score > 0.88)
length(index.family)
Genus level
index.genus <- which(tnrs.res$Name_matched_rank == "genus" &
( tnrs.res$Taxonomic_status %in% c("Synonym", "Accepted") &
tnrs.res$Genus_score > 0.83)
|
( tnrs.res$Taxonomic_status == "No opinion" &
tnrs.res$Genus_score >= 0.99))
length(index.genus)
Species level
index.species <- which(tnrs.res$Name_matched_rank == "species" &
( (tnrs.res$Taxonomic_status == "Accepted" | #condition 1
tnrs.res$Taxonomic_status == "Synonym") &
tnrs.res$Genus_score > 0.78 &
tnrs.res$Name_score > 0.90)
|
( tnrs.res$Genus_score > 0.90 & # condition 2 - effective for records with subspecies information
(tnrs.res$Specific_epithet_score > 0.90)
))
length(index.species)
Subspecies level
index.subspec <- which( (tnrs.res$Name_matched_rank %in% c("infraspecies", "subspecies") |
is.na(tnrs.res$Name_matched_rank)) & # there are a few records at sub-species level which are not categorized
(tnrs.res$Taxonomic_status == "Accepted" |
tnrs.res$Taxonomic_status == "Synonym"))
length(index.subspec)
index.variety <- which(tnrs.res$Name_matched_rank == "variety" &
(tnrs.res$Taxonomic_status == "Accepted" |
tnrs.res$Taxonomic_status == "Synonym"))
length(index.variety)
index.infraspec <- which(tnrs.res$Name_matched_rank == "infraspecies")
length(index.infraspec)
index.forma <- which(tnrs.res$Name_matched_rank == "forma")
length(index.forma)
Identifying "non-matched" species that are spermatophyta
index.spermatophyt <- which(tnrs.res$Name_matched == "No suitable matches found."
& word(tnrs.res$Name_submitted, 1) == "Spermatophyta")
length(index.spermatophyt)
certain
or uncertain
names
Select Select names that do not fulfill the search criteria, i.e. that were not selected as certain species, for further name matching.
index.tnrs <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt))
tnrs.res.certain <- tnrs.res[index.tnrs,]
dim(tnrs.res.certain)
write.csv(tnrs.res.certain, file = "../_derived/TNRS_submit/tnrs.res.iter1.certain.csv")
tnrs.res.uncertain <- tnrs.res[-index.tnrs,]
dim(tnrs.res.uncertain)
write.csv(tnrs.res.uncertain, file = "../_derived/TNRS_submit/tnrs.res.iter1.uncertain.csv")
save(tnrs.res.certain, tnrs.res.uncertain, file="../_derived/TNRS_submit/tnrs.iter1.RData")
Manual cleaning, delete subspecies information and rerun match in TNRS
Many unmatched records do contain subspecies information which could not be retrieved in TNRS, although genus and species seem to be spelled correctly. Also, sometimes the mismatch derives from having the word 'species' or 'sp' at the end of the name.
#Ancillary function to change to lower case
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
#Manual cleaning
tnrs.submit.iter2 <- data.frame(old=tnrs.res.uncertain$Name_submitted) %>%
mutate(new=old) %>%
mutate(new=tolower(new)) %>%
mutate(new=firstup(new)) %>%
mutate(new=gsub(" [0-9]*$", "", new)) %>%
mutate(new=gsub(" sp.$", "", new)) %>%
mutate(new=gsub(" sp$", "", new)) %>%
mutate(new=gsub(" species$", "", new)) %>%
mutate(new=gsub(" *$", "", new)) %>%mutate(new=gsub('^Agropyrum', 'Agropyron', new)) %>%
mutate(new=gsub('^Anno ', 'Annona ', new)) %>%
mutate(new=gsub('Adpdytes dimidiata', 'Apodytes dimidiata', new)) %>%
mutate(new=gsub('Adenostorna fasciculaturn', 'Adenostoma fasciculaturn', new)) %>%
mutate(new=gsub('Arctostapliylos glallca', 'Arctostaphylos glauca', new)) %>%
mutate(new=gsub('Bituminosa bituminosa', 'Bituminaria bituminosa', new)) %>%
mutate(new=gsub('Causurina equisitifolia', 'Causuarina equisetifolia', new)) %>%
mutate(new=gsub('Convulvus arvensis', 'Convolvulus arvensis', new)) %>%
mutate(new=gsub('Diospyrus dygina', 'Diospyros dygina', new)) %>%
mutate(new=gsub('^Dodoea', 'Dodonaea', new)) %>%
mutate(new=gsub('^Boheravia', 'Boerhavia', new)) %>%
mutate(new=gsub('Centaria maculosa', 'Centaurea maculosa', new)) %>%
mutate(new=gsub('Chamrenerium angustifolium', 'Chamaenerion angustifolium', new)) %>%
mutate(new=gsub('^Chicorium', 'Cichorium', new)) %>%
mutate(new=gsub('^Cirsiumum', 'Cirsium', new)) %>%
mutate(new=gsub('^Colubrium', 'Colubrina', new)) %>%
mutate(new=gsub('^Corymbium', 'Corymbia', new)) %>%
mutate(new=gsub('Cosmos bipinnata', 'Cosmos bipinnatus', new)) %>%
mutate(new=gsub('Diospyrus dygina', 'Diospyros digyna', new)) %>%
mutate(new=gsub('Diospyros egbert', 'Diospyros egbert-walkeri', new)) %>%
mutate(new=gsub('Dispyrus halesioides', 'Diospyros halesioides', new)) %>%
mutate(new=gsub('^Drymis', 'Drimys', new)) %>%
mutate(new=gsub('^Dysoxylon', 'Dysoxylum', new)) %>%
mutate(new=gsub('^Eleaegnus', 'Elaeagnus', new)) %>%
mutate(new=gsub('^Eleutherant', 'Eleutherantera', new)) %>%
mutate(new=gsub('^Echicea', 'Echinacea', new)) %>%
mutate(new=gsub('Gauteria foliolata', 'Gaultheria foliolosa', new)) %>%
mutate(new=gsub('^Geophylla', 'Geophyla', new)) %>%
mutate(new=gsub('Gloichidion insignis', 'Glochidion insigne', new)) %>%
mutate(new=gsub('^Glycium', 'Glycine', new)) %>%
mutate(new=gsub('^Hammalis', 'Hamamelis', new)) %>%
mutate(new=gsub('^Hippochoeris', 'Hypochaeris', new)) %>%
mutate(new=gsub('Ilix tephrohylla', 'Ilex tephrophylla', new)) %>%
mutate(new=gsub('^Jasininum', 'Jasminum', new)) %>%
mutate(new=gsub('Jenipa conjuta', 'Jenipa conjunta', new)) %>%
mutate(new=gsub('^Lechytis', 'Lecythis', new)) %>%
mutate(new=gsub('Lespedeza juncus', 'Lespedeza juncea', new)) %>%
mutate(new=gsub('Licania apelata', 'Licania apetala', new)) %>%
mutate(new=gsub('Limeum arenicola', 'Limeum arenicolum', new)) %>%
mutate(new=gsub('^Maniota', 'Manihot', new)) %>%
mutate(new=gsub('^Menta', 'Mentha', new)) %>%
mutate(new=gsub('Metophyum brownei', 'Metopium brownei', new)) %>%
mutate(new=gsub('Miliusa tomentosum', 'Miliusa tomentosa', new)) %>%
mutate(new=gsub('Mimululus ringens', 'Mimulus ringens', new)) %>%
mutate(new=gsub('Nardus strictus', 'Nardus stricta', new)) %>%
mutate(new=gsub('Neea glomeratha', 'Neea glomerata', new)) %>%
mutate(new=gsub('^Onopordon', 'Onopordum', new)) %>%
mutate(new=gsub('^Orbigynia', 'Orbignya', new)) %>%
mutate(new=gsub('Orites excelsa', 'Orites excelsus', new)) %>%
mutate(new=gsub('Paedorata lutea', 'Paederota lutea', new)) %>%
mutate(new=gsub('Palaquin ellipticum', 'Palaquium ellipticum', new)) %>%
mutate(new=gsub('Palmeria arfakensis', 'Palmeria arfakiana', new)) %>%
mutate(new=gsub('Petalostcmum purpureum', 'Petalostemum purpureum', new)) %>%
mutate(new=gsub('Petalostimum purpureum', 'Petalostemum purpureum', new)) %>%
mutate(new=gsub('^Petrosileum', 'Petroselinum', new)) %>%
mutate(new=gsub('Phlomis herba', 'Phlomis herba-venti', new)) %>%
mutate(new=gsub('^Phyllirea', 'Phillyrea', new)) %>%
mutate(new=gsub('Physilus pumula', 'Physalus pumila', new)) %>%
mutate(new=gsub('Picea maria', 'Picea mariana', new)) %>%
mutate(new=gsub('Picea retroXexa', 'Picea retroflexa', new)) %>%
mutate(new=gsub('Pilayella litoralis', 'Pilayella littoralis', new)) %>%
mutate(new=gsub('Placocarpus schaereri', 'Platecarpus schaerer', new)) %>%
mutate(new=gsub('Placocarpus schraereri', 'Platecarpus schaerer', new)) %>%
mutate(new=gsub('^Pulteea', 'Pultenaea', new)) %>%
mutate(new=gsub('Quercus rubrum', 'Quercus rubra', new)) %>%
mutate(new=gsub('Rubus fruticosa', 'Rubus fruticosus', new)) %>%
mutate(new=gsub('Rubus saxatile', 'Rubus saxatilis', new)) %>%
mutate(new=gsub('Rubus sylvatici', 'Rubus sylvaticus', new)) %>%
mutate(new=gsub('^Sanguiria', 'Sanguinaria', new)) %>%
mutate(new=gsub('Sarauja nepaulensis', 'Sarauja nepalensis', new)) %>%
mutate(new=gsub('^Sateria', 'Setaria', new)) %>%
mutate(new=gsub('Sauraiea nepulensis', 'Saurauia nepalensis', new)) %>%
mutate(new=gsub('Schneckia australis', 'Schenckia australis', new)) %>%
mutate(new=gsub('Smirnium oleastrum', 'Smyrnium olusatrum', new)) %>%
mutate(new=gsub('Solms laubachia', 'Solms-laubachia himalayensis', new)) %>%
mutate(new=gsub('Stellaria chamaejasme', 'Stellera chamaejasme', new)) %>%
mutate(new=gsub('Steraria parviflora', 'Setaria parviflora', new)) %>%
mutate(new=gsub('^Stuartia', 'Stewartia', new)) %>%
mutate(new=gsub('Sycops sinensis', 'Sycopsis sinensis', new)) %>%
mutate(new=gsub('Tacetum vulgare', 'Tanacetum vulgare', new)) %>%
mutate(new=gsub('Talinurn angustissimun', 'Talinun angustissimun', new)) %>%
mutate(new=gsub('Talloma hodgsoni', 'Talauma hodgsonii', new)) %>%
mutate(new=gsub('Taraxacum albo', 'Taraxacum album', new)) %>%
mutate(new=gsub('Tetragonia falcata', 'Tetragona falcata', new)) %>%
mutate(new=gsub('Trapogogon', 'Tragopogon', new)) %>%
mutate(new=gsub('Zyzyphus saeri', 'Zizyphus saeri', new)) %>%
mutate(new=gsub('^Helicrysum', 'Helichrysum', new)) %>%
mutate(new=gsub('^Diceropappus rhinocerotis', 'Elytropappus rhinocerotis', new)) %>%
mutate(new=gsub('^Euphorbiace ', 'Euphorbiacaea ', new)) %>%
mutate(new=gsub('^Gloecapsa', 'Gloeocapsa', new)) %>%
mutate(new=gsub('Glycirhiza', 'Glycyrrhiza', new)) %>%
mutate(new=gsub('Abiesnordmannia', 'Abies nordmannia', new)) %>%
mutate(new=gsub('Alnus inca', 'Alnus incana', new)) %>%
mutate(new=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', new)) %>%
mutate(new=gsub('Antylis barba-jovis', 'Anthyllis barba-jovis', new)) %>%
mutate(new=gsub('^Albizzia "', 'Albizia ', new)) %>%
mutate(new=gsub('^Ipomoena ', 'Ipomoea ', new)) %>%
mutate(new=gsub('^Ipomea ', 'Ipomoea ', new)) %>%
mutate(new=gsub('Ipomo wolco', 'Ipomoea wolcottiana', new))
# delete remaining records of mushroom species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
filter(!word(new,1) %in% mushroom)
# Extract family name for unidentified species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
na.omit() %>%
group_by(old) %>%
mutate(family.lev=str_extract(word(new,1), pattern='([^\\s]+acea)')) %>%
mutate(new=ifelse(is.na(family.lev), new, family.lev)) %>%
dplyr::select(-family.lev) %>%
ungroup()
#Cut to the first 2 words in the name string
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
group_by(old) %>%
mutate(Name_binomial=paste(word(new, c(1,2)), collapse=" ")) %>%
ungroup() %>%
mutate(Name_binomial=gsub(' NA$', '', Name_binomial))
Save species list to submit to TNRS for iteration 2
write_csv(tnrs.submit.iter2 %>%
dplyr::select(Name_binomial) %>%
#After cleaning some names now match to those already resolved in iteration 1. Take them out
filter(!Name_binomial %in% tnrs.res.certain$Name_submitted) %>%
distinct(), path="../_derived/TNRS_submit/tnrs_submit_iter2.csv")
Iteration 2 - Reimport resolved species names from TNRS and mark solved
tnrs.res.iter2.raw <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter2.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Name_number = col_double(),
Overall_score = col_double(),
Name_score = col_double(),
Author_score = col_double(),
Family_score = col_double(),
Genus_score = col_double(),
Specific_epithet_score = col_double(),
Infraspecific_epithet_score = col_double(),
Infraspecific_epithet_2_score = col_double(),
Selected = col_logical()
))
tnrs.res.iter2 <- tnrs.res.iter2.raw %>%
mutate(Name_matched_rank=factor(Name_matched_rank,
levels=c("variety", "subspecies", "species",
"genus", "family", "section",
"supersection", "infraspecies", "forma",
"race", "nothosubspecies", "proles",
"monstr", "series"))) %>%
mutate(Source=factor(Source, levels=c("tpl", #reorder priorities
"tpl;gcc", "tpl;gcc;tropicos", "tpl;gcc;tropicos;usda",
"tpl;gcc;usda","tpl;ildis","tpl;ildis;tropicos",
"tpl;ildis;usda","tpl;tropicos","tpl;tropicos;usda",
"tpl;usda","gcc","gcc;tropicos",
"gcc;tropicos;usda","gcc;usda","ildis",
"ildis;tropicos","ildis;tropicos;usda","ildis;usda",
"tropicos","tropicos;gcc","tropicos;usda","usda" ))) %>%
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion",
"Invalid","Illegitimate","Misapplied",
"Rejected name"))) %>%
arrange(Name_number,
desc(Infraspecific_epithet_2_score),
desc(Infraspecific_epithet_score),
desc(Specific_epithet_score),
desc(Genus_score),
desc(Family_score),
desc(Name_score),
desc(Overall_score),
Source) %>%
group_by(Name_submitted) %>%
slice(1)
Family level
index.family <- which(tnrs.res.iter2$Name_matched_rank == "family" &
(tnrs.res.iter2$Taxonomic_status == "Accepted" |
tnrs.res.iter2$Taxonomic_status == "Synonym") &
tnrs.res.iter2$Family_score > 0.88)
length(index.family)
Genus level
index.genus <- which(tnrs.res.iter2$Name_matched_rank == "genus" &
(tnrs.res.iter2$Taxonomic_status %in% c("Accepted","Synonym") &
tnrs.res.iter2$Genus_score >= 0.90 &
tnrs.res.iter2$Name_score > 0.49))
length(index.genus)
Species level
index.species <- which(tnrs.res.iter2$Name_matched_rank == "species" &
#(tnrs.res.iter2$Taxonomic_status == "Accepted" |
# tnrs.res.iter2$Taxonomic_status == "Synonym") &
tnrs.res.iter2$Genus_score >= 0.80 &
tnrs.res.iter2$Specific_epithet_score > 0.90)
length(index.species)
Subspecies level
index.infraspec <- which(tnrs.res.iter2$Name_matched_rank == "infraspecies")
length(index.infraspec)
index.subspec <- which((tnrs.res.iter2$Name_matched_rank %in% c("infraspecies", "subspecies") |
is.na(tnrs.res.iter2$Name_matched_rank)) & # there are a few records at sub-species level which are not categorized
(tnrs.res.iter2$Taxonomic_status == "Accepted" |
tnrs.res.iter2$Taxonomic_status == "Synonym"))
length(index.subspec)
index.variety <- which(tnrs.res.iter2$Name_matched_rank == "variety" &
(tnrs.res.iter2$Taxonomic_status == "Accepted" |
tnrs.res.iter2$Taxonomic_status == "Synonym"))
length(index.variety)
index.forma <- which(tnrs.res.iter2$Name_matched_rank == "forma")
length(index.forma)
index.spermatophyt <- which(tnrs.res.iter2$Name_matched == "No suitable matches found."
& word(tnrs.res.iter2$Name_submitted, 1) == "Spermatophyta")
length(index.spermatophyt)
index.tnrs.iter2 <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt))
tnrs.res.iter2.certain <- tnrs.res.iter2[index.tnrs.iter2,]
dim(tnrs.res.iter2.certain)
write.csv(tnrs.res.iter2.certain, file = "../_derived/TNRS_submit/tnrs.res.iter2.certain.csv")
tnrs.res.iter2.uncertain <- tnrs.res.iter2[-index.tnrs.iter2,]
dim(tnrs.res.iter2.uncertain)
write.csv(tnrs.res.iter2.uncertain, file = "../_derived/TNRS_submit/tnrs.res.iter2.uncertain.csv")
save(tnrs.res.iter2.certain, tnrs.res.iter2.uncertain,
tnrs.submit.iter2, file="../_derived/TNRS_submit/tnrs.iter2.RData")
Save species list to submit to TNRS for iteration 3
write_csv(tnrs.res.iter2.uncertain[,2], path = "../_derived/TNRS_submit/tnrs_submit_iter3.csv")
This list was submitted to TNRS
, but only selecting the NCBI
database.
TNRS_NCBI
Iteration 3 - Reimport resolved species names from tnrs.res.iter3.raw <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter3.txt", delim="\t", locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Name_number = col_double(),
Overall_score = col_double(),
Name_score = col_double(),
Author_score = col_double(),
Family_score = col_double(),
Genus_score = col_double(),
Specific_epithet_score = col_double(),
Infraspecific_epithet_score = col_double(),
Infraspecific_epithet_2_score = col_double(),
Selected = col_logical()
))
tnrs.ncbi <- tnrs.res.iter3.raw %>%
mutate(Name_matched_rank=factor(Name_matched_rank,
levels=c("variety", "subspecies", "species",
"genus", "family", "section", "supersection",
"infraspecies", "forma", "race",
"nothosubspecies", "proles", "monstr",
"series"))) %>%
mutate(Source=factor(Source, levels=c("tpl", #reorder priorities
"tpl;gcc", "tpl;gcc;tropicos", "tpl;gcc;tropicos;usda",
"tpl;gcc;usda","tpl;ildis","tpl;ildis;tropicos",
"tpl;ildis;usda","tpl;tropicos","tpl;tropicos;usda",
"tpl;usda","gcc","gcc;tropicos","gcc;tropicos;usda",
"gcc;usda", "ildis","ildis;tropicos","ildis;tropicos;usda",
"ildis;usda","tropicos","tropicos;gcc","tropicos;usda","usda" ))) %>%
mutate(Taxonomic_status=factor(Taxonomic_status,
levels=c("Accepted","Synonym", "No opinion","Invalid","Illegitimate","Misapplied","Rejected name"))) %>%
arrange(Name_number,
desc(Infraspecific_epithet_2_score),
desc(Infraspecific_epithet_score),
desc(Specific_epithet_score),
desc(Genus_score),
desc(Family_score),
desc(Name_score),
desc(Overall_score),
Source) %>%
group_by(Name_submitted) %>%
slice(1)
Family level
index.family <- which(tnrs.ncbi$Name_matched_rank == "family" &
(tnrs.ncbi$Taxonomic_status == "Accepted"|
tnrs.ncbi$Taxonomic_status == "Synonym") &
tnrs.ncbi$Family_score > 0.85)
length(index.family)
Genus level
index.genus <- which(tnrs.ncbi$Name_matched_rank == "genus" &
tnrs.ncbi$Taxonomic_status %in% c("Accepted", "Synonym", "No opinion") &
(
(tnrs.ncbi$Genus_score > 0.89 &
tnrs.ncbi$Name_score > 0.49) |
(tnrs.ncbi$Genus_score > 0.99 &
tnrs.ncbi$Name_score > 0.2)
))
length(index.genus)
Species level
index.species.1 <- which(tnrs.ncbi$Name_matched_rank == "species" &
(tnrs.ncbi$Taxonomic_status == "Accepted" |
tnrs.ncbi$Taxonomic_status == "Synonym") &
tnrs.ncbi$Name_score > 0.94 &
tnrs.ncbi$Specific_epithet_score>=0.67)
length(index.species.1)
index.species.2 <- which(tnrs.ncbi$Name_matched_rank == "species" &
(tnrs.ncbi$Taxonomic_status == "Accepted" |
tnrs.ncbi$Taxonomic_status == "Synonym") &
tnrs.ncbi$Genus_score > 0.81 &
tnrs.ncbi$Name_score > 0.51 &
tnrs.ncbi$Specific_epithet_score>=0.67)
length(index.species.2)
index.species.3 <- which(tnrs.ncbi$Name_matched_rank == "species" &
tnrs.ncbi$Taxonomic_status == "No opinion" &
tnrs.ncbi$Genus_score > 0.7 &
tnrs.ncbi$Specific_epithet_score > 0.75)
length(index.species.3)
index.species <- unique(c(index.species.1, index.species.2, index.species.3))
length(index.species)
Variety level
index.var <- which((tnrs.ncbi$Name_matched_rank == "subspecies" |
tnrs.ncbi$Name_matched_rank == "unknown" |
tnrs.ncbi$Name_matched_rank == "variety") &
(tnrs.ncbi$Taxonomic_status == "Accepted" |
tnrs.ncbi$Taxonomic_status == "No opinion" |
tnrs.ncbi$Taxonomic_status == "Synonym"))
length(index.var)
certain
or uncertain
names
Select index.ncbi <- unique(c(index.family, index.genus, index.species, index.var))
tnrs.ncbi.certain <- tnrs.ncbi[index.ncbi,]
nrow(tnrs.ncbi.certain)
write_csv(tnrs.ncbi.certain, path = "../_derived/TNRS_submit/tnrs.ncbi.certain.csv")
tnrs.ncbi.uncertain <- tnrs.ncbi[-index.ncbi,]
nrow(tnrs.ncbi.uncertain)
write_csv(tnrs.ncbi.uncertain, path = "../_derived/TNRS_submit/tnrs.ncbi.uncertain.csv")
save(tnrs.ncbi.certain, tnrs.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter3.RData")
After iteration 3, there are still r nrow(tnrs.ncbi.uncertain)
unresolved taxa.
The Plant List
matching tools for unresolved names
Iteration 4 - Using Generate names list from tnrs.ncbi.uncertain
to be matched against The Plant List
, using Taxonstand::TPL
.
tpl.submit <- tnrs.ncbi.uncertain %>% dplyr::select(Name_submitted)
write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
tpl.ncbi <- TPL(tpl.submit$Name_submitted)
write_csv(tpl.ncbi, path = "../_derived/TPL/tpl_results_iter4.csv")
tpl.ncbi <- read_csv("../_derived/TPL/tpl_results_iter4.csv",
locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Hybrid.marker = col_logical(),
Plant.Name.Index = col_logical(),
TPL.version = col_double(),
Typo = col_logical(),
WFormat = col_logical(),
Higher.level = col_logical(),
Date = col_date(format = "")
))
tpl.ncbi.certain <- tpl.ncbi %>%
filter(Plant.Name.Index==T)
nrow(tpl.ncbi.certain)
write_csv(tpl.ncbi.certain, path = "../_derived/TPL/tpl.ncbi.certain.csv")
tpl.ncbi.uncertain <- tpl.ncbi %>%
filter(Plant.Name.Index==F) %>%
dplyr::select(Taxon)
nrow(tpl.ncbi.uncertain)
write_csv(tpl.ncbi.uncertain, path = "../_derived/TPL/tpl.ncbi.uncertain.csv")
save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter4.RData")