Skip to content
Snippets Groups Projects
Commit 326b4b98 authored by Francesco Sabatini's avatar Francesco Sabatini
Browse files

Created Backbone combining results of different iterations

parent 005fb3be
Branches
No related tags found
No related merge requests found
......@@ -174,7 +174,8 @@ spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(Species=gsub('X-', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('like ', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub(',', '', Species, fixed=TRUE))
mutate(Species=gsub(',', '', Species, fixed=TRUE)) %>%
mutate(Species=gsub('_', ' ', Species))
```
......@@ -195,7 +196,7 @@ Correct some name abbreviations using `taxname.abbr` in `vegdata`:
```{r}
spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
mutate(Species=taxname.abbr(spec.list.TRY.sPlot$Species)) %>%
dplyr::select(OriginalNames, Species) %>%
dplyr::select(OriginalNames, Species, Source) %>%
distinct()
```
......@@ -267,7 +268,7 @@ tnrs.res0 <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter1.txt",
```
## Select best match for each submitted name
### Select best match for each submitted name
Best matches are selected in successive steps, depending at which taxonomic level each record was matched. Records were sorted based on decreasing match scores. Matches at low taxonomic level (variety, subspecies) were favoured over matches at high taxonomic levels (family, sections). When having exactly the same ranks, the records were ranked based on their source, as explained above.
For each name submitted, only the record having the highest rank was retained.
......@@ -320,8 +321,6 @@ tnrs.res <- tnrs.res0 %>%
After this first step, there are `r sum(tnrs.res$Name_matched=="No suitable matches found.")` for which no match was found. Another `r sum(tnrs.res$Overall_score<0.9)` were unreliably matched (overall match score <0.9).
## Select correctly resolved names {#ID}
### General procedure {#ID}
1. Open `tnrs.res` in a spreadsheet program and sort according to `Name_matched_rank`, `Taxonomic_status` and `Family_score`, and select thresholds for selection.
......@@ -395,12 +394,12 @@ index.spermatophyt <- which(tnrs.res$Name_matched == "No suitable matches found.
length(index.spermatophyt)
```
## Select `certain` or `uncertain` names
### Select `certain` or `uncertain` names
Select names that do not fulfill the search criteria, i.e. that were not selected as certain species, for further name matching.
```{r, eval = T}
index.tnrs <- c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt)
index.tnrs <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt))
tnrs.res.certain <- tnrs.res[index.tnrs,]
dim(tnrs.res.certain)
......@@ -414,7 +413,7 @@ save(tnrs.res.certain, tnrs.res.uncertain, file="../_derived/TNRS_submit/tnrs.it
```
## Manual cleaning, delete subspecies information and rerun match in TNRS
### Manual cleaning, delete subspecies information and rerun match in TNRS
Many unmatched records do contain subspecies information which could not be retrieved in TNRS, although genus and species seem to be spelled correctly. Also, sometimes the mismatch derives from having the word 'species' or 'sp' at the end of the name.
```{r}
#Ancillary function to change to lower case
......@@ -524,8 +523,8 @@ tnrs.submit.iter2 <- data.frame(old=tnrs.res.uncertain$Name_submitted) %>%
mutate(new=gsub('Glycirhiza', 'Glycyrrhiza', new)) %>%
mutate(new=gsub('Abiesnordmannia', 'Abies nordmannia', new)) %>%
mutate(new=gsub('Alnus inca', 'Alnus incana', new)) %>%
mutate(new=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', new))
mutate(new=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', new)) %>%
mutate(new=gsub('"Antylis barba-jovis"', '"Anthyllis barba-jovis"', new))
......@@ -535,19 +534,26 @@ tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
# Extract family name for unidentified species
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
na.omit() %>%
group_by(old) %>%
mutate(family.lev=str_extract(word(new,1), pattern='([^\\s]+acea)')) %>%
mutate(new=ifelse(is.na(family.lev), new, family.lev)) %>%
dplyr::select(-family.lev)
dplyr::select(-family.lev) %>%
ungroup()
#Cut to the first 2 words in the name string
tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
group_by(new) %>%
group_by(old) %>%
mutate(Name_binomial=paste(word(new, c(1,2)), collapse=" ")) %>%
ungroup() %>%
mutate(Name_binomial=gsub(' NA$', '', Name_binomial))
#save species name list to be submitted to TNRS
write_csv(tnrs.submit.iter2 %>% dplyr::select(Name_binomial), path="../_derived/TNRS_submit/tnrs.submit_iter2.csv")
write_csv(tnrs.submit.iter2 %>%
dplyr::select(Name_binomial) %>%
#After cleaning some names now match to those already resolved in iteration 1. Take them out
filter(!Name_binomial %in% tnrs.res.certain$Name_submitted) %>%
distinct(), path="../_derived/TNRS_submit/tnrs_submit_iter2.csv")
```
## Iteration 2 - Reimport resolved species names from TNRS and mark solved
......@@ -604,7 +610,7 @@ length(index.family)
```
### Genus level
```{r, eval = F}
```{r, eval = T}
index.genus <- which(tnrs.res.iter2$Name_matched_rank == "genus" &
(tnrs.res.iter2$Taxonomic_status %in% c("Accepted","Synonym") &
tnrs.res.iter2$Genus_score >= 0.90 &
......@@ -653,8 +659,8 @@ length(index.spermatophyt)
```{r, eval = T}
index.tnrs.iter2 <- c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt)
index.tnrs.iter2 <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
index.variety, index.spermatophyt))
tnrs.res.iter2.certain <- tnrs.res.iter2[index.tnrs.iter2,]
dim(tnrs.res.iter2.certain)
......@@ -779,20 +785,19 @@ length(index.var)
### Select `certain` or `uncertain` names
```{r, eval = T}
index.ncbi <- c(index.family, index.genus, index.species, index.var)
length(index.ncbi)
index.ncbi <- unique(c(index.family, index.genus, index.species, index.var))
tnrs.ncbi.certain <- tnrs.ncbi[index.ncbi,]
dim(tnrs.ncbi.certain)
nrow(tnrs.ncbi.certain)
write_csv(tnrs.ncbi.certain, path = "../_derived/TNRS_submit/tnrs.ncbi.certain.csv")
tnrs.ncbi.uncertain <- tnrs.ncbi[-index.ncbi,]
dim(tnrs.ncbi.uncertain)
nrow(tnrs.ncbi.uncertain)
write_csv(tnrs.ncbi.uncertain, path = "../_derived/TNRS_submit/tnrs.ncbi.uncertain.csv")
save(tnrs.ncbi.certain, tnrs.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter3.RData")
```
After iteration 3, there are still `r nrow(tnrs.ncbi.uncertain)` unresolved taxa.
......@@ -802,141 +807,120 @@ Generate names list from `tnrs.ncbi.uncertain` to be matched against `The Plant
```{r, eval = F}
tpl.submit <- tnrs.ncbi.uncertain %>% dplyr::select(Name_submitted)
#write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
tpl.ncbi <- TPL(tpl.submit)
write_csv(tpl.ncbi, file = "../_derived/TPL/tpl_results_iter4.csv")
tpl.ncbi <- TPL(tpl.submit$Name_submitted)
write_csv(tpl.ncbi, path = "../_derived/TPL/tpl_results_iter4.csv")
```
```{r}
tpl.ncbi <- read_csv("../_derived/TPL/tpl_results_iter4.csv",
locale = locale(encoding = 'UTF-8'),quote="",
col_type = cols(
.default = col_character(),
Hybrid.marker = col_logical(),
Plant.Name.Index = col_logical(),
TPL.version = col_double(),
Typo = col_logical(),
WFormat = col_logical(),
Higher.level = col_logical(),
Date = col_date(format = "")
))
tpl.ncbi.certain <- tpl.ncbi %>%
filter(Plant.Name.Index==T)
nrow(tpl.ncbi.certain)
tpl.ncbi.uncertain <- tpl.ncbi %>%
filter(Plant.Name.Index==F) %>%
dplyr::select(Taxon)
nrow(tpl.ncbi.uncertain)
save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter4.RData")
```
# Merge the resolved species lists
## Read files
```{r, eval = F}
```{r, eval = T}
load("../_derived/TNRS_submit/tnrs.iter1.RData")
load("../_derived/TNRS_submit/tnrs.iter2.RData")
load("../_derived/TNRS_submit/tnrs.iter3.RData")
Read in the `tpl.ncbi` table:
```{r, eval = T}
tpl.ncbi <- read_csv("../_derived/TPL/tpl_results_iter4.csv",
col_types = cols(
.default = col_character(),
Hybrid.marker = col_logical(),
Plant.Name.Index = col_logical(),
TPL.version = col_double(),
Typo = col_logical(),
WFormat = col_logical(),
Higher.level = col_logical(),
Date = col_date(format = "")
))
load("../_derived/TNRS_submit/tnrs.iter4.RData")
```
Combine the `certain` data sets:
```{r, eval = F}
```{r, eval = T, warning=F}
Backbone <- spec.list.TRY.sPlot %>%
rename(Name_submitted1=Species) %>%
as.tbl() %>%
rename(Name_sPlot_TRY=OriginalNames,
Name_corrected1=Species) %>%
left_join(tnrs.submit.iter2 %>%
dplyr::select(-new) %>%
rename(Name_submitted1=old, Name_submitted2=Name_binomial),
by="Name_submitted1") %>%
mutate(Name_submitted=ifelse(!is.na(Name_submitted2), Name_submitted2, Name_submitted1)) %>%
rename(Name_corrected1=old, Name_corrected2=Name_binomial),
by="Name_corrected1") %>%
mutate(Name_submitted=ifelse(!is.na(Name_corrected2), Name_corrected2, Name_corrected1)) %>%
dplyr::select(Name_sPlot_TRY, Name_corrected1, Name_corrected2, Source, Name_submitted) %>%
rename(sPlot_Try=Source) %>%
left_join(tnrs.res.certain %>%
bind_rows(tnrs.res.iter2.certain) %>%
bind_rows(tnrs.ncbi.certain),
bind_rows(tnrs.ncbi.certain) %>%
#reformat TPL output to tnrs output
bind_rows(tpl.ncbi.certain %>%
rename(Name_submitted=Taxon,
Name_matched_url=ID,
Taxonomic_status=Taxonomic.status,
Accepted_name_author=New.Authority,
Accepted_name_url=New.ID,
Accepted_name_family=Family,
Selected=Plant.Name.Index) %>%
mutate_at(.vars=vars(New.Hybrid.marker, New.Infraspecific.rank, New.Infraspecific),
.fun=~ifelse(is.na(.), "", .)) %>%
mutate(Accepted_name=paste(New.Genus, New.Hybrid.marker,
New.Species, New.Infraspecific.rank,
New.Infraspecific)) %>%
mutate(Accepted_name=gsub(pattern="\\s+", " ", Accepted_name)) %>%
mutate(Accepted_name_species=paste(New.Genus, New.Hybrid.marker, New.Species)) %>%
mutate(Accepted_name_species=gsub(pattern="\\s+", " ", Accepted_name_species)) %>%
mutate(Accepted_name_rank=ifelse(Higher.level==F, "species", NA)) %>%
mutate(Source=paste("tpl", TPL.version)) %>%
dplyr::select( (data.frame(colmatch=match(colnames(tnrs.ncbi),
names(.))) %>%
filter(!is.na(colmatch)))$colmatch)) %>%
group_by(Name_submitted) %>% #Some double matches. Prioritize first iterations
slice(1),
by="Name_submitted")
tnrs.tpl.all.trop.certain <- rbind(tnrs.tpl.certain, tnrs.trop.small.certain)
dim(tnrs.tpl.all.trop.certain)
```
... and add the four additional columns:
```{r, eval = F}
names(tnrs.tpl.all.trop.certain)
tnrs.tpl.all.trop.certain$Manual.matching <- NA
tnrs.tpl.all.trop.certain$Status.correct <- NA
tnrs.tpl.all.trop.certain$name.correct <- NA
tnrs.tpl.all.trop.certain$rank.correct <- NA
```
### Pick the respective `NCBI` data sets
... for the 8,177 certain species:
```{r, eval = F}
names(tnrs.ncbi.certain.comb)
tnrs.ncbi.certain.comb$rank.correct <- NA
```
Combine the with the big list above:
```{r, eval = F}
tnrs.tpl.all.trop.certain.2 <- rbind(tnrs.tpl.all.trop.certain, tnrs.ncbi.certain.comb)
dim(tnrs.tpl.all.trop.certain.2)
names(tnrs.tpl.all.trop.certain.2)
#Double check
nrow(Backbone) == nrow(spec.list.TRY.sPlot)
```
## Tag names that could not be resolved
Add four additional columns.
If names were not corrected, set `Taxonomic.status == ""`, and assign `No suitable matches found.` to the remaining species.
```{r, eval = T}
Backbone <- Backbone %>%
mutate(Status_correct=ifelse(!is.na(Taxonomic_status), Taxonomic_status, NA)) %>%
mutate(Status_correct=replace(Status_correct,
list=is.na(Status_correct),
values="No suitable matches found.")) %>%
mutate(Status_correct=factor(Status_correct)) %>%
mutate(Name_correct=ifelse(!is.na(Accepted_name), Accepted_name, "No suitable matches found.")) %>%
mutate(Rank_correct=ifelse(!is.na(Name_matched_rank), as.character(Name_matched_rank), "higher")) %>%
mutate(Rank_correct=factor(Rank_correct)) %>%
mutate(Name_short=ifelse(!is.na(Accepted_name_species), Accepted_name_species, NA))
summary(Backbone$Status_correct)
summary(Backbone$Rank_correct)
### Tag names that could not be resolved
If names were not corrected, set `Taxonomic.status == ""`
```{r, eval = F}
ncbi.uncertain.corr.uncertain.2$Status.correct[
ncbi.uncertain.corr.uncertain.2$Status.correct==""] <-
ncbi.uncertain.corr.uncertain.2$Taxonomic.status[
ncbi.uncertain.corr.uncertain.2$Status.correct ==""]
summary(ncbi.uncertain.corr.uncertain.2$Status.correct)
str(ncbi.uncertain.corr.uncertain.2$Status.correct)
```
... and assign `No suitable matches found.` to the remaining species:
```{r, eval = F}
ncbi.uncertain.corr.uncertain.2$Status.correct <-
as.character(ncbi.uncertain.corr.uncertain.2$Status.correct)
ncbi.uncertain.corr.uncertain.2$Status.correct
[is.na(ncbi.uncertain.corr.uncertain.2$Status.correct)] <- "No suitable matches found."
```
Add uncorrected names in column `X` to `name.correct`:
```{r, eval = F}
ncbi.uncertain.corr.uncertain.2$name.correct[
ncbi.uncertain.corr.uncertain.2$Genus.correct==""] <-
as.character(ncbi.uncertain.corr.uncertain.2[,41])[
ncbi.uncertain.corr.uncertain.2$Genus.correct==""]
View(Backbone %>% mutate(n_words = stringr::str_count(Name_short, ' ') + 1) %>% filter(n_words>2))
```
Assign `No suitable matches found.` to remaining species in `name.correct` according to `Status.correct`.
```{r, eval = F}
ncbi.uncertain.corr.uncertain.2$name.correct[ncbi.uncertain.corr.uncertain.2$Status.correct==
"No suitable matches found."] <-
"No suitable matches found."
write.csv(ncbi.uncertain.corr.uncertain.2, file = "ncbi.uncertain.corr.uncertain.2.csv")
```
Done! Use `ncbi.uncertain.corr.uncertain.2` for later merging wit
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment