From 326b4b9829bd5569af321da673a2b656bad1a06c Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Sat, 27 Jul 2019 18:17:45 +0200
Subject: [PATCH] Created Backbone combining results of different iterations

---
 code/03_TaxonomicBackbone.Rmd | 240 ++++++++++++++++------------------
 1 file changed, 112 insertions(+), 128 deletions(-)

diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd
index 0250ac5..e39cd66 100644
--- a/code/03_TaxonomicBackbone.Rmd
+++ b/code/03_TaxonomicBackbone.Rmd
@@ -174,7 +174,8 @@ spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>%
   mutate(Species=gsub('X-', '', Species, fixed=TRUE)) %>%
   mutate(Species=gsub('×', '', Species, fixed=TRUE)) %>%
   mutate(Species=gsub('like ', '', Species, fixed=TRUE)) %>% 
-  mutate(Species=gsub(',', '', Species, fixed=TRUE))
+  mutate(Species=gsub(',', '', Species, fixed=TRUE)) %>%
+  mutate(Species=gsub('_', ' ', Species))
          
          
 ```
@@ -195,7 +196,7 @@ Correct some name abbreviations using `taxname.abbr` in `vegdata`:
 ```{r}
 spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>% 
   mutate(Species=taxname.abbr(spec.list.TRY.sPlot$Species)) %>%
-  dplyr::select(OriginalNames, Species)  %>% 
+  dplyr::select(OriginalNames, Species, Source)  %>% 
   distinct()
 ```
 
@@ -267,7 +268,7 @@ tnrs.res0 <- readr::read_delim("../_derived/TNRS_submit/tnrs_results_iter1.txt",
 ```
 
 
-## Select best match for each submitted name
+### Select best match for each submitted name
 
 Best matches are selected in successive steps, depending at which taxonomic level each record was matched. Records were sorted based on decreasing match scores. Matches at low taxonomic level (variety, subspecies) were favoured over matches at high taxonomic levels (family, sections). When having exactly the same ranks, the records were ranked based on their source, as explained above.  
 For each name submitted, only the record having the highest rank was retained.
@@ -320,8 +321,6 @@ tnrs.res <- tnrs.res0 %>%
 
 After this first step, there are `r sum(tnrs.res$Name_matched=="No suitable matches found.")` for which no match was found. Another `r sum(tnrs.res$Overall_score<0.9)` were unreliably matched (overall match score <0.9). 
 
-
-## Select correctly resolved names {#ID}
 ### General procedure  {#ID}
 
 1. Open `tnrs.res` in a spreadsheet program and sort according to `Name_matched_rank`, `Taxonomic_status` and `Family_score`, and select thresholds for selection.
@@ -395,12 +394,12 @@ index.spermatophyt <- which(tnrs.res$Name_matched == "No suitable matches found.
 length(index.spermatophyt)
 ```
 
-## Select `certain` or `uncertain` names
+### Select `certain` or `uncertain` names
 Select names that do not fulfill the search criteria, i.e. that were not selected as certain species, for further name matching.
 
 ```{r, eval = T}
-index.tnrs <- c(index.family, index.forma, index.genus, index.species, index.subspec,
-               index.variety, index.spermatophyt)
+index.tnrs <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
+               index.variety, index.spermatophyt))
 
 tnrs.res.certain <- tnrs.res[index.tnrs,]
 dim(tnrs.res.certain)
@@ -414,7 +413,7 @@ save(tnrs.res.certain, tnrs.res.uncertain, file="../_derived/TNRS_submit/tnrs.it
 
 ```
 
-## Manual cleaning, delete subspecies information and rerun match in TNRS
+### Manual cleaning, delete subspecies information and rerun match in TNRS
 Many unmatched records do contain subspecies information which could not be retrieved in TNRS, although genus and species seem to be spelled correctly. Also, sometimes the mismatch derives from having the word 'species' or 'sp' at the end of the name. 
 ```{r}
 #Ancillary function to change to lower case
@@ -524,8 +523,8 @@ tnrs.submit.iter2 <- data.frame(old=tnrs.res.uncertain$Name_submitted) %>%
   mutate(new=gsub('Glycirhiza', 'Glycyrrhiza', new)) %>%
   mutate(new=gsub('Abiesnordmannia', 'Abies nordmannia', new)) %>%
   mutate(new=gsub('Alnus inca', 'Alnus incana', new)) %>%
-  mutate(new=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', new)) 
-  
+  mutate(new=gsub('Amalencier alnifolia', 'Amalenchier alnifolia', new)) %>% 
+  mutate(new=gsub('"Antylis barba-jovis"', '"Anthyllis barba-jovis"', new))
   
   
 
@@ -535,19 +534,26 @@ tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
   
 # Extract family name for unidentified species
 tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
+  na.omit() %>%
+  group_by(old) %>%
   mutate(family.lev=str_extract(word(new,1), pattern='([^\\s]+acea)')) %>%
   mutate(new=ifelse(is.na(family.lev), new, family.lev)) %>%
-  dplyr::select(-family.lev)
+  dplyr::select(-family.lev) %>%
+  ungroup()
 
 #Cut to the first 2 words in the name string
 tnrs.submit.iter2 <- tnrs.submit.iter2 %>%
-  group_by(new) %>%
+  group_by(old) %>%
   mutate(Name_binomial=paste(word(new, c(1,2)), collapse=" ")) %>%
   ungroup() %>%
   mutate(Name_binomial=gsub(' NA$', '', Name_binomial))
-  
+
 #save species name list to be submitted to TNRS
-write_csv(tnrs.submit.iter2 %>% dplyr::select(Name_binomial), path="../_derived/TNRS_submit/tnrs.submit_iter2.csv")
+write_csv(tnrs.submit.iter2 %>% 
+            dplyr::select(Name_binomial) %>% 
+            #After cleaning some names now match to those already resolved in iteration 1. Take them out
+            filter(!Name_binomial %in% tnrs.res.certain$Name_submitted) %>% 
+            distinct(), path="../_derived/TNRS_submit/tnrs_submit_iter2.csv")
 ```
 
 ## Iteration 2 - Reimport resolved species names from TNRS and mark solved
@@ -604,7 +610,7 @@ length(index.family)
 ```
 
 ### Genus level
-```{r, eval = F}
+```{r, eval = T}
 index.genus <- which(tnrs.res.iter2$Name_matched_rank == "genus" &
                                  (tnrs.res.iter2$Taxonomic_status %in% c("Accepted","Synonym") &
                                  tnrs.res.iter2$Genus_score >= 0.90 &
@@ -653,8 +659,8 @@ length(index.spermatophyt)
 
 
 ```{r, eval = T}
-index.tnrs.iter2 <- c(index.family, index.forma, index.genus, index.species, index.subspec,
-               index.variety, index.spermatophyt)
+index.tnrs.iter2 <- unique(c(index.family, index.forma, index.genus, index.species, index.subspec,
+               index.variety, index.spermatophyt))
 
 tnrs.res.iter2.certain <- tnrs.res.iter2[index.tnrs.iter2,]
 dim(tnrs.res.iter2.certain)
@@ -779,20 +785,19 @@ length(index.var)
 
 ### Select `certain` or `uncertain` names
 ```{r, eval = T}
-index.ncbi <- c(index.family, index.genus, index.species, index.var)
-length(index.ncbi)
+index.ncbi <- unique(c(index.family, index.genus, index.species, index.var))
 
 tnrs.ncbi.certain <- tnrs.ncbi[index.ncbi,]
-dim(tnrs.ncbi.certain)
+nrow(tnrs.ncbi.certain)
 write_csv(tnrs.ncbi.certain, path = "../_derived/TNRS_submit/tnrs.ncbi.certain.csv")
 
 tnrs.ncbi.uncertain <- tnrs.ncbi[-index.ncbi,]
-dim(tnrs.ncbi.uncertain)
+nrow(tnrs.ncbi.uncertain)
 write_csv(tnrs.ncbi.uncertain, path = "../_derived/TNRS_submit/tnrs.ncbi.uncertain.csv")
 save(tnrs.ncbi.certain, tnrs.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter3.RData")
 ```
 
-
+After iteration 3, there are still `r nrow(tnrs.ncbi.uncertain)` unresolved taxa.
 
 
 
@@ -802,141 +807,120 @@ Generate names list from `tnrs.ncbi.uncertain` to be matched against `The Plant
 
 ```{r, eval = F}
 tpl.submit <- tnrs.ncbi.uncertain %>% dplyr::select(Name_submitted)
-#write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
+write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
 
-tpl.ncbi <- TPL(tpl.submit)
-write_csv(tpl.ncbi, file = "../_derived/TPL/tpl_results_iter4.csv")
+tpl.ncbi <- TPL(tpl.submit$Name_submitted)
+write_csv(tpl.ncbi, path = "../_derived/TPL/tpl_results_iter4.csv")
+```
+
+
+```{r}
+tpl.ncbi <- read_csv("../_derived/TPL/tpl_results_iter4.csv",  
+         locale = locale(encoding = 'UTF-8'),quote="",
+          col_type = cols(
+            .default = col_character(),
+              Hybrid.marker = col_logical(),
+              Plant.Name.Index = col_logical(),
+              TPL.version = col_double(),
+              Typo = col_logical(),
+              WFormat = col_logical(),
+              Higher.level = col_logical(),
+              Date = col_date(format = "")
+            ))
+tpl.ncbi.certain <- tpl.ncbi %>%
+  filter(Plant.Name.Index==T)
+nrow(tpl.ncbi.certain)
+
+tpl.ncbi.uncertain <- tpl.ncbi %>%
+  filter(Plant.Name.Index==F) %>%
+  dplyr::select(Taxon)
+nrow(tpl.ncbi.uncertain)
+
+save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.iter4.RData")
 ```
 
 
 # Merge the resolved species lists
 ## Read files
 
-```{r, eval = F}
+```{r, eval = T}
 load("../_derived/TNRS_submit/tnrs.iter1.RData")
 load("../_derived/TNRS_submit/tnrs.iter2.RData")
 load("../_derived/TNRS_submit/tnrs.iter3.RData")
-Read in the `tpl.ncbi` table:
-
-```{r, eval = T}
-tpl.ncbi <- read_csv("../_derived/TPL/tpl_results_iter4.csv", 
-                     col_types = cols(
-                       .default = col_character(),
-                      Hybrid.marker = col_logical(),
-                      Plant.Name.Index = col_logical(),
-                      TPL.version = col_double(),
-                      Typo = col_logical(),
-                      WFormat = col_logical(),
-                      Higher.level = col_logical(),
-                      Date = col_date(format = "")
-                    ))
+load("../_derived/TNRS_submit/tnrs.iter4.RData")
 ```
 
-
 Combine the `certain` data sets:
-```{r, eval = F}
+```{r, eval = T, warning=F}
 
 Backbone <- spec.list.TRY.sPlot %>%
-  rename(Name_submitted1=Species) %>%
+  as.tbl() %>%
+  rename(Name_sPlot_TRY=OriginalNames, 
+         Name_corrected1=Species) %>%
   left_join(tnrs.submit.iter2 %>%
               dplyr::select(-new) %>%
-              rename(Name_submitted1=old, Name_submitted2=Name_binomial),
-            by="Name_submitted1") %>%
-  mutate(Name_submitted=ifelse(!is.na(Name_submitted2), Name_submitted2, Name_submitted1)) %>%
+              rename(Name_corrected1=old, Name_corrected2=Name_binomial),
+            by="Name_corrected1") %>%
+  mutate(Name_submitted=ifelse(!is.na(Name_corrected2), Name_corrected2, Name_corrected1)) %>%
+  dplyr::select(Name_sPlot_TRY, Name_corrected1, Name_corrected2, Source, Name_submitted) %>%
+  rename(sPlot_Try=Source) %>%
   left_join(tnrs.res.certain %>% 
               bind_rows(tnrs.res.iter2.certain) %>%
-              bind_rows(tnrs.ncbi.certain), 
+              bind_rows(tnrs.ncbi.certain) %>%
+#reformat TPL output to tnrs output
+              bind_rows(tpl.ncbi.certain %>%
+                    rename(Name_submitted=Taxon,
+                    Name_matched_url=ID,
+                    Taxonomic_status=Taxonomic.status,
+                    Accepted_name_author=New.Authority,
+                    Accepted_name_url=New.ID,
+                    Accepted_name_family=Family, 
+                    Selected=Plant.Name.Index) %>%
+                    mutate_at(.vars=vars(New.Hybrid.marker, New.Infraspecific.rank, New.Infraspecific),
+                              .fun=~ifelse(is.na(.), "", .)) %>%
+                    mutate(Accepted_name=paste(New.Genus, New.Hybrid.marker, 
+                                               New.Species, New.Infraspecific.rank, 
+                                               New.Infraspecific)) %>%
+                    mutate(Accepted_name=gsub(pattern="\\s+", " ", Accepted_name)) %>%
+                    mutate(Accepted_name_species=paste(New.Genus, New.Hybrid.marker, New.Species)) %>%
+                    mutate(Accepted_name_species=gsub(pattern="\\s+", " ", Accepted_name_species)) %>%
+                    mutate(Accepted_name_rank=ifelse(Higher.level==F, "species", NA)) %>%
+                    mutate(Source=paste("tpl", TPL.version)) %>%
+                    dplyr::select( (data.frame(colmatch=match(colnames(tnrs.ncbi), 
+                                                        names(.))) %>%
+                                filter(!is.na(colmatch)))$colmatch)) %>%
+                    group_by(Name_submitted) %>%  #Some double matches. Prioritize first iterations
+                    slice(1),
             by="Name_submitted")
-  
-  
-
-
-tnrs.tpl.all.trop.certain <- rbind(tnrs.tpl.certain, tnrs.trop.small.certain)
-dim(tnrs.tpl.all.trop.certain)
-```
-... and add the four additional columns:
-
-```{r, eval = F}
-names(tnrs.tpl.all.trop.certain)
-
-tnrs.tpl.all.trop.certain$Manual.matching <- NA
-tnrs.tpl.all.trop.certain$Status.correct <- NA
-tnrs.tpl.all.trop.certain$name.correct <- NA
-tnrs.tpl.all.trop.certain$rank.correct <- NA
-```
-
-### Pick the respective `NCBI` data sets
-... for the 8,177 certain species:
-```{r, eval = F}
-names(tnrs.ncbi.certain.comb)
-tnrs.ncbi.certain.comb$rank.correct <- NA
-```
-Combine the with the big list above:
-```{r, eval = F}
-tnrs.tpl.all.trop.certain.2 <- rbind(tnrs.tpl.all.trop.certain, tnrs.ncbi.certain.comb)
-dim(tnrs.tpl.all.trop.certain.2)
-names(tnrs.tpl.all.trop.certain.2)
+#Double check
+nrow(Backbone) == nrow(spec.list.TRY.sPlot)
 ```
 
 
+## Tag names that could not be resolved
+Add four additional columns.
+If names were not corrected, set `Taxonomic.status == ""`, and assign `No suitable matches found.` to the remaining species.
+```{r, eval = T}
 
+Backbone <- Backbone %>%
+  mutate(Status_correct=ifelse(!is.na(Taxonomic_status), Taxonomic_status, NA)) %>%
+  mutate(Status_correct=replace(Status_correct, 
+                                list=is.na(Status_correct), 
+                                values="No suitable matches found.")) %>% 
+  mutate(Status_correct=factor(Status_correct)) %>%
+  mutate(Name_correct=ifelse(!is.na(Accepted_name), Accepted_name, "No suitable matches found.")) %>%
+  mutate(Rank_correct=ifelse(!is.na(Name_matched_rank), as.character(Name_matched_rank), "higher")) %>%
+  mutate(Rank_correct=factor(Rank_correct)) %>% 
+  mutate(Name_short=ifelse(!is.na(Accepted_name_species), Accepted_name_species, NA))
 
 
 
+summary(Backbone$Status_correct)
+summary(Backbone$Rank_correct)
 
-
-
-
-
-
-
-
-
-
-
-
-
-### Tag names that could not be resolved
-If names were not corrected, set `Taxonomic.status == ""`
-```{r, eval = F}
-ncbi.uncertain.corr.uncertain.2$Status.correct[
-                                    ncbi.uncertain.corr.uncertain.2$Status.correct==""] <-
-    ncbi.uncertain.corr.uncertain.2$Taxonomic.status[
-                                        ncbi.uncertain.corr.uncertain.2$Status.correct ==""]
-
-summary(ncbi.uncertain.corr.uncertain.2$Status.correct)
-str(ncbi.uncertain.corr.uncertain.2$Status.correct)
-```
-
-... and assign `No suitable matches found.` to the remaining species:
-```{r, eval = F}
-ncbi.uncertain.corr.uncertain.2$Status.correct <-
-    as.character(ncbi.uncertain.corr.uncertain.2$Status.correct)
-ncbi.uncertain.corr.uncertain.2$Status.correct
-[is.na(ncbi.uncertain.corr.uncertain.2$Status.correct)] <- "No suitable matches found."
-```
-
-Add uncorrected names in column `X` to `name.correct`:
-```{r, eval = F}
-ncbi.uncertain.corr.uncertain.2$name.correct[
-                                    ncbi.uncertain.corr.uncertain.2$Genus.correct==""] <-
-    as.character(ncbi.uncertain.corr.uncertain.2[,41])[
-        ncbi.uncertain.corr.uncertain.2$Genus.correct==""]
+View(Backbone %>% mutate(n_words = stringr::str_count(Name_short, ' ') + 1) %>% filter(n_words>2))
 ```
 
-Assign `No suitable matches found.` to remaining species in `name.correct` according to `Status.correct`.
-
-```{r, eval = F}
-ncbi.uncertain.corr.uncertain.2$name.correct[ncbi.uncertain.corr.uncertain.2$Status.correct==
-                                             "No suitable matches found."] <-
-    "No suitable matches found."
-
-write.csv(ncbi.uncertain.corr.uncertain.2, file = "ncbi.uncertain.corr.uncertain.2.csv")
-```
-Done! Use `ncbi.uncertain.corr.uncertain.2` for later merging wit
-
-
-
 
 
 
-- 
GitLab