From de09f453df1f3f4cee3112721e1d29cc9848adad Mon Sep 17 00:00:00 2001 From: Francesco Sabatini <francesco.sabatini@idiv.de> Date: Wed, 4 Mar 2020 16:42:23 +0100 Subject: [PATCH] First version of 06_buildDT --- code/03_TaxonomicBackbone.Rmd | 82 +++++----- code/06_buildDT.Rmd | 288 ++++++++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 37 deletions(-) create mode 100644 code/06_buildDT.Rmd diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd index 2611f12..b7cf96e 100644 --- a/code/03_TaxonomicBackbone.Rmd +++ b/code/03_TaxonomicBackbone.Rmd @@ -51,25 +51,7 @@ library(vegdata) ## Read in taxon names from [sPlot](https://www.idiv.de/sdiv/working_groups/wg_pool/splot/splot_database.html) and [TRY](https://www.try-db.org/TryWeb/Home.php) -```{r, echo=F} -## fungi genera #NOT COMPLETE LIST -mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Amanitopsis","Coprinus", - "Galerina","Geoglossum","Hebeloma","Hydnum","Lactarius","Leucocarpia","Naucoria","Otidea","Polyporus", "Involucrothele", - "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", - "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe", - "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", - "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria","Amphoroblasia", "Amphoroblastia", "Agrocybe", - "Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma", - "Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae", - "Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora", - "Kirschsteiniothelia", "Lasallia","Lepiota","Llimoniella","Mazosia","Microthelia","Mollisia", - "Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella", - "Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Umbilicaria","Unguiculariopsis" , - "Xanthoconium") -``` - - -```{r, eval=F} +```{r} #import and save splot names from DT table DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", delim="\t", @@ -87,7 +69,25 @@ DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", x_ = col_double() ) ) -## Exclude fungi + +``` +Create list of species, and exclude fungi +```{r} +## fungi genera #NOT COMPLETE LIST +mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Amanitopsis","Coprinus", + "Galerina","Geoglossum","Hebeloma","Hydnum","Lactarius","Leucocarpia","Naucoria","Otidea","Polyporus", + "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", + "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe", "Paxillus", + "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", + "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria", "Agrocybe", + "Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma", + "Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae", + "Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora", + "Kirschsteiniothelia", "Lepiota","Llimoniella","Mazosia","Mollisia", + "Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella", + "Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Unguiculariopsis" , + "Xanthoconium") + splot.species <- DT0 %>% rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>% filter(`Taxon group`!="Mushroom") %>% @@ -99,10 +99,10 @@ splot.species <- DT0 %>% filter(fungi==F) %>% dplyr::select(Species.original, Matched.concept) - write_csv(splot.species, path = "../_derived/splot3.0.2.species.csv") ``` + !!! I used the column from TRY with the full species name, not the column with only two-word name strings ```{r, message=F} @@ -330,11 +330,11 @@ tnrs.res <- tnrs.res0 %>% levels=c("Accepted","Synonym", "No opinion","Invalid", "Illegitimate","Misapplied","Rejected name"))) %>% #filter(Taxonomic_status %in% c("Accepted", "Synonym")) %>% - arrange(Name_number, + arrange(Name_number, + desc(Genus_score), + desc(Specific_epithet_score), desc(Infraspecific_epithet_2_score), desc(Infraspecific_epithet_score), - desc(Specific_epithet_score), - desc(Genus_score), desc(Family_score), desc(Name_score), desc(Overall_score), @@ -979,15 +979,16 @@ tnrs.res.iter2 <- tnrs.res.iter2.raw %>% levels=c("Accepted","Synonym", "No opinion", "Invalid","Illegitimate","Misapplied", "Rejected name"))) %>% - arrange(Name_number, + arrange(Name_number, + desc(Genus_score), + desc(Specific_epithet_score), desc(Infraspecific_epithet_2_score), desc(Infraspecific_epithet_score), - desc(Specific_epithet_score), - desc(Genus_score), desc(Family_score), desc(Name_score), desc(Overall_score), - Source) %>% + Source, + Taxonomic_status) %>% group_by(Name_submitted) %>% slice(1) ``` @@ -1107,13 +1108,15 @@ tnrs.ncbi <- tnrs.res.iter3.raw %>% levels=c("Accepted","Synonym", "No opinion","Invalid", "Illegitimate","Misapplied","Rejected name"))) %>% arrange(Name_number, + desc(Genus_score), + desc(Specific_epithet_score), desc(Infraspecific_epithet_2_score), desc(Infraspecific_epithet_score), - desc(Specific_epithet_score), - desc(Genus_score), desc(Family_score), desc(Name_score), - desc(Overall_score)) %>% + desc(Overall_score), + Source, + Taxonomic_status) %>% group_by(Name_submitted) %>% slice(1) ``` @@ -1560,14 +1563,16 @@ Backbone <- Backbone %>% #Records with missing family info sum(is.na(Backbone$Family_correct)) + +nrow(Backbone) == nrow(spec.list.TRY.sPlot) ``` ### Delete records assigned to mushroom families, if any ```{r} mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae", "Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae", - "Cortinariaceae", "Polyporaceae", "Umbilicariaceae" , "Pleosporaceae", - "Leotiaceae","Dermateaceae", "Hymenochaetaceae","Stereaceae","Tremellaceae") + "Cortinariaceae", "Polyporaceae", "Pleosporaceae", + "Leotiaceae","Dermateaceae","Hymenochaetaceae","Stereaceae","Tremellaceae") Backbone <- Backbone %>% filter(!Genus_correct %in% mushroom) %>% filter(!Family_correct %in% mushroom.families) @@ -1625,12 +1630,14 @@ lichens <- c("Acarosporaceae" , "Parmeliaceae", "Physciaceae", "Lichinaceae", "Teloschistaceae","Candelariaceae","Rhizocarpaceae","Lecideaceae", "Icmadophilaceae","Cladoniaceae","Collemataceae","Pannariaceae" , "Lobariaceae", "Ophioparmaceae" ,"Psoraceae","Stereocaulaceae", - "Massalongiaceae","Peltigeraceae","Nephromataceae") + "Massalongiaceae","Peltigeraceae","Nephromataceae", "Umbilicariaceae" ) lichen.genera <- c("Amygdalaria", "Anamylospora", "Arthonia", "Pertusaria", "Pyrenula","Opegrapha", "Ochrolechia", "Graphis", "Micarea", "Porpidia", "Arthopyrenia", "Graphina", "Anisomeridium", "Mycobilimbia","Peltula", "Thelotrema", "Arthothelium", "Diploschistes", "Strigula", "Trichothelium", "Melaspilea", "Phaeographis", "Thelenella", "Chaenothecopsis","Fuscidea", - "Dactylospora", "Gyalecta", "Myriotrema", "Placynthium") + "Dactylospora", "Gyalecta", "Myriotrema", "Placynthium", "Umbilicaria", + "Lasallia", "Microthelia", "Lichenochora", "Roselliniopsis", "Homostegia", + "Verrucaria", "Leptorhaphis") mosses <- c("Pilotrichaceae", "Chonecoleaceae", "Hypopterygiaceae", "Scorpidiaceae", "Balantiopsaceae", "Mesoptychiaceae","Octoblepharaceae" ,"Takakiaceae") algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiaceae", @@ -1648,7 +1655,7 @@ Backbone <- Backbone %>% values=T)) %>% mutate(`Taxon group`="Unknown") %>% mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T), - "Vascular Plant", `Taxon group`)) %>% + "Vascular plant", `Taxon group`)) %>% mutate(`Taxon group`=replace(`Taxon group`, list=Family_correct %in% lichens, values="Lichen")) %>% @@ -1737,7 +1744,8 @@ knitr::kable(Backbone %>% *`Taxon group`* - Taxon group, as in Turboveg. 'Vascular plant', 'Moss' (include liverworts), 'Lichen', 'Algae', 'Unknown ```{r} -save(Backbone, file="../_output/Backbone3.0.RData") +save(Backbone, mushroom, mushroom.families, lichen.genera, + file="../_output/Backbone3.0.RData") ``` ## Export species list to request in TRY diff --git a/code/06_buildDT.Rmd b/code/06_buildDT.Rmd new file mode 100644 index 0000000..26f642a --- /dev/null +++ b/code/06_buildDT.Rmd @@ -0,0 +1,288 @@ +--- +title: "sPlot 3.0 - Build DT" +author: "Francesco Maria Sabatini" +date: "2/24/2020" +output: html_document +--- + + +<center> +  +</center> + + +MEMO CHECK Field cover code! It seems to have species characters + + + +**Timestamp:** `r date()` +**Drafted:** Francesco Maria Sabatini +**Revised:** +**version:** 1.0 + +This report documents the construction of the DT table for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens. + + +```{r results="hide", message=F, warning=F} +knitr::opts_chunk$set(echo = TRUE) +library(tidyverse) +library(readr) +library(xlsx) +library(knitr) +library(kableExtra) + +#save temporary files +write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron')) +write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron')) +#rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp") +``` + + +Search and replace unclosed quotation marks and escape them. Run in Linux terminal +```{bash, eval=F} +# escape all double quotation marks. Run in Linux terminal +# sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv +``` + +## Import data Table +DT table is the species x plot matrix, in long format. +```{r } +DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", + delim="\t", + col_type = cols( + PlotObservationID = col_double(), + Taxonomy = col_character(), + `Taxon group` = col_character(), + `Taxon group ID` = col_double(), + `Turboveg2 concept` = col_character(), + `Matched concept` = col_character(), + Match = col_double(), + Layer = col_double(), + `Cover %` = col_double(), + `Cover code` = col_character(), + x_ = col_double() + ) + ) +nplots <- length(unique(DT0$PlotObservationID)) +nspecies <- length(unique(DT0$`Matched concept`)) +``` +Species data include `r nrow(DT0)` species * plot records, across `r nplots` plots and including `r nspecies` non-resolved species. +\newline + + +```{r, echo=F} +set.seed <- 1984 +sampled <- sample(unique(DT0$PlotObservationID), 10, replace=F) + +knitr::kable(DT0 %>% + filter(PlotObservationID %in% sampled[1:3]), + caption="Example of initial DT table (3 randomly selected plots shown)") %>% + kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), + full_width = F, position = "center") +``` + +Import taxonomic backbone +```{r} +load("../_output/Backbone3.0.RData") +``` + +## Match species names from DT0 to those in Backbone +```{r} +DT1 <- DT0 %>% + left_join(Backbone %>% + dplyr::select(Name_sPlot_TRY, Name_short, `Taxon group`) %>% + rename(`Matched concept`=Name_sPlot_TRY, + Taxongroup_BB=`Taxon group`), + by="Matched concept") +``` + +## Explore name matching based on Backbone v1.2 +Select species entries that changed after taxonomic standardization, as a way to check the backbone. +```{r} +name.check <- DT1 %>% + dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% + rename(Name_TNRS=Name_short) %>% + distinct() %>% + mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% + filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>% + dplyr::select(-Matched_short) %>% + arrange(Name_TNRS) +``` + +```{r, echo=F} +knitr::kable(name.check %>% sample_n(30), + caption="Check 30 random species names from DT after matching to backbone") %>% + kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), + full_width = F, position = "center") +``` + +Check the most common species names from DT after matching to backbone +```{r} +name.check.freq <- DT1 %>% + dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% + rename(Name_TNRS=Name_short) %>% + group_by(`Turboveg2 concept`, `Matched concept`, Name_TNRS) %>% + summarize(n=n()) %>% + mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% + filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>% + dplyr::select(-Matched_short) %>% + ungroup() %>% + arrange(desc(n)) +``` + +```{r, echo=F} +knitr::kable(name.check.freq %>% slice(1:40), + caption="Check 40 most common species names from DT after matching to backbone") %>% + kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), + full_width = F, position = "center") +``` + +## Complete field `taxon group` + +Coalesce `Taxon group` info from Backbone +```{r} +table(DT1$`Taxon group`, exclude=NULL) + +DT1 <- DT1 %>% + mutate(`Taxon group`=ifelse(`Taxon group`=="Unknown", NA, `Taxon group`)) %>% + mutate(Taxongroup_BB=ifelse(Taxongroup_BB=="Unknown", NA, Taxongroup_BB)) %>% + mutate(`Taxon group`=coalesce(`Taxon group`, Taxongroup_BB)) %>% + dplyr::select(-Taxongroup_BB) + + +table(DT1$`Taxon group`, exclude=NULL) +``` + + + +Cross-complement +```{r} +DT1 <- DT1 %>% + left_join(DT1 %>% + filter(!is.na(Name_short)) %>% + filter(`Taxon group` != "Unknown") %>% + dplyr::select(Name_short, `Taxon group`) %>% + distinct(Name_short, .keep_all=T) %>% + rename(TaxonGroup_compl=`Taxon group`), + by="Name_short") %>% + mutate(`Taxon group`=coalesce(`Taxon group`, TaxonGroup_compl)) %>% + dplyr::select(-TaxonGroup_compl) + +table(DT1$`Taxon group`, exclude=NULL) +``` + +Check species with conflicting `Taxon group` information and fix manually. +```{r} +#Attach genus info +DT1 <- DT1 %>% + left_join(Backbone %>% + dplyr::select(Name_sPlot_TRY, Name_short) %>% + mutate(Genus=word(Name_short, 1, 1)) %>% + dplyr::select(-Name_short) %>% + rename(`Matched concept`=Name_sPlot_TRY), + by="Matched concept") %>% + mutate(`Taxon group`=fct_collapse(`Taxon group`, + Alga_Stonewort=c("Alga", "Stonewort"))) +#manually fix some know problems +mosses.gen <- c("Hypnum", "Brachytheciastrum", + "Brachythecium","Hypnum", "Zygodon", "Oxymitra", "Bryophyta", "Musci", '\\\"Moos\\\"') +vascular.gen <- c("Polystichum", "Hypericum", "Peltaria", "Pancovia", "Calythrix", "Ripogonum", + "Notogrammitis", "Fuscospora", "Lophozonia", "Rostellularia", + "Hesperostipa", "Microsorium", + "Angiosperm","Dicotyledonae", "Spermatophy") +alga.gen <- c("Chara", "Characeae", "Tonina", "Nostoc", "Entermorpha", "Hydrocoleum" ) + +DT1 <- DT1 %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% mosses.gen, + values="Moss")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% vascular.gen, + values="Vascular plant")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% alga.gen, + values="Alga_Stonewort")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% c(lichen.genera, "Lichenes"), + values="Lichen")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% mushroom, + values="Mushroom")) + +table(DT1$`Taxon group`, exclude=NULL) +``` + +```{r, eval=F, echo=F} +#check for conflicts in attribution of genera to Taxon groups +conflict <- DT1 %>% + filter(!is.na(Name_short)) %>% + dplyr::select(Genus, `Taxon group`) %>% + filter(!is.na(`Taxon group`)) %>% + distinct() %>% + group_by(Genus) %>% + summarize(n=n()) %>% + filter(n>1) %>% + arrange(desc(n)) %>% + pull(Genus) +``` + + +Delete all records of fungi +```{r} +DT1f <- DT1 %>% + dplyr::select(-Genus) %>% + left_join(DT1 %>% + distinct(`Matched concept`) %>% + mutate(Genus=word(`Matched concept`, 1)), + by="Matched concept") %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% mushroom, + values = "Mushroom")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% lichen.genera, + values="Lichen")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% mosses.gen, + values="Moss")) %>% + mutate(`Taxon group`=replace(`Taxon group`, + list=Genus %in% vascular.gen, + values="Vascular plant")) %>% + mutate(`Taxon group` = fct_explicit_na(`Taxon group`, "Unknown")) %>% + filter(`Taxon group`!="Mushroom")# %>% + #dplyr::select(-Genus) + +table(DT1f$`Taxon group`, exclude=NULL) +``` + +Check the most frequent species for which we don't have taxon group info +```{r, echo=F, eval=F} +DT1f %>% + filter(`Taxon group` == "Unknown") %>% + group_by(Genus) %>% + summarize(n=n()) %>% + arrange(desc(n)) %>% + slice(1:40) +``` + +Clean DT and export +```{r} +DT2 <- DT1 %>% + dplyr::select(PlotObservationID, Name_short, `Turboveg2 concept`, `Taxon group`, Layer:x_ ) %>% + rename(Species_original=`Turboveg2 concept`, + Species_matched=Name_short, + Taxon_group=`Taxon group`, + Cover_perc=`Cover %`, + Cover_code=`Cover code`) +``` + +The output of the DT table contains `r nrow(DT2)` records, over `r length(unique(DT2$PlotObservationID))` plots. The total number of taxa is `r length(unique(DT2$Species_original))` and `r length(unique(DT2$Species_matched))`, before and after standardization, respectively. Information on the `Taxon group` is available for `r DT2 %>% filter(Taxon_group!="Unknown") %>% distinct(Species_matched) %>% nrow()` standardized species. + +```{r} +save(DT2, file = "../_output/DT_sPlot3.0.RData") +``` + +***!!! ADD Explanation of fields!!!*** + + + +!! It seems there are a few species in Name_sPlot_TRY which are duplicated!!! Check in Backbone -- GitLab