Skip to content
Snippets Groups Projects
03_TaxonomicBackbone.Rmd 80.31 KiB
title: "Taxonomic Backbone - sPlot 3.0"
author: "Francesco Maria Sabatini"
date: "`r format(Sys.time(), '%d %B, %Y')`"
tags: 
- database
- big data
- traits
- taxonomy
output: 
  html_document:
    number_sections: true
    toc: true
    toc_depth: 2
abstract: "This document describes the workflow (with contributions from Oliver Purschke, Jürgen Dengler and Florian Jansen) that was used to generate the taxonomic backbone that standardizes taxon names across the (i) global vegetation plot database sPlot version 3.0 and (ii) the global plant trait data base TRY version 5.  "
urlcolor: blue
![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")

Timestamp: r date()
Drafted: Francesco Maria Sabatini
Revised: Helge Bruelheide, Borja Jimenez-Alfaro
Version: 1.1

Changes to Version 1.1 Additional manual cleaning of species names from BJA, UJ and HB.


Data preparation

Load packages

library(reshape2)
library(tidyverse)
library(readr)
library(data.table)
library(knitr)
library(kableExtra)
library(stringr)
library(taxize)
library(Taxonstand)
library(vegdata)

Read in taxon names from sPlot and TRY

## fungi genera #NOT COMPLETE LIST
mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Amanitopsis","Coprinus",
  "Galerina","Geoglossum","Hebeloma","Hydnum","Lactarius","Leucocarpia","Naucoria","Otidea","Polyporus", "Involucrothele",
  "Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma", 
   "Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe",
  "Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe", 
  "Rickenella", "Sarcoscypha", "Vascellum", "Ramaria", 
  "Amphoroblasia", "Amphoroblastia")
#import and save splot names from DT table
DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species.csv", 
                            delim="\t", 
                         col_type = cols(
                                PlotObservationID = col_double(),
                                Taxonomy = col_character(),
                                `Taxon group` = col_character(),
                                `Taxon group ID` = col_double(),
                                `Turboveg2 concept` = col_character(),
                                `Matched concept` = col_character(),
                                Match = col_double(),
                                Layer = col_double(),
                                `Cover %` = col_double(),
                                `Cover code` = col_character(),
                                x_ = col_double()
                              )
                         ) 
## Exclude fungi
splot.species <- DT0 %>%
  rename(Species.original=`Turboveg2 concept`, Matched.concept=`Matched concept`) %>%
  filter(`Taxon group`!="Mushroom") %>%
  dplyr::select(Species.original, Matched.concept) %>%
  distinct() %>%
  group_by(Matched.concept) %>%
  mutate(fungi= word(Matched.concept, 1) %in% mushroom) %>%
  ungroup() %>%
  filter(fungi==F) %>%
  dplyr::select(Species.original, Matched.concept)

  
write_csv(splot.species, path = "../_derived/splot3.0.2.species.csv")

!!! I used the column from TRY with the full species name, not the column with only a two-word name strings

splot.species <- read_csv("../_derived/splot3.0.2.species.csv")

try.species <- readr::read_csv("../_input/AccSpecies_TRY5.csv", col_names = F, locale = locale(encoding = 'Latin1')) %>%
  dplyr::select(-X6, -X7) %>%
  rename(try.ID=X1, FullSpecies=X2, Species=X3, Genus=X4, Family=X5, GrowthForm=X8)

# Sneak in  species from the Alpine database (Borja & Riccardo), as a courtesy to Project #18
alpine.species <- read_delim("../_input/new_alpine_species.txt", col_names = F, delim = "\t", locale = locale(encoding = 'Latin1')) %>% 
  rename(Species=X1)

Use the Matched.concept column, as it already contains some standardization by Stephan Hennekkens according to synbiosys.

sPlot 3.0.1 contains r nrow(unique(splot.species[,2])) different species names.
TRY 5. contains r nrow(try.species).
I add to this a list of r nrow(alpine.species) alpine species delivered from Riccardo Testolin, within sPlot Project #18.

Combine species lists

spec.list.TRY.sPlot <- splot.species %>%
  dplyr::select(Matched.concept) %>%
  rename(Species=Matched.concept) %>%
  mutate(Source="S") %>%
  bind_rows(try.species %>% 
              dplyr::select(FullSpecies) %>% ##using the full name from TRY
              rename(Species=FullSpecies) %>% 
              mutate(Source="T")) %>%
  bind_rows(alpine.species %>% 
              mutate(Source="A")) %>%
  reshape2::dcast(Species ~ Source) %>%
  mutate(A=ifelse(A>=1, "A", "")) %>%
  mutate(S=ifelse(S>=1, "S", "")) %>%
  mutate(T=ifelse(T>=1, "T", "")) %>%
  mutate(Source=paste(S, T, A, sep="")) %>%
  dplyr::select(-A, -S, -T)
            
 #Number of species unique and in common across databases

The total number of species in the backbone is r nrow(spec.list.TRY.sPlot).

knitr::kable(spec.list.TRY.sPlot %>%
               mutate(Source=factor(Source, 
                                    levels=c("S", "T", "A", "ST", "SA", "TA", "STA"),
                                    labels=c("sPlot only", "TRY only", "Alpine only",
                                             "sPlot + TRY", "sPlot + Alpine", "TRY + Alpine",
                                             "sPlot + TRY + Alpine"))) %>% 
               group_by(Source) %>%
               summarize(Num.taxa=n()), 
  caption="Number of taxa per database") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  latex_options = "basic", 
                  full_width = F, position = "center")

A-priori cleaning of names