Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
title: "sPlot 3.0 - Build DT"
author: "Francesco Maria Sabatini"
date: "2/24/2020"
output: html_document
![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")

MEMO!! WHAT TO DO WITH LAYER WHEN IS CONSISTENTLY ZERO IN A PLOT? CHANGE TO NA? WHAT TO DO INSTEAD WHEN LAYER==0 IN A PLOT WHERE LAYER INFO IS OTHERWISE AVAILABLE? !!! ADD Explanation of fields!!!

Timestamp: r date()
Drafted: Francesco Maria Sabatini
Revised:
version: 1.0

This report documents the construction of the DT table for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens.

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(readr)
library(xlsx)
library(knitr)
library(kableExtra)

#save temporary files
write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
#rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")

Search and replace unclosed quotation marks and escape them. Run in Linux terminal

# escape all double quotation marks. Run in Linux terminal
# sed 's/"/\\"/g' sPlot_3_0_2_species.csv > sPlot_3_0_2_species_test.csv

Import data Table

DT table is the species x plot matrix, in long format.

DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", 
                            delim="\t", 
                         col_type = cols(
                                PlotObservationID = col_double(),
                                Taxonomy = col_character(),
                                `Taxon group` = col_character(),
                                `Taxon group ID` = col_double(),
                                `Turboveg2 concept` = col_character(),
                                `Matched concept` = col_character(),
                                Match = col_double(),
                                Layer = col_double(),
                                `Cover %` = col_double(),
                                `Cover code` = col_character(),
                                x_ = col_double()
                              )
                         ) 
nplots <- length(unique(DT0$PlotObservationID))
nspecies <- length(unique(DT0$`Matched concept`))

Species data include r nrow(DT0) species * plot records, across r nplots plots. Before taxonomic resolution, there are r nspecies species .
\newline

set.seed <- 1984
sampled <- sample(unique(DT0$PlotObservationID), 10, replace=F)

knitr::kable(DT0 %>%
               filter(PlotObservationID %in% sampled[1:3]), 
  caption="Example of initial DT table (3 randomly selected plots shown)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Match species names from DT0 to those in Backbone

Import taxonomic backbone

load("../_output/Backbone3.0.RData")

Match to DT0, using Taxonomic concept as matching key. This is the field that was used to build, and resolve, the Backbone.

DT1 <- DT0 %>% 
  left_join(Backbone %>% 
              dplyr::select(Name_sPlot_TRY, Name_short, `Taxon group`, Rank_correct) %>%
              rename(`Matched concept`=Name_sPlot_TRY,
                     Taxongroup_BB=`Taxon group`), 
            by="Matched concept") %>% 
  # Simplify Rank_correct
  mutate(Rank_correct=fct_collapse(Rank_correct, 
                                   lower=c("subspecies", "variety", "infraspecies", "race", "forma"))) %>% 
  mutate(Rank_correct=fct_explicit_na(Rank_correct, "No_match")) %>% 
  mutate(Name_short=replace(Name_short, 
                            list=Name_short=="No suitable", 
                            values=NA))

Explore name matching based on Backbone v1.2

Select species entries that changed after taxonomic standardization, as a way to check the backbone.

name.check <- DT1 %>% 
  dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% 
  rename(Name_TNRS=Name_short) %>% 
  distinct() %>% 
  mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% 
  filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>%
  dplyr::select(-Matched_short) %>% 
  arrange(Name_TNRS)
knitr::kable(name.check %>% sample_n(30), 
  caption="Check 30 random species names from DT that changed name after matching to backbone") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Check the most common species names from DT after matching to backbone

name.check.freq <- DT1 %>% 
  dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% 
  rename(Name_TNRS=Name_short) %>% 
  group_by(`Turboveg2 concept`, `Matched concept`, Name_TNRS) %>% 
  summarize(n=n()) %>% 
  mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% 
  filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>%
  dplyr::select(-Matched_short) %>% 
  ungroup() %>% 
  arrange(desc(n)) 
knitr::kable(name.check.freq %>% slice(1:40), 
  caption="Check 40 most common species names from DT that changed name after matching to backbone") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Complete field taxon group

nknown <- DT1 %>% filter(`Taxon group`!="Unknown") %>% nrow()
nunknown <- DT1 %>% filter(`Taxon group`=="Unknown") %>% nrow()

Taxon group information is only available for r nknown entries, but absent for r nunknown. To improve the completeness of this field, we derive additional info from the Backbone, and merge it with the data already present in DT.

table(DT1$`Taxon group`, exclude=NULL)

DT1 <- DT1 %>% 
  mutate(`Taxon group`=ifelse(`Taxon group`=="Unknown", NA, `Taxon group`)) %>% 
  mutate(Taxongroup_BB=ifelse(Taxongroup_BB=="Unknown", NA, Taxongroup_BB)) %>% 
  mutate(`Taxon group`=coalesce(`Taxon group`, Taxongroup_BB)) %>% 
  dplyr::select(-Taxongroup_BB)


table(DT1$`Taxon group`, exclude=NULL)

Those taxa for which a measuress of Basal Area exists can be safely assumed to belong to vascular plants

DT1 <- DT1 %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=`Cover code`=="x_BA", 
                               values="Vascular plant"))

Cross-complement Taxon group information. This means that, whenever a taxon is marked to belong to one group, then assign the same taxon to that group throughout the DT table.

DT1 <- DT1 %>% 
  left_join(DT1 %>% 
              filter(!is.na(Name_short)) %>% 
              filter(`Taxon group` != "Unknown") %>% 
              dplyr::select(Name_short, `Taxon group`) %>% 
              distinct(Name_short, .keep_all=T) %>% 
              rename(TaxonGroup_compl=`Taxon group`),
            by="Name_short") %>% 
  mutate(`Taxon group`=coalesce(`Taxon group`, TaxonGroup_compl)) %>% 
  dplyr::select(-TaxonGroup_compl)

table(DT1$`Taxon group`, exclude=NULL)

Check species with conflicting Taxon group information and fix manually.

#check for conflicts in attribution of genera to Taxon groups
DT1 %>% 
  filter(!is.na(Name_short)) %>% 
  filter(!is.na(`Taxon group`)) %>% 
  distinct(Name_short, `Taxon group`) %>% 
  mutate(Genus=word(Name_short,1)) %>% 
  dplyr::select(Genus, `Taxon group`) %>% 
  distinct() %>% 
  group_by(Genus) %>% 
  summarize(n=n()) %>% 
  filter(n>1) %>% 
  arrange(desc(n))

Manually fix some known problems in Taxon group attribution. Some lists of taxa (e.g., lichen.genera, mushroom.genera) were defined when building the Backbone.

#Attach genus info
DT1 <- DT1 %>% 
    left_join(Backbone %>% 
              dplyr::select(Name_sPlot_TRY, Name_short) %>%
              mutate(Genus=word(Name_short, 1, 1)) %>%
              dplyr::select(-Name_short) %>% 
              rename(`Matched concept`=Name_sPlot_TRY),
            by="Matched concept") %>% 
    mutate(`Taxon group`=fct_collapse(`Taxon group`, 
                                    Alga_Stonewort=c("Alga", "Stonewort")))
#manually fix some known problems
mosses.gen    <- c("Hypnum", "Brachytheciastrum","Brachythecium","Hypnum",  
                  "Zygodon", "Oxymitra", "Bryophyta", "Musci", '\\\"Moos\\\"')
vascular.gen  <- c("Polystichum", "Hypericum", "Peltaria", "Pancovia", "Calythrix", "Ripogonum",
                  "Notogrammitis", "Fuscospora", "Lophozonia",  "Rostellularia", 
                  "Hesperostipa", "Microsorium", "Angiosperm","Dicotyledonae", "Spermatophy")
alga.gen      <- c("Chara", "Characeae", "Tonina", "Nostoc", "Entermorpha", "Hydrocoleum" )
 
DT1 <- DT1 %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% mosses.gen, 
                               values="Moss")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% vascular.gen, 
                               values="Vascular plant")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% alga.gen, 
                               values="Alga_Stonewort")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% c(lichen.genera, "Lichenes"),
                               values="Lichen")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% mushroom, 
                               values="Mushroom"))
  
table(DT1$`Taxon group`, exclude=NULL)

Delete all records of fungi, and use lists of genera to fix additional problems. While in the previous round the matching was done on the resolved Genus name, here the match is based on unresolved Genus names.

DT1 <- DT1 %>% 
  dplyr::select(-Genus) %>% 
  left_join(DT1 %>% 
              distinct(`Matched concept`) %>% 
              mutate(Genus=word(`Matched concept`, 1)), 
            by="Matched concept") %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                                 list=Genus %in% mushroom, 
                                 values = "Mushroom")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% lichen.genera, 
                               values="Lichen")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% mosses.gen, 
                               values="Moss")) %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
                               list=Genus %in% vascular.gen, 
                               values="Vascular plant")) %>% 
  mutate(`Taxon group` = fct_explicit_na(`Taxon group`, "Unknown")) %>% 
  filter(`Taxon group`!="Mushroom") %>%
  mutate(`Taxon group`=factor(`Taxon group`))
  #dplyr::select(-Genus)

table(DT1$`Taxon group`, exclude=NULL)
nunknown <- DT1 %>% filter(`Taxon group`=="Unknown") %>% nrow()

After cross-checking all sources of information, the number of taxa not having Taxon group information decreased to r nunknown entries

#Check the most frequent genera for which we don't have taxon group info
DT1 %>% 
  filter(`Taxon group` == "Unknown") %>% 
  group_by(Genus) %>% 
  summarize(n=n()) %>% 
  arrange(desc(n)) %>% 
    slice(1:40)

Calculate relative cover per layer per species in each plot

Species abundance information varies across datasets and plots. While for the large majority of plots abundance values are returned as percentage cover, there is a subset where abundance is returned with different scales. These are marked in the column Cover code as follows: \newline \newline x_BA - Basal Area
x_IC - Individual count
x_SC - Stem count
x_IV - Relative Importance
x_RF - Relative Frequency
x - Presence absence
\newline \newline Still, it's not really intuitive that in case Cover code belongs to one of the classes above, then the actual abundance value is stored in the x_ column. This stems from the way this data is stored in TURBOVEG.
To make the cover data more user friendly, I simplify the way cover is stored, so that there are only two columns:
Ab_scale - to report the type of scale used
Abundance - to coalesce the cover\abundance values previously in the columns Cover % and x_.

# Create Ab_scale field
DT1 <- DT1 %>% 
  mutate(Ab_scale = ifelse(`Cover code` %in% 
                             c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF") & !is.na(x_), 
                           `Cover code`, 
                           "CoverPerc"))  

Fix some errors. There are some plots where all species have zeros in the field Cover %. Some of them are marked as p\a (Cover code=="x"), but other not. Consider all this plots as presence\absence and transform Cover % to 1.
!! There are some other plots having layers with all zeros. This should be double-checked, but are not being transformed here !!

allzeroes <- DT1 %>% 
  group_by(PlotObservationID) %>% 
  summarize(allzero=all(`Cover %`==0) ) %>% 
  filter(allzero==T) %>% 
  pull(PlotObservationID)
DT1 <- DT1 %>% 
  mutate(`Cover %`=replace(`Cover %`, 
                           list=(PlotObservationID %in% allzeroes), 
                           values=1)) %>% 
  mutate(`Cover code`=replace(`Cover code`, 
                           list=(PlotObservationID %in% allzeroes), 
                           values="x"))

Consider all plot-layer combinations where Cover code=="x", and all the entries of the field Cover % == 1 as presence\absence data, and transform Ab_scale to "pa". This is done to avoid confusion with plots where Cover code=="x" but "x" has to be intended as a class in the cover scale used. For p\a plots, replace the field Cover % with NA, and assign the value 1 to the field x_.

#plots with at least one entry in Cover code=="x"
sel <- DT1 %>% 
  filter(`Cover code`=="x") %>% 
  distinct(PlotObservationID) %>% 
  pull(PlotObservationID)

DT1 <- DT1 %>% 
  left_join(DT1 %>%
              filter(PlotObservationID %in% sel) %>% 
              group_by(PlotObservationID, Layer) %>% 
              mutate(to.pa= all(`Cover %`==1 & `Cover code`=="x")) %>% 
              distinct(PlotObservationID, Layer, to.pa), 
            by=c("PlotObservationID", "Layer")) %>% 
  replace_na(list(to.pa=F)) %>% 
  mutate(Ab_scale=ifelse(to.pa==T, "pa", Ab_scale)) %>% 
  mutate(`Cover %`=ifelse(to.pa==T, NA, `Cover %`)) %>% 
  mutate(x_=ifelse(to.pa==T, 1, x_)) %>% 
  dplyr::select(-to.pa)

There are also some plots having different cover scales in the same layer. They are not many, and I will reduce their cover value to p\a.
Find these plots first:

mixed <- DT1 %>% 
  distinct(PlotObservationID, Ab_scale, Layer) %>% 
  group_by(PlotObservationID, Layer) %>% 
  summarize(n=n()) %>% 
  filter(n>1) %>% 
  pull(PlotObservationID) %>% 
  unique()
length(mixed)

Transform these plots to p\a and correct field Ab_scale. Note: the column Abundance is only created here.

DT1 <- DT1 %>% 
  mutate(Ab_scale=replace(Ab_scale, 
                           list=PlotObservationID %in% mixed, 
                           values="mixed")) %>%
  mutate(`Cover %`=replace(`Cover %`, 
                           list=Ab_scale=="mixed",
                           values=NA)) %>% 
  mutate(x_=replace(x_,  list=Ab_scale=="mixed", values=1)) %>% 
  mutate(Ab_scale=replace(Ab_scale, list=Ab_scale=="mixed", values="pa")) %>% 
  #Create additional field Abundance to avoid overwriting original data
  mutate(Abundance =ifelse(Ab_scale %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF", "pa"), 
                          x_, `Cover %`)) %>% 
  mutate(Abundance=replace(Abundance, 
                           list=PlotObservationID %in% mixed, 
                           values=1))

Double check and summarize Ab_scales

scale_check <- DT1 %>% 
  distinct(PlotObservationID, Layer, Ab_scale) %>% 
  group_by(PlotObservationID) %>% 
  summarise(Ab_scale_combined=ifelse(length(unique(Ab_scale))==1, 
                                     unique(Ab_scale), 
                                     "Multiple_scales"))

nrow(scale_check)== length(unique(DT1$PlotObservationID))
table(scale_check$Ab_scale_combined)

Transform abundances to relative abundance. For consistency with the previous version of sPlot, this field is called Relative cover.
Watch out - Even plots with p\a information are transformed to relative cover.

DT1 <- DT1 %>% 
  left_join(x=., 
            y={.} %>%
              group_by(PlotObservationID) %>% 
              summarize(tot.abundance=sum(Abundance)), 
            by=c("PlotObservationID")) %>% 
  mutate(Relative.cover=Abundance/tot.abundance)

# check: there should be no plot where the sum of all relative covers !=0
DT1 %>% 
  group_by(PlotObservationID) %>% 
  summarize(tot.cover=sum(Relative.cover), 
            num.layers=sum(unique(Layer))) %>% 
  filter(tot.cover != num.layers) %>% 
  nrow()

Clean DT and export

DT2 <- DT1 %>% 
  dplyr::select(PlotObservationID, Name_short, `Turboveg2 concept`, Rank_correct, `Taxon group`, Layer:x_, Ab_scale, Abundance, Relative.cover ) %>% 
  rename(species_original=`Turboveg2 concept`, 
         species=Name_short,
         taxon_group=`Taxon group`, 
         cover_perc=`Cover %`, 
         cover_code=`Cover code`)

The output of the DT table contains r nrow(DT2) records, over r length(unique(DT2$PlotObservationID)) plots. The total number of taxa is r length(unique(DT2$species_original)) and r length(unique(DT2$species)), before and after standardization, respectively. Information on the Taxon group is available for r DT2 %>% filter(taxon_group!="Unknown") %>% distinct(species) %>% nrow() standardized species.

knitr::kable(DT2 %>%
               filter(PlotObservationID %in% sampled[1:3]), 
  caption="Example of initial DT table (same 3 randomly selected plots shown above)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")
save(DT2, file = "../_output/DT_sPlot3.0.RData")

SessionInfo

sessionInfo()