Skip to content
Snippets Groups Projects
Select Git revision
  • 4a51db3cbdca48c47016f16024eaa6cb10363f2e
  • master default protected
2 results

06_buildDT.Rmd

Blame
  • Code owners
    Assign users and groups as approvers for specific file changes. Learn more.
    title: "sPlot 3.0 - Build DT"
    author: "Francesco Maria Sabatini"
    date: "2/24/2020"
    output: html_document
    ![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")

    MEMO CHECK Field cover code! It seems to have species characters

    Timestamp: r date()
    Drafted: Francesco Maria Sabatini
    Revised:
    version: 1.0

    This report documents the construction of the DT table for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens.

    knitr::opts_chunk$set(echo = TRUE)
    library(tidyverse)
    library(readr)
    library(xlsx)
    library(knitr)
    library(kableExtra)
    
    #save temporary files
    write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
    write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
    #rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")

    Search and replace unclosed quotation marks and escape them. Run in Linux terminal

    # escape all double quotation marks. Run in Linux terminal
    # sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv

    Import data Table

    DT table is the species x plot matrix, in long format.

    DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv", 
                                delim="\t", 
                             col_type = cols(
                                    PlotObservationID = col_double(),
                                    Taxonomy = col_character(),
                                    `Taxon group` = col_character(),
                                    `Taxon group ID` = col_double(),
                                    `Turboveg2 concept` = col_character(),
                                    `Matched concept` = col_character(),
                                    Match = col_double(),
                                    Layer = col_double(),
                                    `Cover %` = col_double(),
                                    `Cover code` = col_character(),
                                    x_ = col_double()
                                  )
                             ) 
    nplots <- length(unique(DT0$PlotObservationID))
    nspecies <- length(unique(DT0$`Matched concept`))

    Species data include r nrow(DT0) species * plot records, across r nplots plots and including r nspecies non-resolved species.
    \newline

    set.seed <- 1984
    sampled <- sample(unique(DT0$PlotObservationID), 10, replace=F)
    
    knitr::kable(DT0 %>%
                   filter(PlotObservationID %in% sampled[1:3]), 
      caption="Example of initial DT table (3 randomly selected plots shown)") %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                      full_width = F, position = "center")

    Import taxonomic backbone

    load("../_output/Backbone3.0.RData")

    Match species names from DT0 to those in Backbone

    DT1 <- DT0 %>% 
      left_join(Backbone %>% 
                  dplyr::select(Name_sPlot_TRY, Name_short, `Taxon group`) %>%
                  rename(`Matched concept`=Name_sPlot_TRY,
                         Taxongroup_BB=`Taxon group`), 
                by="Matched concept")  

    Explore name matching based on Backbone v1.2

    Select species entries that changed after taxonomic standardization, as a way to check the backbone.

    name.check <- DT1 %>% 
      dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% 
      rename(Name_TNRS=Name_short) %>% 
      distinct() %>% 
      mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% 
      filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>%
      dplyr::select(-Matched_short) %>% 
      arrange(Name_TNRS)
    knitr::kable(name.check %>% sample_n(30), 
      caption="Check 30 random species names from DT after matching to backbone") %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                      full_width = F, position = "center")

    Check the most common species names from DT after matching to backbone

    name.check.freq <- DT1 %>% 
      dplyr::select(`Turboveg2 concept`:`Matched concept`, Name_short) %>% 
      rename(Name_TNRS=Name_short) %>% 
      group_by(`Turboveg2 concept`, `Matched concept`, Name_TNRS) %>% 
      summarize(n=n()) %>% 
      mutate(Matched_short=word(`Matched concept`, start = 1L, end=2L)) %>% 
      filter(is.na(Name_TNRS) | Matched_short != Name_TNRS) %>%
      dplyr::select(-Matched_short) %>% 
      ungroup() %>% 
      arrange(desc(n)) 
    knitr::kable(name.check.freq %>% slice(1:40), 
      caption="Check 40 most common species names from DT after matching to backbone") %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                      full_width = F, position = "center")

    Complete field taxon group

    Coalesce Taxon group info from Backbone

    table(DT1$`Taxon group`, exclude=NULL)
    
    DT1 <- DT1 %>% 
      mutate(`Taxon group`=ifelse(`Taxon group`=="Unknown", NA, `Taxon group`)) %>% 
      mutate(Taxongroup_BB=ifelse(Taxongroup_BB=="Unknown", NA, Taxongroup_BB)) %>% 
      mutate(`Taxon group`=coalesce(`Taxon group`, Taxongroup_BB)) %>% 
      dplyr::select(-Taxongroup_BB)
    
    
    table(DT1$`Taxon group`, exclude=NULL)

    Cross-complement

    DT1 <- DT1 %>% 
      left_join(DT1 %>% 
                  filter(!is.na(Name_short)) %>% 
                  filter(`Taxon group` != "Unknown") %>% 
                  dplyr::select(Name_short, `Taxon group`) %>% 
                  distinct(Name_short, .keep_all=T) %>% 
                  rename(TaxonGroup_compl=`Taxon group`),
                by="Name_short") %>% 
      mutate(`Taxon group`=coalesce(`Taxon group`, TaxonGroup_compl)) %>% 
      dplyr::select(-TaxonGroup_compl)
    
    table(DT1$`Taxon group`, exclude=NULL)

    Check species with conflicting Taxon group information and fix manually.

    #Attach genus info
    DT1 <- DT1 %>% 
        left_join(Backbone %>% 
                  dplyr::select(Name_sPlot_TRY, Name_short) %>%
                  mutate(Genus=word(Name_short, 1, 1)) %>%
                  dplyr::select(-Name_short) %>% 
                  rename(`Matched concept`=Name_sPlot_TRY),
                by="Matched concept") %>% 
        mutate(`Taxon group`=fct_collapse(`Taxon group`, 
                                        Alga_Stonewort=c("Alga", "Stonewort")))
    #manually fix some know problems
    mosses.gen <- c("Hypnum", "Brachytheciastrum", 
               "Brachythecium","Hypnum",  "Zygodon", "Oxymitra", "Bryophyta", "Musci", '\\\"Moos\\\"')
    vascular.gen <- c("Polystichum", "Hypericum", "Peltaria", "Pancovia", "Calythrix", "Ripogonum",
                      "Notogrammitis", "Fuscospora", "Lophozonia",  "Rostellularia", 
                      "Hesperostipa", "Microsorium", 
                      "Angiosperm","Dicotyledonae", "Spermatophy")
    alga.gen <- c("Chara", "Characeae", "Tonina", "Nostoc", "Entermorpha", "Hydrocoleum" )
     
    DT1 <- DT1 %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% mosses.gen, 
                                   values="Moss")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% vascular.gen, 
                                   values="Vascular plant")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% alga.gen, 
                                   values="Alga_Stonewort")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% c(lichen.genera, "Lichenes"),
                                   values="Lichen")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% mushroom, 
                                   values="Mushroom"))
      
    table(DT1$`Taxon group`, exclude=NULL)
    #check for conflicts in attribution of genera to Taxon groups
    conflict <- DT1 %>% 
      filter(!is.na(Name_short)) %>% 
      dplyr::select(Genus, `Taxon group`) %>% 
      filter(!is.na(`Taxon group`)) %>% 
      distinct() %>% 
      group_by(Genus) %>% 
      summarize(n=n()) %>% 
      filter(n>1) %>% 
      arrange(desc(n)) %>% 
      pull(Genus)

    Delete all records of fungi

    DT1 <- DT1 %>% 
      dplyr::select(-Genus) %>% 
      left_join(DT1 %>% 
                  distinct(`Matched concept`) %>% 
                  mutate(Genus=word(`Matched concept`, 1)), 
                by="Matched concept") %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                     list=Genus %in% mushroom, 
                                     values = "Mushroom")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% lichen.genera, 
                                   values="Lichen")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% mosses.gen, 
                                   values="Moss")) %>% 
      mutate(`Taxon group`=replace(`Taxon group`, 
                                   list=Genus %in% vascular.gen, 
                                   values="Vascular plant")) %>% 
      mutate(`Taxon group` = fct_explicit_na(`Taxon group`, "Unknown")) %>% 
      filter(`Taxon group`!="Mushroom")# %>% 
      #dplyr::select(-Genus)
    
    table(DT1$`Taxon group`, exclude=NULL)

    Check the most frequent species for which we don't have taxon group info

    DT1 %>% 
      filter(`Taxon group` == "Unknown") %>% 
      group_by(Genus) %>% 
      summarize(n=n()) %>% 
      arrange(desc(n)) %>% 
        slice(1:40)

    Calculate relative cover per layer per species in each plot

    DT1 <- DT1 %>% 
      mutate(tmp.cover=ifelse(`Cover code` %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF"), 
                              x_, `Cover %`)) %>% 
      left_join(x=., 
                y={.} %>%
                  group_by(PlotObservationID, Layer) %>% 
                  summarize(tot.cover=sum(tmp.cover)), 
                by=c("PlotObservationID", "Layer")) %>% 
      mutate(Relative.cover=tmp.cover/tot.cover)
                  
                

    Clean DT and export

    DT2 <- DT1 %>% 
      dplyr::select(PlotObservationID, Name_short, `Turboveg2 concept`, `Taxon group`, Layer:x_, Relative.cover ) %>% 
      rename(species_original=`Turboveg2 concept`, 
             species=Name_short,
             taxon_group=`Taxon group`, 
             cover_perc=`Cover %`, 
             cover_code=`Cover code`)

    The output of the DT table contains r nrow(DT2) records, over r length(unique(DT2$PlotObservationID)) plots. The total number of taxa is r length(unique(DT2$Species_original)) and r length(unique(DT2$Species_matched)), before and after standardization, respectively. Information on the Taxon group is available for r DT2 %>% filter(Taxon_group!="Unknown") %>% distinct(Species_matched) %>% nrow() standardized species.

    save(DT2, file = "../_output/DT_sPlot3.0.RData")

    !!! ADD Explanation of fields!!!