Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
title: "sPlot3.0 - Build CWMs"
author: "Francesco Maria Sabatini"
date: "3/4/2020"
output: html_document
![](/data/sPlot/users/Francesco/_sPlot_Management/splot-long-rgb.png "sPlot Logo")

Timestamp: r date()
Drafted: Francesco Maria Sabatini
Revised:
version: 1.0

This reports documents 1) the construction of Community Weighted Means (CWMs) and Variance (CWVs); and 2) the classification of plots into forest\non-forest based on species growth forms. It complements species composition data from sPlot 3.0 and gap-filled plant functional traits from TRY 5.0, as received by Jens Kattge on Jan 21, 2020.

library(tidyverse)
library(readr)
library(data.table)
library(knitr)
library(kableExtra)
library(stringr)
library(caret)
library(viridis)

1 Data import, preparation and cleaning

load("../_output/DT_sPlot3.0.RData")
load("../_output/Backbone3.0.RData")

Import TRY data

# Species, Genus, Family
try.species <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv",
  locale = locale(encoding = "latin1")) 
# Original data without gap-filling. With species and trait labels
try.allinfo <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/traits_x_georef_wide_table.csv", 
  locale = locale(encoding = "latin1"), 
                        col_types = paste0(c("dddccccc",rep("c", 84)), collapse=""))
# Individual-level gap-filled data - order as in try.allinfo
try.individuals0 <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/gapfilled_data/mean_gap_filled_back_transformed.csv", 
  locale = locale(encoding = "latin1"))

There are r nrow(try.species) individual observations from r nrow(try.species %>% distinct(Species)) distinct (unresolved) species in r nrow(try.species %>% distinct(Genus)) distinct (unresolved) genera.

\newline \newline

# #Ancillary function to change species names to lower case
# firstup <- function(x) {
#   substr(x, 1, 1) <- toupper(substr(x, 1, 1))
#   x
# }
# 
# try.species <- try.species %>% 
#   mutate(Species=tolower(Species)) %>%
#   mutate(Species=firstup(Species)) 
# 
# try.allinfo <- try.allinfo %>% 
#   mutate(Species=tolower(Species)) %>%
#   mutate(Species=firstup(Species)) 

1.2 Attach resolved names from Backbone

try.species.names <- try.allinfo %>% 
  dplyr::select(Species, Genus, GrowthForm) %>% 
  left_join(Backbone %>% 
              dplyr::select(Name_sPlot_TRY, Name_short) %>% 
              rename(Species=Name_sPlot_TRY), 
            by="Species") %>% 
  dplyr::select(Species, Name_short, Genus, GrowthForm)

After attaching resolved names, TRY data contains information on r try.species.names %>% distinct(Name_short) %>% nrow() species.
\newline \newline Check for how many of the species in sPlot, trait information is available in TRY.

sPlot.species <- DT2 %>% 
  distinct(species) 

sPlot.in.TRY <- sPlot.species %>% 
  filter(species %in% (try.species.names %>% 
                                  distinct(Name_short) %>% 
                                  pull(Name_short))) 

Out of the r nrow(sPlot.species) standardizes species names in sPlot 3.0, r nrow(sPlot.in.TRY) (r round(nrow(sPlot.in.TRY)/nrow(sPlot.species)*100,1)%) also occur in TRY 5.0. This number does not account for matches at the genus level.

1.3 Create legend of trait names

trait.legend <- data.frame(full=try.allinfo %>% 
                             dplyr::select(starts_with("StdValue_")) %>% 
                             colnames() %>% 
                             gsub("StdValue_", "", .) %>% 
                             sort()) %>%
  mutate(full=as.character(full)) %>% 
  mutate(traitcode=parse_number(full)) %>% 
  arrange(traitcode) %>% 
  dplyr::select(traitcode, everything()) %>% 
  mutate(full=gsub(pattern = "^[0-9]+_", replacement="", full)) %>% 
  mutate(short=c("StemDens", "RootingDepth","LeafC.perdrymass", "LeafN","LeafP",
                 "StemDiam","SeedMass", "Seed.length","LeafThickness","LDMC",
                 "LeafNperArea","LeafDryMass.single","Leaf.delta.15N","SeedGerminationRate",
                 "Seed.num.rep.unit","LeafLength","LeafWidth","LeafCN.ratio","Leaffreshmass",
                 "Stem.cond.dens","Chromosome.n","Chromosome.cDNAcont", 
                 "Disp.unit.leng","StemConduitDiameter","Wood.vessel.length",
                 "WoodFiberLength","SpecificRootLength.fine","SpecificRootLength",
                 "PlantHeight.veg","PlantHeight.generative","LeafArea.leaf.noPet",
                 "LeafArea.leaflet.noPet","LeafArea.leaf.wPet","LeafArea.leaflet.wPet",
                 "LeafArea.leaf.undef","LeafArea.leaflet.undef","LeafArea.undef.undef",
                 "SLA.noPet", "SLA.wPet","SLA.undef", "LeafWaterCont")) %>% 
  ## Add SLA missing from allinfo file
  bind_rows(data.frame(traitcode=11, 
                       full="Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA)",
                       short="SLA")) %>% 
  bind_rows(data.frame(traitcode=18, 
                       full="Plant height (vegetative + generative)", 
                       short="PlantHeight")) %>%
  arrange(traitcode) %>% 
  #create a column to mark traits for which gap filled data is available.
  mutate(available=paste0("X", traitcode) %in% colnames(try.individuals0))
knitr::kable(trait.legend, 
  caption="Legend of traits from TRY") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Use trait legend to change naming of try.individuals0 data.frame of traits

#create string to rename traits
col.to <- trait.legend %>% 
  filter(available==T) %>% 
  pull(short) 
col.from <- trait.legend %>% 
  filter(available==T) %>% 
  mutate(traitcode=paste0("X", traitcode))  %>% 
  pull(traitcode) 

try.individuals <- try.individuals0 %>% 
              rename_at(col.from, .funs=function(x) col.to)

1.3 Fix some known errors in the gap-filled matrix

Check traits at the individual level. There are some traits with unexpected negative entries:

try.species.names %>% 
    dplyr::select(Name_short) %>% 
    bind_cols(try.individuals %>% 
                  dplyr::select(-X1)) %>% 
  gather(variable, value, -Name_short) %>% 
  filter(value<0) %>% 
  group_by(variable) %>% 
  summarize(n=n())

According to Jens Kattge, the entries for Leaf.delta.15N are legitimate, while in the other cases, it may be due to bad predictions. He suggested to delete these negative records.
Similarly, there are records with impossible values for height. Some species incorrectly predicted to have height >100 meters, and some herbs predicted to have a height >10 m.

try.individuals <- try.species.names %>% 
  dplyr::select(Name_short) %>% 
  bind_cols(try.individuals)

toexclude <- try.individuals %>% 
  gather(variable, value, -X1, -Name_short) %>% 
  filter(variable != "Leaf.delta.15N") %>% 
  filter(value<0) %>% 
  pull(X1)

toexclude2 <- try.individuals %>% 
  filter(PlantHeight>100  & (!Name_short %in% c("Pseudotsuga menziesii", "Sequoia sempervirens"))) %>% 
  pull(X1)

toexclude3 <- try.individuals %>% 
  filter(X1 %in% (try.allinfo %>% 
                     filter(GrowthForm=="herb") %>% 
                     pull(X1))) %>% 
  filter(PlantHeight>10) %>% 
  pull(X1)

try.individuals <- try.individuals %>% 
  filter(!X1 %in% c(toexclude, toexclude2, toexclude3)) %>% 
  dplyr::select(-X1)

This results in the exclusion of r length(unique(c(toexclude, toexclude2, toexclude3))) individuals. In this way the total number of species included in TRY reduces to r try.individuals %>% distinct(Name_short) %>% nrow()

1.4 Calculate species and genus level trait means and sd

## Calculate species level trait means and sd. 
try.species.means <- try.individuals %>% 
  group_by(Name_short) %>% 
  #Add a field to indicate the number of observation per taxon
  left_join(x={.} %>% 
              summarize(n=n()), 
            y={.} %>% 
              summarize_at(.vars=vars(StemDens:LeafWaterCont ),
                           .funs=list(mean=~mean(.), sd=~sd(.))),
            by="Name_short") %>% 
  dplyr::select(Name_short, n, everything())

## Calculate genus level trait means and sd.
try.genus.means <- try.individuals %>% 
  mutate(Genus=word(Name_short, 1)) %>% 
  group_by(Genus) %>% 
  left_join(x={.} %>% 
              summarize(n=n()), 
            y={.} %>% 
              summarize_at(.vars=vars(StemDens:LeafWaterCont ),
                           .funs=list(mean=~mean(.), sd=~sd(.))),
            by="Genus") %>% 
  dplyr::select(Genus, n, everything())

The average number of observations per species and genus is r round(mean(try.species.means$n),1) and r round(mean(try.genus.means$n),1), respectively. As many as r sum(try.species.means$n==1) species have only one observation (r sum(try.genus.means$n==1) at the genus level).

knitr::kable(try.species.means %>% 
               sample_n(15), 
  caption="Example of trait means for 15 randomly selected species", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

1.5 Match taxa based on species, if available, or Genus

Combined the trait means based on species and genera into a single object, and check how many of these taxa match to the (resolved) species names in DT2.

try.combined.means <- try.genus.means %>% 
  rename(Taxon_name=Genus) %>% 
  mutate(Rank_correct="genus") %>% 
  bind_rows(try.species.means %>% 
              rename(Taxon_name=Name_short) %>% 
              mutate(Rank_correct="species")) %>% 
  dplyr::select(Taxon_name, Rank_correct, everything())

total.matches <- DT2 %>%
  distinct(species, Rank_correct) %>%
  left_join(try.combined.means %>%
              dplyr::rename(species=Taxon_name), 
            by=c("species", "Rank_correct")) %>% 
  filter(!is.na(SLA_mean)) %>% 
  nrow()

The total number of matched taxa (either at species, or genus level) is r total.matches.

1.6 Calculate summary statistics for species- and genus-level mean traits

mysummary <- try.combined.means %>% 
               group_by(Rank_correct) %>% 
               summarize_at(.vars=vars(StemDens_mean:LeafWaterCont_sd),
                            .funs=list(min=~min(., na.rm=T),
                                       q025=~quantile(., 0.25, na.rm=T), 
                                       q50=~quantile(., 0.50, na.rm=T), 
                                       q75=~quantile(., .75, na.rm=T), 
                                       max=~max(., na.rm=T), 
                                       mean=~mean(., na.rm=T), 
                                       sd=~sd(., na.rm=T))) %>% 
  gather(variable, value, -Rank_correct) %>% 
  separate(variable, sep="_", into=c("variable", "mean.sd", "stat")) %>% 
  mutate(stat=factor(stat, levels=c("min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
  spread(key=stat, value=value) %>% 
  arrange(desc(Rank_correct))
  
knitr::kable(mysummary, 
  caption="Summary statistics for each trait, when summarized across species or genera", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

2 Calculate CWMs and CWVs for each plot

Merge vegetation layers, where necessary. Combine cover values across layers

#Ancillary function
# Combine cover accounting for layers
combine.cover <- function(x){
    while (length(x)>1){
      x[2] <- x[1]+(100-x[1])*x[2]/100
      x <- x[-1]
    }
  return(x)
}

DT2.comb <- DT2 %>% 
  group_by(PlotObservationID, species, Rank_correct) %>% 
  summarize(Relative.cover=combine.cover(Relative.cover)) %>%
  ungroup() %>% 
  # re-normalize to 100%
  left_join(x=., 
            y={.} %>% 
              group_by(PlotObservationID) %>% 
              summarize(Tot.cover=sum(Relative.cover)), 
            by="PlotObservationID") %>% 
  mutate(Relative.cover=Relative.cover/Tot.cover) %>% 
  dplyr::select(-Tot.cover)

Calculate CWMs and CWV, as well as plot coverage statistics (proportion of total cover for which trait info exist, and proportion of species for which we have trait info). To avoid misleading results, CWM is calculated ONLY for plots for which we have some abundance information. All plots where Ab_scale=="pa" in ANY of the layers are therefore excluded.

# Tag plots where at least one layer has only p\a information 
any_pa <- DT2 %>% 
  distinct(PlotObservationID, Ab_scale) %>% 
  group_by(PlotObservationID) %>% 
  summarize(any.pa=any(Ab_scale=="pa")) %>% 
  filter(any.pa==T) %>% 
  pull(PlotObservationID)
length(any_pa)

# Exclude plots above and merge species data table with traits
CWM0 <- DT2.comb %>%
  filter(!PlotObservationID %in% any_pa) %>% 
  left_join(try.combined.means %>%
              dplyr::rename(species=Taxon_name) %>% 
              dplyr::select(species, Rank_correct, ends_with("_mean")), 
            by=c("species", "Rank_correct"))

# Calculate CWM for each trait in each plot
CWM1 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~weighted.mean(., Relative.cover, na.rm=T))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=CWM, -PlotObservationID)

# Calculate coverage for each trait in each plot
CWM2 <- CWM0 %>%
  mutate_at(.funs = list(~if_else(is.na(.),0,1) * Relative.cover), 
            .vars = vars(StemDens_mean:LeafWaterCont_mean)) %>%
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~sum(., na.rm=T))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=trait.coverage, -PlotObservationID)
  
# Calculate CWV
# Ancillary function
variance2.fun <- function(trait, abu){
  res <- as.double(NA)
  abu <- abu[!is.na(trait)]
  trait <- trait[!is.na(trait)]
  abu <- abu/sum(abu)
  if (length(trait)>1){
    # you need more than 1 observation to calculate variance
    # for calculation see 
    # http://r.789695.n4.nabble.com/Weighted-skewness-and-curtosis-td4709956.html
    m.trait <- weighted.mean(trait,abu)
    res <- sum(abu*(trait-m.trait)^2)
  }
  res
}

CWM3 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~variance2.fun(., Relative.cover))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=CWV, -PlotObservationID)

## Calculate proportion of species having traits
CWM4 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  #distinct(PlotObservationID, species, .keep_all = T) %>% 
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~sum(!is.na(.)))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=n.sp.with.trait, -PlotObservationID)

# Join together
CWM <- CWM1 %>%
  left_join(CWM2, by=c("PlotObservationID", "variable")) %>%
  left_join(CWM3, by=c("PlotObservationID", "variable")) %>%
  left_join(CWM4, by=c("PlotObservationID", "variable")) %>%
  left_join(CWM0 %>% 
              group_by(PlotObservationID) %>%
              summarize(sp.richness=n()), by=c("PlotObservationID")) %>%
  mutate(prop.sp.with.trait=n.sp.with.trait/sp.richness) %>%
  dplyr::select(PlotObservationID, variable, sp.richness, prop.sp.with.trait, trait.coverage, CWM, CWV) %>% 
  arrange(PlotObservationID)

2.1 Explore CWM output

knitr::kable(CWM %>% 
               filter(PlotObservationID %in%
                        sample(unique(DT2$PlotObservationID), 3, replace=F)), 
  caption="Community weighted means of 3 randomly selected plots", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Scatterplot comparing coverage of traits values across plots, when based on relative cover and when based on proportion of species richness

ggplot(data=CWM %>% 
         #all variables have the same coverage. Showcase with LDMC
         filter(variable=="LDMC_mean"), aes(x=trait.coverage, y=prop.sp.with.trait, col=log(sp.richness))) + 
  geom_point(pch="+", alpha=1/3) + 
  geom_abline(intercept = 0, slope=1, col=2, lty=2, lwd=.7) + 
  xlim(c(0,1)) + 
  ylim(c(0,1)) + 
  scale_color_viridis() + 
  theme_bw() +
  xlab("Trait coverage (Relative  cover)") + 
  ylab("Trait coverage (Proportion of species)") + 
  coord_equal()

Calculate summary statistics for trait coverage in plots

CWM.coverage <- CWM %>% 
  filter(variable=="LDMC_mean") %>% 
  summarize_at(.vars=vars(trait.coverage, prop.sp.with.trait),
                .funs=list(num.0s=~sum(.==0),
                           min=~min(., na.rm=T),
                           q025=~quantile(., 0.25, na.rm=T), 
                           q50=~quantile(., 0.50, na.rm=T), 
                           q75=~quantile(., .75, na.rm=T), 
                           max=~max(., na.rm=T), 
                           mean=~mean(., na.rm=T), 
                           sd=~sd(., na.rm=T))) %>% 
  gather(key=variable, value=value) %>% 
  separate(variable, sep="_", into=c("metric", "stat")) %>% 
  mutate(stat=factor(stat, levels=c("num.0s", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
  spread(key=stat, value=value) 
knitr::kable(CWM.coverage, 
  caption="Summary of plot-level coverage of CWM and CWVs", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Calculate summary statistics for CWMs and CWVs

CWM.summary <- CWM %>% 
  rename(myvar=variable) %>% 
  group_by(myvar) %>% 
  summarize_at(.vars=vars(CWM:CWV),
                .funs=list(min=~min(., na.rm=T),
                           q025=~quantile(., 0.25, na.rm=T), 
                           q50=~quantile(., 0.50, na.rm=T), 
                           q75=~quantile(., .75, na.rm=T), 
                           max=~max(., na.rm=T), 
                           mean=~mean(., na.rm=T), 
                           sd=~sd(., na.rm=T))) %>% 
  gather(key=variable, value=value, -myvar) %>% 
  separate(variable, sep="_", into=c("metric", "stat")) %>% 
  mutate(stat=factor(stat, levels=c("min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
  spread(key=stat, value=value) %>% 
  arrange(metric, myvar)
  
knitr::kable(CWM.summary, 
  caption="Summary of CWMs and CWVs across all plots", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

2.2 Export CWM and species mean trait values

save(try.combined.means, CWM, file="../_output/Traits_CWMs_sPlot3.RData")

3 Classify plots in is.forest or is.non.forest based on species traits

sPlot has two independent systems for classifying plots to vegetation types. The first relies on the expert opinion of data contributors and classifies plots into broad habitat types. These broad habitat types are coded using 5, non-mutually exclusive dummy variables:

  1. Forest
  2. Grassland
  3. Shrubland
  4. Sparse vegetation
  5. Wetland
    A plot may belong to more than one formation, e.g. a Savannah is categorized as Forest + Grassland (FG). This system is, unfortunately, not consistently available across all plots, being the large majority of classified plots only available for Europe.
    There is therefore the need to give at least some indication to the remaining unclassified plots. To achieve this, already from v2.1, sPlot started using a classification into forest and non-forest, based on the share of trees, and the layering of vegetation. Here, we derived the (mutually exclusive) is.forest and is.non.forest classification of plots.

3.1 Derive species level information on Growth Forms

We used different sources of information:

  1. Data from the gap-filled trait matrix
  2. Manual cleaning of the most common species for which growth trait info is not available
  3. Data from TRY (public dataset only) on all species with growth form info (Trait ID = 42)
  4. Cross-match with species assigned to tree layer in DT table. \newline\newline

Step 1: Attach growth form trait information to DT table. Growth form information derives from TRY

DT.gf <- DT2 %>% 
  filter(taxon_group=="Vascular plant") %>% 
  #join with try names, using resolved species names as key
  left_join(try.species.names %>% 
              dplyr::select(Name_short, GrowthForm) %>% 
              rename(species=Name_short) %>% 
              distinct(species, .keep_all=T), 
            by="species") %>% 
  left_join(try.species.means %>% 
              dplyr::select(Name_short, PlantHeight_mean) %>% 
              rename(species=Name_short), 
            by="species")
# number of records withouth Growth Form info
sum(is.na(DT.gf$GrowthForm))

Step 2: Select most common species without growth-trait information to export and check manually

top.gf.nas <- DT.gf %>% 
  filter(is.na(GrowthForm)) %>% 
  group_by(species) %>% 
  summarize(n=n()) %>% 
  arrange(desc(n))
write_csv(top.gf.nas %>% 
            filter(n>1000), 
  path="../_derived/Species_missingGF.csv")

The first r nrow(top.gf.nas) species account for r round(sum(top.gf.nas %>% filter(n>1000) %>% pull(n))/sum(top.gf.nas$n)*100,2)% of the missing records. Assign growth forms manually, reimport and coalesce into DT.gf

# Import manually classified species - this info is also reported in Appendix 1
gf.manual <- read_csv("../_derived/Species_missingGF_complete.csv")

DT.gf <- DT.gf %>% 
  left_join(gf.manual %>% 
              rename(GrowthForm.m=GrowthForm),
            by="species") %>% 
   mutate(GrowthForm=coalesce(GrowthForm, GrowthForm.m)) %>% 
   dplyr::select(-GrowthForm.m)

After manual completion, the number of records without growth form information decresead to r sum(is.na(DT.gf$GrowthForm)).
\newline\newline

Step 3: Import additional data on growth-form from TRY (Accessed 10 March 2020).
All public data on growth form downloaded. First take care of unmatched quotation marks in the txt file. Do this from command line.

# escape all unmatched quotation marks. Run in Linux terminal
#sed 's/"/\\"/g' 8854.txt > 8854_test.csv
#sed "s/'/\\'/g" 8854.txt > 8854_test.csv

Information on growth form is not organized and has a myriad of levels. Extract and simplify to the set of few types used so far. In case a species is attributed to multiple growth forms use a majority vote.

all.gf0 <- read_delim("../_input/TRY5.0_v1.1/8854_test.txt", delim="\t") 

all.gf <- all.gf0 %>% 
  filter(TraitID==42) %>% 
  distinct(AccSpeciesName, OrigValueStr) %>% 
  rename(GrowthForm0=OrigValueStr) %>% 
  mutate(GrowthForm0=tolower(GrowthForm0)) %>%
  filter(AccSpeciesName %in% sPlot.species$species) %>% 
  mutate(GrowthForm_simplified= GrowthForm0) %>% 
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                       list=str_detect(GrowthForm0,
                                       "vine|climber|liana|carnivore|epiphyte|^succulent|lichen|parasite|
                                       hydrohalophyte|aquatic|cactous|parasitic|hydrophytes|carnivorous"), 
                                       "other")) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                       list=str_detect(GrowthForm0, 
                                                       "tree|conifer|^woody$|palmoid|mangrove|gymnosperm"), 
                                       "tree")) %>% 
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                       list=str_detect(GrowthForm0, "shrub|scrub|bamboo"), "shrub")) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                       list=str_detect(GrowthForm0,
                                      "herb|sedge|graminoid|fern|forb|herbaceous|grass|chaemaephyte|geophyte|annual"),
                                       "herb")) %>%
  mutate(GrowthForm_simplified=ifelse(GrowthForm_simplified %in% c("other", "herb", "shrub", "tree"), 
                                      GrowthForm_simplified, NA)) %>% 
  filter(!is.na(GrowthForm_simplified)) 

#Some species have multiple attributions - use a majority vote. NA if ties
get.mode <- function(x){
  if(length(unique(x))==1){
    return(as.character(unique(x)))} else{
    tmp <- sort(table(x), decreasing=T)
    if(tmp[1]!=tmp[2]){return(names(tmp)[1])} else {
    return("Unknown")}
    }
  }

all.gf <- all.gf %>% 
  group_by(AccSpeciesName) %>% 
  summarize(GrowthForm_simplified=get.mode(GrowthForm_simplified)) %>% 
  filter(GrowthForm_simplified!="Unknown")

table(all.gf$GrowthForm_simplified, exclude=NULL)  

#coalesce this info into DT.gf
DT.gf <- DT.gf %>% 
  left_join(all.gf %>% 
              rename(species=AccSpeciesName), 
            by="species") %>% 
  mutate(GrowthForm=coalesce(GrowthForm, GrowthForm_simplified)) %>% 
  dplyr::select(-GrowthForm_simplified)

Step 4: Cross-match. Assign all species occurring in at least one relevé in the tree layer as tree. Conservatively, do this only when the record is at species level (exclude records at genus\family level)

other.trees <- DT.gf %>% 
  filter(Layer==1 & is.na(GrowthForm)) %>% 
  filter(Rank_correct=="species") %>% 
  distinct(species, Layer, GrowthForm) %>% 
  pull(species)

DT.gf <- DT.gf %>% 
  mutate(GrowthForm=replace(GrowthForm, 
                            list=species %in% other.trees, 
                            values="tree"))

After cross-matching, the number of records without growth form information decresead to r sum(is.na(DT.gf$GrowthForm)).
\newline\newline

knitr::kable(DT.gf %>% 
              distinct(species, GrowthForm, PlantHeight_mean) %>% 
              group_by(GrowthForm) %>% 
              summarize(Height=mean(PlantHeight_mean, na.rm=T)), 
  caption="Average height per growth form", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                  full_width = F, position = "center")

Classify species as tree or tall shrubs vs. other. Make a compact table of species growth forms and create fields is.tree.or.tall.shrub and is.not.tree.and.small.
Define a species as is.tree.or.tall.shrub when it is either defined as tree, OR has a height >10
Define a species as is.not.tree.or.shrub.and.small when it has a height <10, as long as it's not defined a tree. When height is not available, it is sufficient that the species is classified as "herb" or "other".

GF <- DT.gf %>% 
  distinct(species, GrowthForm, PlantHeight_mean) %>% 
  mutate(GrowthForm=fct_collape(GrowthForm, 
                                 "herb/shrub"=c("herb\\shrub","herb/shrub"), 
                                 "shrub/tree"=c("shrub/tree", "shrub\\tree"))) %>% 
  ## define is.tree.or.tall
  mutate(is.tree.or.tall.shrub=NA) %>% 
  mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub, 
                                       list=str_detect(GrowthForm, "tree"), 
                                       T)) %>% 
  mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub, 
                                       list=PlantHeight_mean>=10, 
                                       T)) %>% 
  ## define is.not.tree.or.shrub.and.small 
  mutate(is.not.tree.or.shrub.and.small=NA) %>% 
  mutate(is.not.tree.or.shrub.and.small=replace(is.not.tree.or.shrub.and.small,
                                       list=PlantHeight_mean<10, 
                                       T)) %>% 
  mutate(is.not.tree.or.shrub.and.small=replace(is.not.tree.or.shrub.and.small,
                                       list=is.na(PlantHeight_mean) & str_detect(GrowthForm, "herb|other"), 
                                       T)) %>%   
  ## use each field in turn to define which of the records in the other is F
  mutate(is.not.tree.or.shrub.and.small=replace(is.not.tree.or.shrub.and.small,
                                       list= is.tree.or.tall.shrub==T,
                                       F)) %>% 
  mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub,
                                       list= is.not.tree.or.shrub.and.small==T,
                                       F)) %>% 
  ## drop redundant field
  dplyr::select(-is.not.tree.or.shrub.and.small)
  

## cross-check classification  
table(GF$GrowthForm, GF$is.tree.or.tall.shrub, exclude=NULL)

## Check for herb species classified as trees
GF %>% 
  filter(is.tree.or.tall.shrub & GrowthForm=="herb")

These are Bamboo species and their hiehgts seems reasonable.

3.2 Classify plots into forest\non-forest

Define a plot as forest if:

  1. Has a total cover of the the tree layer >=25% (from header)
  2. Has a total cover in Layer 1 >=25% (from DT)
  3. Has a total cover of tree or tall shrub species >=25% (from DT + TRY)
  4. Has data on Basal area summing to 10 m2/ha
    \newline\newline

The first three criteria are declined to define non forest as follows:

  1. Info on total cover of the tree layer is available and <25%
  2. Info on total cover in Layer 1 is available and <25%
  3. The relative cover of non tree species is >75%
    \newline\newline

Criteria 2 and 3 only apply to plots having cover data in percentage.
Reimport header file

load("../_output/header_sPlot3.0.RData")
#delete is.forest is.non.forest columns, in case of overwrite
header <- header %>% 
    dplyr::select(PlotObservationID:ESY, `Cover total (%)`:`Maximum height cryptogams (mm)`)

Criterium 1

plot.vegtype1 <- header %>% 
  dplyr::select(PlotObservationID, `Cover tree layer (%)`) %>% 
  rename(Cover_trees=`Cover tree layer (%)`) %>% 
  mutate(is.forest=Cover_trees>=25) 

table(plot.vegtype1 %>% dplyr::select(is.forest), exclude=NULL)

Criterium 2

# Select only plots having cover data in percentage
mysel <- (DT.gf %>% 
            distinct(PlotObservationID, Ab_scale) %>% 
            group_by(PlotObservationID) %>% 
            summarize(AllCovPer=all(Ab_scale=="CoverPerc")) %>% 
            filter(AllCovPer==T) %>% 
            pull(PlotObservationID))
# Excludedd plots
nrow(header)-length(mysel)

plot.vegtype2 <- DT.gf %>% 
  filter(PlotObservationID %in% mysel ) %>% 
  filter(Layer %in% c(1,2,3)) %>% 
  # first sum the cover of all species in a layer
  group_by(PlotObservationID, Layer) %>% 
  summarize(cover_perc=sum(cover_perc)) %>% 
  # then combine cover across layers
  group_by(PlotObservationID) %>% 
  summarize(cover_perc=combine.cover(cover_perc)) %>% 
  mutate(is.forest=cover_perc>=25) 

table(plot.vegtype1 %>% dplyr::select(is.forest), exclude=NULL)

Criterium 3

plot.vegtype3 <- DT.gf %>% 
  #filter plots where all records are recorded as percentage cover
  filter(PlotObservationID %in% mysel ) %>% 
  # combine cover across layers
  group_by(PlotObservationID, species) %>%
  summarize(cover_perc=combine.cover(cover_perc)) %>%
  ungroup() %>% 
  # attach species Growth Form information
  left_join(GF, by="species")%>% 
  group_by(PlotObservationID) %>% 
  summarize(cover_tree=sum(cover_perc*is.tree.or.tall.shrub, na.rm=T), 
            cover_non_tree=sum(cover_perc*(!is.tree.or.tall.shrub), na.rm=T), 
            cover_unknown=sum(cover_perc* is.na(is.tree.or.tall.shrub))) %>% 
  rowwise() %>% 
  ## classify plots based on cover of different growth forms
  mutate(tot.cover=sum(cover_tree, cover_non_tree, cover_unknown, na.rm=T)) %>% 
  mutate(is.forest=cover_tree>=25) %>% 
  mutate(is.non.forest=cover_tree<25 & (cover_non_tree/tot.cover)>.75)

table(plot.vegtype3 %>% dplyr::select(is.forest, is.non.forest), exclude=NULL)

Criterium 4

plot.vegtype4 <-  DT.gf %>% 
  filter(Ab_scale=="x_BA") %>% 
  group_by(PlotObservationID) %>% 
  summarize(tot.ba=sum(Abundance)) %>% 
  mutate(is.forest=tot.ba>10)

table(plot.vegtype4 %>% dplyr::select(is.forest), exclude=NULL)

Combine classifications from the three criteria. Use majority vote to assign plots. In case of ties, a progressively lower priority is given from criterium 1 to criterium 4.

plot.vegtype <- header %>% 
  dplyr::select(PlotObservationID) %>% 
  left_join(plot.vegtype1 %>% 
              dplyr::select(PlotObservationID, is.forest), 
            by="PlotObservationID") %>% 
  left_join(plot.vegtype2 %>% 
              dplyr::select(PlotObservationID, is.forest), 
            by="PlotObservationID") %>% 
  left_join(plot.vegtype3 %>% 
              dplyr::select(PlotObservationID, is.forest, is.non.forest) %>% 
              rename(is.non.forest.x.x=is.non.forest), 
            by="PlotObservationID") %>% 
  left_join(plot.vegtype4 %>% 
              dplyr::select(PlotObservationID, is.forest), 
            by="PlotObservationID") %>% 
  ## assign vegtype based on majority vote. In case of ties use the order of criteria as ranking
  rowwise() %>% 
  mutate(mean.forest=mean(c(is.forest.x, is.forest.y, is.forest.x.x, is.forest.y.y), na.rm=T)) %>% 
  mutate(mean.forest2=coalesce(is.forest.x, is.forest.y, is.forest.x.x, is.forest.y.y)) %>% 
  mutate(is.forest=ifelse(mean.forest==0.5, mean.forest2, mean.forest>0.5)) %>%  
  # same for is.non.forest
  mutate(mean.non.forest=mean(c( (!is.forest.x), (!is.forest.y), is.non.forest.x.x, (!is.forest.y.y)), na.rm=T)) %>% 
  mutate(mean.non.forest2=coalesce( (!is.forest.x), (!is.forest.y), is.non.forest.x.x, (!is.forest.y.y))) %>% 
  mutate(is.non.forest=ifelse(mean.non.forest==0.5, mean.non.forest2, mean.non.forest>0.5)) %>% 
  # when both is.forest & is.non.forest are F transform to NA
  mutate(both.F=ifelse( ( (is.forest==F | is.na(is.forest)) & is.non.forest==F), T, F)) %>% 
  mutate(is.forest=replace(is.forest, list=both.F==T, values=NA)) %>% 
  mutate(is.non.forest=replace(is.non.forest, list=both.F==T, values=NA))

table(plot.vegtype %>% dplyr::select(is.forest, is.non.forest), exclude=NULL)

3.3 Cross-check and validate

Cross check with sPlot's 5-class (incomplete) native classification deriving from data contributors. Build a Confusion matrix.

cross.check <- header %>% 
  dplyr::select(PlotObservationID, Forest) %>% 
  left_join(plot.vegtype %>% 
              dplyr::select(PlotObservationID, is.forest, is.non.forest) %>% 
              rename(Forest=is.forest, Other=is.non.forest) %>% 
              gather(isfor_isnonfor, value, -PlotObservationID) %>% 
              filter(value==T) %>% 
              dplyr::select(-value), 
            by="PlotObservationID") %>% 
  mutate(Other=1*Forest!=1) %>% 
  gather(veg_type, value, -PlotObservationID, -isfor_isnonfor) %>% 
  filter(value==1) %>% 
  dplyr::select(-value)

#Build a confusion matrix to evaluate the comparison  
u <- union(cross.check$isfor_isnonfor, cross.check$veg_type)
t <- table( factor(cross.check$isfor_isnonfor, u), factor(cross.check$veg_type, u))
confm <- caret::confusionMatrix(t)
knitr::kable(confm$table, caption="Confusion matrix between sPlot's native classification of habitats (columns), and classification based on four criteria based on vegetation layers and growth forms (rows)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "center")

Formulas of associated statistics are available on the help page of the caret package and associated references. The overall accuracy of the classification based on is.forest\is.non.forest, when tested against sPlot's native habitat classification is r round(confm$overall[1],2), the Kappa statistics is r round(confm$overall[2],2).

knitr::kable((confm$byClass), caption="Associated statistics of confusion matrix by class") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
header.vegtype <- header %>% 
  dplyr::select(PlotObservationID, Forest:Wetland) %>% 
  left_join(plot.vegtype %>% 
              dplyr::select(PlotObservationID, is.forest, is.non.forest),
            by="PlotObservationID")
#check 
nrow(header.vegtype)==nrow(header)

Through the process described above, we managed to classify r plot.vegtype %>% filter(is.forest==T | is.non.forest==T) %>% nrow(), of which r plot.vegtype %>% filter(is.forest==T) %>% nrow() is forest and r plot.vegtype %>% filter(is.non.forest==T) %>% nrow() is non-forest.
\newline\newline The total number of plots with attribution to forest\non-forest (either coming from sPlot's native classification, or from the process above) is: r header.vegtype %>% dplyr::select(-PlotObservationID) %>% filter(rowMeans(is.na(.)) < 1) %>% nrow().

4 Export and update other objects

sPlot.traits <- sPlot.species %>% 
  arrange(species) %>% 
  left_join(GF %>% 
              dplyr::select(species, GrowthForm, is.tree.or.tall.shrub), 
            by="species") %>% 
  left_join(try.combined.means %>% 
              rename(species=Taxon_name), by="species") %>% 
  dplyr::select(-Rank_correct)
  
save(try.combined.means, CWM, sPlot.traits, file="../_output/Traits_CWMs_sPlot3.RData")

header <- header %>% 
  left_join(plot.vegtype %>% 
              dplyr::select(PlotObservationID, is.forest, is.non.forest),
            by="PlotObservationID") %>% 
  dplyr::select(PlotObservationID:ESY, is.forest:is.non.forest, everything())

save(header, file="../_output/header_sPlot3.0.RData")

APPENDIX

Appendix 1 - Growth forms of most common species

As assigned manually.

cat(readLines("../_derived/Species_missingGF_complete.csv"), sep = '\n')

SessionInfo

sessionInfo()