07_buildCWMs.Rmd

title: "sPlot3.0 - Build CWMs"
author: "Francesco Maria Sabatini"
date: "3/4/2020"
output: html_document
library(tidyverse)
library(readr)
library(data.table)
library(knitr)
library(kableExtra)
library(stringr)
library(caret)
library(viridis)
load("../_output/DT_sPlot3.0.RData")
load("../_output/Backbone3.0.RData")
# Species, Genus, Family
try.species <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv",
  locale = locale(encoding = "latin1"))
# Original data without gap-filling. With species and trait labels
try.allinfo <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/traits_x_georef_wide_table.csv",
  locale = locale(encoding = "latin1"),
                        col_types = paste0(c("dddccccc",rep("c", 84)), collapse=""))
# Individual-level gap-filled data - order as in try.allinfo
try.individuals0 <- read_csv(
  "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/gapfilled_data/mean_gap_filled_back_transformed.csv",
  locale = locale(encoding = "latin1"))
# #Ancillary function to change species names to lower case
# firstup <- function(x) {
#   substr(x, 1, 1) <- toupper(substr(x, 1, 1))
#   x
# }
#
# try.species <- try.species %>%
#   mutate(Species=tolower(Species)) %>%
#   mutate(Species=firstup(Species))
#
# try.allinfo <- try.allinfo %>%
#   mutate(Species=tolower(Species)) %>%
#   mutate(Species=firstup(Species))
try.species.names <- try.allinfo %>%
  dplyr::select(Species, Genus, GrowthForm) %>%
  left_join(Backbone %>%
              dplyr::select(Name_sPlot_TRY, Name_short) %>%
              rename(Species=Name_sPlot_TRY),
            by="Species") %>%
  dplyr::select(Species, Name_short, Genus, GrowthForm)
sPlot.species <- DT2 %>%
  distinct(species)

sPlot.in.TRY <- sPlot.species %>%
  filter(species %in% (try.species.names %>%
                                  distinct(Name_short) %>%
                                  pull(Name_short)))
trait.legend <- data.frame(full=try.allinfo %>%
                             dplyr::select(starts_with("StdValue_")) %>%
                             colnames() %>%
                             gsub("StdValue_", "", .) %>%
                             sort()) %>%
  mutate(full=as.character(full)) %>%
  mutate(traitcode=parse_number(full)) %>%
  arrange(traitcode) %>%
  dplyr::select(traitcode, everything()) %>%
  mutate(full=gsub(pattern = "^[0-9]+_", replacement="", full)) %>%
  mutate(short=c("StemDens", "RootingDepth","LeafC.perdrymass", "LeafN","LeafP",
                 "StemDiam","SeedMass", "Seed.length","LeafThickness","LDMC",
                 "LeafNperArea","LeafDryMass.single","Leaf.delta.15N","SeedGerminationRate",
                 "Seed.num.rep.unit","LeafLength","LeafWidth","LeafCN.ratio","Leaffreshmass",
                 "Stem.cond.dens","Chromosome.n","Chromosome.cDNAcont",
                 "Disp.unit.leng","StemConduitDiameter","Wood.vessel.length",
                 "WoodFiberLength","SpecificRootLength.fine","SpecificRootLength",
                 "PlantHeight.veg","PlantHeight.generative","LeafArea.leaf.noPet",
                 "LeafArea.leaflet.noPet","LeafArea.leaf.wPet","LeafArea.leaflet.wPet",
                 "LeafArea.leaf.undef","LeafArea.leaflet.undef","LeafArea.undef.undef",
                 "SLA.noPet", "SLA.wPet","SLA.undef", "LeafWaterCont")) %>%
  ## Add SLA missing from allinfo file
  bind_rows(data.frame(traitcode=11,
                       full="Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA)",
                       short="SLA")) %>%
  bind_rows(data.frame(traitcode=18,
                       full="Plant height (vegetative + generative)",
                       short="PlantHeight")) %>%
  arrange(traitcode) %>%
  #create a column to mark traits for which gap filled data is available.
  mutate(available=paste0("X", traitcode) %in% colnames(try.individuals0))
knitr::kable(trait.legend,
  caption="Legend of traits from TRY") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
#create string to rename traits
col.to <- trait.legend %>%
  filter(available==T) %>%
  pull(short)
col.from <- trait.legend %>%
  filter(available==T) %>%
  mutate(traitcode=paste0("X", traitcode))  %>%
  pull(traitcode)

try.individuals <- try.individuals0 %>%
              rename_at(col.from, .funs=function(x) col.to)
try.species.names %>%
    dplyr::select(Name_short) %>%
    bind_cols(try.individuals %>%
                  dplyr::select(-X1)) %>%
  gather(variable, value, -Name_short) %>%
  filter(value<0) %>%
  group_by(variable) %>%
  summarize(n=n())
try.individuals <- try.species.names %>%
  dplyr::select(Name_short) %>%
  bind_cols(try.individuals)

toexclude <- try.individuals %>%
  gather(variable, value, -X1, -Name_short) %>%
  filter(variable != "Leaf.delta.15N") %>%
  filter(value<0) %>%
  pull(X1)

toexclude2 <- try.individuals %>%
  filter(PlantHeight>100  & (!Name_short %in% c("Pseudotsuga menziesii", "Sequoia sempervirens"))) %>%
  pull(X1)

toexclude3 <- try.individuals %>%
  filter(X1 %in% (try.allinfo %>%
                     filter(GrowthForm=="herb") %>%
                     pull(X1))) %>%
  filter(PlantHeight>10) %>%
  pull(X1)

try.individuals <- try.individuals %>%
  filter(!X1 %in% c(toexclude, toexclude2, toexclude3)) %>%
  dplyr::select(-X1)
## Calculate species level trait means and sd.
try.species.means <- try.individuals %>%
  group_by(Name_short) %>%
  #Add a field to indicate the number of observation per taxon
  left_join(x={.} %>%
              summarize(n=n()),
            y={.} %>%
              summarize_at(.vars=vars(StemDens:LeafWaterCont ),
                           .funs=list(mean=~mean(.), sd=~sd(.))),
            by="Name_short") %>%
  dplyr::select(Name_short, n, everything())

## Calculate genus level trait means and sd.
try.genus.means <- try.individuals %>%
  mutate(Genus=word(Name_short, 1)) %>%
  group_by(Genus) %>%
  left_join(x={.} %>%
              summarize(n=n()),
            y={.} %>%
              summarize_at(.vars=vars(StemDens:LeafWaterCont ),
                           .funs=list(mean=~mean(.), sd=~sd(.))),
            by="Genus") %>%
  dplyr::select(Genus, n, everything())
knitr::kable(try.species.means %>%
               sample_n(15),
  caption="Example of trait means for 15 randomly selected species", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
try.combined.means <- try.genus.means %>%
  rename(Taxon_name=Genus) %>%
  mutate(Rank_correct="genus") %>%
  bind_rows(try.species.means %>%
              rename(Taxon_name=Name_short) %>%
              mutate(Rank_correct="species")) %>%
  dplyr::select(Taxon_name, Rank_correct, everything())

total.matches <- DT2 %>%
  distinct(species, Rank_correct) %>%
  left_join(try.combined.means %>%
              dplyr::rename(species=Taxon_name),
            by=c("species", "Rank_correct")) %>%
  filter(!is.na(SLA_mean)) %>%
  nrow()
mysummary <- try.combined.means %>%
               group_by(Rank_correct) %>%
               summarize_at(.vars=vars(StemDens_mean:LeafWaterCont_sd),
                            .funs=list(min=~min(., na.rm=T),
                                       q025=~quantile(., 0.25, na.rm=T),
                                       q50=~quantile(., 0.50, na.rm=T),
                                       q75=~quantile(., .75, na.rm=T),
                                       max=~max(., na.rm=T),
                                       mean=~mean(., na.rm=T),
                                       sd=~sd(., na.rm=T))) %>%
  gather(variable, value, -Rank_correct) %>%
  separate(variable, sep="_", into=c("variable", "mean.sd", "stat")) %>%
  mutate(stat=factor(stat, levels=c("min", "q025", "q50", "q75", "max", "mean", "sd"))) %>%
  spread(key=stat, value=value) %>%
  arrange(desc(Rank_correct))

knitr::kable(mysummary,
  caption="Summary statistics for each trait, when summarized across species or genera", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
#Ancillary function
# Combine cover accounting for layers
combine.cover <- function(x){
    while (length(x)>1){
      x[2] <- x[1]+(100-x[1])*x[2]/100
      x <- x[-1]
    }
  return(x)
}

DT2.comb <- DT2 %>%
  group_by(PlotObservationID, species, Rank_correct) %>%
  summarize(Relative.cover=combine.cover(Relative.cover)) %>%
  ungroup() %>%
  # re-normalize to 100%
  left_join(x=.,
            y={.} %>%
              group_by(PlotObservationID) %>%
              summarize(Tot.cover=sum(Relative.cover)),
            by="PlotObservationID") %>%
  mutate(Relative.cover=Relative.cover/Tot.cover) %>%
  dplyr::select(-Tot.cover)

# Tag plots where at least one layer has only p\a information
any_pa <- DT2 %>%
  distinct(PlotObservationID, Ab_scale) %>%
  group_by(PlotObservationID) %>%
  summarize(any.pa=any(Ab_scale=="pa")) %>%
  filter(any.pa==T) %>%
  pull(PlotObservationID)
length(any_pa)

# Exclude plots above and merge species data table with traits
CWM0 <- DT2.comb %>%
  filter(!PlotObservationID %in% any_pa) %>%
  left_join(try.combined.means %>%
              dplyr::rename(species=Taxon_name) %>%
              dplyr::select(species, Rank_correct, ends_with("_mean")),
            by=c("species", "Rank_correct"))

# Calculate CWM for each trait in each plot
CWM1 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~weighted.mean(., Relative.cover, na.rm=T))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=CWM, -PlotObservationID)

# Calculate coverage for each trait in each plot
CWM2 <- CWM0 %>%
  mutate_at(.funs = list(~if_else(is.na(.),0,1) * Relative.cover),
            .vars = vars(StemDens_mean:LeafWaterCont_mean)) %>%
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~sum(., na.rm=T))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=trait.coverage, -PlotObservationID)

# Calculate CWV
# Ancillary function
variance2.fun <- function(trait, abu){
  res <- as.double(NA)
  abu <- abu[!is.na(trait)]
  trait <- trait[!is.na(trait)]
  abu <- abu/sum(abu)
  if (length(trait)>1){
    # you need more than 1 observation to calculate variance
    # for calculation see
    # http://r.789695.n4.nabble.com/Weighted-skewness-and-curtosis-td4709956.html
    m.trait <- weighted.mean(trait,abu)
    res <- sum(abu*(trait-m.trait)^2)
  }
  res
}

CWM3 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~variance2.fun(., Relative.cover))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=CWV, -PlotObservationID)

## Calculate proportion of species having traits
CWM4 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  #distinct(PlotObservationID, species, .keep_all = T) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~sum(!is.na(.)))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=n.sp.with.trait, -PlotObservationID)

# Join together
CWM <- CWM1 %>%
  left_join(CWM2, by=c("PlotObservationID", "variable")) %>%
  left_join(CWM3, by=c("PlotObservationID", "variable")) %>%
  left_join(CWM4, by=c("PlotObservationID", "variable")) %>%
  left_join(CWM0 %>%
              group_by(PlotObservationID) %>%
              summarize(sp.richness=n()), by=c("PlotObservationID")) %>%
  mutate(prop.sp.with.trait=n.sp.with.trait/sp.richness) %>%
  dplyr::select(PlotObservationID, variable, sp.richness, prop.sp.with.trait, trait.coverage, CWM, CWV) %>%
  arrange(PlotObservationID)

knitr::kable(CWM %>%
               filter(PlotObservationID %in%
                        sample(unique(DT2$PlotObservationID), 3, replace=F)),
  caption="Community weighted means of 3 randomly selected plots", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
ggplot(data=CWM %>%
         #all variables have the same coverage. Showcase with LDMC
         filter(variable=="LDMC_mean"), aes(x=trait.coverage, y=prop.sp.with.trait, col=log(sp.richness))) +
  geom_point(pch="+", alpha=1/3) +
  geom_abline(intercept = 0, slope=1, col=2, lty=2, lwd=.7) +
  xlim(c(0,1)) +
  ylim(c(0,1)) +
  scale_color_viridis() +
  theme_bw() +
  xlab("Trait coverage (Relative  cover)") +
  ylab("Trait coverage (Proportion of species)") +
  coord_equal()
CWM.coverage <- CWM %>%
  filter(variable=="LDMC_mean") %>%
  summarize_at(.vars=vars(trait.coverage, prop.sp.with.trait),
                .funs=list(num.0s=~sum(.==0),
                           min=~min(., na.rm=T),
                           q025=~quantile(., 0.25, na.rm=T),
                           q50=~quantile(., 0.50, na.rm=T),
                           q75=~quantile(., .75, na.rm=T),
                           max=~max(., na.rm=T),
                           mean=~mean(., na.rm=T),
                           sd=~sd(., na.rm=T))) %>%
  gather(key=variable, value=value) %>%
  separate(variable, sep="_", into=c("metric", "stat")) %>%
  mutate(stat=factor(stat, levels=c("num.0s", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>%
  spread(key=stat, value=value)

knitr::kable(CWM.coverage,
  caption="Summary of plot-level coverage of CWM and CWVs", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
CWM.summary <- CWM %>%
  rename(myvar=variable) %>%
  group_by(myvar) %>%
  summarize_at(.vars=vars(CWM:CWV),
                .funs=list(min=~min(., na.rm=T),
                           q025=~quantile(., 0.25, na.rm=T),
                           q50=~quantile(., 0.50, na.rm=T),
                           q75=~quantile(., .75, na.rm=T),
                           max=~max(., na.rm=T),
                           mean=~mean(., na.rm=T),
                           sd=~sd(., na.rm=T))) %>%
  gather(key=variable, value=value, -myvar) %>%
  separate(variable, sep="_", into=c("metric", "stat")) %>%
  mutate(stat=factor(stat, levels=c("min", "q025", "q50", "q75", "max", "mean", "sd"))) %>%
  spread(key=stat, value=value) %>%
  arrange(metric, myvar)

knitr::kable(CWM.summary,
  caption="Summary of CWMs and CWVs across all plots", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
save(try.combined.means, CWM, file="../_output/Traits_CWMs_sPlot3.RData")
DT.gf <- DT2 %>%
  filter(taxon_group=="Vascular plant") %>%
  #join with try names, using resolved species names as key
  left_join(try.species.names %>%
              dplyr::select(Name_short, GrowthForm) %>%
              rename(species=Name_short) %>%
              distinct(species, .keep_all=T),
            by="species") %>%
  left_join(try.species.means %>%
              dplyr::select(Name_short, PlantHeight_mean) %>%
              rename(species=Name_short),
            by="species")
# number of records withouth Growth Form info
sum(is.na(DT.gf$GrowthForm))
top.gf.nas <- DT.gf %>%
  filter(is.na(GrowthForm)) %>%
  group_by(species) %>%
  summarize(n=n()) %>%
  arrange(desc(n))
write_csv(top.gf.nas %>%
            filter(n>1000),
  path="../_derived/Species_missingGF.csv")
# Import manually classified species - this info is also reported in Appendix 1
gf.manual <- read_csv("../_derived/Species_missingGF_complete.csv")

DT.gf <- DT.gf %>%
  left_join(gf.manual %>%
              rename(GrowthForm.m=GrowthForm),
            by="species") %>%
   mutate(GrowthForm=coalesce(GrowthForm, GrowthForm.m)) %>%
   dplyr::select(-GrowthForm.m)
# escape all unmatched quotation marks. Run in Linux terminal
#sed 's/"/\\"/g' 8854.txt > 8854_test.csv
#sed "s/'/\\'/g" 8854.txt > 8854_test.csv
all.gf0 <- read_delim("../_input/TRY5.0_v1.1/8854_test.txt", delim="\t")

all.gf <- all.gf0 %>%
  filter(TraitID==42) %>%
  distinct(AccSpeciesName, OrigValueStr) %>%
  rename(GrowthForm0=OrigValueStr) %>%
  mutate(GrowthForm0=tolower(GrowthForm0)) %>%
  filter(AccSpeciesName %in% sPlot.species$species) %>%
  mutate(GrowthForm_simplified= GrowthForm0) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified,
                                       list=str_detect(GrowthForm0,
                                       "vine|climber|liana|carnivore|epiphyte|^succulent|lichen|parasite|
                                       hydrohalophyte|aquatic|cactous|parasitic|hydrophytes|carnivorous"),
                                       "other")) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified,
                                       list=str_detect(GrowthForm0,
                                                       "tree|conifer|^woody$|palmoid|mangrove|gymnosperm"),
                                       "tree")) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified,
                                       list=str_detect(GrowthForm0, "shrub|scrub|bamboo"), "shrub")) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified,
                                       list=str_detect(GrowthForm0,
                                      "herb|sedge|graminoid|fern|forb|herbaceous|grass|chaemaephyte|geophyte|annual"),
                                       "herb")) %>%
  mutate(GrowthForm_simplified=ifelse(GrowthForm_simplified %in% c("other", "herb", "shrub", "tree"),
                                      GrowthForm_simplified, NA)) %>%
  filter(!is.na(GrowthForm_simplified))

#Some species have multiple attributions - use a majority vote. NA if ties
get.mode <- function(x){
  if(length(unique(x))==1){
    return(as.character(unique(x)))} else{
    tmp <- sort(table(x), decreasing=T)
    if(tmp[1]!=tmp[2]){return(names(tmp)[1])} else {
    return("Unknown")}
    }
  }

all.gf <- all.gf %>%
  group_by(AccSpeciesName) %>%
  summarize(GrowthForm_simplified=get.mode(GrowthForm_simplified)) %>%
  filter(GrowthForm_simplified!="Unknown")

table(all.gf$GrowthForm_simplified, exclude=NULL)

#coalesce this info into DT.gf
DT.gf <- DT.gf %>%
  left_join(all.gf %>%
              rename(species=AccSpeciesName),
            by="species") %>%
  mutate(GrowthForm=coalesce(GrowthForm, GrowthForm_simplified)) %>%
  dplyr::select(-GrowthForm_simplified)
other.trees <- DT.gf %>%
  filter(Layer==1 & is.na(GrowthForm)) %>%
  filter(Rank_correct=="species") %>%
  distinct(species, Layer, GrowthForm) %>%
  pull(species)

DT.gf <- DT.gf %>%
  mutate(GrowthForm=replace(GrowthForm,
                            list=species %in% other.trees,
                            values="tree"))
knitr::kable(DT.gf %>%
              distinct(species, GrowthForm, PlantHeight_mean) %>%
              group_by(GrowthForm) %>%
              summarize(Height=mean(PlantHeight_mean, na.rm=T)),
  caption="Average height per growth form", digits = 3) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
                  full_width = F, position = "center")
GF <- DT.gf %>%
  distinct(species, GrowthForm, PlantHeight_mean) %>%
  mutate(GrowthForm=fct_collape(GrowthForm,
                                 "herb/shrub"=c("herb\\shrub","herb/shrub"),
                                 "shrub/tree"=c("shrub/tree", "shrub\\tree"))) %>%
  ## define is.tree.or.tall
  mutate(is.tree.or.tall.shrub=NA) %>%
  mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub,
                                       list=str_detect(GrowthForm, "tree"),
                                       T)) %>%
  mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub,
                                       list=PlantHeight_mean>=10,
                                       T)) %>%
  ## define is.not.tree.or.shrub.and.small
  mutate(is.not.tree.or.shrub.and.small=NA) %>%
  mutate(is.not.tree.or.shrub.and.small=replace(is.not.tree.or.shrub.and.small,
                                       list=PlantHeight_mean<10,
                                       T)) %>%
  mutate(is.not.tree.or.shrub.and.small=replace(is.not.tree.or.shrub.and.small,
                                       list=is.na(PlantHeight_mean) & str_detect(GrowthForm, "herb|other"),
                                       T)) %>%
  ## use each field in turn to define which of the records in the other is F
  mutate(is.not.tree.or.shrub.and.small=replace(is.not.tree.or.shrub.and.small,
                                       list= is.tree.or.tall.shrub==T,
                                       F)) %>%
  mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub,
                                       list= is.not.tree.or.shrub.and.small==T,
                                       F)) %>%
  ## drop redundant field
  dplyr::select(-is.not.tree.or.shrub.and.small)


## cross-check classification
table(GF$GrowthForm, GF$is.tree.or.tall.shrub, exclude=NULL)

## Check for herb species classified as trees
GF %>%
  filter(is.tree.or.tall.shrub & GrowthForm=="herb")
load("../_output/header_sPlot3.0.RData")
#delete is.forest is.non.forest columns, in case of overwrite
header <- header %>%
    dplyr::select(PlotObservationID:ESY, `Cover total (%)`:`Maximum height cryptogams (mm)`)
plot.vegtype1 <- header %>%
  dplyr::select(PlotObservationID, `Cover tree layer (%)`) %>%
  rename(Cover_trees=`Cover tree layer (%)`) %>%
  mutate(is.forest=Cover_trees>=25)

table(plot.vegtype1 %>% dplyr::select(is.forest), exclude=NULL)
# Select only plots having cover data in percentage
mysel <- (DT.gf %>%
            distinct(PlotObservationID, Ab_scale) %>%
            group_by(PlotObservationID) %>%
            summarize(AllCovPer=all(Ab_scale=="CoverPerc")) %>%
            filter(AllCovPer==T) %>%
            pull(PlotObservationID))
# Excludedd plots
nrow(header)-length(mysel)

plot.vegtype2 <- DT.gf %>%
  filter(PlotObservationID %in% mysel ) %>%
  filter(Layer %in% c(1,2,3)) %>%
  # first sum the cover of all species in a layer
  group_by(PlotObservationID, Layer) %>%
  summarize(cover_perc=sum(cover_perc)) %>%
  # then combine cover across layers
  group_by(PlotObservationID) %>%
  summarize(cover_perc=combine.cover(cover_perc)) %>%
  mutate(is.forest=cover_perc>=25)

table(plot.vegtype1 %>% dplyr::select(is.forest), exclude=NULL)

plot.vegtype3 <- DT.gf %>%
  #filter plots where all records are recorded as percentage cover
  filter(PlotObservationID %in% mysel ) %>%
  # combine cover across layers
  group_by(PlotObservationID, species) %>%
  summarize(cover_perc=combine.cover(cover_perc)) %>%
  ungroup() %>%
  # attach species Growth Form information
  left_join(GF, by="species")%>%
  group_by(PlotObservationID) %>%
  summarize(cover_tree=sum(cover_perc*is.tree.or.tall.shrub, na.rm=T),
            cover_non_tree=sum(cover_perc*(!is.tree.or.tall.shrub), na.rm=T),
            cover_unknown=sum(cover_perc* is.na(is.tree.or.tall.shrub))) %>%
  rowwise() %>%
  ## classify plots based on cover of different growth forms
  mutate(tot.cover=sum(cover_tree, cover_non_tree, cover_unknown, na.rm=T)) %>%
  mutate(is.forest=cover_tree>=25) %>%
  mutate(is.non.forest=cover_tree<25 & (cover_non_tree/tot.cover)>.75)

table(plot.vegtype3 %>% dplyr::select(is.forest, is.non.forest), exclude=NULL)
plot.vegtype4 <-  DT.gf %>%
  filter(Ab_scale=="x_BA") %>%
  group_by(PlotObservationID) %>%
  summarize(tot.ba=sum(Abundance)) %>%
  mutate(is.forest=tot.ba>10)

table(plot.vegtype4 %>% dplyr::select(is.forest), exclude=NULL)
plot.vegtype <- header %>%
  dplyr::select(PlotObservationID) %>%
  left_join(plot.vegtype1 %>%
              dplyr::select(PlotObservationID, is.forest),
            by="PlotObservationID") %>%
  left_join(plot.vegtype2 %>%
              dplyr::select(PlotObservationID, is.forest),
            by="PlotObservationID") %>%
  left_join(plot.vegtype3 %>%
              dplyr::select(PlotObservationID, is.forest, is.non.forest) %>%
              rename(is.non.forest.x.x=is.non.forest),
            by="PlotObservationID") %>%
  left_join(plot.vegtype4 %>%
              dplyr::select(PlotObservationID, is.forest),
            by="PlotObservationID") %>%
  ## assign vegtype based on majority vote. In case of ties use the order of criteria as ranking
  rowwise() %>%
  mutate(mean.forest=mean(c(is.forest.x, is.forest.y, is.forest.x.x, is.forest.y.y), na.rm=T)) %>%
  mutate(mean.forest2=coalesce(is.forest.x, is.forest.y, is.forest.x.x, is.forest.y.y)) %>%
  mutate(is.forest=ifelse(mean.forest==0.5, mean.forest2, mean.forest>0.5)) %>%
  # same for is.non.forest
  mutate(mean.non.forest=mean(c( (!is.forest.x), (!is.forest.y), is.non.forest.x.x, (!is.forest.y.y)), na.rm=T)) %>%
  mutate(mean.non.forest2=coalesce( (!is.forest.x), (!is.forest.y), is.non.forest.x.x, (!is.forest.y.y))) %>%
  mutate(is.non.forest=ifelse(mean.non.forest==0.5, mean.non.forest2, mean.non.forest>0.5)) %>%
  # when both is.forest & is.non.forest are F transform to NA
  mutate(both.F=ifelse( ( (is.forest==F | is.na(is.forest)) & is.non.forest==F), T, F)) %>%
  mutate(is.forest=replace(is.forest, list=both.F==T, values=NA)) %>%
  mutate(is.non.forest=replace(is.non.forest, list=both.F==T, values=NA))

table(plot.vegtype %>% dplyr::select(is.forest, is.non.forest), exclude=NULL)
cross.check <- header %>%
  dplyr::select(PlotObservationID, Forest) %>%
  left_join(plot.vegtype %>%
              dplyr::select(PlotObservationID, is.forest, is.non.forest) %>%
              rename(Forest=is.forest, Other=is.non.forest) %>%
              gather(isfor_isnonfor, value, -PlotObservationID) %>%
              filter(value==T) %>%
              dplyr::select(-value),
            by="PlotObservationID") %>%
  mutate(Other=1*Forest!=1) %>%
  gather(veg_type, value, -PlotObservationID, -isfor_isnonfor) %>%
  filter(value==1) %>%
  dplyr::select(-value)

#Build a confusion matrix to evaluate the comparison
u <- union(cross.check$isfor_isnonfor, cross.check$veg_type)
t <- table( factor(cross.check$isfor_isnonfor, u), factor(cross.check$veg_type, u))
confm <- caret::confusionMatrix(t)
knitr::kable(confm$table, caption="Confusion matrix between sPlot's native classification of habitats (columns), and classification based on four criteria based on vegetation layers and growth forms (rows)") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "center")
knitr::kable((confm$byClass), caption="Associated statistics of confusion matrix by class") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
header.vegtype <- header %>%
  dplyr::select(PlotObservationID, Forest:Wetland) %>%
  left_join(plot.vegtype %>%
              dplyr::select(PlotObservationID, is.forest, is.non.forest),
            by="PlotObservationID")
#check
nrow(header.vegtype)==nrow(header)
sPlot.traits <- sPlot.species %>%
  arrange(species) %>%
  left_join(GF %>%
              dplyr::select(species, GrowthForm, is.tree.or.tall.shrub),
            by="species") %>%
  left_join(try.combined.means %>%
              rename(species=Taxon_name), by="species") %>%
  dplyr::select(-Rank_correct)

save(try.combined.means, CWM, sPlot.traits, file="../_output/Traits_CWMs_sPlot3.RData")

header <- header %>%
  left_join(plot.vegtype %>%
              dplyr::select(PlotObservationID, is.forest, is.non.forest),
            by="PlotObservationID") %>%
  dplyr::select(PlotObservationID:ESY, is.forest:is.non.forest, everything())

save(header, file="../_output/header_sPlot3.0.RData")
cat(readLines("../_derived/Species_missingGF_complete.csv"), sep = '\n')
sessionInfo()