From 8138532765444fa815e6033ab53c68c836d237cd Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Fri, 6 Mar 2020 11:42:53 +0100
Subject: [PATCH] Edited 06 and 07 to exclude plots with pa info from CWM
 calculations

---
 code/06_buildDT.Rmd   | 69 ++++++++++++++++++++++++++++++++----------
 code/07_buildCWMs.Rmd | 70 +++++++++++++++++++++++++++++--------------
 2 files changed, 100 insertions(+), 39 deletions(-)

diff --git a/code/06_buildDT.Rmd b/code/06_buildDT.Rmd
index 9d504c5..599322a 100644
--- a/code/06_buildDT.Rmd
+++ b/code/06_buildDT.Rmd
@@ -81,12 +81,13 @@ knitr::kable(DT0 %>%
                   full_width = F, position = "center")
 ```
 
+
+## Match species names from DT0 to those in Backbone
 Import taxonomic backbone
 ```{r}
 load("../_output/Backbone3.0.RData")
 ```
-
-## Match species names from DT0 to those in Backbone
+Match to DT0, using `Taxonomic concept` as matching key. This is the field that was used to build, and resolve, the Backbone. 
 ```{r}
 DT1 <- DT0 %>% 
   left_join(Backbone %>% 
@@ -147,7 +148,7 @@ knitr::kable(name.check.freq %>% slice(1:40),
 nknown <- DT1 %>% filter(`Taxon group`!="Unknown") %>% nrow()
 nunknown <- DT1 %>% filter(`Taxon group`=="Unknown") %>% nrow()
 ```
-`Taxon group` information is only available for `r nknown` taxa, but absent for `r nunknown`. To improve the completeness of this field, we derive additional info from the `Backbone`, and merge it with the data already present in `DT`.
+`Taxon group` information is only available for `r nknown` entries, but absent for `r nunknown`. To improve the completeness of this field, we derive additional info from the `Backbone`, and merge it with the data already present in `DT`.
 ```{r}
 table(DT1$`Taxon group`, exclude=NULL)
 
@@ -160,7 +161,7 @@ DT1 <- DT1 %>%
 
 table(DT1$`Taxon group`, exclude=NULL)
 ```
-Those taxon for which measures of Basal Area exist, can be safely assumed to belong to vascular plants 
+Those taxa for which a measuress of Basal Area exists can be safely assumed to belong to vascular plants 
 
 ```{r}
 DT1 <- DT1 %>% 
@@ -172,7 +173,7 @@ DT1 <- DT1 %>%
 
 
 
-Cross-complement `Taxon group` information. This means, whenever a taxon is marked to belong to one group, then assign the same taxon to that group throughout the `DT` table.
+Cross-complement `Taxon group` information. This means that, whenever a taxon is marked to belong to one group, then assign the same taxon to that group throughout the `DT` table.
 ```{r}
 DT1 <- DT1 %>% 
   left_join(DT1 %>% 
@@ -244,7 +245,7 @@ table(DT1$`Taxon group`, exclude=NULL)
 ```
 
 
-Delete all records of fungi, and use lists of genera to fix additional problems. While in the previous round the matching was done on the resolve Genus name, here we match based on the unresolved Genus name.
+Delete all records of fungi, and use lists of genera to fix additional problems. While in the previous round the matching was done on the resolved Genus name, here the match is based on the unresolved Genus name.
 ```{r}
 DT1 <- DT1 %>% 
   dplyr::select(-Genus) %>% 
@@ -299,7 +300,7 @@ Species abundance information varies across datasets and plots. While for the la
 \newline \newline
 Still, it's not really intuitive that in case `Cover code` belongs to one of the classes above, then the actual abundance value is stored in the `x_` column. This stems from the way this data is stored in `TURBOVEG`.  
 To make the cover data more user friendly, I simplify the way cover is stored, so that there are only two columns:  
-`Ab_scale` - to report the type of scale used
+`Ab_scale` - to report the type of scale used  
 `Abundance` - to coalesce the cover\\abundance values previously in the columns `Cover %` and `x_`.  
 
 ```{r}
@@ -311,7 +312,7 @@ DT1 <- DT1 %>%
   mutate(Ab_scale = ifelse(Ab_scale =="x", "pa", Ab_scale)) 
 ```
 
-Fix some error. There are some plots where only p\\a information is available (`Cover code`=="x"), but have zeros in the field `Cover %`. Consider this as presence\\absence and transform `Cover %` to 1.  
+Fix some errors. There are some plots where only p\\a information is available (`Cover code`=="x"), but have zeros in the field `Cover %`. Consider this as presence\\absence and transform `Cover %` to 1.  
 ```{r}
 DT1 <- DT1 %>% 
   mutate(`Cover %`=replace(`Cover %`, 
@@ -323,7 +324,8 @@ DT1 <- DT1 %>%
                                    pull(PlotObservationID))), 
                            values=1))
 ```
-There are also some plots having different cover scales in the same layer. They are not many, and I will reduce their cover value to p\\a.
+There are also some plots having different cover scales in the same layer. They are not many, and I will reduce their cover value to p\\a.  
+Find these plots first:
 ```{r}
 mixed <- DT1 %>% 
   distinct(PlotObservationID, Ab_scale, Layer) %>% 
@@ -333,8 +335,9 @@ mixed <- DT1 %>%
   pull(PlotObservationID) %>% 
   unique()
 length(mixed)
-
-#Transform these plots to p\a and create field Ab_scale to summarize abundance info
+```
+Transform these plots to p\\a and correct field `Ab_scale`. Note: the column `Abundance` is only created here.
+```{r}
 DT1 <- DT1 %>% 
   mutate(Ab_scale=replace(Ab_scale, 
                            list=PlotObservationID %in% mixed, 
@@ -344,12 +347,25 @@ DT1 <- DT1 %>%
                           x_, `Cover %`)) %>% 
   mutate(Abundance=replace(Abundance, 
                            list=PlotObservationID %in% mixed, 
-                           values=1)) %>% 
-  
+                           values=1))
+```
 
+Double check and summarize abundance_scales
+```{r}
+scale_check <- DT1 %>% 
+  distinct(PlotObservationID, Layer, Ab_scale) %>% 
+  group_by(PlotObservationID) %>% 
+  summarise(Ab_scale_combined=ifelse(length(unique(Ab_scale))==1, 
+                                     unique(Ab_scale), 
+                                     "Multiple_scales"))
+
+nrow(scale_check)== length(unique(DT2$PlotObservationID))
+table(scale_check$Ab_scale_combined)
 ```
 
-I then transform abundances to relative abundance, on a layer by layer basis. For consistency with the previous version of sPlot, I call the field `Relative cover`
+
+Transform abundances to relative abundance, on a layer by layer basis. For consistency with the previous version of sPlot, this field is called `Relative cover`.  
+*Watch out* - Even plots with p\\a information are transformed to relative cover.  
 ```{r}
 DT1 <- DT1 %>% 
   left_join(x=., 
@@ -358,8 +374,17 @@ DT1 <- DT1 %>%
               summarize(tot.abundance=sum(Abundance)), 
             by=c("PlotObservationID", "Layer")) %>% 
   mutate(Relative.cover=Abundance/tot.abundance)
+
+DT1 %>% 
+  group_by(PlotObservationID, Layer) %>% 
+  summarize(tot.cover=sum(Relative.cover), 
+            num.layers=sum(unique(Layer))) %>% 
+  filter(tot.cover != num.layers)
 ```
 
+
+
+
 ## Clean DT and export
 ```{r}
 DT2 <- DT1 %>% 
@@ -373,10 +398,22 @@ DT2 <- DT1 %>%
 
 The output of the DT table contains `r nrow(DT2)` records, over `r length(unique(DT2$PlotObservationID))` plots. The total number of taxa is `r length(unique(DT2$species_original))` and `r length(unique(DT2$species))`, before and after standardization, respectively. Information on the `Taxon group` is available for `r DT2 %>% filter(taxon_group!="Unknown") %>% distinct(species) %>% nrow()` standardized species.
 
+```{r, echo=F}
+knitr::kable(DT2 %>%
+               filter(PlotObservationID %in% sampled[1:3]), 
+  caption="Example of initial DT table (same 3 randomly selected plots shown above)") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
+                  full_width = F, position = "center")
+```
+***!!! ADD Explanation of fields!!!***
+
+
 ```{r}
 save(DT2, file = "../_output/DT_sPlot3.0.RData")
 ```
 
-***!!! ADD Explanation of fields!!!***
-
+##SessionInfo
+```{r, echo=F}
+sessionInfo()
+```
 
diff --git a/code/07_buildCWMs.Rmd b/code/07_buildCWMs.Rmd
index 2bdad6c..8994032 100644
--- a/code/07_buildCWMs.Rmd
+++ b/code/07_buildCWMs.Rmd
@@ -20,7 +20,6 @@ This reports documents the construction of Community Weighted Means (CWMs) and V
 Functional Traits were received by [Jens Kattge](jkattge@bgc-jena.mpg.de) on Jan 21, 2020. 
 
 ```{r results="hide", message=F, warning=F}
-library(reshape2)
 library(tidyverse)
 library(readr)
 library(data.table)
@@ -38,7 +37,7 @@ load("../_output/Backbone3.0.RData")
 ```
 
 Import TRY data
-```{r}
+```{r, warning=F, message=F}
 try.species <- read_csv("../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv", 
                          locale = locale(encoding = "latin1")) 
 try.allinfo <- read_csv("../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/traits_x_georef_wide_table.csv", 
@@ -47,7 +46,7 @@ try.allinfo <- read_csv("../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_da
 try.individuals0 <- read_csv("../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/gapfilled_data/mean_gap_filled_back_transformed.csv", 
          locale = locale(encoding = "latin1"))
 
-#Ancillary function to change to lower case
+#Ancillary function to change species names to lower case
 firstup <- function(x) {
   substr(x, 1, 1) <- toupper(substr(x, 1, 1))
   x
@@ -55,14 +54,12 @@ firstup <- function(x) {
 
 try.species <- try.species %>% 
   mutate(Species=tolower(Species)) %>%
-  mutate(Species=firstup(Species)) %>%
+  mutate(Species=firstup(Species)) 
 
 try.allinfo <- try.allinfo %>% 
   mutate(Species=tolower(Species)) %>%
-  mutate(Species=firstup(Species)) %>%
-
+  mutate(Species=firstup(Species)) 
 
-  
 #calculate statistics of data from TRY
 n.indiv <- try.species %>% 
   group_by(Species) %>% 
@@ -94,11 +91,7 @@ sPlot.species <- DT2 %>%
 sPlot.in.TRY <- sPlot.species %>% 
   filter(species %in% (try.species.names %>% 
                                   distinct(Name_short) %>% 
-                                  pull(Name_short)))
-
-##check match when considering taxa resolved ONLY at genus level
-
-
+                                  pull(Name_short))) %>% 
 ```
 Out of the `r nrow(sPlot.species)` standardizes species names in sPlot 3.0, `r nrow(sPlot.in.TRY)` (`r round(nrow(sPlot.in.TRY)/nrow(sPlot.species)*100,1)`%) also occur in TRY 5.0.
 
@@ -141,6 +134,22 @@ knitr::kable(trait.legend,
                   full_width = F, position = "center")
 ```
 
+There are some traits with unexpected negative entries:
+```{r}
+try.species.names %>% 
+    dplyr::select(Species) %>% 
+    bind_cols(try.individuals0 %>% 
+                  rename_at(col.from, .funs=function(x) col.to) %>% 
+                  dplyr::select(-X1)) %>% 
+  gather(variable, value, -Species) %>% 
+  filter(value<0) %>% 
+  group_by(variable) %>% 
+  summarize(n=n())
+```
+I asked Jens Kattget for clarifications.
+
+
+
 ## Calculate species and genus level trait means and sd
 ```{r}
 #create string to rename traits
@@ -186,8 +195,8 @@ try.genus.means <- try.species.names %>%
 ```
 The average number of observations per species and genus is `r round(mean(try.species.means$n),1)` and `r round(mean(try.genus.means$n),1)`, respectively. As many as `r sum(try.species.means$n==1)` species have only one observation (`r sum(try.genus.means$n==1)` at the genus level).  
 
-Match taxa based on species, if available, or Genus (when rank_correct==Genus)
-```{r}
+## Match taxa based on species, if available, or Genus (when rank_correct==Genus)
+```{r, warning=F}
 try.combined.means <- try.genus.means %>% 
   rename(Taxon_name=Genus) %>% 
   mutate(Rank_correct="genus") %>% 
@@ -204,8 +213,9 @@ total.matches <- DT2 %>%
   filter(!is.na(SLA_mean)) %>% 
   nrow()
 ```
+The total number of matched taxa (either at species, or genus level) is `r total.matches`.  
 
-Calculate summary statistics for species- and genus-level mean traits
+### Calculate summary statistics for species- and genus-level mean traits
 ```{r}
 mysummary <- try.combined.means %>% 
                group_by(Rank_correct) %>% 
@@ -247,20 +257,30 @@ combine.cover <- function(x, datatype){
   return(x)
 }
 
-DT2t <- DT2 %>% 
-  ## temporary!
-  filter(PlotObservationID %in% sample(unique(DT2$PlotObservationID), 100, replace=F)) %>% 
+DT2 <- DT2 %>% 
+  #temporary
+  filter(PlotObservationID %in% sample(unique(DT2$PlotObservationID), 1000, replace=F)) %>% 
   group_by(PlotObservationID, species, Rank_correct) %>%
   summarize(Relative.cover=combine.cover(Relative.cover, cover_code)) %>%
   ungroup()
 
 ```
 
-Calculate CWMs and CWV, as well as plot coverage statistics (proportion of total cover for which trait info exist, and proportion of species for which we have trait info)
+Calculate CWMs and CWV, as well as plot coverage statistics (proportion of total cover for which trait info exist, and proportion of species for which we have trait info). To avoid misleading result, CWM is calculated ONLY for plots for which we have some abundance information. All plots where `Ab_scale`=="pa" in **ANY** of the layers are therefore excluded. 
 
 ```{r, cache=T,  cache.lazy=F}
-# Merge species data table with traits
-CWM0 <- DT2t %>%
+# Tag plots where at least one layer only has p\a information 
+any_pa <- DT2 %>% 
+  distinct(PlotObservationID, Ab_scale) %>% 
+  group_by(PlotObservationID) %>% 
+  summarize(any.pa=any(Ab_scale=="pa")) %>% 
+  filter(any.pa==T) %>% 
+  pull(PlotObservationID)
+length(any_pa)
+
+# Exclude plots above and merge species data table with traits
+CWM0 <- DT2 %>%
+  filter(!PlotObservationID %in% any_pa) %>% 
   left_join(try.combined.means %>%
               dplyr::rename(species=Taxon_name) %>% 
               dplyr::select(species, Rank_correct, ends_with("_mean")), 
@@ -326,8 +346,8 @@ CWM <- CWM1 %>%
   left_join(CWM0 %>% 
               group_by(PlotObservationID) %>%
               summarize(sp.richness=n()), by=c("PlotObservationID")) %>%
-  mutate(sp.with.trait.prop=n.sp.with.trait/sp.richness) %>%
-  dplyr::select(PlotObservationID, variable, sp.richness, sp.with.trait.prop, trait.coverage, CWM, CWV) %>% 
+  mutate(prop.sp.with.trait=n.sp.with.trait/sp.richness) %>%
+  dplyr::select(PlotObservationID, variable, sp.richness, prop.sp.with.trait, trait.coverage, CWM, CWV) %>% 
   arrange(PlotObservationID)
 
 ```
@@ -377,6 +397,10 @@ knitr::kable(CWM.summary,
 save(try.combined.means, CWM, file="../_output/Traits_CWMs.RData")
 ```
 
+##SessionInfo
+```{r}
+sessionInfo()
+```
 
  
 
-- 
GitLab