From 1cde2861c60f2c0a999d34f5f4390e615a8a0537 Mon Sep 17 00:00:00 2001
From: Francesco Sabatini <francesco.sabatini@idiv.de>
Date: Fri, 13 Mar 2020 14:56:40 +0100
Subject: [PATCH] Cross-checked and harmonized output

---
 code/04_buildHeader.Rmd |  24 +++++----
 code/06_buildDT.Rmd     |  20 +++++---
 code/07_buildCWMs.Rmd   | 105 ++++++++++++++++------------------------
 3 files changed, 68 insertions(+), 81 deletions(-)

diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd
index 60122fb..ef14da1 100644
--- a/code/04_buildHeader.Rmd
+++ b/code/04_buildHeader.Rmd
@@ -625,14 +625,18 @@ Reimport output, attach to header and check
 elevation.out <- read_csv("../_derived/elevatr/elevation.out.csv")
 ```
 
-
 ```{r message=F, echo=F}
-knitr::kable(head(elevation.out,10), 
-  caption="Example of elevation output (only first 10 shown)") %>%
+knitr::kable(elevation.out %>% sample_n(10), 
+  caption="Example of elevation output (10 randomly selected plots shown)") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   full_width = F, position = "center")
 ```
 
+```{r}
+summary(elevation.out %>% 
+          dplyr::select(-PlotObservationID, -elevation))
+
+```
 There are `r sum(is.na(elevation.out$Elevation_median))` plots without elevation info, corresponding to `r round(sum(is.na(elevation.out$Elevation_median))/nrow(header)*100,1)`% of total.  
 There are `r sum(elevation.out$Elevation_median < -1, na.rm=T)` plots with elevation below sea level.  
 \newline
@@ -641,7 +645,8 @@ Join elevation data (only median)
 header <- header %>% 
   left_join(elevation.out %>% 
               dplyr::select(PlotObservationID, Elevation_median) %>% 
-              rename(elevation_dem=Elevation_median), 
+              rename(elevation_dem=Elevation_median) %>% 
+              distinct(PlotObservationID, .keep_all=T), 
             by="PlotObservationID")
 
 ```
@@ -654,17 +659,13 @@ knitr::kable(header %>%
                summarize(num.plot=n()) %>% 
                ungroup() %>% 
                arrange(desc(num.plot)) %>% 
-               slice(1:10), 
-  caption="Dataset with highest number of plots below sea level") %>%
+               sample_n(10), 
+  caption="Dataset with highest number of plots below sea level (10 randomly selected plots shown)") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   full_width = F, position = "center")
 ```
 
-```{r}
-summary(elevation.out %>% 
-          dplyr::select(-PlotObservationID, -elevation))
 
-```
 
 Create Scatterplot between measured elevation in the field, and elevation derived from DEM  
 ```{r scatterplot, cache=T}
@@ -768,6 +769,9 @@ ggsave(filename="../_pics/PlotDistrib_Dark2_shuffle1984.png", width = 15, height
 ## 6 Fix output and export
 
 ```{r}
+#check 
+nrow(header)==nrow(header0)
+
 header <- header %>% 
   dplyr::select(
     #Metadata
diff --git a/code/06_buildDT.Rmd b/code/06_buildDT.Rmd
index 32501c1..d8eeab1 100644
--- a/code/06_buildDT.Rmd
+++ b/code/06_buildDT.Rmd
@@ -318,17 +318,21 @@ DT1 <- DT1 %>%
                            "CoverPerc"))  
 ```
 
-Fix some errors. There are some plots where only p\\a information is available (`Cover code=="x"`), but have all zeros in the field `Cover %`. Consider this as presence\\absence and transform `Cover %` to 1.  
+Fix some errors. There are some plots where all species have zeros in the field `Cover %`. Some of them are marked as p\\a (`Cover code=="x"`), but other not. Consider all this plots as presence\\absence and transform `Cover %` to 1.  
+!! There are some other plots having layers with all zeros. This should be double-checked, but are not being transformed here !!
 ```{r}
+allzeroes <- DT1 %>% 
+  group_by(PlotObservationID) %>% 
+  summarize(allzero=all(`Cover %`==0) ) %>% 
+  filter(allzero==T) %>% 
+  pull(PlotObservationID)
 DT1 <- DT1 %>% 
   mutate(`Cover %`=replace(`Cover %`, 
-                           list=(PlotObservationID %in% (DT1 %>% 
-                                   group_by(PlotObservationID) %>% 
-                                   mutate(check= (`Cover %`==0 & `Cover code`=="x")) %>% 
-                                   summarize(allzero=mean(check)==1) %>% 
-                                   filter(allzero==T) %>% 
-                                   pull(PlotObservationID))), 
-                           values=1))
+                           list=(PlotObservationID %in% allzeroes), 
+                           values=1)) %>% 
+  mutate(`Cover code`=replace(`Cover code`, 
+                           list=(PlotObservationID %in% allzeroes), 
+                           values="x"))
 ```
 Consider all plot-layer combinations where `Cover code=="x"`, and all the entries of the field `Cover % == 1` as presence\\absence data, and transform `Ab_scale` to "pa". This is done to avoid confusion with plots where `Cover code=="x"` but "x" has to be intended as a class in the cover scale used. For p\\a plots, replace the field `Cover %` with NA, and assign the value 1 to the field `x_`.  
 ```{r}
diff --git a/code/07_buildCWMs.Rmd b/code/07_buildCWMs.Rmd
index 08223de..46718ee 100644
--- a/code/07_buildCWMs.Rmd
+++ b/code/07_buildCWMs.Rmd
@@ -41,7 +41,7 @@ Import TRY data
 try.species <- read_csv(
   "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/hierarchy.info.csv",
   locale = locale(encoding = "latin1")) 
-# Original data without gap-filling. With species and trait lables
+# Original data without gap-filling. With species and trait labels
 try.allinfo <- read_csv(
   "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/input_data/traits_x_georef_wide_table.csv", 
   locale = locale(encoding = "latin1"), 
@@ -50,12 +50,6 @@ try.allinfo <- read_csv(
 try.individuals0 <- read_csv(
   "../_input/TRY5.0_v1.1/TRY_5_GapFilledData_2020/gapfilled_data/mean_gap_filled_back_transformed.csv", 
   locale = locale(encoding = "latin1"))
-
-#calculate statistics of data from TRY
-n.indiv <- try.species %>% 
-  group_by(Species) %>% 
-  summarize(n=n()) %>% 
-  pull(n)
 ```
 There are `r nrow(try.species)` individual observations from `r nrow(try.species %>% distinct(Species))` distinct (unresolved) species in `r nrow(try.species %>% distinct(Genus))` distinct (unresolved) genera. 
 
@@ -141,7 +135,7 @@ trait.legend <- data.frame(full=try.allinfo %>%
 
 ```{r, echo=F}
 knitr::kable(trait.legend, 
-  caption="Legend of trait from TRY") %>%
+  caption="Legend of traits from TRY") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                   full_width = F, position = "center")
 ```
@@ -425,12 +419,13 @@ ggplot(data=CWM %>%
   ylab("Trait coverage (Proportion of species)") + 
   coord_equal()
 ```
-
+Calculate summary statistics for trait coverage in plots
 ```{r, warning=F}
 CWM.coverage <- CWM %>% 
   filter(variable=="LDMC_mean") %>% 
   summarize_at(.vars=vars(trait.coverage, prop.sp.with.trait),
-                .funs=list(min=~min(., na.rm=T),
+                .funs=list(num.0s=~sum(.==0),
+                           min=~min(., na.rm=T),
                            q025=~quantile(., 0.25, na.rm=T), 
                            q50=~quantile(., 0.50, na.rm=T), 
                            q75=~quantile(., .75, na.rm=T), 
@@ -439,7 +434,7 @@ CWM.coverage <- CWM %>%
                            sd=~sd(., na.rm=T))) %>% 
   gather(key=variable, value=value) %>% 
   separate(variable, sep="_", into=c("metric", "stat")) %>% 
-  mutate(stat=factor(stat, levels=c("min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
+  mutate(stat=factor(stat, levels=c("num.0s", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
   spread(key=stat, value=value) 
 
 ```
@@ -457,8 +452,7 @@ CWM.summary <- CWM %>%
   rename(myvar=variable) %>% 
   group_by(myvar) %>% 
   summarize_at(.vars=vars(CWM:CWV),
-                .funs=list(num.NAs=~sum(is.na(.)), 
-                           min=~min(., na.rm=T),
+                .funs=list(min=~min(., na.rm=T),
                            q025=~quantile(., 0.25, na.rm=T), 
                            q50=~quantile(., 0.50, na.rm=T), 
                            q75=~quantile(., .75, na.rm=T), 
@@ -467,7 +461,7 @@ CWM.summary <- CWM %>%
                            sd=~sd(., na.rm=T))) %>% 
   gather(key=variable, value=value, -myvar) %>% 
   separate(variable, sep="_", into=c("metric", "stat")) %>% 
-  mutate(stat=factor(stat, levels=c("num.NAs", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
+  mutate(stat=factor(stat, levels=c("min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
   spread(key=stat, value=value) %>% 
   arrange(metric, myvar)
   
@@ -480,34 +474,6 @@ knitr::kable(CWM.summary,
                   full_width = F, position = "center")
 ```
 
-
-Calculate summary statistics for trait coverage in plots
-```{r}
-coverage.summary <- CWM %>% 
-  separate(variable, sep="_", into=c("metric", "stat")) %>% 
-  filter(metric=="SLA") %>% 
-  summarize_at(.vars=vars(prop.sp.with.trait:trait.coverage),
-                .funs=list(num.NAs=~sum(is.na(.)), 
-                           min=~min(., na.rm=T),
-                           q025=~quantile(., 0.25, na.rm=T), 
-                           q50=~quantile(., 0.50, na.rm=T), 
-                           q75=~quantile(., .75, na.rm=T), 
-                           max=~max(., na.rm=T), 
-                           mean=~mean(., na.rm=T), 
-                           sd=~sd(., na.rm=T))) %>% 
-  gather(variable, value) %>% 
-  separate(variable, sep="_", into=c("metric", "stat")) %>% 
-  mutate(stat=factor(stat, levels=c("num.NAs", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
-  spread(key=stat, value=value)
-```
-
-```{r, echo=F}
-knitr::kable(coverage.summary, 
-  caption="Summary of CWMs and CWVs across all plots", digits = 3) %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
-                  full_width = F, position = "center")
-```
-
 ### 2.2 Export CWM and species mean trait values
 ```{r}
 save(try.combined.means, CWM, file="../_output/Traits_CWMs_sPlot3.RData")
@@ -532,7 +498,7 @@ We used different sources of information:
 4) Cross-match with species assigned to tree layer in DT table.
 \newline\newline
 
-Step 1: Derive growth form trait information to DT table. Growth form information derives from TRY
+**Step 1**: Attach growth form trait information to DT table. Growth form information derives from TRY
 ```{r}
 DT.gf <- DT2 %>% 
   filter(taxon_group=="Vascular plant") %>% 
@@ -550,7 +516,7 @@ DT.gf <- DT2 %>%
 sum(is.na(DT.gf$GrowthForm))
 ```
 
-Step 2: Select most common species without growth-trait information to export and check manually
+**Step 2**: Select most common species without growth-trait information to export and check manually
 ```{r}
 top.gf.nas <- DT.gf %>% 
   filter(is.na(GrowthForm)) %>% 
@@ -558,7 +524,7 @@ top.gf.nas <- DT.gf %>%
   summarize(n=n()) %>% 
   arrange(desc(n))
 ```
-```{r, eval=F}
+```{r, eval=F, message=F}
 write_csv(top.gf.nas %>% 
             filter(n>1000), 
   path="../_derived/Species_missingGF.csv")
@@ -566,6 +532,7 @@ write_csv(top.gf.nas %>%
 The first `r nrow(top.gf.nas)` species account for `r round(sum(top.gf.nas %>% filter(n>1000) %>% pull(n))/sum(top.gf.nas$n)*100,2)`% of the missing records. Assign growth forms manually, reimport and coalesce into `DT.gf`
 
 ```{r}
+# Import manually classified species - this info is also reported in Appendix 1
 gf.manual <- read_csv("../_derived/Species_missingGF_complete.csv")
 
 DT.gf <- DT.gf %>% 
@@ -578,7 +545,7 @@ DT.gf <- DT.gf %>%
 After manual completion, the number of records without growth form information decresead to `r sum(is.na(DT.gf$GrowthForm))`.  
 \newline\newline
 
-Step 3: Import additional data on growth-form from TRY (Accessed 10 March 2020).  
+**Step 3**: Import additional data on growth-form from TRY (Accessed 10 March 2020).  
 All public data on growth form downloaded. First take care of unmatched quotation marks in the txt file. Do this from command line.
 ```{bash, eval=F}
 # escape all unmatched quotation marks. Run in Linux terminal
@@ -586,7 +553,7 @@ All public data on growth form downloaded. First take care of unmatched quotatio
 #sed "s/'/\\'/g" 8854.txt > 8854_test.csv
 ```
 Information on growth form is not organized and has a myriad of levels. Extract and simplify to the set of few types used so far. In case a species is attributed to multiple growth forms use a majority vote. 
-```{r, message=F}
+```{r, message=F, warning=F}
 all.gf0 <- read_delim("../_input/TRY5.0_v1.1/8854_test.txt", delim="\t") 
 
 all.gf <- all.gf0 %>% 
@@ -603,23 +570,24 @@ all.gf <- all.gf0 %>%
                                        "other")) %>%
   mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                        list=str_detect(GrowthForm0, 
-                                                       "tree|conifer|^woody$|palmoid|mangrove|gymnosperm"), "tree")) %>% 
+                                                       "tree|conifer|^woody$|palmoid|mangrove|gymnosperm"), 
+                                       "tree")) %>% 
   mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                        list=str_detect(GrowthForm0, "shrub|scrub|bamboo"), "shrub")) %>%
   mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                        list=str_detect(GrowthForm0,
-                                                       "herb|sedge|graminoid|fern|forb|herbaceous|grass|chaemaephyte|geophyte|annual"),
+                                      "herb|sedge|graminoid|fern|forb|herbaceous|grass|chaemaephyte|geophyte|annual"),
                                        "herb")) %>%
   mutate(GrowthForm_simplified=ifelse(GrowthForm_simplified %in% c("other", "herb", "shrub", "tree"), 
                                       GrowthForm_simplified, NA)) %>% 
   filter(!is.na(GrowthForm_simplified)) 
+
 #Some species have multiple attributions - use a majority vote. NA if ties
 get.mode <- function(x){
   if(length(unique(x))==1){
     return(as.character(unique(x)))} else{
     tmp <- sort(table(x), decreasing=T)
     if(tmp[1]!=tmp[2]){return(names(tmp)[1])} else {
-      #return(paste0(names(tmp)[1:2], collapse="/"))}
     return("Unknown")}
     }
   }
@@ -640,7 +608,7 @@ DT.gf <- DT.gf %>%
   dplyr::select(-GrowthForm_simplified)
 ```
 
-Step 4: Cross-match. Assign all species occurring in at least one relevé in the tree layer as tree. Conservatively, do this only when the record is at species level (exclude records at genus\\family level)
+**Step 4**: Cross-match. Assign all species occurring in at least one relevé in the tree layer as tree. Conservatively, do this only when the record is at species level (exclude records at genus\\family level)
 ```{r}
 other.trees <- DT.gf %>% 
   filter(Layer==1 & is.na(GrowthForm)) %>% 
@@ -672,6 +640,9 @@ Define a species as `is.not.tree.or.shrub.and.small` when it has a height <10, a
 ```{r}
 GF <- DT.gf %>% 
   distinct(species, GrowthForm, PlantHeight_mean) %>% 
+  mutate(GrowthForm=fct_collape(GrowthForm, 
+                                 "herb/shrub"=c("herb\\shrub","herb/shrub"), 
+                                 "shrub/tree"=c("shrub/tree", "shrub\\tree"))) %>% 
   ## define is.tree.or.tall
   mutate(is.tree.or.tall.shrub=NA) %>% 
   mutate(is.tree.or.tall.shrub=replace(is.tree.or.tall.shrub, 
@@ -706,7 +677,7 @@ table(GF$GrowthForm, GF$is.tree.or.tall.shrub, exclude=NULL)
 GF %>% 
   filter(is.tree.or.tall.shrub & GrowthForm=="herb")
 ```
-
+These are Bamboo species and their hiehgts seems reasonable.  
 
 ## 3.2 Classify plots into forest\\non-forest
 Define a plot as forest if:  
@@ -715,11 +686,13 @@ Define a plot as forest if:
 3) Has a total cover of tree or tall shrub species >=25% (from DT + TRY)  
 4) Has data on Basal area summing to 10 m2/ha  
 \newline\newline
+
 The first three criteria are declined to define non forest as follows:  
 1) Info on total cover of the tree layer is available and <25%  
 2) Info on total cover in Layer 1 is available and <25%  
 3) The **relative** cover of non tree species is >75%  
 \newline\newline
+
 Criteria 2 and 3 only apply to plots having cover data in percentage.  
 Reimport header file
 ```{r}
@@ -732,17 +705,17 @@ header <- header %>%
     dplyr::select(PlotObservationID:ESY, `Cover total (%)`:`Maximum height cryptogams (mm)`)
 ```
 
-
+**Criterium 1**
 ```{r}
-# Criterium 1
 plot.vegtype1 <- header %>% 
   dplyr::select(PlotObservationID, `Cover tree layer (%)`) %>% 
   rename(Cover_trees=`Cover tree layer (%)`) %>% 
   mutate(is.forest=Cover_trees>=25) 
 
 table(plot.vegtype1 %>% dplyr::select(is.forest), exclude=NULL)
-
-# Criterium 2
+```
+**Criterium 2**
+```{r}
 # Select only plots having cover data in percentage
 mysel <- (DT.gf %>% 
             distinct(PlotObservationID, Ab_scale) %>% 
@@ -766,7 +739,9 @@ plot.vegtype2 <- DT.gf %>%
 
 table(plot.vegtype1 %>% dplyr::select(is.forest), exclude=NULL)
 
-# Criterium 3
+```
+**Criterium 3**
+```{r}
 plot.vegtype3 <- DT.gf %>% 
   #filter plots where all records are recorded as percentage cover
   filter(PlotObservationID %in% mysel ) %>% 
@@ -787,8 +762,9 @@ plot.vegtype3 <- DT.gf %>%
   mutate(is.non.forest=cover_tree<25 & (cover_non_tree/tot.cover)>.75)
 
 table(plot.vegtype3 %>% dplyr::select(is.forest, is.non.forest), exclude=NULL)
-
-## Criterium 4
+```
+**Criterium 4**
+```{r}
 plot.vegtype4 <-  DT.gf %>% 
   filter(Ab_scale=="x_BA") %>% 
   group_by(PlotObservationID) %>% 
@@ -798,7 +774,7 @@ plot.vegtype4 <-  DT.gf %>%
 table(plot.vegtype4 %>% dplyr::select(is.forest), exclude=NULL)
 ```
 
-Combine classifications from the three criteria. Use majority vote to assign plots. In case of ties, a progressively lower priority is given from criterium 1 to criterim 4. 
+Combine classifications from the three criteria. Use majority vote to assign plots. In case of ties, a progressively lower priority is given from criterium 1 to criterium 4. 
 ```{r}
 plot.vegtype <- header %>% 
   dplyr::select(PlotObservationID) %>% 
@@ -825,7 +801,7 @@ plot.vegtype <- header %>%
   mutate(mean.non.forest2=coalesce( (!is.forest.x), (!is.forest.y), is.non.forest.x.x, (!is.forest.y.y))) %>% 
   mutate(is.non.forest=ifelse(mean.non.forest==0.5, mean.non.forest2, mean.non.forest>0.5)) %>% 
   # when both is.forest & is.non.forest are F transform to NA
-  mutate(both.F=ifelse( (is.forest==F & is.non.forest==F), T, F)) %>% 
+  mutate(both.F=ifelse( ( (is.forest==F | is.na(is.forest)) & is.non.forest==F), T, F)) %>% 
   mutate(is.forest=replace(is.forest, list=both.F==T, values=NA)) %>% 
   mutate(is.non.forest=replace(is.non.forest, list=both.F==T, values=NA))
 
@@ -873,6 +849,8 @@ header.vegtype <- header %>%
   left_join(plot.vegtype %>% 
               dplyr::select(PlotObservationID, is.forest, is.non.forest),
             by="PlotObservationID")
+#check 
+nrow(header.vegtype)==nrow(header)
 ```
 
 Through the process described above, we managed to classify `r plot.vegtype %>% filter(is.forest==T | is.non.forest==T) %>% nrow()`, of which `r plot.vegtype %>% filter(is.forest==T) %>% nrow()` is forest and `r plot.vegtype %>% filter(is.non.forest==T) %>% nrow()` is non-forest.  
@@ -903,8 +881,9 @@ save(header, file="../_output/header_sPlot3.0.RData")
 
 
 
-# Appendix
-## Growth forms of most common species
+# APPENDIX
+## Appendix 1 - Growth forms of most common species
+As assigned manually.
 ```{r comment=''}
 cat(readLines("../_derived/Species_missingGF_complete.csv"), sep = '\n')
 ```
-- 
GitLab