Fixed CWM - matching traits & count of prop sp. w/ traits

fc75685b · Francesco Sabatini · 9f26f171 · fc75685b · fc75685b · fc75685b
Commit fc75685b authored 4 years ago by Francesco Sabatini
--- a/code/06_buildDT.Rmd
+++ b/code/06_buildDT.Rmd
@@ -75,7 +75,7 @@ nplots <- length(unique(DT0$PlotObservationID))
 nspecies <- length(unique(DT0$`Matched concept`))
 # Plots in header but not in DT
 empty.plots <- header %>% 
-  filter(!PlotObservationID %in% unique(DT2$PlotObservationID)) %>% 
+  filter(!PlotObservationID %in% unique(DT0$PlotObservationID)) %>% 
  pull(PlotObservationID)
 ```

@@ -378,8 +378,8 @@ mixed <- DT1 %>%
  group_by(PlotObservationID, Layer) %>% 
  summarize(n=n()) %>% 
  filter(n>1) %>% 
-  pull(PlotObservationID) %>% 
-  unique()
+  distinct(PlotObservationID) %>% 
+  pull(PlotObservationID) 
 length(mixed)
 ```
 Transform these plots to p\\a and correct field `Ab_scale`. Note: the column `Abundance` is only created here.
@@ -427,8 +427,12 @@ DT1 <- DT1 %>%
              summarize(tot.abundance=sum(Abundance)), 
            by=c("PlotObservationID")) %>% 
  mutate(Relative.cover=Abundance/tot.abundance)
+```
+

-# check: there should be no plot where the sum of all relative covers !=0
+```{r echo=F, eval=F}
+# Deprecated - Rel.cover is returned at the level of plot, not layer
+# check: there should be no plot where the sum of all relative covers != number of layers
 DT1 %>% 
  group_by(PlotObservationID) %>% 
  summarize(tot.cover=sum(Relative.cover), 

--- a/code/07_buildCWMs.Rmd
+++ b/code/07_buildCWMs.Rmd
@@ -12,12 +12,13 @@ output: html_document

 **Timestamp:** `r date()`  
 **Drafted:** Francesco Maria Sabatini  
-**Revised:**  
-**version:** 1.0
+**Revised:** Helge Bruelheide  
+**version:** 1.2  
  
 This report documents 1) the construction of Community Weighted Means (CWMs) and Variance (CWVs); and 2) the classification of plots into forest\\non-forest based on species growth forms. It complements species composition data from sPlot 3.0 and gap-filled plant functional traits from TRY 5.0, as received by [Jens Kattge](jkattge@bgc-jena.mpg.de) on Jan 21, 2020. 

 *Changes in version 1.1* - Standardized Growth form names in sPlot_traits.  
+*Changes in version 1.2* - Improved match of species to traits, accounted for non standardized species in CWM completeness.  

 ```{r results="hide", message=F, warning=F}
 library(tidyverse)
@@ -28,6 +29,9 @@ library(kableExtra)
 library(stringr)
 library(caret)
 library(viridis)
+#save temporary files
+write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
+write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
 ```

 # 1 Data import, preparation and cleaning 
@@ -97,7 +101,7 @@ sPlot.in.TRY <- sPlot.species %>%
                                  distinct(Name_short) %>% 
                                  pull(Name_short))) 
 ```
-Out of the `r nrow(sPlot.species)` standardizes species names in sPlot 3.0, `r nrow(sPlot.in.TRY)` (`r round(nrow(sPlot.in.TRY)/nrow(sPlot.species)*100,1)`%) also occur in TRY 5.0. This number does not account for matches at the genus level. 
+Out of the `r nrow(sPlot.species)` standardized species names in sPlot 3.0, `r nrow(sPlot.in.TRY)` (`r round(nrow(sPlot.in.TRY)/nrow(sPlot.species)*100,1)`%) also occur in TRY 5.0. This number does not account for matches at the genus level. 



@@ -250,6 +254,10 @@ try.combined.means <- try.genus.means %>%

 total.matches <- DT2 %>%
  distinct(Species, Rank_correct) %>%
+  mutate(Rank_correct=fct_recode(Rank_correct, 
+  #many taxa reported as matched at higher rank level or lower, were nevertheless resolved at species level
+                                   "species"="higher", 
+                                   "species"="lower")) %>% #added in Version1.2
  left_join(try.combined.means %>%
              dplyr::rename(Species=Taxon_name), 
            by=c("Species", "Rank_correct")) %>% 
@@ -302,7 +310,7 @@ combine.cover <- function(x){
 }

 DT2.comb <- DT2 %>% 
-  group_by(PlotObservationID, Species, Rank_correct) %>% 
+  group_by(PlotObservationID, Species,Species_original, Rank_correct) %>% 
  summarize(Relative_cover=combine.cover(Relative_cover)) %>%
  ungroup() %>% 
  # re-normalize to 100%
@@ -331,6 +339,9 @@ length(any_pa)
 # Exclude plots above and merge species data table with traits
 CWM0 <- DT2.comb %>%
  filter(!PlotObservationID %in% any_pa) %>% 
+  mutate(Rank_correct=fct_recode(Rank_correct, 
+                                 "species"="higher", 
+                                 "species"="lower")) %>% #added in Version1.2
  left_join(try.combined.means %>%
              dplyr::rename(Species=Taxon_name) %>% 
              dplyr::select(Species, Rank_correct, ends_with("_mean")), 
@@ -340,25 +351,12 @@ CWM0 <- DT2.comb %>%

 ```{r, cache=T,  cache.lazy=F, warning=F}
 # Calculate CWM for each trait in each plot
-CWM1 <- CWM0 %>%
+CWM1 <- CWM0 %>% 
  group_by(PlotObservationID) %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~weighted.mean(., Relative_cover, na.rm=T))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  gather(key=variable, value=CWM, -PlotObservationID)
-```
-
-
-```{r, cache=T,  cache.lazy=F, warning=F}
-# Calculate coverage for each trait in each plot
-CWM2 <- CWM0 %>%
-  mutate_at(.funs = list(~if_else(is.na(.),0,1) * Relative_cover), 
-            .vars = vars(StemDens_mean:LeafWaterCont_mean)) %>%
-  group_by(PlotObservationID) %>%
-  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
-               .funs = list(~sum(., na.rm=T))) %>%
-  dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  gather(key=variable, value=trait.coverage, -PlotObservationID)
+  pivot_longer(-PlotObservationID, names_to="variable", values_to = "CWM")
 ```


@@ -385,12 +383,35 @@ CWM3 <- CWM0 %>%
  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
               .funs = list(~variance2.fun(., Relative_cover))) %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  gather(key=variable, value=CWV, -PlotObservationID)
+  pivot_longer(-PlotObservationID, names_to="variable", values_to = "CWV")
 ```

+```{r, warning=F}
+# Calculate coverage for each trait in each plot
+# changed in Version 1.2
+CWM2 <- CWM0 %>%
+  mutate(StemDens_mean=if_else(is.na(StemDens_mean),0,1) * Relative_cover) %>% 
+  group_by(PlotObservationID) %>%
+  summarize(trait.coverage=sum(StemDens_mean, na.rm=T))
+```
+
+
+```{r deprecated2, echo=F, eval=F}
+### DEPRECATED 
+### Unnecessarily slow, given that being the trait data gap-filled
+### it's sufficient to calculate the proportion of species with traits
+### on just on trait
+# Calculate coverage for each trait in each plot
+CWM2 <- CWM0 %>%
+  mutate_at(.funs = list(~if_else(is.na(.),0,1) * Relative_cover), 
+            .vars = vars(StemDens_mean:LeafWaterCont_mean)) %>%
+  group_by(PlotObservationID) %>%
+  summarize_at(.vars= vars(StemDens_mean:LeafWaterCont_mean),
+               .funs = list(~sum(., na.rm=T))) %>%
+  dplyr::select(PlotObservationID, order(colnames(.))) %>%
+  gather(key=variable, value=trait.coverage, -PlotObservationID)
+

-```{r, cache=T,  cache.lazy=F, warning=F}
-## Calculate proportion of species having traits
 CWM4 <- CWM0 %>%
  group_by(PlotObservationID) %>%
  #distinct(PlotObservationID, species, .keep_all = T) %>% 
@@ -399,11 +420,20 @@ CWM4 <- CWM0 %>%
  dplyr::select(PlotObservationID, order(colnames(.))) %>%
  gather(key=variable, value=n.sp.with.trait, -PlotObservationID)

+```
+
+```{r, cache=T,  cache.lazy=F, warning=F}
+## Calculate proportion of species having traits #changes in version 1.2
+CWM4 <- CWM0 %>%
+  group_by(PlotObservationID) %>%
+  summarize(n.sp.with.trait=sum(!is.na(StemDens_mean))) 
+
+
 # Join together
 CWM <- CWM1 %>%
-  left_join(CWM2, by=c("PlotObservationID", "variable")) %>%
+  left_join(CWM2, by=c("PlotObservationID")) %>%
  left_join(CWM3, by=c("PlotObservationID", "variable")) %>%
-  left_join(CWM4, by=c("PlotObservationID", "variable")) %>%
+  left_join(CWM4, by=c("PlotObservationID")) %>%
  left_join(CWM0 %>% 
              group_by(PlotObservationID) %>%
              summarize(sp.richness=n()), by=c("PlotObservationID")) %>%
@@ -414,6 +444,7 @@ CWM <- CWM1 %>%
 ```

 ```{r, echo=F}
+load("../_output/header_sPlot3.0.RData")
 # align to header (to avoid recalculating cached CWMs)
 CWM <- CWM %>% 
  filter(PlotObservationID %in% header$PlotObservationID)
@@ -457,11 +488,10 @@ CWM.coverage <- CWM %>%
                           max=~max(., na.rm=T), 
                           mean=~mean(., na.rm=T), 
                           sd=~sd(., na.rm=T))) %>% 
-  gather(key=variable, value=value) %>% 
+  pivot_longer(cols = 1:ncol(.), names_to="variable", values_to="value") %>% 
  separate(variable, sep="_", into=c("metric", "stat")) %>% 
  mutate(stat=factor(stat, levels=c("num.0s", "min", "q025", "q50", "q75", "max", "mean", "sd"))) %>% 
-  spread(key=stat, value=value) 
-
+  pivot_wider(names_from = "stat")
 ```

 ```{r, echo=F}
@@ -569,12 +599,12 @@ After manual completion, the number of records without growth form information d
 All public data on growth form downloaded. First take care of unmatched quotation marks in the txt file. Do this from command line.
 ```{bash, eval=F}
 # escape all unmatched quotation marks. Run in Linux terminal
-#sed 's/"/\\"/g' 8854.txt > 8854_test.csv
-#sed "s/'/\\'/g" 8854.txt > 8854_test.csv
+#sed 's/"/\\"/g' 8854.txt > 8854_test.txt
+#sed "s/'/\\'/g" 8854_test.txt > 8854_test2.txt
 ```
 Information on growth form is not organized and has a myriad of levels. Extract and simplify to the set of few types used so far. In case a species is attributed to multiple growth forms use a majority vote. 
 ```{r, message=F, warning=F}
-all.gf0 <- read_delim("../_input/TRY5.0_v1.1/8854_test.txt", delim="\t") 
+all.gf0 <- read_delim("../_input/TRY5.0_v1.1/8854_test2.txt", delim="\t") 

 all.gf <- all.gf0 %>% 
  filter(TraitID==42) %>% 
@@ -596,7 +626,7 @@ all.gf <- all.gf0 %>%
                                       list=str_detect(GrowthForm0, "shrub|scrub|bamboo"), "shrub")) %>%
  mutate(GrowthForm_simplified=replace(GrowthForm_simplified, 
                                       list=str_detect(GrowthForm0,
-                                      "herb|sedge|graminoid|fern|forb|herbaceous|grass|chaemaephyte|geophyte|annual"),
+                                                       "herb|sedge|graminoid|fern|forb|herbaceous|grass|chaemaephyte|geophyte|annual"),
                                       "herb")) %>%
  mutate(GrowthForm_simplified=ifelse(GrowthForm_simplified %in% c("other", "herb", "shrub", "tree"), 
                                      GrowthForm_simplified, NA)) %>% 
@@ -869,6 +899,9 @@ header.vegtype <- header %>%
  left_join(plot.vegtype %>% 
              dplyr::select(PlotObservationID, is.forest, is.non.forest),
            by="PlotObservationID")
+```
+
+```{r}
 #check 
 nrow(header.vegtype)==nrow(header)
 ```

--- a/code/08_Documentation.Rmd
+++ b/code/08_Documentation.Rmd
@@ -134,8 +134,8 @@ sPlot 3.0 contains `r nrow(header)` Plots.
 6. `GUID`: GUID identifier, as created in Turboveg 2.  
 <br>
 **Geographic location**  
-7. `Longitude`: Degrees, WGS85.  
-8. `Latitude`: Degrees, WGS85.  
+7. `Longitude`: Degrees, WGS84.  
+8. `Latitude`: Degrees, WGS84.  
 9. `Location uncertainty (m)`: Uncertainty of geographic coordinates. Negative if estimated, instead of measured. 
 10. `Country`: Original country name in Turboveg 3.  
 11. `CONTINENT`: Continent.  

--- a/public/06_buildDT.html
+++ b/public/06_buildDT.html
--- a/public/07_buildCWMs.html
+++ b/public/07_buildCWMs.html