diff --git a/01_Extract_data_Project_31.Rmd b/01_Extract_data_Project_31.Rmd
index 47585766de068a1d38579b016edfa2ca71c04250..de59e4fb2735f66450fd26b0cc9ca84514054ac2 100755
--- a/01_Extract_data_Project_31.Rmd
+++ b/01_Extract_data_Project_31.Rmd
@@ -1,8 +1,8 @@
 ---
 title: 'Project #31 - Data Extraction'
 output:
-html_document: default
-word_document: default
+  html_document: default
+  toc: TRUE
 always_allow_html: yes
 ---
 
@@ -19,33 +19,26 @@ always_allow_html: yes
 **Version:** 1.0 
   
 
-This report documents the data extraction for **sPlot project proposal #31** - *Global variation in fine root traits along climate and soil gradients* as requested by Daniel Laughlin and Alexandra Weigelt.   
+This report documents the data extraction for **sPlot project proposal #31** - *The adaptive value of xylem physiology within and across global ecoregions* as requested by Daniel Laughlin and Jesse Robert Fleri 
+
+
 
 
-This R script:  
-1) Extracts plots from sPlot based on species with root traits,  
-2) Calculates CWM for plots with >80% trait coverage,  
-3) Extracts climate and soils data,  
-4) Classifies plots into broad vegetation types based on species' affinities,  
-5) Shows the distribution of these root traits across vegetation types (forest, grassland, etc)  
-6) Returns scatterplots of root traits along climate and soil variables.  
-~~7) Performs a PCA of CWMs~~  
-  
 ```{r results="hide", message=F}
-library(reshape2)
 library(tidyverse)
-library(dplyr)
-library(data.table)
+
 library(knitr)
 library(kableExtra)
 library(viridis)
 library(grid)
 library(gridExtra)
-library(vegan)
-library(xlsx)
-library(caret)
-library(foreign)
-library(raster)
+
+#library(vegan)
+#library(xlsx)
+#library(caret)
+#library(foreign)
+#library(raster)
+
 library(downloader)
 library(sp)
 library(sf)
@@ -55,102 +48,135 @@ library(rgeos)
 #save temporary files
 write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
 write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
-rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
+#rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
 ```
 
-
-
-```{r, cache=T}
+Import data from sPlot 3.0
+```{r}
 #Import sPlot data
-load("/data/sPlot/releases/sPlot2.1/DT2_20161025.RData")
-load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
+load("/data/sPlot/releases/sPlot3.0/header_sPlot3.0.RData")
+load("/data/sPlot/releases/sPlot3.0/DT_sPlot3.0.RData")
+load("/data/sPlot/releases/sPlot3.0/Traits_CWMs_sPlot3.RData")
+load("/data/sPlot/releases/sPlot3.0/SoilClim_sPlot3.RData")
+```
+Import data on xylem traits, provided by Jesse Robert Fleri on October 26th, 2020.
+```{r}
+load("xylem_data.RData")
 ```
 
-```{r, echo=F, cache=T}
-source("/data/sPlot/users/Francesco/_sPlot_Management/Fix.header.R")
-header <- fix.header(header, exclude.sophy=F)
-#clean unnecessary columns 
-DT2 <- DT2 %>% 
-  dplyr::select(-Taxon.group, -Matched.concept)
-header <- header %>%
-  dplyr::select(-Longitude, -Latitude)
-# merge vegetation layers, where necessary
-###combine cover values across layers
-combine.cover <- function(x){
-    while (length(x)>1){
-      x[2] <- x[1]+(100-x[1])*x[2]/100
-      x <- x[-1]
-    }
-    return(x)
-}
+# 1 Extract plots from sPlot based on species with xylem traits  
+Extract all plots containing at least one species in the xylem list.
+```{r, message=F, results=F, warning=F}
+species_list <- xylem_data$Species
+plot.sel <- DT2 %>%
+  filter(DT2$species %in% species_list) %>%
+  dplyr::select(PlotObservationID) %>%
+  distinct() %>%
+  pull(PlotObservationID)
+
+#exclude plots without geographic information
+header.xylem <- header %>%
+  filter(PlotObservationID %in% plot.sel) %>% 
+  filter(!is.na(Latitude))
+#refine plot.sel
+plot.sel <- header.xylem$PlotObservationID
+
+DT.xylem <- DT2 %>% 
+  filter(taxon_group %in% c("Vascular plant", "Unknown")) %>% 
+  filter(PlotObservationID %in% plot.sel)
 
-DT2 <- DT2 %>%
-  dplyr::select(PlotObservationID, species, Relative.cover) %>%
-  group_by(PlotObservationID, species) %>%
-  summarize(Relative.cover=combine.cover(Relative.cover)) %>%
-  ungroup()
 ```
 
+Out of the `r length(species_list)` species in the sRoot list, `r sum(unique(DT2$species) %in% species_list)` species are present in sPlot, for a total of `r nrow(DT.xylem %>% filter(species %in% species_list))` plots., across `r length(plot.sel)` plots.
 
+# 2 Extract woody species
+This is partial selection, as we don't have information on the growth form of all species in sPlot
+```{r}
+#load list of woody species, as provided to me by Alexander Zizka, within sPlot project #21
+load("../Project_21/_input/evowood_species_list.rda")
+
+#Select all woody species, as well as those without GrowthForm information
+#extract relevant traits from TRY
+woody_species_traits <- sPlot.traits %>%
+  dplyr::select(species, GrowthForm, is.tree.or.tall.shrub, n,
+                starts_with("StemDens"),
+                starts_with("Stem.cond.dens"), 
+                starts_with("StemConduitDiameter"),
+                starts_with("LDCM"),
+                starts_with("SLA"),
+                starts_with("PlantHeight"), 
+                starts_with("Wood"), 
+                starts_with("SpecificRootLength_mean")) %>%
+  filter( (species %in% species_list) |
+          (species %in% synonyms$name_binomial) |
+          grepl(pattern = "tree|shrub", x = GrowthForm) |
+          is.tree.or.tall.shrub==T
+              )
+
+table(woody_species_traits$GrowthForm, exclude=NULL)
+# MEMO: some standardization needed in sPlot 3.0
+# Including the data from A.Zizka in the selection 
+# increases the number of selected species only marginally (from ~21k to ~22k)
+```
+
+```{r, echo=F}
+knitr::kable(woody_species_traits %>% 
+               sample_n(20), caption="Example of gap-filled trait data from TRY (20 randomly selected species") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
 
+Selected traits are:  
 
-# 1 Extract plots from sPlot based on species with root traits  
-Import data on root traits provided by the sRoot consortium (v 30 sept. 2019). Extract all plots containing at least one species in sRoot list. ~~(Matching could be marginally improved if checking names in sRoot's list with TPL).~~
-```{r, message=F, results=F, warning=F, cache=T}
-root_traits <- read_csv("_input/Mean_values_for_sPlot.csv")
-root_traits <- root_traits %>% #rename column to adapt input data in v1.8
-  rename(RD=Root_diameter, 
-         RootN=Root_N_concentration,
-         RTD=Root_tissue_density,
-         SRL=Specific_root_length) %>% 
-  dplyr::select(full_species:Taxonomic_information, vegtype_version1:vegtype_expert, Mycorrhizal_type_Kuyper,
-                RD, RootN, RTD, SRL) %>% 
-  ### replace all RTD values >.8 with NAs
-  mutate(RTD=replace(RTD, list=RTD>1, values=NA))
-  
-  
-species_list <- root_traits$full_species
+- StemDens - 4 - Stem specific density (SSD) or wood density (stem dry mass per stem fresh volume) (g/cm^3)  
+- SLA - 11 - Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA)  
+- PlantHeight - 18 - Plant height (vegetative + generative)  
+- StemDiam - 21 - Stem diameter (m)  
+- Stem.cond.dens - 169 - Stem conduit density (vessels and tracheids) (mm^-2)  
+- StemConduitDiameter - 281 - Stem conduit diameter (vessels, tracheids)_micro (m)  
+- Wood.vessel.length - 282 - Wood vessel element length; stem conduit (vessel and tracheids) element length_micro (m)  
+- WoodFiberLength - 289 - Wood fiber lengths_micro (m) 	
+- SpecificRootLength - 1080 - Root length per root dry mass (specific root length, SRL) (cm/g)  
 
-plot.sel <- (DT2 %>%
-  filter(DT2$species %in% species_list) %>%
-    dplyr::select(PlotObservationID) %>%
-    distinct())$PlotObservationID
+Codes correspond to those reported in [TRY](https://www.try-db.org/TryWeb/Home.php)
 
-DT.sRoot <- DT2 %>% 
-  filter(DT2$PlotObservationID %in% plot.sel)
+```{r}
+#subset DT.xylem to only retain woody species
+DT.xylem <- DT.xylem %>% 
+  filter(species %in% (woody_species_traits$species))
+nrow(DT.xylem)
 ```
 
-Out of the `r length(species_list)` species in the sRoot list, `r sum(unique(DT2$species) %in% species_list)` species are present in sPlot, for a total of `r length(unique(DT.sRoot$PlotObservationID))` records.
 
-# 2 Calculate CWMs 
+# 3 Calculate CWMs and trait coverage 
 Calculate CWM and trait coverage for each trait and each plot. Select plots having more than 80% coverage for at least one trait.
 ```{r, cache=T,  cache.lazy=F}
 # Merge species data table with traits
-CWM.sRoot0 <- DT.sRoot %>%
+CWM.xylem0 <- DT.xylem %>%
   as.tbl() %>%
   dplyr::select(PlotObservationID, species, Relative.cover) %>%
-  left_join(root_traits %>%
-              dplyr::rename(species=full_species) %>%
-              dplyr::select(species, RD:SRL), 
+  left_join(xylem_data %>%
+              dplyr::rename(species=Species) %>%
+              dplyr::select(species, P50, Ks), 
             by="species")
 
 # Calculate CWM for each trait in each plot
-CWM.sRoot1 <- CWM.sRoot0 %>%
+CWM.xylem1 <- CWM.xylem0 %>%
   group_by(PlotObservationID) %>%
-  summarize_at(.vars= vars(RD:SRL),
+  summarize_at(.vars= vars(P50:Ks),
                .funs = list(~weighted.mean(., Relative.cover, na.rm=T))) %>%
   dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  reshape2::melt(id.vars="PlotObservationID", value.name="trait.value")
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.value")
 
 # Calculate coverage for each trait in each plot
-CWM.sRoot2 <- CWM.sRoot0 %>%
+CWM.xylem2 <- CWM.xylem0 %>%
   mutate_at(.funs = list(~if_else(is.na(.),0,1) * Relative.cover), 
-            .vars = vars(RD:SRL)) %>%
+            .vars = vars(P50:Ks)) %>%
   group_by(PlotObservationID) %>%
-  summarize_at(.vars= vars(RD:SRL),
-               .funs = list(~sum(., na.rm=T))) %>%
+  summarize_at(.vars= vars(P50:Ks),
+               .funs = list(~sum(., na.rm=T)), 
+               .groups="drop") %>%
   dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  reshape2::melt(id.vars="PlotObservationID", value.name="trait.coverage")
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.coverage")
 
 # Calculate CWV
 variance2.fun <- function(trait, abu){
@@ -170,93 +196,77 @@ variance2.fun <- function(trait, abu){
   res
 }
 
-CWM.sRoot3 <- CWM.sRoot0 %>%
+CWM.xylem3 <- CWM.xylem0 %>%
   group_by(PlotObservationID) %>%
-  summarize_at(.vars= vars(RD:SRL),
+  summarize_at(.vars= vars(P50:Ks),
                .funs = list(~variance2.fun(., Relative.cover))) %>%
   dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  reshape2::melt(id.vars="PlotObservationID", value.name="trait.variance")
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.variance")
 
 ## Calculate proportion of species having traits
-CWM.sRoot4 <- CWM.sRoot0 %>%
+CWM.xylem4 <- CWM.xylem0 %>%
   group_by(PlotObservationID) %>%
-  summarize_at(.vars= vars(RD:SRL),
+  summarize_at(.vars= vars(P50:Ks),
                .funs = list(~sum(!is.na(.)))) %>%
   dplyr::select(PlotObservationID, order(colnames(.))) %>%
-  reshape2::melt(id.vars="PlotObservationID", value.name="trait.nspecies")
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.nspecies")
 
 # Join together
-CWM.sRoot <- CWM.sRoot1 %>%
-  left_join(CWM.sRoot2, by=c("PlotObservationID", "variable")) %>%
-  left_join(CWM.sRoot3, by=c("PlotObservationID", "variable")) %>%
-  left_join(CWM.sRoot4, by=c("PlotObservationID", "variable")) %>%
-  left_join(CWM.sRoot0 %>% 
+CWM.xylem <- CWM.xylem1 %>%
+  left_join(CWM.xylem2, by=c("PlotObservationID", "trait")) %>%
+  left_join(CWM.xylem3, by=c("PlotObservationID", "trait")) %>%
+  left_join(CWM.xylem4, by=c("PlotObservationID", "trait")) %>%
+  left_join(CWM.xylem0 %>% 
               group_by(PlotObservationID) %>%
-              summarize(sp.richness=n()), by=c("PlotObservationID")) %>%
+              summarize(sp.richness=n()), by=c("PlotObservationID"), 
+            .groups="drop") %>%
   mutate(trait.coverage.nspecies=trait.nspecies/sp.richness) %>%
   #filter(trait.coverage>=0.8) %>%
   arrange(PlotObservationID)
 ```
-Scatterplot comparing coverage of traits values across plots, when based on relative cover and when based on proportion of species richness
-```{r, eval=T, fig.align="center", fig.width=8, fig.height=4, cache=T, warning=F}
-ggplot(data=CWM.sRoot, aes(x=trait.coverage, y=trait.coverage.nspecies, col=log(sp.richness))) + 
-  geom_point(pch="+", alpha=1/4, cex=.7) + 
-  geom_vline(xintercept = .8, lwd=.7, lty=2) + 
-  geom_abline(intercept = 0, slope=1, col=2, lty=2, lwd=.7) + 
-  xlim(c(0,1)) + 
-  ylim(c(0,1)) + 
-  facet_wrap(.~variable, ncol=4) + 
-  scale_color_viridis() + 
-  xlab("Trait coverage (Relative  cover)") + 
-  ylab("Trait coverage (Proportion of species)")
-```
 
-```{r}
-CWM.sRoot <- CWM.sRoot %>%
-  filter(trait.coverage>=0.8)
-```
+
 
 ```{r, echo=F}
-knitr::kable(CWM.sRoot[1:20,], caption="Example of CWM data file") %>%
+knitr::kable(CWM.xylem[1:20,], caption="Example of CWM data file") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
 ```
 
+```{r}
+CWM.xylem08 <- CWM.xylem %>%
+  filter(trait.coverage>=0.8)
+```
 
 ```{r, echo=F}
-knitr::kable(CWM.sRoot %>%
+knitr::kable(CWM.xylem08 %>%
                group_by(variable) %>%
                summarize("num.plots"=n()), caption="Number of plots with >=.8 coverage per trait") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
 ```
 
 
-
-
-Create list of plots having at least one trait with >=.8 coverage and extract header data
 ```{r}
-plot80perc <- (CWM.sRoot %>%
-  dplyr::select(PlotObservationID) %>%
-  distinct())$PlotObservationID
+#Create list of plots having at least one trait with >=.8 coverage and extract header data
 
-header.sRoot <- header %>%
-  filter(PlotObservationID %in% plot80perc) %>% 
-  filter(!is.na(POINT_X))
+#plot80perc <- (CWM.xylem %>%
+#  dplyr::select(PlotObservationID) %>%
+#  distinct())$PlotObservationID
 
-DT.sRoot08 <- DT.sRoot %>%
-  filter(PlotObservationID %in% header.sRoot$PlotID)
+#DT.xylem08 <- DT.xylem %>%
+#  filter(PlotObservationID %in% header.xylem$PlotID)
 
-CWM.sRoot <- CWM.sRoot %>%
-  filter(PlotObservationID %in% header.sRoot$PlotID)
+#CWM.xylem <- CWM.xylem %>%
+#  filter(PlotObservationID %in% header.xylem$PlotID) 
 ```
 
 Completeness of header data
 ```{r, echo=F}
-knitr::kable(data.frame(Completeness_perc=colSums(!is.na(header.sRoot))/
-                          nrow(header.sRoot)*100)[-c(1,2),,drop=F], 
+knitr::kable(data.frame(Completeness_perc=colSums(!is.na(header.xylem))/
+                          nrow(header.xylem)*100)[-c(1,2),,drop=F], 
              caption="Header file - Columns present and % completeness") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
 ```
-The process results in `r nrow(header.sRoot)` plots selected, for a total of `r nrow(CWM.sRoot)` trait * plot combinations. 
+The process results in `r nrow(header.xylem)` plots selected, for a total of `r nrow(CWM.xylem)` trait * plot combinations. 
   
 
 Geographical distribution of plots
@@ -264,7 +274,7 @@ Geographical distribution of plots
 countries <- map_data("world")
 ggworld <- ggplot(countries, aes(x=long, y=lat, group = group)) +
   geom_polygon(col=NA, lwd=3, fill = gray(0.9)) +
-  geom_point(data=header.sRoot, aes(x=POINT_X, y=POINT_Y, group=1), col="red", alpha=0.5, cex=0.7, shape="+") + 
+  geom_point(data=header.xylem, aes(x=Longitude, y=Latitude, group=1), col="red", alpha=0.5, cex=0.7, shape="+") + 
   theme_bw()
 ggworld
 ```
@@ -274,388 +284,39 @@ Summarize data across data sets in sPlot, and create list of data custodians
 ```{r, warning=F, message=F}
 db.out <- fread("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv") %>%
   dplyr::select(`GIVD ID`, Custodian)
-data.origin <- header.sRoot %>% 
+data.origin <- header.xylem %>% 
    group_by(`GIVD ID`) %>% 
    summarize(Num.plot=n()) %>%
    left_join(db.out)
 ```
 
 ```{r, echo=F}
-knitr::kable(data.origin[1:10,], caption="Data Origin [only first 10 rows shown]") %>%
+knitr::kable(data.origin, caption="Data Origin") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
 ```
 
 The data derive from `r nrow(data.origin)` datasets.  
 
-# 3 Extract climate and soils data  
-## ISRIC soil data  
-
-Download ISRIC soil data at 5 cm depth (250 m horizontal resolution)  
-```{r, eval=F}
-# Added in v1.8
-url.bulk <- "https://files.isric.org/soilgrids/data/recent/BLDFIE_M_sl2_250m_ll.tif"
-url.cec <- "https://files.isric.org/soilgrids/data/recent/CECSOL_M_sl2_250m_ll.tif"
-url.cly <- "https://files.isric.org/soilgrids/data/recent/CLYPPT_M_sl2_250m_ll.tif"
-url.crf <- "https://files.isric.org/soilgrids/data/recent/CRFVOL_M_sl2_250m_ll.tif"
-url.orc <- "https://files.isric.org/soilgrids/data/recent/ORCDRC_M_sl2_250m_ll.tif"
-url.pH <- "https://files.isric.org/soilgrids/data/recent/PHIHOX_M_sl2_250m_ll.tif"
-url.slt <- "https://files.isric.org/soilgrids/data/recent/SLTPPT_M_sl2_250m_ll.tif"
-url.snd <- "https://files.isric.org/soilgrids/data/recent/SNDPPT_M_sl2_250m_ll.tif"
-
-download(url.bulk, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/BLDFIE_M_sl2_250m_ll.tif", mode = "wb")
-download(url.cec, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/CECSOL_M_sl2_250m_ll.tif", mode = "wb")
-download(url.cly, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/CLYPPT_M_sl2_250m_ll.tif", mode = "wb")
-download(url.crf, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/CRFVOL_M_sl2_250m_ll.tif", mode = "wb")
-download(url.orc, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/ORCDRC_M_sl2_250m_ll.tif", mode = "wb")
-download(url.pH, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/PHIHOX_M_sl2_250m_ll.tif", mode = "wb")
-download(url.slt, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/SLTPPT_M_sl2_250m_ll.tif", mode = "wb")
-download(url.snd, "/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/SNDPPT_M_sl2_250m_ll.tif", mode = "wb")
-```
-
-Load ISRIC rasters
-```{r, eval=F}
-mycrs <- CRS("+init=epsg:4326")
-BLDFIE <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/BLDFIE_M_sl2_250m_ll.tif")
-CECSOL <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/CECSOL_M_sl2_250m_ll.tif")
-CLYPPT <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/CLYPPT_M_sl2_250m_ll.tif")
-CRFVOL <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/CRFVOL_M_sl2_250m_ll.tif")
-ORCDRC <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/ORCDRC_M_sl2_250m_ll.tif")
-PHIHOX <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/PHIHOX_M_sl2_250m_ll.tif")
-SLTPPT <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/SLTPPT_M_sl2_250m_ll.tif")
-SNDPPT <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/SNDPPT_M_sl2_250m_ll.tif")
-
-P.ret <- raster("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/Generalized P-retention potential (dominant class)1.tif")
-
-# defined functions to calculate mean and mode, which exclude NAs and 0s (if needed)
-robust.mean <- function(x){mean(x, rm.na=T)}
-robust.mode <- function(x){if(any(x!=0)) {
-                              a <- x[which(x!=0)] #exclude zero (i.e. NAs)
-                              return(as.numeric(names(sort(table(a), decreasing=T))[1]))} else
-                                return(NA)
-                          }
-```
-
-Create spatial point dataframe for sPlot data to intersect with ISRIC raster layers
-```{r}
-header.shp <- SpatialPointsDataFrame(coords=header.sRoot %>%
-                                        dplyr::select(POINT_X, POINT_Y),
-                               proj4string = CRS("+init=epsg:4326"), 
-                               data=data.frame(PlotID=header.sRoot$PlotID, 
-                                               loc.uncert=header.sRoot$`Location uncertainty (m)`))
-writeOGR(header.shp, dsn = "_derived", layer = "header.shp", driver = "ESRI Shapefile", overwrite_layer = T)
-```
-
-
-Intersect sPlot SpatialPoints DataFrame with ISRIC raster layers (in Parallel computing)
-```{r, eval=F, warning=F, message=F}
-##go parallel
-library(parallel)
-library(doParallel)
-cl <- makeForkCluster(8, outfile="" )
-#cl <- makeCluster(8)
-registerDoParallel(cl)
- 
- BLDFIE.out <- foreach (i=1:length(header.shp), .packages=c('raster'),  .combine=rbind) %dopar% { #
-   BLDFIE.tmp <- raster::extract(BLDFIE, header.shp[i,], buffer=min(10000,  max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
- }
- save(BLDFIE.out, file="_derived/BLDFIE.out.RData")
- 
- CECSOL.out <- foreach (i=1:length(header.shp), .packages=c('raster'),  .combine=rbind) %dopar% { #
-   CECSOL.tmp <- raster::extract(CECSOL, header.shp[i,], buffer=min(10000,  max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
- }
- save(CECSOL.out, file="_derived/CECSOL.out.RData")  
- 
- 
- CLYPPT.out <- foreach (i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { #
-   CLYPPT.tmp <- raster::extract(CLYPPT, header.shp[i,], buffer=min(10000,  max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
- }
- save(CLYPPT.out, file="_derived/CLYPPT.out.RData")  
- 
- CRFVOL.out <- foreach (i=1:length(header.shp), .packages=c('raster'),  .combine=rbind) %dopar% { #
-   CRFVOL.tmp <- raster::extract(CRFVOL, header.shp[i,], buffer=min(10000,  max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
- }
-save(CRFVOL.out, file="_derived/CRFVOL.out.RData")  
-
-ORCDRC.out <- foreach (i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { #
-  ORCDRC.tmp <- raster::extract(ORCDRC, header.shp[i,], buffer=min(10000, max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
-}
-save(ORCDRC.out, file="_derived/ORCDRC.out.RData")  
-
-PHIHOX.out <- foreach (i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { #
-  PHIHOX.tmp <- raster::extract(PHIHOX, header.shp[i,], buffer=min(10000, max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
-}
-save(PHIHOX.out, file="_derived/PHIHOX.out.RData")  
-
-SLTPPT.out <- foreach (i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { #
-  SLTPPT.tmp <- raster::extract(SLTPPT, header.shp[i,], buffer=min(10000, max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
-}
-save(SLTPPT.out, file="_derived/SLTPPT.out.RData") 
-
-SNDPPT.out <- foreach (i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { #
-  SNDPPT.tmp <- raster::extract(SNDPPT, header.shp[i,], buffer=min(10000, max(250, header.shp@data$loc.uncert[i])), fun=robust.mean)
-}
-save(SNDPPT.out, file="_derived/SNDPPT.out.RData") 
-
-P.ret.out <- foreach (i=1:length(header.shp), .packages=c('raster'), .combine=rbind) %dopar% { #
-  P.ret.tmp <- raster::extract(P.ret, header.shp[i,], buffer=min(10000, max(250, header.shp@data$loc.uncert[i])), fun=robust.mode)
-}
-save(P.ret.out, file="_derived/P.ret.out.RData")
-
-stopCluster(cl)  
-```
-
-Reload and recompile output
-```{r, eval=T, warning=F, message=F}
-load( file="_derived/BLDFIE.out.RData")
-load( file="_derived/CECSOL.out.RData")
-load( file="_derived/CLYPPT.out.RData")
-load( file="_derived/CRFVOL.out.RData")
-load( file="_derived/ORCDRC.out.RData")  
-load( file="_derived/PHIHOX.out.RData") 
-load( file="_derived/SLTPPT.out.RData") 
-load( file="_derived/SNDPPT.out.RData")
-load(file="_derived/P.ret.out.RData")
-
-ISRIC.out <- data.frame(PlotID=header.shp@data$PlotID, 
-             BLDFIE=BLDFIE.out, 
-             CECSOL=CECSOL.out,
-             CLYPPT=CLYPPT.out,
-             CRFVOL=CRFVOL.out,
-             ORCDRC=ORCDRC.out,
-             PHIHOX=PHIHOX.out,
-             SLTPPT=SLTPPT.out,
-             SNDPPT=SNDPPT.out, 
-             P.ret=P.ret.out)
-
-P.ret.key <- read.dbf("/data/sPlot/users/Francesco/Ancillary_Data/ISRIC/Generalized P-retention potential (dominant class)1.tif.vat.dbf")
-
-ISRIC.out <- ISRIC.out %>% 
-  left_join(P.ret.key %>% 
-              dplyr::select(VALUE, MainCLASS) %>% 
-              rename(P.ret=VALUE), by="P.ret") %>% 
-  rename(P.ret.cat=MainCLASS)
-```
-
-## Climate data 
-Join soil and climate data into the `env.sRoot` data.frame
-
-```{r, cache=T}
-
-load("/data/sPlot/releases/sPlot2.1/sPlot_header_chelsa_20161124.RData")
-#load("/data/sPlot/releases/sPlot2.1/sPlot_header_soilgrids_20161124.RData") 
-
-env.sRoot <- header.sRoot %>% 
-  dplyr::select(PlotID) %>% 
-  left_join(climate %>%
-              dplyr::select(-POINT_X, -POINT_Y), 
-            by="PlotID") %>%
-  left_join(ISRIC.out, by="PlotID") %>% 
-  dplyr::select(-P.ret)
-
-```
-
-Download and unzip rasters from [Global Aridity Index and Potential Evapotranspiration (ET0) Climate Database v2](https://figshare.com/articles/Global_Aridity_Index_and_Potential_Evapotranspiration_ET0_Climate_Database_v2/7504448/3)  
-
-```{r, eval=F}
-require(rfigshare)
-##Authentication using key from online tutorial
-client_key = '123456'
-client_secret = '65xyAzi1'
-token_key = 'n4oTQ22l4FxsuQlYZlhCFwYrrSgrlPn1lhIx32uzwzAwn4oTQ22l4FxsuQlYZlhC1F'
-token_secret = '0MNOqkQncNHuKTi6fQ8MuA'
-fs_auth(client_key, client_secret, token_key, token_secret)
-
-## Import map of aridity
-a <- fs_details(7504448, mine=F)
-url.ai <- a$files[[5]]$download_url
-url.et <- a$files[[3]]$download_url
-download(url.ai, "/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/Aridity.zip", mode = "wb")
-download(url.et, "/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/ET.zip", mode = "wb")
-
-unzip ("/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/ET.zip", 
-       exdir ="/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/")
-unzip ("/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/Aridity.zip", 
-       exdir ="/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/")
-
-```
-
-Extract Aridity and PET values for all selected plots
-```{r, cache=T, warning=F, message=F}
-ai <- raster("/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/ai_et0/ai_et0.tif")
-et <- raster("/data/sPlot/users/Francesco/Ancillary_Data/Aridity_Index/et0_yr/et0_yr.tif")
-
-ai.header <- raster::extract(ai, header.shp)
-et.header <- raster::extract(et, header.shp)
-
-env.sRoot <- env.sRoot %>%
-  left_join(data.frame(PlotID=header.shp@data$PlotID,
-                       Aridity_ind=ai.header,
-                       Pot_Evapot=et.header),
-            by="PlotID")
-```
-
-## Estimate soil N based on T and pH
-**as suggested by Thom Kuyper**  
-  
-The basic idea for the plots (true for forests, not fully true for grasslands, admittedly) is that N supply is determined by the decomposition of the organic matter and the nitrogen mineralisation during the decomposition.   
-That means that we discount (1) nitrogen deposition; (2) nitrogen fixation (assuming that very few plots are so dominated by legumes that they overwhelm the signal of nitrogen through mineralisation), (3) application of nitrogen fertiliser.  
-We also assume that most plots will be nitrogen-limited, so losses of nitrogen (when mineralisation is in excess of plant nitrogen demand) is equally discounted in the estimate of nitrogen supply. Under that assumption we can estimate nitrogen supply from soil organic matter. 
-In order to do that we need a brief recap which factors affect nitrogen mineralisation, factors that can be easily included in the model.  
-*Factor 1 - Temperature*  
-For simplicity we assume that the average annual soil temperature in the uppermost 10-20 cm is equal to the average annual temperature.  From models on decomposition we chose a model developed by Yang (a PhD from Wageningen). His temperature function is:
-(1)	At average temperatures below 0 °C there is no decomposition and N mineralisation (as there is no water which is crucial for both enzyme activity and accessibility of soil organic matter in the soil solution). The few arctic and alpine plots will then possibly have estimates of zero N supply.  
-(2)	The model for temperature works for an average temperature of 9° C, so that has the standard value of 1. For other temperatures we then apply a correction factor.  
-(3)	For temperatures between 0 °C and 9 °C, we use: F = 0.1 (T+1); so the actual model yields 0 at T = -1 °C and f = 1 at 9 °C  
-(4)	For temperatures between 10 and 27 °C we use F = 2^(T-9)/9. So at 9 ° C the formula against yields f = 1; for 18 °C the formula yields f = 2 (which suggests the formula has a Q10, aka temperature sensitivity or slightly over 2).  
-
-*Factor 2 - pH and Soil C org*  
-The second correction factor has to do with what is called protection of organic matter by mineral association. Mineral association protects negatively charged organic matter, if the minerals have positive charge. (There is some protection of positively charged organic matter, e.g. basic amino acids, on negatively charged mineral particles such as clay). However, the database might not be very accurate for clay content, and would have not data for different  kinds of clay. So we might overcome this by again simplifying assumptions. I assume that protection is mainly be metal oxides and these have a more positive charge at lower pH (and hence organic matter is better protected against decomposition at lower pH). So I propose to include a second protection factor, based on pH. This simplification is also in QUEFTS and a paper by Sattari et al. (2014) that is based on QUEFTS, a model to estimate soil nutrient supply for agricultural soils, so as to calculate which part of plant nutrient demand can be covered by soil supply rather than fertilizer supply.: 
-
-SN = αN * FN * Corg  
-with  
-αN: an empirical parameter, set at 6.8  
-FN: pH-dependent coefficient to take into account the availability of organic matter in relation to mineral association  
-Corg : the fraction soil organic matter in mg kg-1. C is 50% of SOM content.  
-  
-The factor N is defined as:  
-FN = 0.4 for pH < 4.7  
-FN = 0.25*(pH-3) for 4.7 < pH < 7  
-FN = 1 for pH > 7  
-  
-The combination of Factor 1 and Factor 2 would give a reasonable estimate of N supply  
-
-```{r}
-# Added in v1.8
-env.sRoot <- env.sRoot %>% 
-  mutate(N.f1=ifelse(bio01<0, 0, 
-                     ifelse(bio01<=9,0.1*(bio01+1), 2^((bio01-9)/9)))) %>% 
-  mutate(FN=ifelse((PHIHOX/10)<4.7, 0.4, 
-                   ifelse((PHIHOX/10)<7, 0.25*((PHIHOX/10)-3), 1))) %>% 
-  mutate(N.f2=6.8*FN*(ORCDRC/1000)) %>% 
-  mutate(N.est=N.f1*N.f2) %>% 
-  dplyr::select(-FN)
-# Check range of factors
-summary(env.sRoot$N.f1)
-summary(env.sRoot$N.f2)
-summary(env.sRoot$N.est)
-```
-
-## Continents
-```{r}
-sPDF <- rworldmap::getMap(resolution="coarse")
-continent <- sPDF[,"continent"]
-crs(continent) <- crs(header.shp)
-continent@data[243,"continent"] <- "South America" ## Manually correct missing data
-# create clipped version of continent to avoid going beyond 180 lON
-coords <- data.frame(x=c(-180,180,180,-180),
-                     y=c(-90,-90,90,90))
-bboxc = Polygon(coords)
-bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent))
-continent_clipped <- gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround
-```
-
-Assign plots to continent (as correction to header file)  
-```{r, eval=F}
-continent.out <- sp::over(x=header.shp, y=continent)
-#correct unassigned points to closest continent
-toassign <- header.shp[which(is.na(continent.out$continent)),]
-crs(toassign) <- crs(continent)
-#go parallel
-cl <- makeForkCluster(8, outfile="" )
-#cl <- makeCluster(8)
-registerDoParallel(cl)
-nearestContinent <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { 
-  nearestContinent.tmp <- geosphere::dist2Line(toassign[i,], continent_clipped)
-} #~2 hrs with 8 cpus
-continent.out$continent[which(is.na(continent.out$continent))] <- as.character(continent[-137,]@data[nearestContinent[,"ID"],])
-save(continent.out, file = "_derived/continent.out")
-stopCluster(cl)
-```
-Reload and manipulate continent. Correct header.sRoot
-```{r}
-load("_derived/continent.out")
-header.sRoot$CONTINENT <- continent.out %>% 
-  mutate(CONTINENT=factor(continent, 
-                            levels=c("Africa", "Antarctica", "Australia", "Eurasia", "North America", "South America"),
-                            labels=c("AF", "AN", "AU", "EU", "NA", "SA"))) %>% 
-  pull(CONTINENT)
-```
-
-
-## Ecoregions
-Extract data on ecoregion and join into the `env.sRoot` data.frame. Ecoregion derive from [Olson et al. (2001)](https://doi.org/10.1641/0006-3568(2001)051[0933:TEOTWA]2.0.CO;2).
-
+# 4 Extract climate and soils data  
 
-```{r, message=F}
-##Import spatial data on Ecoregions
-ecoreg <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/Ecoregions_WWF/", layer="wwf_terr_ecos")
-crs(ecoreg) <- CRS("+init=epsg:4326")
-ecoreg@data <- ecoreg@data %>% 
-  dplyr::select(ECO_NAME, eco_code)
-## Intersect plots with ecoregions and bind column
-ecoreg.out <- sp::over(x=header.shp, y=ecoreg) # ~5 mins
-
-```
-Match missing values to nearest ecoregion
-```{r, warning=F, message=F, eval=F}
-library(doParallel)
-library(parallel)
-cl <- makeForkCluster(10, outfile="" )
-registerDoParallel(cl)
-
-toassign <- header.shp[which(is.na(ecoreg.out$ECO_NAME)),]
-nearestEcoreg <- foreach(i=1:length(toassign), .packages=c('raster'), .combine=rbind) %dopar% { 
-  nearestEcoreg.tmp <- tryCatch(data.frame(toassign[i,"PlotID"],
-                                                  geosphere::dist2Line(toassign[i,], ecoreg)),
-               error = function(e){data.frame(toassign[i,"PlotID"], distance=NA, lon=NA, lat=NA,ID=NA)}
-      )
-  return(nearestEcoreg.tmp)
-} #~20 hrs with 10 cpus
-save(nearestEcoreg, file = "_derived/EcoregionExtract.RData")
-stopCluster(cl)
-```
-Recompile and check
 ```{r}
-load(file = "_derived/EcoregionExtract.RData")
-ecoreg.out[which(is.na(ecoreg.out$ECO_NAME)),] <- ecoreg@data[nearestEcoreg[,"ID"],]
-save(ecoreg.out, file = "_derived/Ecoregion.out.RData")
-env.sRoot <- env.sRoot %>% bind_cols(ecoreg.out)
-
-##double check
-header.sRoot %>%
-  left_join(env.sRoot %>%
-              dplyr::select(PlotID, ECO_NAME), 
-            by="PlotID") %>%
-  distinct(ECO_NAME, CONTINENT, .keep_all = FALSE) %>% 
-  arrange(ECO_NAME)
-
-header.shp@data$CONTINENT <- header.sRoot$CONTINENT
-header.shp@data <- header.shp@data %>% 
-  left_join(env.sRoot %>%
-              dplyr::select(PlotID, ECO_NAME), 
-            by="PlotID")
-writeOGR(header.shp, dsn="_derived/", layer="EcoregionContinent", driver = "ESRI Shapefile", overwrite_layer = T)
+soilclim.xylem <- soilclim %>% 
+  filter(PlotObservationID %in% plot.sel) %>% 
+  rename(Elevation=Elevation_median, -Elevation_q2.5, -Elevation_q97.5)
 ```
 
 ```{r, echo=F}
-knitr::kable(env.sRoot %>% 
-    group_by(ECO_NAME) %>% 
-    summarize(n.plot=n()) %>% 
-    filter(n.plot>50), caption="Number of plots for each ecoregion (only those with more than 50 plots shown)") %>%
+knitr::kable(soilclim.xylem %>% 
+               sample_n(20), caption="Example of climatic and soil variables for 20 randomly selected plots. All values represent the median in a circle centered on the plot coordinates, having a radius equal to the plot's location uncertainty (capped to 50 km for computing reasons)") %>%
     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
 ```
 
+The procedure used to obtain these environmental predictors is described [here](https://git.idiv.de/fs40gaho/splot3_build/-/raw/master/public/05_ExtractEnvironment.html?inline=false) (Click to download the report)
 
 
-## Summary of environmental data available per plot
+Bioclimatic variables (bio01-bio19) derive from [CHELSA](http://chelsa-climate.org/bioclim/)  
 
-```{r, echo=F}
-knitr::kable(env.sRoot[1:10,], caption="Environmental data [only first 10 rows]") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
-```
-Bioclimatic variables (bio01-bio19) derive from [CHELSA](http://chelsa-climate.org/bioclim/)
-
-Codes:
+Codes:  
 
 Bio1 = Annual Mean Temperature  
 Bio2 = Mean Diurnal Range  
@@ -666,7 +327,7 @@ Bio6 = Min Temperature of Coldest Month
 Bio7 = Temperature Annual Range  
 Bio8 = Mean Temperature of Wettest Quarter  
 Bio9 = Mean Temperature of Driest Quarter  
-Bio10 = Mean Temperature of Warmest Quarter  
+Bio10 = Mean Temperature of Warmest Quarter   
 Bio11 = Mean Temperature of Coldest Quarter  
 Bio12 = Annual Precipitation  
 Bio13 = Precipitation of Wettest Month  
@@ -677,8 +338,9 @@ Bio17 = Precipitation of Driest Quarter
 Bio18 = Precipitation of Warmest Quarter  
 Bio19 = Precipitation of Coldest Quarter  
   
-  
-Soil variables derive from the [ISRIC dataset](https://www.isric.org/), downloaded at 250-m resolution 
+\newline \newline
+
+Soil variables (5 cm depth) derive from the [ISRIC dataset](https://www.isric.org/), downloaded at 250-m resolution 
   
 CECSOL                     Cation Exchange capacity of soil  
 CLYPPT                     Clay mass fraction in %  
@@ -686,320 +348,16 @@ CRFVOL                     Coarse fragments volumetric in %
 ORCDRC                     Soil Organic Carbon Content in g/kg  
 PHIHOX                     Soil pH x 10 in H20  
 SLTPPT                     Silt mass fraction in %  
-SNDPTT                     Sand mass fraction in %  
+SNDPPT                     Sand mass fraction in %  
 BLDFIE                     Bulk Density (fine earth) 	in kg/m3  
 P.ret.cat                  Phosphorous Retention - Categorical value, see [ISRIC 2011-06](https://www.isric.org/documents/document-type/isric-report-201106-global-distribution-soil-phosphorus-retention-potential)  
+\newline \newline
 
-  
-N.f1 - Nitrogen factor 1  
-N.f2 - Nitrogen factor 2  
-N.est - N.f1 * N.f2 (See description above)  
-  
-Aridity Index and Potential Evapotranspiration are downloaded from [Global Aridity Index and Potential Evapotranspiration (ET0) Climate Database v2](https://figshare.com/articles/Global_Aridity_Index_and_Potential_Evapotranspiration_ET0_Climate_Database_v2/7504448/3) at 30 arc-seconds. 
-Trabucco, Antonio; Zomer, Robert (2019): Global Aridity Index and Potential Evapotranspiration (ET0) Climate Database v2. figshare. 
-
-# 4 Classifiy plots into broad vegetation types based on species' affinities
-
-sPlot has two independent systems for classifying plots to vegetation types. The first, classifies plots into forest and non-forest, based on the share of phanerophytes. The second system classifies plots into broad habitat types and relies on the expert opinion of data contributors. This is, unfortunately, not consistently available across all plots, being the large majority of classified plots only available for Europe. These broad habitat types are coded using 5, non-mutually exclusive dummy variables:  
-1) Forest - F  
-2) Grassland - G  
-3) Shrubland - S  
-4) Sparse vegetation - B (Bare)  
-5) Wetland - W  
-A plot may belong to more than one formation, e.g. a Savannah is categorized as Forest + Grassland (FG).
-
-
-```{r, cache=T, cache.lazy=T, echo=F}
-## create synthetic codes for habitats
-
-habitat_convert <- header.sRoot %>%
-                dplyr::select(Forest:Wetland) %>%
-                mutate_all(.funs= list( ~replace_na(.,0))) %>%
-                plyr::adply(., 1, function(x) ifelse(sum(x)!=0,
-                                         paste0(c("F", "G", "S", "B", "W")[x==1], collapse=""),
-                                         NA)) %>%
-                dplyr::rename(habitat.code=V1) %>%
-                dplyr::select(habitat.code) 
-```
-
-On top of the native (but incomplete) classification of plots into vegetation types, we ran ~~two~~ *one* classifications based on ~~a) Cluster analysis, or b)~~ species' affinities  
-*from v1.8 on, classification based on cluster analysis is discontinued*  
-Species affinities were assigned based on expert-opinion. Based on the relative cover of each species, we summarized species based on their habitat affinities, and summarized their relative cover. We then assigned each plot to an habitat based on the following ifelse conditions:  
-  
-wetland>0.5 -> "Wetland"  
-forest>0.3 -> "Forest"  
-grassland+heathland+steppe>0.7 -> "Grassland"  
-"Other", otherwise.
-
-```{r }
-# Added in v1.4 - Add results of cluster analysis
-# based on beal's smoothing as performed by Helge Bruelheide
-# Grasslands belong to cluster #5
-#load("_derived/cluster_5.RData")
-#header.sRoot <- header.sRoot %>%
-#  left_join(data.frame(PlotObservationID=as.integer(names(cluster.5$cluster)),
-#                       Cluster_n=cluster.5$cluster), 
-#            by="PlotObservationID")
-
-# Added in v1.7 - Classify plots into forest\grasslands based on 
-# species' habitat preference
-
-#calculate proportion of relative in cover belonging to species with different habitat affinities
-# and assign most likely habitat
-
-
-
-TopHabitat <- DT.sRoot08 %>%
-  filter(PlotObservationID %in% plot80perc) %>%
-  as.tbl() %>%
-  dplyr::select(PlotObservationID, species, Relative.cover) %>%
-  left_join(root_traits %>%
-              dplyr::rename(species=full_species, habitat=vegtype_expert_homogenized) %>%
-              filter(is.in.sPlot=="WAHR") %>%
-              dplyr::select(species, habitat) %>%
-              mutate(value=1) %>%
-              reshape2::dcast(species ~ habitat, value.var="value", fill=0), 
-            by="species") %>%
-  group_by(PlotObservationID) %>%
-  summarize_at(.vars= vars(forest:wetland),
-               .funs = list(~weighted.mean(., Relative.cover, na.rm=T))) %>%
-  mutate(TopHabitat=ifelse(wetland>0.5, "Wetland", 
-                           ifelse(forest>0.3, "Forest", ifelse(
-                             grassland+heathland+steppe>0.7, "Grassland",
-                             "Other"))))
-
-habitats.sRoot <- CWM.sRoot %>%
-  left_join(header.sRoot %>% 
-              mutate(fornonfor=ifelse(is.forest==T, "for", 
-                                      ifelse(is.non.forest==T, "nonfor", NA))) %>%
-              dplyr::select(PlotObservationID, fornonfor) %>%
-              bind_cols(habitat_convert), 
-    by="PlotObservationID") %>%
-  left_join(TopHabitat %>% 
-              dplyr::select(PlotObservationID, TopHabitat),
-              by="PlotObservationID")
-
-
-#Attach to header
-header.sRoot <- header.sRoot %>%
-  left_join(TopHabitat %>%
-              dplyr::select(PlotObservationID, TopHabitat) %>%
-              distinct(), 
-            by="PlotObservationID")
-```
-
-The match between sPlot's native classification system and the one based on species affinities is evaluated using a confusion matrix
-```{r message=F}
-Habitat.compare <- header.sRoot %>%
-    as.tbl() %>%
-    dplyr::select(PlotObservationID, Forest:Wetland) %>%
-    mutate(Other=ifelse((Sparse.vegetation==1 | Shrubland==1), 1, 0)) %>%
-    dplyr::select(-Sparse.vegetation, - Shrubland) %>%
-    reshape2::melt(id.vars="PlotObservationID") %>%
-    rename(HabitatObs=variable) %>%
-    filter(value==1) %>%
-    left_join(TopHabitat %>% 
-                dplyr::select(PlotObservationID, TopHabitat), 
-              by="PlotObservationID")
-
-#Build a confusion matrix to evaluate the comparison  
-u <- union(Habitat.compare$TopHabitat, Habitat.compare$HabitatObs)
-t <- table( factor(Habitat.compare$TopHabitat, u), factor(Habitat.compare$HabitatObs, u))
-confm <- caret::confusionMatrix(t)
-
-```
-
-```{r echo=F}
-knitr::kable(confm$table, caption="Confusion matrix between sPlot's native classification of habitats (columns), and classification based on species' habitat affinity (rows)") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
-```
-
-Formulas of associated statistics are available on the help page of the [caret package](https://www.rdocumentation.org/packages/caret/versions/6.0-84/topics/confusionMatrix) and associated references.
-The overall accuracy of the classification based on species' habitat affinity, when tested against sPlot's native habitat classification is `r round(confm$overall[1],2)`, the Kappa statistics is `r round(confm$overall[2],2)`.
-```{r echo=F}
-knitr::kable(t(confm$byClass), caption="Associated statistics of confusion matrix by class") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
-```
-
-Summary of plots with trait coverage >=.8 across habitat types (based on species affinities)
-```{r, echo=F}
-knitr::kable(as.matrix(table((CWM.sRoot %>%
-                               left_join(header.sRoot %>%
-                                           dplyr::select(PlotObservationID, TopHabitat),
-                                         by="PlotObservationID"))[,c("variable", "TopHabitat")],
-                                  exclude=NULL)), caption="Number of plots across habitats") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
-```
-
-```{r, echo=F, eval=F}
-#Summary of plots with trait coverage >=.8 across habitat types (based on native sPlot's classification)
-knitr::kable(as.matrix(table(habitats.sRoot[,c("variable", "habitat.code" )],
-                                  exclude=NULL)), caption="Plots per habitat*trait combination") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
-```
-
-
-
-Summary of plots with trait coverage >=.8 across country/habitat types. 
-```{r, echo=F}
-continent_habitat <- header.sRoot %>%
-  dplyr::select(CONTINENT, TopHabitat)
-
-knitr::kable(as.matrix(table(continent_habitat[,2:1],
-                                  exclude=NULL)), caption="Plots per habitat*continent combination") %>%
-    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
-```
-
-
-# 5 Distribution of CWMs across vegetation types
-
-```{r, fig.width=8, fig.height=6, fig.align="center", eval=F, echo=F}
-#CWMs of different traits across forest and non forest
-ggplot(data=habitats.sRoot, aes(x=fornonfor, y=trait.value, fill=fornonfor)) + 
-    geom_boxplot(outlier.alpha = 1/3, outlier.size = 1.5, outlier.shape = "+") + 
-    #geom_smooth(method = "loess") + 
-    scale_color_viridis(guide=F, discrete = T) + 
-    facet_wrap(. ~ variable, nrow = 2) + 
-    theme_classic() + 
-    theme(axis.title.x=element_blank(),
-        axis.text.x=element_blank(),
-        axis.ticks.x=element_blank(), 
-        axis.title.y=element_blank(),
-        axis.text.y=element_blank(),
-        axis.ticks.y=element_blank())
-```
-
-
-
-```{r, fig.width=8, fig.height=11, fig.align="center"}
-ggplot(data=habitats.sRoot, aes(x=TopHabitat, y=trait.value, fill=TopHabitat)) + 
-    geom_boxplot(outlier.alpha = 1/3, outlier.size = 1.5, outlier.shape = "+") + 
-    #geom_smooth(method = "loess") + 
-    scale_color_viridis(guide=F, discrete = T) + 
-    facet_wrap(. ~ variable, nrow = 4, scales = "free_y") + 
-    theme_classic() + 
-    theme(axis.title.x=element_blank(),
-        axis.text.x=element_blank(),
-        axis.ticks.x=element_blank(), 
-        axis.title.y=element_blank(),
-        #axis.text.y=element_blank(),
-        #axis.ticks.y=element_blank()
-        )
-```
-
-
-
-# 6 Scatterplots of root traits along climate and soil variables
-
-Show geographic distribution and create scatterplots between CWMs and environmental variables.  
-```{r, fig.align="center", fig.height=4, fig.width=6, message=F, warning=F}
-gg.global <- ggplot(countries, aes(x=long, y=lat, group = group)) +
-  geom_polygon(col=NA, lwd=3, fill = gray(0.9)) +
-  geom_point(data=header.sRoot %>%
-               filter(!is.na(TopHabitat)), 
-             aes(x=POINT_X, y=POINT_Y, group=1, col=TopHabitat),
-             alpha=0.5, cex=0.7, shape="+") + 
-  facet_wrap(.~ TopHabitat, ncol=2) + 
-  theme_bw() + 
-  scale_color_brewer(palette="Dark2", guide=F)
-gg.global
-```
-
-
-```{r, fig.align="center", fig.width=2, fig.height=2, warning=F}
-CWM.env <- CWM.sRoot %>%
-  left_join(env.sRoot %>%
-              dplyr::rename(PlotObservationID=PlotID), by="PlotObservationID") %>%
-  left_join(header.sRoot %>%
-              dplyr::select(PlotObservationID, TopHabitat), 
-            by="PlotObservationID") %>%
-  mutate(TopHabitat=factor(TopHabitat))
-
-#Scatterplots
-ggenv <- list()
-for(i in 1:27) {
-  CWM.env$out <- CWM.env[,8+i]
-  labpos.x <- quantile(CWM.env$out, 0.05, na.rm=T)
-  ggenv[[i]] <- ggplot(data=CWM.env, aes(x=out, y=trait.value, col=TopHabitat)) + 
-    geom_point(alpha=0.1, pch=16, cex=.6) + 
-    scale_color_brewer(palette="Dark2", guide=F) + 
-    facet_grid(variable ~ ., scale="free_y") + 
-    theme_classic() + 
-    ggtitle(colnames(CWM.env)[9+i]) + 
-    theme(axis.title.x=element_blank(),
-        axis.title.y=element_blank()) 
-  if(!i %in% c(4,8,12,16,20,24,27)) ggenv[[i]] <- ggenv[[i]] + theme(strip.text.y = element_blank())
-  }
-
-ggenv[[28]] <- cowplot::get_legend(ggplot(data=CWM.env %>%
-                                            filter(!is.na(TopHabitat)), 
-                                          aes(x=out, y=trait.value, col=TopHabitat)) + 
-                             geom_point(pch=16, cex=4) + 
-                             scale_color_brewer(palette="Dark2", name="Habitat"))
-plot(ggenv[[28]])
-```
-
-```{r, fig.align="center", fig.width=8, fig.height=11, cache=T, cache.lazy=F, warning=F}
-gridExtra::grid.arrange(ggenv[[1]], ggenv[[2]], ggenv[[3]], ggenv[[4]], ncol=4, widths=c(1,1,1,1.2))
-gridExtra::grid.arrange(ggenv[[5]], ggenv[[6]], ggenv[[7]], ggenv[[8]], ncol=4, widths=c(1,1,1,1.2))
-gridExtra::grid.arrange(ggenv[[9]], ggenv[[10]], ggenv[[11]], ggenv[[12]], ncol=4, widths=c(1,1,1,1.2))
-gridExtra::grid.arrange(ggenv[[13]], ggenv[[14]], ggenv[[15]], ggenv[[16]], ncol=4, widths=c(1,1,1,1.2))
-gridExtra::grid.arrange(ggenv[[17]], ggenv[[18]], ggenv[[19]], ggenv[[20]], ncol=4, widths=c(1,1,1,1.2))
-gridExtra::grid.arrange(ggenv[[21]], ggenv[[22]], ggenv[[23]], ggenv[[24]], ncol=4, widths=c(1,1,1,1.2))
-gridExtra::grid.arrange(ggenv[[25]], ggenv[[26]], ggenv[[27]], ncol=4, widths=c(1,1,1.2,1))
-```
-
-
-  
-```{r, warning=F, message=F, fig.aligh="center", fig.height=6, fig.width=6, eval=F, echo=F}
-# 6 PCA of CWMs
-## PCA on all traits 
-#CWM.wide <- reshape2::dcast(CWM.sRoot %>% 
-#                              dplyr::select(PlotObservationID:trait.value) %>%
-#                              filter(PlotObservationID %in% 
-#                                       names(cluster.5$cluster[cluster.5$cluster==cl])), 
-#                            PlotObservationID~variable, value.var = "trait.value", fill = NA) %>%
-#  filter(!is.na(rowSums(.)))
-#(CWM.pca <- rda(CWM.wide[,-1], scale=T))
-#ordiplot(CWM.pca)
-#plot(envfit(CWM.pca, CWM.wide[,-1]))
-
-
-## PCA on the most complete 4 traits only
-
-##selected env variables
-env.sRoot.sel <- env.sRoot %>%
-                  filter(env.sRoot$PlotID %in% (header.sRoot %>%
-                                    filter(TopHabitat=="Grassland"))$PlotID) %>%
-                  dplyr::select(PlotID, bio01, bio12, PHIHOX, ORCDRC, CLYPPT, CECSOL, N.est) %>% 
-                  rename(MAT=bio01, MAP=bio12, pH=PHIHOX, OM=ORCDRC, Clay=CLYPPT, CEC=CECSOL, N=N.est) %>%
-                  filter(pH != 0) %>%
-                  filter(CEC != 0) %>%
-                  drop_na %>%
-                  mutate_at(vars(MAT:N),
-                            .funs=~scale(.))
-CWM.wide2 <- reshape2::dcast(CWM.sRoot %>%
-                               filter(variable %in% c("Root_diameter", "SRL", "rootN", "RTD")), 
-                             PlotObservationID~variable, value.var = "trait.value", fill = NA) %>%
-                    filter(PlotObservationID %in% env.sRoot.sel$PlotID) %>%
-  filter(!is.na(rowSums(.))) %>%
-  arrange(PlotObservationID)
-
-env.sRoot.sel <- env.sRoot.sel %>%
-  filter(PlotID %in% CWM.wide2$PlotObservationID) %>%
-  arrange(PlotID)
-
-(CWM.pca2 <- rda(CWM.wide2[,-1], scale=T))
-
-envfit.grasslands <- envfit(CWM.pca2, env.sRoot.sel[,-1])
-biplot (CWM.pca2, display = 'species', scaling = 'species', choices=c(1,2))
-plot(envfit.grasslands)
-```
 
-# 7 Export
+# 4 Export & SessionInfo
 
 ```{r, eval=T}
-save( root_traits, DT.sRoot08, CWM.sRoot, header.sRoot, data.origin, env.sRoot,  #CWM.pca2, #habitats.sRoot,
-      file="_derived/sRoot_sPlot.RData" )
+save( woody_species_traits, DT.xylem, CWM.xylem, header.xylem, 
+      file="_derived/Xylem_sPlot.RData" )
 sessionInfo()
 ```