diff --git a/01_Extract_data_Project_31.Rmd b/01_Extract_data_Project_31.Rmd
new file mode 100755
index 0000000000000000000000000000000000000000..de59e4fb2735f66450fd26b0cc9ca84514054ac2
--- /dev/null
+++ b/01_Extract_data_Project_31.Rmd
@@ -0,0 +1,363 @@
+---
+title: 'Project #31 - Data Extraction'
+output:
+  html_document: default
+  toc: TRUE
+always_allow_html: yes
+---
+
+
+<center>
+  ![](https://www.idiv.de/fileadmin/content/Files_sDiv/sDiv_Workshops_Photos_Docs/sDiv_WS_Documents_sPlot/splot-long-rgb.png "sPlot Logo")
+</center>
+  
+    
+      
+        
+**Timestamp:** `r date()`  
+**Drafted:** Francesco Maria Sabatini  
+**Version:** 1.0 
+  
+
+This report documents the data extraction for **sPlot project proposal #31** - *The adaptive value of xylem physiology within and across global ecoregions* as requested by Daniel Laughlin and Jesse Robert Fleri 
+
+
+
+
+```{r results="hide", message=F}
+library(tidyverse)
+
+library(knitr)
+library(kableExtra)
+library(viridis)
+library(grid)
+library(gridExtra)
+
+#library(vegan)
+#library(xlsx)
+#library(caret)
+#library(foreign)
+#library(raster)
+
+library(downloader)
+library(sp)
+library(sf)
+library(rgdal)
+library(rgeos)
+
+#save temporary files
+write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
+write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
+#rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
+```
+
+Import data from sPlot 3.0
+```{r}
+#Import sPlot data
+load("/data/sPlot/releases/sPlot3.0/header_sPlot3.0.RData")
+load("/data/sPlot/releases/sPlot3.0/DT_sPlot3.0.RData")
+load("/data/sPlot/releases/sPlot3.0/Traits_CWMs_sPlot3.RData")
+load("/data/sPlot/releases/sPlot3.0/SoilClim_sPlot3.RData")
+```
+Import data on xylem traits, provided by Jesse Robert Fleri on October 26th, 2020.
+```{r}
+load("xylem_data.RData")
+```
+
+# 1 Extract plots from sPlot based on species with xylem traits  
+Extract all plots containing at least one species in the xylem list.
+```{r, message=F, results=F, warning=F}
+species_list <- xylem_data$Species
+plot.sel <- DT2 %>%
+  filter(DT2$species %in% species_list) %>%
+  dplyr::select(PlotObservationID) %>%
+  distinct() %>%
+  pull(PlotObservationID)
+
+#exclude plots without geographic information
+header.xylem <- header %>%
+  filter(PlotObservationID %in% plot.sel) %>% 
+  filter(!is.na(Latitude))
+#refine plot.sel
+plot.sel <- header.xylem$PlotObservationID
+
+DT.xylem <- DT2 %>% 
+  filter(taxon_group %in% c("Vascular plant", "Unknown")) %>% 
+  filter(PlotObservationID %in% plot.sel)
+
+```
+
+Out of the `r length(species_list)` species in the sRoot list, `r sum(unique(DT2$species) %in% species_list)` species are present in sPlot, for a total of `r nrow(DT.xylem %>% filter(species %in% species_list))` plots., across `r length(plot.sel)` plots.
+
+# 2 Extract woody species
+This is partial selection, as we don't have information on the growth form of all species in sPlot
+```{r}
+#load list of woody species, as provided to me by Alexander Zizka, within sPlot project #21
+load("../Project_21/_input/evowood_species_list.rda")
+
+#Select all woody species, as well as those without GrowthForm information
+#extract relevant traits from TRY
+woody_species_traits <- sPlot.traits %>%
+  dplyr::select(species, GrowthForm, is.tree.or.tall.shrub, n,
+                starts_with("StemDens"),
+                starts_with("Stem.cond.dens"), 
+                starts_with("StemConduitDiameter"),
+                starts_with("LDCM"),
+                starts_with("SLA"),
+                starts_with("PlantHeight"), 
+                starts_with("Wood"), 
+                starts_with("SpecificRootLength_mean")) %>%
+  filter( (species %in% species_list) |
+          (species %in% synonyms$name_binomial) |
+          grepl(pattern = "tree|shrub", x = GrowthForm) |
+          is.tree.or.tall.shrub==T
+              )
+
+table(woody_species_traits$GrowthForm, exclude=NULL)
+# MEMO: some standardization needed in sPlot 3.0
+# Including the data from A.Zizka in the selection 
+# increases the number of selected species only marginally (from ~21k to ~22k)
+```
+
+```{r, echo=F}
+knitr::kable(woody_species_traits %>% 
+               sample_n(20), caption="Example of gap-filled trait data from TRY (20 randomly selected species") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+
+Selected traits are:  
+
+- StemDens - 4 - Stem specific density (SSD) or wood density (stem dry mass per stem fresh volume) (g/cm^3)  
+- SLA - 11 - Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA)  
+- PlantHeight - 18 - Plant height (vegetative + generative)  
+- StemDiam - 21 - Stem diameter (m)  
+- Stem.cond.dens - 169 - Stem conduit density (vessels and tracheids) (mm^-2)  
+- StemConduitDiameter - 281 - Stem conduit diameter (vessels, tracheids)_micro (m)  
+- Wood.vessel.length - 282 - Wood vessel element length; stem conduit (vessel and tracheids) element length_micro (m)  
+- WoodFiberLength - 289 - Wood fiber lengths_micro (m) 	
+- SpecificRootLength - 1080 - Root length per root dry mass (specific root length, SRL) (cm/g)  
+
+Codes correspond to those reported in [TRY](https://www.try-db.org/TryWeb/Home.php)
+
+```{r}
+#subset DT.xylem to only retain woody species
+DT.xylem <- DT.xylem %>% 
+  filter(species %in% (woody_species_traits$species))
+nrow(DT.xylem)
+```
+
+
+# 3 Calculate CWMs and trait coverage 
+Calculate CWM and trait coverage for each trait and each plot. Select plots having more than 80% coverage for at least one trait.
+```{r, cache=T,  cache.lazy=F}
+# Merge species data table with traits
+CWM.xylem0 <- DT.xylem %>%
+  as.tbl() %>%
+  dplyr::select(PlotObservationID, species, Relative.cover) %>%
+  left_join(xylem_data %>%
+              dplyr::rename(species=Species) %>%
+              dplyr::select(species, P50, Ks), 
+            by="species")
+
+# Calculate CWM for each trait in each plot
+CWM.xylem1 <- CWM.xylem0 %>%
+  group_by(PlotObservationID) %>%
+  summarize_at(.vars= vars(P50:Ks),
+               .funs = list(~weighted.mean(., Relative.cover, na.rm=T))) %>%
+  dplyr::select(PlotObservationID, order(colnames(.))) %>%
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.value")
+
+# Calculate coverage for each trait in each plot
+CWM.xylem2 <- CWM.xylem0 %>%
+  mutate_at(.funs = list(~if_else(is.na(.),0,1) * Relative.cover), 
+            .vars = vars(P50:Ks)) %>%
+  group_by(PlotObservationID) %>%
+  summarize_at(.vars= vars(P50:Ks),
+               .funs = list(~sum(., na.rm=T)), 
+               .groups="drop") %>%
+  dplyr::select(PlotObservationID, order(colnames(.))) %>%
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.coverage")
+
+# Calculate CWV
+variance2.fun <- function(trait, abu){
+  res <- as.double(NA)
+  #nam <- nam[!is.na(trait)]
+  abu <- abu[!is.na(trait)]
+  trait <- trait[!is.na(trait)]
+  abu <- abu/sum(abu)
+  if (length(trait)>1){
+    # you need more than 1 observation to calculate
+    # skewness and kurtosis
+    # for calculation see 
+    # http://r.789695.n4.nabble.com/Weighted-skewness-and-curtosis-td4709956.html
+    m.trait <- weighted.mean(trait,abu)
+    res <- sum(abu*(trait-m.trait)^2)
+  }
+  res
+}
+
+CWM.xylem3 <- CWM.xylem0 %>%
+  group_by(PlotObservationID) %>%
+  summarize_at(.vars= vars(P50:Ks),
+               .funs = list(~variance2.fun(., Relative.cover))) %>%
+  dplyr::select(PlotObservationID, order(colnames(.))) %>%
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.variance")
+
+## Calculate proportion of species having traits
+CWM.xylem4 <- CWM.xylem0 %>%
+  group_by(PlotObservationID) %>%
+  summarize_at(.vars= vars(P50:Ks),
+               .funs = list(~sum(!is.na(.)))) %>%
+  dplyr::select(PlotObservationID, order(colnames(.))) %>%
+  pivot_longer(-PlotObservationID, names_to="trait", values_to="trait.nspecies")
+
+# Join together
+CWM.xylem <- CWM.xylem1 %>%
+  left_join(CWM.xylem2, by=c("PlotObservationID", "trait")) %>%
+  left_join(CWM.xylem3, by=c("PlotObservationID", "trait")) %>%
+  left_join(CWM.xylem4, by=c("PlotObservationID", "trait")) %>%
+  left_join(CWM.xylem0 %>% 
+              group_by(PlotObservationID) %>%
+              summarize(sp.richness=n()), by=c("PlotObservationID"), 
+            .groups="drop") %>%
+  mutate(trait.coverage.nspecies=trait.nspecies/sp.richness) %>%
+  #filter(trait.coverage>=0.8) %>%
+  arrange(PlotObservationID)
+```
+
+
+
+```{r, echo=F}
+knitr::kable(CWM.xylem[1:20,], caption="Example of CWM data file") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+
+```{r}
+CWM.xylem08 <- CWM.xylem %>%
+  filter(trait.coverage>=0.8)
+```
+
+```{r, echo=F}
+knitr::kable(CWM.xylem08 %>%
+               group_by(variable) %>%
+               summarize("num.plots"=n()), caption="Number of plots with >=.8 coverage per trait") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+
+
+```{r}
+#Create list of plots having at least one trait with >=.8 coverage and extract header data
+
+#plot80perc <- (CWM.xylem %>%
+#  dplyr::select(PlotObservationID) %>%
+#  distinct())$PlotObservationID
+
+#DT.xylem08 <- DT.xylem %>%
+#  filter(PlotObservationID %in% header.xylem$PlotID)
+
+#CWM.xylem <- CWM.xylem %>%
+#  filter(PlotObservationID %in% header.xylem$PlotID) 
+```
+
+Completeness of header data
+```{r, echo=F}
+knitr::kable(data.frame(Completeness_perc=colSums(!is.na(header.xylem))/
+                          nrow(header.xylem)*100)[-c(1,2),,drop=F], 
+             caption="Header file - Columns present and % completeness") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+The process results in `r nrow(header.xylem)` plots selected, for a total of `r nrow(CWM.xylem)` trait * plot combinations. 
+  
+
+Geographical distribution of plots
+```{r, fig.align="center", fig.height=4, fig.width=6, message=F, warning=F}
+countries <- map_data("world")
+ggworld <- ggplot(countries, aes(x=long, y=lat, group = group)) +
+  geom_polygon(col=NA, lwd=3, fill = gray(0.9)) +
+  geom_point(data=header.xylem, aes(x=Longitude, y=Latitude, group=1), col="red", alpha=0.5, cex=0.7, shape="+") + 
+  theme_bw()
+ggworld
+```
+
+
+Summarize data across data sets in sPlot, and create list of data custodians
+```{r, warning=F, message=F}
+db.out <- fread("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv") %>%
+  dplyr::select(`GIVD ID`, Custodian)
+data.origin <- header.xylem %>% 
+   group_by(`GIVD ID`) %>% 
+   summarize(Num.plot=n()) %>%
+   left_join(db.out)
+```
+
+```{r, echo=F}
+knitr::kable(data.origin, caption="Data Origin") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+
+The data derive from `r nrow(data.origin)` datasets.  
+
+# 4 Extract climate and soils data  
+
+```{r}
+soilclim.xylem <- soilclim %>% 
+  filter(PlotObservationID %in% plot.sel) %>% 
+  rename(Elevation=Elevation_median, -Elevation_q2.5, -Elevation_q97.5)
+```
+
+```{r, echo=F}
+knitr::kable(soilclim.xylem %>% 
+               sample_n(20), caption="Example of climatic and soil variables for 20 randomly selected plots. All values represent the median in a circle centered on the plot coordinates, having a radius equal to the plot's location uncertainty (capped to 50 km for computing reasons)") %>%
+    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center")
+```
+
+The procedure used to obtain these environmental predictors is described [here](https://git.idiv.de/fs40gaho/splot3_build/-/raw/master/public/05_ExtractEnvironment.html?inline=false) (Click to download the report)
+
+
+Bioclimatic variables (bio01-bio19) derive from [CHELSA](http://chelsa-climate.org/bioclim/)  
+
+Codes:  
+
+Bio1 = Annual Mean Temperature  
+Bio2 = Mean Diurnal Range  
+Bio3 = Isothermality  
+Bio4 = Temperature Seasonality  
+Bio5 = Max Temperature of Warmest Month  
+Bio6 = Min Temperature of Coldest Month  
+Bio7 = Temperature Annual Range  
+Bio8 = Mean Temperature of Wettest Quarter  
+Bio9 = Mean Temperature of Driest Quarter  
+Bio10 = Mean Temperature of Warmest Quarter   
+Bio11 = Mean Temperature of Coldest Quarter  
+Bio12 = Annual Precipitation  
+Bio13 = Precipitation of Wettest Month  
+Bio14 = Precipitation of Driest Month  
+Bio15 = Precipitation Seasonality  
+Bio16 = Precipitation of Wettest Quarter  
+Bio17 = Precipitation of Driest Quarter  
+Bio18 = Precipitation of Warmest Quarter  
+Bio19 = Precipitation of Coldest Quarter  
+  
+\newline \newline
+
+Soil variables (5 cm depth) derive from the [ISRIC dataset](https://www.isric.org/), downloaded at 250-m resolution 
+  
+CECSOL                     Cation Exchange capacity of soil  
+CLYPPT                     Clay mass fraction in %  
+CRFVOL                     Coarse fragments volumetric in %  
+ORCDRC                     Soil Organic Carbon Content in g/kg  
+PHIHOX                     Soil pH x 10 in H20  
+SLTPPT                     Silt mass fraction in %  
+SNDPPT                     Sand mass fraction in %  
+BLDFIE                     Bulk Density (fine earth) 	in kg/m3  
+P.ret.cat                  Phosphorous Retention - Categorical value, see [ISRIC 2011-06](https://www.isric.org/documents/document-type/isric-report-201106-global-distribution-soil-phosphorus-retention-potential)  
+\newline \newline
+
+
+# 4 Export & SessionInfo
+
+```{r, eval=T}
+save( woody_species_traits, DT.xylem, CWM.xylem, header.xylem, 
+      file="_derived/Xylem_sPlot.RData" )
+sessionInfo()
+```