diff --git a/code/03_TaxonomicBackbone.Rmd b/code/03_TaxonomicBackbone.Rmd index cb35c519004c1135bf42a7ac8e76b4079afa315c..575af3ee0bb4d0b740085ab0ab900adf9d6221a1 100644 --- a/code/03_TaxonomicBackbone.Rmd +++ b/code/03_TaxonomicBackbone.Rmd @@ -29,7 +29,7 @@ urlcolor: blue **Version:** 1.3 **Changes to Version 1.1** Additional manual cleaning of species names from BJA, UJ and HB. -**Changes to Version 1.2** Changed order of ranking TNRS databases, when a name is matched across more than 1 DB; Using cleaned version of DT table (after stripping non-closed quotation marks). Additionally check with TPL those species that, even if resolved in TNRS, did not return an accepted name. +**Changes to Version 1.2** Changed order of ranking TNRS databases, when a name is matched across more than 1 DB; Using cleaned version of DT table (after stripping non-closed quotation marks). Additionally check with TPL those species that, even if resolved in TNRS, did not return an accepted name. **Changes to Version 1.3** Manual check of names BEFORE matching with TNRS @@ -696,7 +696,8 @@ spec.list.TRY.sPlot <- spec.list.TRY.sPlot %>% mutate(Species=gsub('Zwstr faurea', 'Faurea', Species)) %>% mutate(Species=gsub('Quercus crispla', 'Quercus crispula', Species)) %>% mutate(Species=gsub('Corallorrhiza', 'Corallorhiza', Species)) %>% - mutate(Species=gsub('Brunella vulgaris', 'Prunella vulgaris', Species)) + mutate(Species=gsub('Brunella vulgaris', 'Prunella vulgaris', Species)) %>% + mutate(Species=gsub('Lamprothamnum', 'Lamprothamnium', Species)) ``` A total of `r nrow(spec.list.TRY.sPlot %>% filter(OriginalNames != Species))` species names were modified. Although substantially improved, the species list has still quite a lot of inconsistencies. @@ -1271,7 +1272,7 @@ load("../_derived/TNRS_submit/tnrs.iter3.RData") load("../_derived/TNRS_submit/tnrs.iter4.RData") #Double check of wrong taxa from TNRS -finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii") +finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii", "Lamprothamnium papulosum") tpl.ncbi.certain <- tpl.ncbi.certain %>% bind_rows(TPL(finalcheck)) @@ -1291,6 +1292,8 @@ Backbone <- spec.list.TRY.sPlot %>% dplyr::select(Name_sPlot_TRY, Name_string_corr1, Name_string_corr2, Source, Name_submitted) %>% rename(sPlot_TRY=Source) %>% left_join(tnrs.res.certain %>% + #filter out wrongly matches species + filter(!Name_submitted %in% finalcheck) %>% #filter(!is.na(Accepted_name)) %>% bind_rows(tnrs.res.iter2.certain) %>% bind_rows(tnrs.ncbi.certain) %>% @@ -2012,7 +2015,7 @@ algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiac "Chordariaceae", "Dinobryaceae", "Diploneidaceae", "Ectocarpaceae", "Fragilariaceae","Sphacelariaceae","Vaucheriaceae" , "Amphipleuraceae", "Fucaceae", "Gomphonemataceae", "Melosiraceae", - "Laminariaceae","Acinetosporaceae" ,"Botryochloridaceae", + "Laminariaceae","Acinetosporaceae" ,"Botryochloridaceae", "Lamprothamnium", #diatoms below "Thalassiosiraceae", "Cymbellaceae", "Naviculaceae","Bacillariaceae") diff --git a/code/04_buildHeader.Rmd b/code/04_buildHeader.Rmd index b6119120049c648e2e7449b78ec2c5b2480db468..3aae24579880c85674d835ea7e272cc011cceced 100644 --- a/code/04_buildHeader.Rmd +++ b/code/04_buildHeader.Rmd @@ -10,17 +10,19 @@ output: html_document  </center> -MEMO : Exclude plots from CANADA and adjust DT2, CWMs & SoilClim (at the moment code is here, but eval=F) -MEMO : There are ~2000 without country information from these datasets: -EcoPlant Db, Germany_vegetweb, Greece_nat2000, Russia_volga, Spain_sivim_sclerophyllous, Spain_sivim_sclerophyllous_pinus, Egypt Nile delta \newline **Timestamp:** `r date()` **Drafted:** Francesco Maria Sabatini -**Revised:** -**version:** 1.0 +**Revised:** Helge Bruelheide +**Version:** 1.1 -This report documents the construction of the header file for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens. +This report documents the construction of the header file for sPlot 3.0. It is based on dataset sPlot_3.0.2, received on 24/07/2019 from Stephan Hennekens. + +*Changes in version 1.1.* +1) Excluded plots from Canada, as recommended by Custodian +2) Filled missing info from most of the ~2000 plots without country information from these datasets. +3) Corrected mismatched sBiomes and ecoregions ```{r results="hide", message=F, warning=F} @@ -50,6 +52,8 @@ rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp") ``` +## 1 Import data +Import header data. Clean header data from quotation and double quotation marks from linux terminal. ```{bash, eval=F} # escape all double quotation marks. Run in Linux terminal #sed 's/"/\\"/g' sPlot_3_0_2_header.csv > sPlot_3_0_2_header_test.csv @@ -57,12 +61,9 @@ rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp") #more general alternative in case some " are already escaped ##first removing \s before all "s, and then adding \ before all ": #sed 's/\([^\\]\)"/\1\\\"/g; s/"/\\"/g' - - ``` -## 1 Import data -Import header data +Import cleaned header data. ```{r import} header0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_header_test.csv", locale = locale(encoding = 'UTF-8'), delim="\t", col_types=cols( @@ -148,9 +149,9 @@ Fadum01, 02 & 03 - 1707779:1707781 Faers01 - 1707782 Pfe-f-08 - 1707849 Pfe-o-05- 1707854 -```{r, eval=F} +```{r, eval=T} header0 <- header0 %>% - filter(PlotObservationID != c(1707776, 1707779:1707782, 1707849, 1707854)) + filter(!PlotObservationID %in% c(1707776, 1707779:1707782, 1707849, 1707854)) ``` @@ -312,7 +313,7 @@ header <- header %>% ## 4 Assign plots to spatial descriptors Create spatial point dataframe for sPlot data to intersect with spatial layers -```{r} +```{r, eval=F} header.shp <- header %>% filter(!is.na(Longitude) | !is.na(Latitude)) header.shp <- SpatialPointsDataFrame(coords= header.shp %>% @@ -323,6 +324,16 @@ header.shp <- SpatialPointsDataFrame(coords= header.shp %>% `GIVD ID`=header.shp$`GIVD ID`)) writeOGR(header.shp, dsn="../_derived/", layer="header.shp", driver="ESRI Shapefile", overwrite_layer = T) ``` +Reimport shapefile +```{r} +header.shp <- readOGR("../_derived/header.shp.shp") +header.shp@data <- header.shp@data %>% + rename(PlotObservationID=PltObID, + loc.uncert=lc_ncrt, + `GIVD ID`=GIVD_ID) +crs(header.shp) <- CRS("+init=epsg:4326") +``` + ### 4.1 Assign to Continents Download and manipulate map of continents @@ -377,14 +388,14 @@ Reload, manipulate continent and attach to header ```{r} load("../_derived/continent.out") header <- header %>% - left_join(header %>% - filter(!is.na(Longitude) | !is.na(Latitude)) %>% + left_join(header.shp@data %>% dplyr::select(PlotObservationID) %>% - bind_cols(continent.out), + bind_cols(continent.out), by="PlotObservationID") %>% mutate(CONTINENT=factor(continent, levels=c("Africa", "Antarctica", "Australia", "Eurasia", "North America", "South America"), labels=c("AF", "AN", "AU", "EU", "NA", "SA"))) %>% + dplyr::select(-continent) ``` Summarize @@ -425,10 +436,13 @@ stopCluster(cl) Reimport output and join to header ```{r, message=F, warning=F} sBiome.files <- list.files("../_derived/sBiomes", pattern="sBiomes-[0-9]+.csv", full.names=T) +sBiome.files <- sBiome.files[order(as.numeric(str_extract(sBiome.files, pattern="[0-9]+")))] sBiomes.out <- do.call(rbind, lapply(sBiome.files, function(x) {read_csv(x)})) +sBiomes.out <- header.shp@data %>% + dplyr::select(PlotObservationID) %>% + bind_cols(sBiomes.out) header <- header %>% left_join(sBiomes.out %>% - rename(PlotObservationID=X1) %>% dplyr::select(PlotObservationID, Name, BiomeID) %>% dplyr::rename(sBiome=Name, sBiomeID=BiomeID), by="PlotObservationID") @@ -446,6 +460,8 @@ knitr::kable(header %>% full_width = F, position = "center") ``` + + ### 4.3 Extract WWF Ecoregions Extract ecoregion name and ID from [Ecoregions of the World](https://www.worldwildlife.org/publications/terrestrial-ecoregions-of-the-world). Olson et al. 2001 [(BioScience)](https://academic.oup.com/bioscience/article/51/11/933/227116). Computation was performed in EVE HPC cluster using function `A98_PredictorsExtract.R`. Divided in 99 chunks. @@ -477,6 +493,7 @@ stopCluster(cl) Reimport output and join to header ```{r, message=F, warning=F} ecoreg.files <- list.files("../_derived/wwf_ecoregions/", pattern="wwf_terr_ecos-[0-9]+.csv", full.names=T) +ecoreg.files <- ecoreg.files[order(as.numeric(str_extract(ecoreg.files, pattern="[0-9]+")))] ecoreg.out <- do.call(rbind, lapply(ecoreg.files, function(x) {read_csv(x, col_types=cols( .default = col_double(), @@ -484,9 +501,13 @@ ecoreg.out <- do.call(rbind, lapply(ecoreg.files, function(x) {read_csv(x, REALM = col_character(), G200_REGIO = col_character(), eco_code = col_character()))})) + +ecoreg.out <- header.shp@data %>% + dplyr::select(PlotObservationID) %>% + bind_cols(ecoreg.out) + header <- header %>% left_join(ecoreg.out %>% - rename(PlotObservationID=X1) %>% dplyr::select(PlotObservationID, ECO_NAME, ECO_ID) %>% dplyr::rename(Ecoregion=ECO_NAME, EcoregionID=ECO_ID), by="PlotObservationID") @@ -512,7 +533,7 @@ knitr::kable(header %>% Extract elevation for each plot. Loops over tiles of 1 x 1°, projects to mercator, and extract elevation for plot coordinates, as well as 2.5, 50, and 97.5 quantiles for a buffer area having a radius equal to the location uncertainty of each plot (but only if location uncertainty < 50 km). DEM derive from package [elevatr](https://cran.r-project.org/web/packages/elevatr/vignettes/introduction_to_elevatr.html#get_raster_elevation_data), which uses the [Terrain Tiles on Amazon Web Services](https://registry.opendata.aws/terrain-tiles/). Resolutions of DEM rasters vary by region. I set a zoom factor z=10, which corresponds to ~ 75-150 m. Sources are: SRTM, data.gov.at in Austria, NRCAN in Canada, SRTM, NED/3DEP 1/3 arcsec, data.gov.uk in United Kingdom, INEGI in Mexico, ArcticDEM in latitudes above 60°, LINZ in New Zealand, Kartverket in Norway, as described [here](https://github.com/tilezen/joerd/blob/master/docs/data-sources.md). \newline -Split data into tiles of 1 x 1 degrees, and create `sp::SpatialPointsDataFrame` files. Only for plots having a location uncertainty < 50 km. +Split data into tiles of 1 x 1 degrees, and create `sp::SpatialPointsDataFrame` files. Only for plots having a location uncertainty < 50 km, which corresponds to `r header.shp@data %>% mutate(lc_ncrt=abs(loc.uncert)) %>% filter(lc_ncrt <= 50000) %>% nrow()` plots. ```{r create tiles} header.tiles <- header %>% dplyr::select(PlotObservationID, Dataset, Longitude, Latitude, `Location uncertainty (m)`) %>% @@ -665,7 +686,7 @@ summary(elevation.out %>% dplyr::select(-PlotObservationID, -elevation)) ``` -There are `r sum(is.na(elevation.out$Elevation_median))` plots without elevation info, corresponding to `r round(sum(is.na(elevation.out$Elevation_median))/nrow(header)*100,1)`% of total. +There are `r sum(is.na(elevation.out$Elevation_median))` plots without elevation info, corresponding to `r round(sum(is.na(elevation.out$Elevation_median))/nrow(header)*100,1)`% of the number of matched plots. Please not that elevation was extracted only for plots with location uncertainty <50 km, i.e., `r header.shp@data %>% mutate(lc_ncrt=abs(loc.uncert)) %>% filter(lc_ncrt <= 50000) %>% nrow()` plots. There are `r sum(elevation.out$Elevation_median < -1, na.rm=T)` plots with elevation below sea level. \newline Join elevation data (only median) @@ -710,9 +731,42 @@ ggplot(data=mydata) + geom_abline(slope=1, intercept=1, col="Dark green") ``` +## 4.5 Assign to countries +There is a minor number of plots (`r header %>% filter(is.na(Country)) %>% nrow()`), not assigned to any countries. Fix that. +```{r} +countries <- readOGR("../../Ancillary_Data/naturalearth/ne_110m_admin_0_countries.shp") +crs(countries) <- crs(header.shp) +tmp.sel <- header %>% + filter(is.na(Country)) %>% + pull(PlotObservationID) + +header.shp.nocontry <- header.shp[which(header.shp$PlotObservationID %in% tmp.sel),] +countries.out <- over(header.shp.nocontry, countries) + +header$Country <- as.character(header$Country) +header$Country[tmp.sel] <- countries.out$NAME +header$Country <- as.factor(header$Country) +``` +Plots without country info are now only `r header %>% filter(is.na(Country)) %>% nrow()`. + # 5 Map of plots +Update header.shp +```{r} +header.shp@data <- header.shp@data %>% + left_join(header %>% + dplyr::select(PlotObservationID, sBiome, CONTINENT, + Ecoregion, GIVD.ID=`GIVD ID`), + by="PlotObservationID") + +header.sf <- header.shp %>% + st_as_sf() %>% + st_transform(crs = "+proj=eck4") +``` + + +Basic Map of the world in Eckert projection ```{r, message=F, warning=F} countries <- ne_countries(returnclass = "sf") %>% st_transform(crs = "+proj=eck4") %>% @@ -726,7 +780,7 @@ bb <- ne_download(type = "wgs84_bounding_box", category = "physical", st_transform(crs = "+proj=eck4") %>% st_geometry() -## basic graph of the world in Eckert projection + w3a <- ggplot() + geom_sf(data = bb, col = "grey20", fill = "white") + geom_sf(data = graticules, col = "grey20", lwd = 0.1) + @@ -781,10 +835,6 @@ ggsave(filename="../_pics/PlotDensityLog10_vir.png", width = 15, height = 7, uni Graph of plot location by Dataset ```{r, fig.align="center", fig.width=8, fig.height=4, cache=T, message=F} -header.sf <- header.shp %>% - st_as_sf() %>% - st_transform(crs = "+proj=eck4") - (w4 <- w3a + geom_sf(data=header.sf %>% mutate(GIVD.ID=fct_shuffle(GIVD.ID)), aes(col=factor(GIVD.ID)), pch=16, size=0.8, alpha=0.6) + @@ -794,6 +844,51 @@ header.sf <- header.shp %>% ggsave(filename="../_pics/PlotDistrib_Dark2_shuffle1984.png", width = 15, height = 7, units="in", dpi=300, plot=w4) ## takes ~40' to render ``` +Double check attribution to continents, Biomes and Ecoregions. Do it only on a subset of plots +```{r, fig.align="center", fig.width=8, fig.height=7, message=F} +tmp.sel <- header %>% + group_by(sBiome) %>% + sample_n(1000) %>% + pull(PlotObservationID) + +#sBiomes +(w5 <- w3a + + geom_sf(data=header.sf %>% + filter(PlotObservationID %in% tmp.sel), + aes(col=factor(sBiome)), pch=16, size=0.8, alpha=0.6) + + geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3)) + +#continent +tmp.sel <- header %>% + filter(CONTINENT!="AN") %>% + group_by(CONTINENT) %>% + sample_n(1000) %>% + pull(PlotObservationID) +(w6 <- w3a + + geom_sf(data=header.sf %>% + filter(PlotObservationID %in% tmp.sel), + aes(col=factor(CONTINENT)), pch=16, size=0.8, alpha=0.6) + + geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3)) +``` + + +```{r, fig.align="center", fig.width=8, fig.height=15, message=F} +#Ecoregion - Only 10 random ecoregions tested +tmp.sel <- header %>% + filter(Ecoregion %in% sample(unique(header$Ecoregion), 10)) %>% + pull(PlotObservationID) +(w7 <- w3a + + geom_sf(data=header.sf %>% + filter(PlotObservationID %in% tmp.sel) %>% + mutate(Ecoregion=factor(Ecoregion)), + aes(col=factor(Ecoregion)), pch=16, size=0.8, alpha=0.6) + + geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + + theme(legend.position="bottom")) +``` + + + + ## 6 Fix output and export ```{r} @@ -820,8 +915,8 @@ save(header, file = "../_output/header_sPlot3.0.RData") ``` ```{r message=F, echo=F} -knitr::kable(header %>% slice(1:20), - caption="Example of header (first 20 rows)") %>% +knitr::kable(header %>% sample_n(20), + caption="Example of header (20 random rows shown)") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F, position = "center") ```