library(tidyverse)

load("/data/sPlot/releases/sPlot2.1/DT2_20161025.RData")
DT2.0 <- DT2
##### Filter out all species entry of non-vascular plants #####
#use taxon groups from sPlot 3.0
load("/data/sPlot/releases/sPlot3.0/DT_sPlot3.0.RData")
DT3.0 <- DT2
rm(DT2)
dt3.species <- DT3.0 %>% 
  dplyr::rename(Taxon.group=taxon_group) %>% 
  distinct(species, species_original, Taxon.group) %>% 
  filter(Taxon.group != "Unknown") %>% 
  separate(species, sep=" ", into = c("genus", "species")) %>% 
  dplyr::select(genus, Taxon.group) %>% 
  mutate(Taxon.group=replace(Taxon.group, 
                             list=genus=="Friesodielsia", 
                             values="Vascular plant")) %>% 
  mutate(Taxon.group=replace(Taxon.group, 
                             list=genus=="Lamprothamnus", 
                             values="Alga_Stonewort")) %>% 
  filter(!is.na(genus)) %>% 
  filter(!genus %in% c("Hepatica", "×")) %>% 
  distinct()
  

dt2.species <- DT2.0 %>% 
  distinct(species) %>% 
  mutate(species0 = species) %>% 
#  mutate(Taxon.group=replace(Taxon.group, 
#                             list=Taxon.group=="Unknown", 
#                             values=NA)) %>% 
#  mutate(Taxon.group=replace(Taxon.group, 
#                             list=Taxon.group %in% c("Alga", "Stonewort"), 
#                             values="Alga_Stonewort")) %>% 
  separate(species, sep=" ", into = c("genus", "species")) %>% 
  left_join(dt3.species, by="genus") %>% 
#  mutate(Taxon.group=coalesce(Taxon.group.x, as.character(Taxon.group.y))) %>% 
  dplyr::select(species0, Taxon.group) %>% 
  rename(species=species0)

DT2 <- DT2.0 %>% 
  mutate(Taxon.group=replace(Taxon.group, 
                               list=Taxon.group=="Unknown", 
                             values=NA)) %>% 
  mutate(Taxon.group=replace(Taxon.group, 
                               list=Taxon.group %in% c("Alga", "Stonewort"), 
                               values="Alga_Stonewort")) %>% 
  left_join(dt2.species, by="species") %>% 
  #coalesce prioritizing sPlot 3.0
  mutate(Taxon.group=coalesce(as.character(Taxon.group.y), as.character(Taxon.group.x))) %>% 
  dplyr::select(-Taxon.group.x, -Taxon.group.y) %>% 
  filter(!Taxon.group %in% c("Alga_Stonewort", "Lichen", "Moss"))


### exclude all taxa matched at family or higher level #ending "...aceae or ...psida"
DT2.out <- DT2 %>% 
  filter(!grepl(pattern="[A-Za-z]*aceae$|[A-Za-z]*opsida$|^[A-Za-z]*ales$|^[A-Za-z]*phyta$", species))
  
##  filter(species %in% DT2 %>% 
#           distinct(species) %>% 
#           filter(!grepl(pattern="[A-Za-z]*aceae$", species)) %>% 
#           filter(!grepl(pattern="[A-Za-z]*opsida$", species)) %>% 
#           filter(!grepl(pattern="^[A-Za-z]*ales$", species)) %>% 
#           filter(!grepl(pattern="^[A-Za-z]*phyta$", species)) %>% 
#           pull(species))

DT2 <- DT2.out
save(DT2, file = "../sPlot/_derived_data/DT2_20161025_filtered.RData")





#### Add column tree/shrub  ####
load("../sPlot/_derived_data/DT2_20161025_filtered.RData")
load("/data/sPlot2.0/TRY.all.mean.sd.3.by.genus.species.tree.Rdata")

gf <- TRY.all.mean.sd.3.by.genus.species.tree %>% 
  dplyr::select(species=StandSpeciesName,is.tree.or.tall, is.shrub) %>% 
  filter(complete.cases(.))
## only cover ~half of the species

load("/data/sPlot/releases/sPlot3.0/Traits_CWMs_sPlot3.RData")
gf2 <- sPlot.traits %>% 
  mutate(is.shrub=NA) %>% 
  mutate(is.shrub=replace(is.shrub, 
                          list=str_detect(GrowthForm, "shrub"), 
                          values=T)) %>% 
  dplyr::select(species, is.shrub, is.tree.or.tall.shrub) %>% 
  filter(complete.cases(.))
  
  
DT2 <- DT2 %>% 
  left_join(gf, by="species") %>%
  left_join(gf2, by="species") %>% 
  mutate(is.tree.or.tall=coalesce(is.tree.or.tall, is.tree.or.tall.shrub)) %>% 
  mutate(is.shrub=coalesce(is.shrub.x, is.shrub.y)) %>% 
  dplyr::select(-is.tree.or.tall.shrub, -is.shrub.x, -is.shrub.y)

tmp <- DT2 %>% distinct(species, is.tree.or.tall, is.shrub)
table(tmp$is.tree.or.tall, tmp$is.shrub, exclude=NULL)
"        FALSE  TRUE  <NA>
  FALSE 15222  3535     0
  TRUE   6571  1498     0
  <NA>      0     0 31062"  
  
save(DT2, file = "../sPlot/_derived_data/DT2_20161025_filtered.RData")

### use info from header. Assign to is.tree.or.tall all entries from plots
### where only woody species were sampled
load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
source("A96_fixheaderPaper11.R")
header <- fix.header11(header)
plants.recorded <- header %>% 
  dplyr::select(PlotObservationID, plants_recorded2) %>% 
  filter(plants_recorded2 != "complete")

gf.complement <- DT2 %>% 
  filter(PlotObservationID %in% (plants.recorded  %>% 
                                   pull(PlotObservationID))) %>% 
  left_join(plants.recorded, by="PlotObservationID") %>% 
  dplyr::select(species, plants_recorded2) %>% 
  distinct(species, .keep_all = T) %>% 
  mutate(is.tree.or.tall2 = ifelse(plants_recorded2=="woody_large", T, NA)) %>% 
  mutate(is.shrub2 = ifelse(plants_recorded2=="woody_large", F, NA)) %>% 
  mutate(is.shrub2 = ifelse(plants_recorded2=="woody_all", T, is.shrub2))
  
  
DT2 <- DT2 %>% 
  left_join(gf.complement, by="species") %>% 
  mutate(is.tree.or.tall=coalesce(is.tree.or.tall, is.tree.or.tall2)) %>% 
  mutate(is.shrub=coalesce(is.shrub, is.shrub2)) %>% 
  dplyr::select(-is.tree.or.tall2, -is.shrub2)

tmp <- DT2 %>% distinct(species, is.tree.or.tall, is.shrub)
table(tmp$is.tree.or.tall, tmp$is.shrub, exclude=NULL)

"        FALSE  TRUE  <NA>
  FALSE 15222  3535     0
  TRUE   9684  1498     0
  <NA>      0  3852 24097"

### check the most species rich genera 
tmp2 <- tmp %>% 
  filter(is.na(is.tree.or.tall) & is.na(is.shrub)) %>% 
  separate(species, into = c("genus", "species")) %>% 
  group_by(genus) %>% 
  summarize(n=n()) %>% 
  arrange(desc(n)) %>% 
  slice(1:200)
sum(tmp2$n) ## 10528 species in the 200 most speciose genera

genera.gf <- c('Carex' = 'h','Astragalus' = 'h','Acacia' = 't','Euphorbia' = NA,'Silene' = 'h',
               'Eucalyptus' = 't','Ranunculus' = 'h','Rubus' = 's','Senecio' = 'h','Viola' = 'h',
               'Galium' = 'h','Erica' = 's','Festuca' = 'h','Ficus' = 't','Hypericum' = 'h',
               'Hieracium' = 'h','Cyperus' = 'h','Centaurea' = 'h','Miconia' = 't','Salix' = 't',
               'Solanum' = 'h','Allium' = 'h','Quercus' = 't','Potentilla' = 'h','Juncus' = 'h',
               'Dianthus' = 'h','Veronica' = 'h','Indigofera' = 's','Syzygium' = 't','Artemisia' = 'h',
               'Campanula' = 'h','Inga' = 't','Saxifraga' = 'h','Poa' = 'h','Taraxacum' = 'h',
               'Eugenia' = 't','Trifolium' = 'h','Cirsium' = 'h','Eragrostis' = 'h','Diospyros' = 't',
               'Alyssum' = 'h','Alchemilla' = 'h','Stipa' = 'h','Geranium' = 'h','Polygala' = 's',
               'Pedicularis' = 'h','Psychotria' = 't','Ocotea' = 't','Piper' = 's','Panicum' = 'h',
               'Aconitum' = 'h','Draba' = 'h','Ilex' = 't','Oxytropis' = 'h','Aristida' = 'h',
               'Polygonum' = 'h','Cerastium' = 'h','Minuartia' = 'h','Erigeron' = 'h',
               'Acantholimon' = 'h','Prunus' = 't','Asplenium' = 'h','Croton' = NA,'Limonium' = 'h',
               'Plantago' = 'h','Thymus' = 'h','Rosa' = 's','Rumex' = 'h','Verbascum' = 'h',
               'Astracantha' = 'h','Atriplex' = NA,'Restio' = NA,'Salvia' = 'h','Aspalathus' = 's',
               'Crotalaria' = NA,'Helichrysum' = 'h','Pouteria' = 't','Sedum' = 'h','Crassula' = 'h',
               'Vicia' = 'h','Rhynchospora' = 'h','Gentiana' = 'h','Elymus' = 'h','Oxalis' = 'h',
               'Linum' = 'h','Erysimum' = 'h','Dryopteris' = 'h','Eleocharis' = 'h',
               'Delphinium' = 'h','Ipomoea' = 'h','Melaleuca' = 't','Clematis' = 's',
               'Epilobium' = 'h','Saussurea' = 'h','Crepis' = 'h','Lathyrus' = 'h','Ribes' = 's',
               'Agrostis' = 'h','Pelargonium' = 'h','Tephrosia' = NA)

genera.gf2 <- tmp2$genus
genera.gf2 <- genera.gf2[which(!genera.gf2 %in% names(genera.gf))]
#paste(paste("'", genera.gf2, "' = ''", sep=""), collapse=",")
genera.gf2 <- c('Gypsophila' = 'h','Asperula' = 'h','Paronychia' = 'h','Arenaria' = 'h',
                'Cliffortia' = 's','Thesium' = 's','Achillea' = 'h','Scutellaria' = 'h',
                'Calamagrostis' = 'h','Elaphoglossum' = 'h','Stachys' = 'h','Huperzia' = 'h',
                'Linaria' = 'h','Gentianella' = 'h','Valeriana' = 'h','Phylica' = 's',
                'Stellaria' = 'h','Cardamine' = 'h','Ficinia' = 'h','Fimbristylis' = 'h',
                'Lobelia' = 's','Ornithogalum' = 'h','Heliotropium' = 'h','Iris' = 'h',
                'Diplostephium' = 't','Scorzonera' = 'h','Baccharis' = 's','Asparagus' = 's',
                'Cheilanthes' = 'h','Solidago' = 'h','Thlaspi' = 'h','Lupinus' = 'h',
                'Anemone' = 'h','Aster' = 'h','Genista' = 's','Peucedanum' = 'h',
                'Selaginella' = 'h','Isatis' = 'h','Aethionema' = 'h','Agathosma' = 's',
                'Erodium' = 'h','Parrya' = 'h','Polystichum' = 'h','Tragopogon' = 'h',
                'Wahlenbergia' = 'h','Acanthophyllum' = 's','Anthemis' = 'h','Athyrium' = 'h',
                'Corydalis' = 'h','Eryngium' = 'h','Hermannia' = 's','Hesperis' = 's',
                'Justicia' = 's','Pentacalia' = NA,'Scleria' = 'h','Scrophularia' = 'h',
                'Berberis' = 's','Bupleurum' = NA,'Castilleja' = 'h','Eremogone' = NA,
                'Gnidia' = NA,'Goodenia' = 'h','Myosotis' = 'h','Papaver' = 'h',
                'Pentaschistis' = 'h','Salsola' = 's','Tetraria' = 's','Teucrium' = 's',
                'Crataegus' = 't','Dioscorea' = 'h','Lachemilla' = NA,'Othonna' = NA,
                'Zygophyllum' = NA,'Carduus' = 'h','Colchicum' = 'h','Convolvulus' = 'h',
                'Desmodium' = 'h','Moraea' = 'h','Muraltia' = 's','Nepeta' = 'h',
                'Onosma' = 'h','Xyris' = 'h','Clinopodium' = 'h','Hibiscus' = NA,
                'Hydrocotyle' = 'h','Knautia' = 'h','Leandra' = NA,'Luzula' = 'h',
                'Packera' = 'h','Pteris' = 'h','Ruschia' = 'h','Androsace' = 'h',
                'Centella' = 'h','Consolida' = 'h','Grevillea' = 't','Lampranthus' = 'h',
                'Lilium' = 'h','Phyllanthus' = 'h','Puccinellia' = 'h','Rhamnus' = 't',
                'Sporobolus' = 'h','Thalictrum' = 'h','Angelica' = 'h','Armeria' = 'h',
                'Calceolaria' = 'h','Lactuca' = 'h','Lotus' = 'h','Nassella' = 'h',
                'Platanthera' = 'h','Scabiosa' = 'h','Symphyotrichum' = 'h','Arabis' = 'h',
                'Asclepias' = 'h','Barleria' = 'h','Begonia' = 'h','Crocus' = 'h',
                'Eriocaulon' = 'h','Eupatorium' = 'h','Gynoxys' = NA,'Jurinea' = 'h',
                'Lepidium' = 'h','Orobanche' = 'h','Paspalum' = 'h','Potamogeton' = 'h',
                'Pterostylis' = 'h','Reseda' = 'h','Seseli' = 'h','Chusquea' = 's',
                'Eriogonum' = 'h','Herniaria' = 'h','Melica' = 'h','Myrcia' = NA,
                'Penstemon' = 'h','Plectranthus' = 'h','Selago' = 'h','Tanacetum' = 'h',
                'Tillandsia' = 'h','Bomarea' = NA,'Boronia' = 's','Bromus' = 'h',
                'Echinops' = 'h','Euphrasia' = 'h','Gagea' = 'h','Hymenophyllum' = 'h',
                'Metalasia' = NA,'Phlox' = 'h','Primula' = 'h','Pultenaea' = 's',
                'Scirpus' = 'h','Sida' = 's','Sideritis' = 'h','Swainsona' = NA,
                'Brachyotum' = 's','Cleome' = NA,'Commelina' = 'h','Cuscuta' = 'h',
                'Cynanchum' = 'h','Espeletia' = 's','Halenia' = 'h','Lonicera' = 'h',
                'Muhlenbergia' = 'h','Puya' = NA,'Sesleria' = 'h','Trigonella' = NA, 
                'Ageratina' = 's','Bartsia' = 'h','Blechnum' = 'h','Chlorophytum' = 'h',
                'Conyza' = 'h','Cousinia' = 'h','Dendrobium' = 'h','Ehrharta' = 'h',
                'Epidendrum' = 'h','Epipactis' = 'h','Helianthemum' = 'h','Hibbertia' = 't',
                'Inula' = 'h','Isoetes' = 'h','Lotononis' = NA,'Oenothera' = 'h',
                'Rhododendron' = 's','Rhynchosia' = 'h','Rorippa' = 'h','Sorbus' = 't',
                'Stylidium' = 'h','Symplocos' = 't','Valerianella' = 'h','Asarum' = 'h',
                'Boechera' = 'h','Bulbophyllum' = 'h','Drimia' = 'h','Drosanthemum' = NA,
                'Gnaphalium' = 'h','Liatris' = 'h')

genera.gf <- c(genera.gf, genera.gf2)

tmp3 <- tmp %>% 
  mutate(species2=species) %>% 
  separate(species2, into = c("genus", "sp")) %>% 
  left_join(data.frame(genus=names(genera.gf), gf=genera.gf), 
            by="genus") %>% 
  mutate(is.tree.or.tall=ifelse(gf=="t", T, F)) %>% 
  mutate(is.shrub=ifelse(gf=="s", T, F))

DT2 <- DT2 %>% 
  left_join(tmp3 %>% 
              dplyr::select(species, is.tree.or.tall, is.shrub), 
            by="species") %>% 
  mutate(is.tree.or.tall=coalesce(is.tree.or.tall.x, is.tree.or.tall.y)) %>% 
  mutate(is.shrub=coalesce(is.shrub.x, is.shrub.y)) %>% 
  dplyr::select(-is.tree.or.tall.x, -is.tree.or.tall.y, -is.shrub.x, -is.shrub.y, -plants_recorded2) 

tmp <- DT2 %>% distinct(species, is.tree.or.tall, is.shrub)
table(tmp$is.tree.or.tall, tmp$is.shrub, exclude=NULL)

"        FALSE  TRUE  <NA>
  FALSE 24841  5155     0
  TRUE  10242  1784     0
  <NA>      0  3256 12610"

save(DT2, file = "../sPlot/_derived_data/DT2_20161025_filtered2.RData")

###### calculate total matrix Mij on the whole database ######
#### resample by species (20 plots per species) to calculate Mij more quickly

DT2 <- DT2 %>% 
  dplyr::rename(SPECIES_NR=species, RELEVE_NR=PlotObservationID)

length(unique(DT2$SPECIES_NR)) 
### delete rare species (<=10 occurrences)
comm.sp <- (DT2 %>%
              group_by(SPECIES_NR) %>%
              summarize(n.occurrences=n()) %>%
              filter(n.occurrences>5) %>%
              dplyr::select(SPECIES_NR))$SPECIES_NR
DT.norare <- as.tbl(DT2) %>%
  filter(SPECIES_NR  %in% comm.sp)
#### subset DT.norare           
#set.seed(1984)
source("../../Ancillary_Functions/stratified_fifer.R")
system.time(resampling0 <- stratified(DT.norare %>% mutate(SPECIES_NR=factor(SPECIES_NR)), 
                                      group="SPECIES_NR", size=20, replace=F)) ### ~300 secs
sum(table(resampling0$SPECIES_NR)<20)  ### 4887 species with less than 10 occurrences
length(unique(resampling0$RELEVE_NR)) # 191540  ### unique plots to cast for MiJ
DT.res <- DT.norare %>% 
  filter(RELEVE_NR %in% unique(resampling0$RELEVE_NR)) %>%
  mutate(presabs=(COV_PERC>0)*1) %>%
  mutate(RELEVE_NR=factor(RELEVE_NR)) %>%
  mutate(SPECIES_NR=factor(SPECIES_NR))
nrow(DT.res) # 4963090 rows

## Much faster using sparsMatrix than cast
system.time(DT.matrix.pa.res <- sparseMatrix(as.integer((DT.res$RELEVE_NR)), 
                                             as.integer((DT.res$SPECIES_NR)), 
                                             x = DT.res$presabs, 
                                             dimnames=list(levels(DT.res$RELEVE_NR), levels(DT.res$SPECIES_NR)))
            ) #~2 secs!!!
system.time(Mij.res <- crossprod(DT.matrix.pa.res, DT.matrix.pa.res)) ### ~ 7 secs
system.time(Mij.res <- sweep(Mij.res, 2, diag(Mij.res), FUN="/")) ###  441.245 secs
save(Mij.res, file="../sPlot/_derived_data/Mij_tot_AllPlots_sparse_filtered.rData")






##### Additional Figures and Tables ######

library(tidyverse)
library(sp)
library(rgdal)
library(viridis)
library(sf)
library(rnaturalearth)
library(dggridR)
library(cowplot)

load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
### run Header.fix!
source("A96_fixheaderPaper11.R")
header.filtered <- fix.header11(header)
header.filtered <- header.filtered %>% 
  mutate(rel.area=cut(`Relevé area (m²)`, breaks=c(0,9,50,200,400,800,2000,Inf), right=T)) 

# load mydata
load("../sPlot/_derived_data/Resample1/Mydata_global.RData")
mydata <- mydata %>%
  #mutate(isforest=as.numeric(isforest)) %>% ##classification based on land cover
  mutate(isforest=ifelse(isforest, "for", "nonfor")) %>%
  mutate(isforest=factor(isforest)) %>% 
  mutate(CONTINENT=factor(CONTINENT))
### Relevées used in each iteration for BRTs
nrows <- 99
set.seed(999)
## stratified by biome, forest, realm, releve area - each group is capped to 100 relevées
rel.list <- lapply(1:nrows, function(x){mydata %>% 
    group_by(sBiomeName, isforest, REALM, cut(Rel.area, c(0,150, 600, 1200, Inf))) %>% 
    sample_frac(1) %>% 
    slice(1:100) %>% 
    #sample_n(100, replace=T) %>% 
    ungroup() %>% 
    dplyr::select(RELEVE_NR) %>% 
    distinct() %>%
    pull(RELEVE_NR)})

res <- data.frame(PlotObservationID=unlist(rel.list)) %>% 
  group_by(PlotObservationID) %>% 
  summarize(n=n())

header.resampled <- header.filtered %>% 
  filter(PlotObservationID %in% res$PlotObservationID)

###### graph of distribution of plot size across biomes ######
biome.order <- c('Polar and subpolar zone' ,'Alpine' ,'Boreal zone' ,'Temperate midlatitudes' ,
                 'Dry midlatitudes' ,'Dry tropics and subtropics' ,'Subtrop. with year-round rain' ,
                 'Subtropics with winter rain' ,'Tropics with summer rain' ,'Tropics with year-round rain')
biome.labs <- c('Polar &\n subpolar' ,'Alpine' ,'Boreal zone' ,'Temperate\n midlatitudes' ,
                'Dry midlatitudes' ,'Dry tropics\n & subtropics' ,'Subtropics -\n year-round rain' ,
                'Subtropics -\n winter rain' ,'Tropics -\n summer rain' ,'Tropics -\n year-round rain')

plotdistr <- mydata %>% 
  mutate(sBiomeName=factor(sBiomeName, levels=biome.order, labels=biome.labs)) %>% 
  mutate(plot_size=cut(Rel.area, c(0,150, 600, 1200, Inf), labels=c("(0, 150]", "(150, 600]", "(600, 1200]", ">1200"))) %>% 
  group_by(isforest, sBiomeName, plot_size) %>% 
  mutate(isforest=factor(isforest, levels=c("for", "nonfor"), labels=c("Forest", "Non forest"))) %>% 
  summarize(n=n()) %>% 
  complete(isforest, sBiomeName, plot_size, fill=list(n=NA))
  
ggplotsize <- ggplot(data=plotdistr) + 
  geom_bar(aes(x=sBiomeName, y=n, group=plot_size, fill=plot_size), stat="identity", position ="dodge") + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust = 1)) + 
  facet_grid(isforest~.) +
  scale_y_log10(labels = function(x) format(x, scientific = F)) +
  xlab("Biome") + 
  ylab("Number of plots") + 
  scale_fill_discrete(name= bquote(Plot~size~(m^{2})))

ggsave(filename="../sPlot/_derived_data/Resample1/_pics/FigS3_plotsize.png", device = "png", dpi=400, 
       width = 6, height=4, ggplotsize)

###### graph of distribution sampling completeness across biomes ######
completeness_distr <- mydata %>% 
  mutate(plants_recorded=factor(plants_recorded, 
                                levels=c("woody_large", "woody_all", "complete"),
                                labels=c("only trees", "trees & shrubs", "complete vegetation"))) %>% 
  mutate(sBiomeName=factor(sBiomeName, levels=biome.order, labels=biome.labs)) %>% 
  group_by(sBiomeName, plants_recorded, isforest) %>% 
  mutate(isforest=factor(isforest, levels=c("for", "nonfor"), labels=c("Forest", "Non forest"))) %>% 
  summarize(n=n()) %>% 
  complete(isforest, sBiomeName, plants_recorded, fill=list(n=NA))

ggcompleteness <- ggplot(data=completeness_distr) + 
  geom_bar(aes(x=sBiomeName, y=n, group=plants_recorded, fill=plants_recorded), stat="identity", position ="dodge") + 
  theme_classic() + 
  theme(axis.text.x = element_text(angle=45, hjust=1, vjust = 1), 
        legend.position = "bottom") + 
  facet_grid(isforest~.) +
  scale_y_log10(labels = function(x) format(x, scientific = F)) +
  xlab("Biome") + 
  ylab("Number of plots") + 
  scale_fill_discrete(name= "Plants recorded")

ggsave(filename="../sPlot/_derived_data/Resample1/_pics/FigS4_completeness.png", device = "png", dpi=400, 
       width = 7, height=5, ggcompleteness)

##### Table 1 ##### 
### create table of databases with plot number and biblio
#Import databases and create reference tags
library(bib2df)
bib.db <- bib2df("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/sPlot_References.bib")
#Import database-level information
databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")

# create citation tags that can be picked up by Manubot
databases <- databases %>% 
  left_join(bib.db %>% 
              dplyr::select(BIBTEXKEY, DOI, URL), 
            by="BIBTEXKEY") %>%
  dplyr::select(-DOI, -URL)


# Create Table 1
databases21 <- fread("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out2.1.csv")
header.resampled <- header.resampled %>%
  mutate(Dataset=replace(Dataset, list=Dataset %in% c("Argentina_Chaco_Espinal","Argentina_Cordoba"), 
                         "Argentina_Central")) %>%
  mutate(Dataset=replace(Dataset, list=Dataset=="SIVIM wetlands", 
                         "Spain_sivim_wetlands")) %>%
  
  left_join(databases21 %>% 
              dplyr::rename(Dataset=`label`) %>%
              dplyr::select(`GIVD ID`, Dataset), by="Dataset")


table1 <- header.resampled %>%
  group_by(`GIVD ID`) %>% 
  summarize(contributed_plots=n(), .groups = 'drop') %>% 
  left_join(databases %>% 
              filter(`Still in sPlot`==T, 
                     Via!="Aggregator") %>% 
              dplyr::select(-Via, -`Still in sPlot`) %>% 
              distinct(),
            by="GIVD ID") %>% 
  filter(!is.na(contributed_plots)) %>% 
  dplyr::select(`GIVD ID`, 
                `Dataset name`=`DB_name GIVD`,  
                `Nr. of unique plots used` = contributed_plots, Ref=BIBTEXKEY) %>% 
  distinct() %>% 
  arrange(`GIVD ID`) %>% 
  replace_na(list(Ref=""))


write_csv(table1, "../sPlot/_output/TableS1_Databases.csv")




##### Fig S1 - PLOT distribution of resampled plots ###############
#### Transform to spatial data.frame
mycrs <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" )
header.filtered.sf <- SpatialPointsDataFrame(coords=header.filtered %>% 
                                               dplyr::select(POINT_X, POINT_Y),
                                             proj4string = mycrs, 
                                             data=header.filtered %>% 
                                               dplyr::select(-POINT_X, -POINT_Y)) %>% 
  st_as_sf() %>% 
  st_transform("+proj=eck4")

header.resampled.sf <- header.filtered.sf[which(header.filtered$PlotObservationID %in% res$PlotObservationID),]



#### Get geographic data
countries <- ne_countries(returnclass = "sf") %>% 
  st_transform(crs = "+proj=eck4") %>% 
  st_geometry()
graticules <- ne_download(type = "graticules_15", category = "physical",
                          returnclass = "sf") %>% 
  st_transform(crs = "+proj=eck4") %>% 
  st_geometry()
bb <- ne_download(type = "wgs84_bounding_box", category = "physical",
                  returnclass = "sf") %>% 
  st_transform(crs = "+proj=eck4") %>% 
  st_geometry()

label_parse <- function(breaks) {
  parse(text = breaks)
}

## basic graph of the world in Eckert projection
w3a <- ggplot() +
  #geom_sf(data = graticules, col = "grey20", lwd = 0.1) +
  geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) +
  geom_sf(data = bb, col = "grey20", fill = NA) +
  coord_sf(crs = "+proj=eck4") +
  theme_minimal() +
  theme(axis.text = element_blank(), 
        legend.title=element_text(size=12), 
        legend.text=element_text(size=12),
        legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 
        legend.key.height = unit(1.1, "cm"), 
        legend.key.width = unit(1.1, "cm")) +
  scale_fill_viridis(label=label_parse)

## plots all releves in header
ggAll0 <- w3a + 
  geom_sf(data=header.filtered.sf, aes(col=plants_recorded2), pch="+", cex=0.9) + 
  scale_color_brewer(palette="Pastel1") +
  ggtitle("All filtered plots")

ggsave(filename="../sPlot/_derived_data/Resample1/_pics/AllfilteredPlot.png", device = "png", dpi=400, 
       width = 6, height=4, ggAll0)

## plots all releves with fornonfor info
ggAll <- w3a + 
  geom_sf(data=header.filtered.sf %>% 
            filter(fornonfor!="out") %>% 
            mutate(fornonf=factor(fornonfor)), aes(col=fornonfor), pch="+", cex=0.9) + 
  scale_color_brewer(palette="Set1", direction=-1) +
  ggtitle("All classified plots")

ggsave(filename="../sPlot/_derived_data/Resample1/_pics/AllClassifiedPlots.png", device = "png", dpi=400, 
       width = 6, height=4, ggAll)

## resampled plots
header.resampled.sf <- header.resampled.sf %>% 
  mutate(fornonfor=fct_recode(fornonfor, Forest="for", `Non Forest`="nonfor")) 
ggRes.for <- w3a + 
  geom_sf(data=header.resampled.sf %>% 
            filter(fornonfor=="Forest"), 
          aes(col=fornonfor), pch="+", cex=0.9, alpha=1/6) + 
  scale_color_brewer(palette="Set1", direction=-1, name = "Vegetation type") +
  #guides(color = guide_legend(override.aes= list(alpha = 1, cex=3))) + 
  theme(legend.position="none", 
        legend.background = element_rect(color = NA), 
        plot.title = element_text(hjust=0.5)) + 
  ggtitle(paste("Forest (n = ", header.resampled.sf %>% 
                  filter(fornonfor=="Forest") %>% nrow(), ")", sep=""))
#ggsave(filename="../sPlot/_derived_data/Resample1/_pics/AllResampledPlots.png", device = "png", dpi=600, 
#       width = 6, height=3.5, ggRes)

ggRes.nonfor <- w3a + 
  geom_sf(data=header.resampled.sf %>% 
            filter(fornonfor=="Non Forest"), 
          aes(col=fornonfor), pch="+", cex=0.9, alpha=1/6) + 
  scale_color_brewer(palette="Set1", direction=-1, name = "Vegetation type") +
  #guides(color = guide_legend(override.aes= list(alpha = 1, cex=3))) + 
  theme(legend.position="none", 
    legend.background = element_rect(color = NA), 
    plot.title = element_text(hjust=0.5))+ 
  ggtitle(paste("Non-Forest (n = ", header.resampled.sf %>% 
                  filter(fornonfor=="Non Forest") %>% nrow(), ")", sep=""))



#ggRes.plants_rec <- w3a + 
#  geom_sf(data=header.resampled.sf, aes(col=plants_recorded2), pch="+", cex=0.9) + 
#  ggtitle("All Resampled plots")

header.resampled.sf.res <- header.resampled.sf %>% 
  left_join(res)
ggRes.leverage <- w3a + 
  geom_sf(data=header.resampled.sf.res, aes(col=n), pch="+", cex=0.9) + 
  scale_color_viridis(name="Plot leverage (%)", limits=c(0,100), breaks=scales::pretty_breaks(n=5)) +
  theme(legend.position="bottom", 
        legend.background = element_rect(color = NA), 
        plot.title = element_text(hjust=0.5))+ 
  ggtitle("Plot Leverage")
#ggsave(filename="../sPlot/_derived_data/Resample1/_pics/AllResampledPlots_leverage.png", device = "png", dpi=600, 
#       width = 6, height=4, ggRes.leverage)

FigS1 <- plot_grid(ggRes.for, ggRes.nonfor, ggRes.leverage, 
                   nrow=3, labels = c("A", "B", "C"), rel_heights = c(1,1,1.4))
ggsave(filename="../sPlot/_derived_data/Resample1/_pics/FigS1_AllResampledPlots_and_leverage.png", device = "png", dpi=600, 
       width = 6, height=9, FigS1)



#### PLOT density of all used plots (for vs nonfor) in hexagons ####
#### build hexagon grid
dggs          <- dgconstruct(spacing=200, metric=T, resround='down')
#Get the corresponding grid cells for each earthquake epicenter (lat-long pair)
header.resampled$cell <- dgGEO_to_SEQNUM(dggs, header.resampled$POINT_X, header.resampled$POINT_Y)$seqnum

#Calculate mean metric for each cell
header.resampled.out   <- header.resampled %>% 
  dplyr::select(cell, fornonfor) %>% 
  group_by(cell) %>% 
  summarise(count=log(n(),10),
            count.for=log(sum(fornonfor=="for"),10),
            count.nonfor=log(sum(fornonfor=="nonfor"),10))
#Get the grid cell boundaries for cells 
grid   <- dgcellstogrid(dggs, header.resampled.out$cell, frame=F) %>%
  st_as_sf() %>% 
  mutate(cell = header.resampled.out$cell) %>% 
  mutate(value.out=header.resampled.out$count) %>% 
  st_transform("+proj=eck4") %>% 
  st_wrap_dateline(options = c("WRAPDATELINE=YES"))

## plotting
ggCount <- w3a + 
    geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9)    +
    geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
    labs(fill = "# of Plots\n(Log 10)")
ggsave(filename="../sPlot/_derived_data/Resample1/_pics/FigSXXV_AllResampledPlots.png", device = "png", dpi=400, 
       width = 6, height=3.5, ggCount)
    



#### List of plots used by dataset, with publication ####
db21 <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out2.1.csv")
dblatest <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
dblist <- header.resampled %>% 
  mutate(Dataset=replace(Dataset, list=Dataset %in% c("Argentina_Chaco_Espinal","Argentina_Cordoba"), 
                         "Argentina_Central")) %>%
  mutate(Dataset=replace(Dataset, list=Dataset=="SIVIM wetlands", 
                         "Spain_sivim_wetlands")) %>% 
  group_by(Dataset) %>% 
  summarize(n=n()) %>% 
  left_join(db21 %>% 
              dplyr::select(`GIVD ID`,  Dataset=label), 
            by="Dataset") %>% 
  left_join(dblatest %>% 
              dplyr::select(`GIVD ID`,`DB_name GIVD`, Citation), 
            by="GIVD ID")

## get missing citations from givd
givd.dois <- jsonlite::fromJSON('https://www.givd.info/api/givd/byMetaDB/splot') %>% 
  as.data.frame %>% 
  dplyr::select(id, doi) %>% 
  filter(id %in% (dblist %>% 
                    dplyr::filter(is.na(Citation)) %>% 
                    pull(`GIVD ID`))) %>% 
  filter(doi!="" & doi!="not") %>% 
  mutate(doi=str_remove(doi, pattern=" http://hdl.handle.net/|https://doi.org/|doi.org/")) %>% 
  mutate(url=paste0("https://doi.org/", doi))

## download bibtex from URLS:
## from https://stackoverflow.com/questions/57340204/r-convert-list-of-dois-to-bibtex
#library(curl)
# urls <- paste0("https://doi.org/", givd.dois %>% pull(doi))
# bib.path <- "../sPlot/_derived_data/Resample1/doibibs.txt"
# h <- new_handle()
# handle_setheaders(h, "accept" = "application/x-bibtex")
# #curl_download(urls[6], destfile = bib.path, handle = h, mode = "a")
# walk(urls, ~ {
#   curl(., handle = h) %>%
#     readLines(warn = T) %>%
#     write(file = bib.path, append = TRUE) 
# })

dblist <- dblist %>% 
  left_join(givd.dois %>% 
              dplyr::select(`GIVD ID`=id, -doi, url)) %>% 
  mutate(Citation=coalesce(Citation, url)) %>% 
  dplyr::select(`GIVD ID`, `Dataset name`=`DB_name GIVD`, `No. Plots`=n, Reference=Citation) %>% 
  arrange(`GIVD ID`) %>% 
  mutate(Reference=ifelse(is.na(Reference), "", Reference))

write_csv(dblist, path = "../sPlot/_derived_data/Resample1/PlotList.csv")




## Test for mistakes in ecoregion attribution
##mydata.test <- mydata %>% 
##  sample_n(10000) %>% 
##  mutate(ECO_NAME=factor(ECO_NAME))
##(test <- sample(levels(mydata.test$ECO_NAME), 1))
##ggplot(data=mydata.test %>% 
##         mutate(key.eco=ECO_NAME == test), 
##       aes(x=POINT_X, y=POINT_Y, col=key.eco, cex=0.5+key.eco)) + 
##  geom_point(pch="+")


############# Not updated below

load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
source("/data/sPlot/users/Francesco/_sPlot_Management/Fix.header.R")
header <- fix.header(header, exclude.sophy = F)

#### select only forest plots, having complete vegetation ## as in Paper #1
header2 <- as.tbl(header) %>%
  filter(!is.na(POINT_X)) %>%
  filter(!is.na(POINT_Y)) %>%
  #filter(!is.na(`Relevé area (m²)`) & `Relevé area (m²)` != -1) %>% 
  mutate(`Relevé area (m²)`=replace(`Relevé area (m²)`, 
                                    list=(`Relevé area (m²)` %in% c(0, -1)),
                                    value=NA)) %>%
  filter(`Location uncertainty (m)`<50000) %>% ##later addition
  filter(is.na(`Plants recorded`) | `Plants recorded` %in% c("all vascular plants", "All vascular plants",
                                                             "All vascular plants and bryophytes", "All vascular plants and dominant cryptogams",
                                                             "complete", "Complete vegetation", "Complete vegetation (including non-terricolous tax",
                                                             "Dominant vascular plants", "#N/A")) %>%
  mutate(is.forest=ifelse(is.na(is.forest), F, is.forest)) %>%
  mutate(is.non.forest=ifelse(is.na(is.non.forest), F, is.non.forest)) %>%
  #mutate(releve.forest.sel=(`Relevé area (m²)`>=100 & `Relevé area (m²)`<=1000 & is.forest==T)) %>%
  #mutate(releve.nonfor.sel=(`Relevé area (m²)`>=5 & `Relevé area (m²)`<=100 & is.non.forest==T)) %>%
  mutate(fornonfor=factor(is.forest+(2*is.non.forest), labels = c("out", "for", "nonfor"))) %>%
  dplyr::select(PlotObservationID, Country, `Relevé area (m²)`, `Herbs identified (y/n)`, `Plants recorded`,
                `is.forest`, `is.non.forest`, `BiomeID`, POINT_X:fornonfor)

xseq <- seq(-2, 4, by=1)
FigS0 <- ggplot(data=header2 %>% 
         filter(fornonfor != "out") %>% 
         mutate(fornonfor=fct_drop(fornonfor)) %>% 
         mutate(fornonfor=factor(fornonfor, levels=c("for", "nonfor"), labels=c("Forest", "Non Forest"))), 
       ) + 
  geom_histogram(aes(x=log10(`Relevé area (m²)`), fill=fornonfor), binwidth = 0.5) + 
  facet_grid(fornonfor~.) + 
  scale_fill_brewer(palette="Dark2") +
  scale_x_continuous(breaks=xseq, labels=10^xseq) +
  theme_bw() + 
  theme(legend.position = "none", 
        strip.background = element_blank(), 
        axis.title.y = element_blank()
        )
  
ggsave(filename = "../sPlot/_pics/FigS0_hist_ReleveArea.png", width = 4, height = 6, dpi=300, 
       FigS0)