read_sPlot_TRY_HB13.R

##############################
### read in sPlot database ###
##############################
rm(list=ls())    
gc()

#path.sPlot <- "/home/oliver/Dokumente/PhD/PostPhD/IDiv/sDiv/sPlot/Analyses/Data/Species/sPlot/sPlot_20_11_2014/"
#path.sPlot <- "C:\\Daten\\iDiv\\sPlot\\2014_December\\"
#path.sPlot <- "D:\\Helge\\sPlot\\"
getwd()
#path.sPlot <- "/data"
#path.sPlot <- "/home/helge/sPlot2/"

#path.sPlot <- "/home/oliver/shared/Backbone_v.2/"
# type in FileZilla in the server path: /home/oliver/shared
#setwd(path.sPlot)

##### DATA ####
sPlot3 <- read.csv(paste(getwd(),"/data/sPlot_14_04_2015_species.csv", 
                         sep = ""), sep="\t", quote = "\"'")
str(sPlot3) # 24241941 obs. of  11 variables:
# column 11 in sPlot_08_04_2015_species.csv is "x_"
# This numeric variable either holds basal area (BA), individual count (IV)
# importance value (IV), per cen frequency (PF) or stem count (SC)
# Which type of information is given, is shown in the Cover.code variable:
levels(sPlot3$Cover.code)
# .... "x"    "x_BA" "x_IC" "x_IV" "x_PF" "x_SC"
# here, "x" stands for p/a data.
# In the first steps of processing the data, the different types of information 
# is combined with the cover values in Cover..
any(is.na(sPlot3$Cover.code)) # T
length(sPlot3$Cover.code[is.na(sPlot3$Cover.code)]) #  30252

###### Backbone #####
load("backbone.v.2.splot.try3.Rdata")
# alternatively:
#sPlot3 <- read.csv(paste(path.sPlot, "backbone.v.2.splot.try3.csv", 
#                         sep = ""), sep=",", quote = "\"'")
str(backbone.splot.try3) 
#data.frame':	122901 obs. of  33 variables:
'$ Name_number                 : int  1 2 3 4 5 6 7 8 9 10 ...
$ names.sPlot.TRY             : chr  "?" "0" "[1269 Chlorophytum platt]" "[1284 Echinochloa]" ...
$ names.corr.string           : chr  "?" "0" "[1269 Chlorophytum platt]" "[1284 Echinochloa]" ...
$ sPlot.TRY                   : chr  "S" "S" "S" "S" ...
$ Name_submitted              : chr  "Spermatophyta sp." "Spermatophyta sp." "Chlorophytum sp. [1269]" "Echinochloa sp." ...
$ Overall_score               : num  0 0 0.9 0.9 0.9 0.9 0.9 0 0.9 0 ...
$ Name_matched                : chr  "No suitable matches found." "No suitable matches found." "Chlorophytum" "Echinochloa" ...
$ Name_matched_rank           : chr  "" "" "genus" "genus" ...
$ Name_score                  : num  0 0 1 1 1 1 1 0 1 0 ...
$ Family_score                : num  0 0 NA NA NA NA 1 0 1 0 ...
$ Name_matched_accepted_family: chr  "" "" "Asparagaceae" "Poaceae" ...
$ Genus_matched               : chr  "" "" "Chlorophytum" "Echinochloa" ...
$ Genus_score                 : num  0 0 1 1 1 1 NA 0 NA 0 ...
$ Specific_epithet_matched    : chr  "" "" "" "" ...
$ Specific_epithet_score      : num  0 0 NA NA NA NA NA 0 NA 0 ...
$ Unmatched_terms             : chr  "" "" "\"\"sp. [1269]" "\"\"sp." ...
$ Taxonomic_status            : chr  "" "" "Accepted" "Accepted" ...
$ Accepted_name               : chr  "" "" "Chlorophytum" "Echinochloa" ...
$ Accepted_name_author        : chr  "" "" "" "" ...
$ Accepted_name_rank          : chr  "" "" "genus" "genus" ...
$ Accepted_name_url           : chr  "" "" "http://www.theplantlist.org/tpl1.1/search?q=Chlorophytum" "http://www.theplantlist.org/tpl1.1/search?q=Echinochloa" ...
$ Accepted_name_species       : chr  "" "" "" "" ...
$ Accepted_name_family        : chr  "" "" "Asparagaceae" "Poaceae" ...
$ Selected                    : chr  "true" "true" "true" "true" ...
$ Source                      : chr  "" "" "tpl" "tpl" ...
$ Warnings                    : chr  " " " " " " " " ...
$ Manual.matching             : chr  NA NA NA NA ...
$ Status.correct              : chr  "No suitable matches found." "No suitable matches found." "Accepted" "Accepted" ...
$ name.correct                : chr  "No suitable matches found." "No suitable matches found." "Chlorophytum" "Echinochloa" ...
$ rank.correct                : chr  "higher" "higher" "genus" "genus" ...
$ family.correct              : chr  "" "" "Asparagaceae" "Poaceae" ...
$ name.short.correct          : chr  NA NA "Chlorophytum" "Echinochloa" ...
$ rank.short.correct          : chr  "higher" "higher" "genus" "genus" ...
'
'The first column "names.sPlot.TRY" are the "raw" names from sPlot and TRY.
The last five columns are the final outcome of the whole matching procedure: 
"Status.correct",Â  "name.correct",Â  "rank.correct", "family.correct",Â  "name.short.correct",Â  "rank.short.correct".
I guess "name.short.correct" will be the most important column and should be used for most of the species-trait matching and other analyses.
'
length(unique(backbone.splot.try3$name.short.correct)) #  86529

###### Match sPlot3 (Matched.concept) with backbone.splot.try3 (name.short.correct)
index1 <- match(sPlot3$Matched.concept,backbone.splot.try3$names.sPlot.TRY)
any(is.na(index1)) #F Excellent
head(backbone.splot.try3$name.short.correct[index1])
any(is.na(backbone.splot.try3$name.short.correct[index1])) # T
length(sPlot3$Matched.concept[!is.na(backbone.splot.try3$name.short.correct[index1])])
# 24221565
length(sPlot3$Matched.concept[is.na(backbone.splot.try3$name.short.correct[index1])])
# 20376
sPlot3$Matched.concept[is.na(backbone.splot.try3$name.short.correct[index1])]
backbone.splot.try3$name.short.correct[index1][is.na(backbone.splot.try3$name.short.correct[index1])]
# there is NA in the name.short.correct
sPlot3$species <- backbone.splot.try3$name.short.correct[index1]
length(sPlot3$species[is.na(sPlot3$species)])
# 20376

#remove NAs
#sPlot4 <- sPlot3[!is.na(sPlot3$species),]
#str(sPlot4) # 24221565 x 11 var.
#rm(sPlot4)

###### Header ######
#splot.header <- read.csv(paste(path.sPlot, "sPlot_header.csv", sep = ""), 
#                         sep = "\t", na.strings=c("","NA"), fileEncoding = "UTF-8",quote = "")
#splot.header <- read.csv(paste("/home/helge/sPlot2/", "sPlot_header.csv", sep = ""), 
#                         sep = "\t", na.strings=c("","NA"), fileEncoding = "UTF-8",quote = "")
splot.header <- read.csv("sPlot_header.csv", 
                         sep = "\t", na.strings=c("","NA"), fileEncoding = "UTF-8",quote = "")
# without comments

dim(splot.header)
# 1117940      41
search.tab <- regexpr("\t",splot.header$Dataset)
head(search.tab)
table(search.tab)
# search.tab
# -1 
# 1117940 
# -> no tabulator in Dataset ???

str(splot.header)
#tail(splot.header)
write.csv(splot.header[,c("PlotObservationID","Longitude","Latitude")],file = 
            paste("/home/helge/sPlot2/", "sPlot_14_04_2015_coord.csv", 
                  sep = ""), row.names = FALSE)
index30 <- match(sPlot3$PlotObservationID, splot.header$PlotObservationID)
any(is.na(index30)) #F
length(index30) #  24241941
length(unique(index30[!is.na(index30)])) # 1117940
#sPlot3[head(which(is.na(index30))),]

index31 <- match(splot.header$PlotObservationID,sPlot3$PlotObservationID)
any(is.na(index31)) #F
length(index31) #   1117940
#length(index31[is.na(index31)]) # 3
length(unique(index31[!is.na(index31)])) # 1117940
#splot.header[which(is.na(index31)),]
#splot.header <- splot.header[-which(is.na(index31)),]
dim(splot.header)
# 1117940      40
#index32 <- match(splot.header$PlotObservationID,sPlot3$PlotObservationID)
#any(is.na(index32)) #F
identical(as.character(splot.header$PlotObservationID),as.character(sPlot3$PlotObservationID[index31]))
# T
head(cbind(splot.header$PlotObservationID,sPlot3$PlotObservationID[index31]))
head(index31)
#head(splot.header$PlotObservationID)

library(data.table)
DT <- data.table(sPlot3)
str(DT) # 24241941 obs. of  12 variables:
tables()
length(unique(DT$PlotObservationID)) # 1117940
setkey(DT,PlotObservationID)

### Species number (more precisely data entry number, as layers counted multiple times) ###
'plot.species.number <- DT[,list(species.number=length(Cover..)), by=PlotObservationID]
str(plot.species.number) #  1117940
hist(plot.species.number$species.number)
min(plot.species.number$species.number) # 1
max(plot.species.number$species.number) # 797
which(plot.species.number$species.number==max(plot.species.number$species.number))
# 51010
'
### Relative Cover ###
plot.total.cover <- DT[,list(total.cover=sum(Cover..)), by=PlotObservationID]
any(is.na(DT$Cover..)) # F
str(plot.total.cover)

any(is.na(plot.total.cover$total.cover)) # F
length(plot.total.cover$total.cover[plot.total.cover$total.cover==0]) 
# 47504
# this includes only plots, in which none of the species has a cover != 0
# there are also cases, in which some spcies have a cover > 0, but others have
# Cover.code=="x" or =="X_BA" etc. or =="NA"

index1a <- match(DT$PlotObservationID,plot.total.cover$PlotObservationID)
length(index1a) # 24241941
any(is.na(index1a)) # F

DT$Relative.cover <-  DT$Cover../plot.total.cover$total.cover[index1a]
str(DT)
any(is.na(DT$Relative.cover)) # T
hist(DT$Relative.cover)
na.cases <- DT[,list(na.cases=any(is.na(Cover.code))), by=PlotObservationID]
str(na.cases)
table(na.cases$na.cases)
'
FALSE    TRUE 
1107234   10706'
head(na.cases[na.cases==T,],100)
dim(DT[is.na(Relative.cover)]) # 136479     13

na.cases2 <- DT[,list(na.species=any(is.na(species))), by=PlotObservationID]
str(na.cases2)
table(na.cases2$na.species,na.cases$na.cases)
'      
FALSE    TRUE
FALSE 1090935   10663
TRUE    16299      43'
# most NAs either in cover values or species

na.cases3 <- DT[,list(x=any(Cover.code=="x"|Cover.code=="x_BA"|Cover.code=="x_IC"|
                              Cover.code=="x_IV"|Cover.code=="x_PF"|Cover.code=="x_SC"|
                              is.na(Cover.code))), by=PlotObservationID]
# this includes "x" -> p/a as well as different types of relative abundance information
str(na.cases3)
table(na.cases3$x)
' FALSE    TRUE
1061602   56338  '

index2 <- match(DT$PlotObservationID,na.cases3$PlotObservationID)
length(index2) #24241941
any(is.na(index2)) # F

na.cases4 <- DT[,list(x=all(Cover.code=="x"|Cover.code=="x_BA"|Cover.code=="x_IC"|
                              Cover.code=="x_IV"|Cover.code=="x_PF"|Cover.code=="x_SC"|
                              is.na(Cover.code))), by=PlotObservationID]
str(na.cases4)
table(na.cases4$x)
' FALSE    TRUE 
1070436   47504  '
# this matches the number of plots with sum of Cover..==0

plot.total.cover2 <- DT[na.cases3$x[index2]==T, list(total.cover=sum(x_,na.rm=T)), 
                        by=PlotObservationID]
any(is.na(DT$x_)) # T
length(DT$PlotObservationID[is.na(DT$x_)]) #  23915391
# 24241941-23915391= 326550 lines with information in the x_ column
str(plot.total.cover2) #56338 obs. of  2 variables
hist(plot.total.cover2$total.cover)
length(plot.total.cover2$total.cover)
# 56338
any(is.na(plot.total.cover2$total.cover)) # F


index3 <- match(plot.total.cover2$PlotObservationID,plot.total.cover$PlotObservationID)
length(index3) #56338
any(is.na(index3)) # F
identical(plot.total.cover2$PlotObservationID,plot.total.cover$PlotObservationID[index3])
# T

table(plot.total.cover2$total.cover>0,plot.total.cover$total.cover[index3]>0)
' 
FALSE  TRUE
FALSE 31459  8674
TRUE  16045   160
# 47504-16045=31459 -> p/a cases for all species in a plot
# all species are assigned 1/species richness as a value for Cover..
16045  -> get value from x_ for all species in a plot
# Cover.. <- x_, but care with assigning NA values
8834 (8674+160) are mixed cover and p/a cases
8674 -> low Cover..=0.01 is assigned
160 -> rescale all values with x_ to 1 and assign that to Cover..
# can be handled together with 11162!
'

index3a <- match(DT$PlotObservationID,plot.total.cover2$PlotObservationID
                 [!plot.total.cover2$total.cover>0 & !plot.total.cover$total.cover[index3]>0])
length(index3a) #   24241941
length(DT$PlotObservationID[!is.na(index3a)])
# 555792
length(unique(DT$PlotObservationID[!is.na(index3a)]))
# 31459
index3a1 <- match(DT$PlotObservationID[!is.na(index3a)],plot.species.number$PlotObservationID)
length(index3a1) #  555792
length(1/plot.species.number$species.number[index3a1])
identical(DT$PlotObservationID[!is.na(index3a)],plot.species.number$PlotObservationID[index3a1])
# T
DT$Cover..[!is.na(index3a)] <- 1/plot.species.number$species.number[index3a1]
# p/a cases for all species in a plot
# all species are assigned 1/species richness as a value for Cover..

index3b <- match(DT$PlotObservationID,plot.total.cover2$PlotObservationID
                 [plot.total.cover2$total.cover>0 & !plot.total.cover$total.cover[index3]>0])
length(index3b) #    24241941
length(DT$PlotObservationID[!is.na(index3b)])
#  355182
length(unique(DT$PlotObservationID[!is.na(index3b)]))
#  16045
index3b1 <- match(DT$PlotObservationID[!is.na(index3b)],plot.total.cover2$PlotObservationID)
length(index3b1) # 355182
identical(DT$PlotObservationID[!is.na(index3b)],plot.total.cover2$PlotObservationID[index3b1])
# T
any(is.na(plot.total.cover2$total.cover[index3b1])) #F
any(is.na(DT$x_[!is.na(index3b)])) #T
length(DT$x_[!is.na(index3b)][!is.na(DT$x_[!is.na(index3b)])]) #  324922
length(DT$x_[!is.na(index3b)][is.na(DT$x_[!is.na(index3b)])]) # 30260
# -> handle !is.na and is.na separately
# !is.na:
DT$Cover..[!is.na(index3b)][!is.na(DT$x_[!is.na(index3b)])] <- 
  DT$x_[!is.na(index3b)][!is.na(DT$x_[!is.na(index3b)])]
# is.na:
'index3b2 <- match(DT$PlotObservationID[!is.na(index3b)]
[is.na(DT$x_[!is.na(index3b)])],DT$PlotObservationID)
# does not work
length(index3b2) # 1939'
plot.species.number2 <- DT[!is.na(index3b)][is.na(DT$x_[!is.na(index3b)])][,list(species.number=length(Cover..)), by=PlotObservationID]
str(plot.species.number2) # 1944
hist(plot.species.number2$species.number)
min(plot.species.number2$species.number) # 1
max(plot.species.number2$species.number) # 612
which(plot.species.number2$species.number==max(plot.species.number2$species.number))
# 1879
plot.species.number2$PlotObservationID[which(plot.species.number2$species.number==max(plot.species.number2$species.number))]
# 51071
#length(DT$Cover..[index3b2]) # 7505
length(DT$Cover..[!is.na(index3b)][is.na(DT$x_[!is.na(index3b)])]) # 30260
index3b3 <- match(DT$PlotObservationID[!is.na(index3b)][is.na(DT$x_[!is.na(index3b)])],
                  plot.species.number2$PlotObservationID)
length(index3b3) #30260
DT$Cover..[!is.na(index3b)][is.na(DT$x_[!is.na(index3b)])] <- 
  1/plot.species.number2$species.number[index3b3]
# p/a cases for all those species in a plot that do not have x_ values
# all those species are assigned 1/species richness as a value for Cover..


index3c <- match(DT$PlotObservationID,plot.total.cover2$PlotObservationID
                 [!plot.total.cover2$total.cover>0 & plot.total.cover$total.cover[index3]>0])
length(index3c) #    24241941
length(DT$PlotObservationID[!is.na(index3c)])
# 115351
length(unique(DT$PlotObservationID[!is.na(index3c)]))
# 8674
# all non-zero cases:
index3c2 <- which(DT$Cover..[!is.na(index3c)]>0)
length(index3c2) #   92103
length(DT$PlotObservationID[!is.na(index3c)][index3c2]) #  92103
min(DT$Cover..[!is.na(index3c)][index3c2]) # 0.01
hist(DT$Cover..[!is.na(index3c)][index3c2]) 
length(DT$Cover..[!is.na(index3c)][index3c2]) #  92103
plot.total.cover3 <- DT[!is.na(index3c)][index3c2][,list(min.cover=min(Cover..)), by=PlotObservationID]
str(plot.total.cover3) # 8674
hist(plot.total.cover3$min.cover)
min(plot.total.cover3$min.cover) # 0.01
max(plot.total.cover3$min.cover) # 100
# all zero cases:
index3c1 <- which(!DT$Cover..[!is.na(index3c)]>0)
length(index3c1) #  23248
length(DT$PlotObservationID[!is.na(index3c)][index3c1]) # 23248
'index3c11 <- match(DT$PlotObservationID[!is.na(index3c)][index3c1],DT$PlotObservationID)
length(index3c11) # 26111
does not work, does mess up the sequence'
plot.species.number3 <- DT[!is.na(index3c)][index3c1][,list(species.number=length(Cover..)), by=PlotObservationID]
str(plot.species.number3) # 8535
hist(plot.species.number3$species.number)
min(plot.species.number3$species.number) # 1
max(plot.species.number3$species.number) # 52
index3c11 <- match(DT$PlotObservationID[!is.na(index3c)][index3c1],
                   plot.total.cover3$PlotObservationID)
length(index3c11) # 23248
DT$Cover..[!is.na(index3c)][index3c1] <- plot.total.cover3$min.cover[index3c11]
# the smallest cover value is assigned that occurred in the plot

index3d <- match(DT$PlotObservationID,plot.total.cover2$PlotObservationID
                 [plot.total.cover2$total.cover>0 & plot.total.cover$total.cover[index3]>0])
length(index3d) #   24241941
length(DT$PlotObservationID[!is.na(index3d)])
# 6472
length(unique(DT$PlotObservationID[!is.na(index3d)]))
# 160
index3d1 <- match(DT$PlotObservationID[!is.na(index3d)],plot.total.cover2$PlotObservationID)
length(index3d1) # 6472
identical(DT$PlotObservationID[!is.na(index3d)],plot.total.cover2$PlotObservationID[index3d1])
# T
any(is.na(plot.total.cover2$total.cover[index3d1])) #F
any(is.na(DT$x_[!is.na(index3d)])) #T
length(DT$x_[!is.na(index3d)][!is.na(DT$x_[!is.na(index3d)])]) #  1628
length(DT$x_[!is.na(index3d)][is.na(DT$x_[!is.na(index3d)])]) # 4844
# -> handle !is.na and is.na separately
# !is.na:
DT$Cover..[!is.na(index3d)][!is.na(DT$x_[!is.na(index3d)])] <- 
  DT$x_[!is.na(index3d)][!is.na(DT$x_[!is.na(index3d)])]
# is.na:
index3d2 <- match(DT$PlotObservationID[!is.na(index3d)]
                  [is.na(DT$x_[!is.na(index3d)])],DT$PlotObservationID)
length(index3d2) # 4844
plot.species.number4 <- DT[index3d2,list(species.number=length(Cover..)), by=PlotObservationID]
str(plot.species.number4) # 160
hist(plot.species.number4$species.number)
min(plot.species.number4$species.number) # 10
max(plot.species.number4$species.number) # 73
length(DT$Cover..[index3d2]) # 4844
index3d3 <- match(DT$PlotObservationID[index3d2],plot.species.number4$PlotObservationID)
length(index3d3) #4844
DT$Cover..[index3d2] <- 1/plot.species.number4$species.number[index3d3]
# p/a cases for all those species in a plot that do not have x_ values
# all those species are assigned 1/species richness as a value for Cover..

### Test for Cover.. ###
hist(DT$Cover..)
any(is.na(DT$Cover..)) #F
#### VERY GOOD !!!! ####
any(DT$Cover..==0) #F
#### VERY GOOD !!!! ####
length(DT$Cover..[DT$Cover..==0]) # 0

min(DT$Cover..) # 0.001
max(DT$Cover..) # 104692
# check
which(DT$Cover..==max(DT$Cover..)) # 1980770
DT[1980770,]
length(DT$PlotObservationID[DT$Cover..==0]) # 0

plot.total.cover4 <- DT[, list(total.cover=sum(Cover..)), by=PlotObservationID]
str(plot.total.cover4) #1117940 
min(plot.total.cover4$total.cover) #0.001
max(plot.total.cover4$total.cover) #104704.1
any(is.na(plot.total.cover4$total.cover)) # F
length(plot.total.cover4$total.cover[plot.total.cover4$total.cover==0]) 
# 0 !!!
# all plots have cover now

index4 <- match(DT$PlotObservationID,plot.total.cover4$PlotObservationID)
length(index4) #24241941
any(is.na(index4)) # F

# recalculate Relative cover
### CAREFUL! This has to be done by layer now ###
DT$Relative.cover <-  DT$Cover../plot.total.cover4$total.cover[index4]
str(DT)
any(is.na(DT$Relative.cover)) # F
hist(DT$Relative.cover)
# between 0 and 1
#### VERY GOOD !!!! ####

write.csv(DT,file = paste("/home/oliver/shared/", 
                          "plots_without_unified_cover_scales_20160120a.csv", sep = ""), row.names = FALSE)