Homogenized Cover abundance in 06_buildDT

b921e1ba · Francesco Sabatini · 99732162 · b921e1ba
Commit b921e1ba authored 5 years ago by Francesco Sabatini
--- a/code/06_buildDT.Rmd
+++ b/code/06_buildDT.Rmd
@@ -11,10 +11,10 @@ output: html_document
 </center>


-MEMO CHECK Field cover code! It seems to have species characters
-    
-      
-        
+MEMO!! WHAT TO DO WITH LAYER WHEN IS CONSISTENTLY ZERO IN A PLOT? CHANGE TO NA?
+WHAT TO DO INSTEAD WHEN LAYER==0 IN A PLOT WHERE LAYER INFO IS OTHERWISE AVAILABLE?
+
+
 **Timestamp:** `r date()`  
 **Drafted:** Francesco Maria Sabatini  
 **Revised:**  
@@ -66,7 +66,7 @@ DT0 <- readr::read_delim("../sPlot_data_export/sPlot_3_0_2_species_test.csv",
 nplots <- length(unique(DT0$PlotObservationID))
 nspecies <- length(unique(DT0$`Matched concept`))
 ```
-Species data include `r nrow(DT0)` species * plot records, across `r nplots` plots and including `r nspecies` non-resolved species.  
+Species data include `r nrow(DT0)` species * plot records, across `r nplots` plots. Before taxonomic resolution, there are `r nspecies` species .  
 \newline


@@ -90,10 +90,14 @@ load("../_output/Backbone3.0.RData")
 ```{r}
 DT1 <- DT0 %>% 
  left_join(Backbone %>% 
-              dplyr::select(Name_sPlot_TRY, Name_short, `Taxon group`) %>%
+              dplyr::select(Name_sPlot_TRY, Name_short, `Taxon group`, Rank_correct) %>%
              rename(`Matched concept`=Name_sPlot_TRY,
                     Taxongroup_BB=`Taxon group`), 
-            by="Matched concept")  
+            by="Matched concept") %>% 
+  # Simplify Rank_correct
+  mutate(Rank_correct=fct_collapse(Rank_correct, 
+                                   lower=c("subspecies", "variety", "infraspecies", "race", "forma"))) %>% 
+  mutate(Rank_correct=fct_explicit_na(Rank_correct, "No_match"))
 ```

 ## Explore name matching based on Backbone v1.2
@@ -139,7 +143,11 @@ knitr::kable(name.check.freq %>% slice(1:40),

 ## Complete field `taxon group` 

-Coalesce `Taxon group` info from Backbone
+```{r echo=F}
+nknown <- DT1 %>% filter(`Taxon group`!="Unknown") %>% nrow()
+nunknown <- DT1 %>% filter(`Taxon group`=="Unknown") %>% nrow()
+```
+`Taxon group` information is only available for `r nknown` taxa, but absent for `r nunknown`. To improve the completeness of this field, we derive additional info from the `Backbone`, and merge it with the data already present in `DT`.
 ```{r}
 table(DT1$`Taxon group`, exclude=NULL)

@@ -152,10 +160,19 @@ DT1 <- DT1 %>%

 table(DT1$`Taxon group`, exclude=NULL)
 ```
+Those taxon for which measures of Basal Area exist, can be safely assumed to belong to vascular plants 

+```{r}
+DT1 <- DT1 %>% 
+  mutate(`Taxon group`=replace(`Taxon group`, 
+                               list=`Cover code`=="x_BA", 
+                               values="Vascular plant"))

+```

-Cross-complement
+
+
+Cross-complement `Taxon group` information. This means, whenever a taxon is marked to belong to one group, then assign the same taxon to that group throughout the `DT` table.
 ```{r}
 DT1 <- DT1 %>% 
  left_join(DT1 %>% 
@@ -172,6 +189,21 @@ table(DT1$`Taxon group`, exclude=NULL)
 ```

 Check species with conflicting `Taxon group` information and fix manually.
+```{r, eval=F}
+#check for conflicts in attribution of genera to Taxon groups
+conflict <- DT1 %>% 
+  filter(!is.na(Name_short)) %>% 
+  dplyr::select(Genus, `Taxon group`) %>% 
+  filter(!is.na(`Taxon group`)) %>% 
+  distinct() %>% 
+  group_by(Genus) %>% 
+  summarize(n=n()) %>% 
+  filter(n>1) %>% 
+  arrange(desc(n)) %>% 
+  pull(Genus)
+```
+
+Manually fix some known problems in `Taxon group` attribution. Some list of taxa (e.g., `lichen.genera`, `mushroom.genera`) derive from the `Backbone`.
 ```{r}
 #Attach genus info
 DT1 <- DT1 %>% 
@@ -183,14 +215,13 @@ DT1 <- DT1 %>%
            by="Matched concept") %>% 
    mutate(`Taxon group`=fct_collapse(`Taxon group`, 
                                    Alga_Stonewort=c("Alga", "Stonewort")))
-#manually fix some know problems
-mosses.gen <- c("Hypnum", "Brachytheciastrum", 
-           "Brachythecium","Hypnum",  "Zygodon", "Oxymitra", "Bryophyta", "Musci", '\\\"Moos\\\"')
-vascular.gen <- c("Polystichum", "Hypericum", "Peltaria", "Pancovia", "Calythrix", "Ripogonum",
+#manually fix some known problems
+mosses.gen    <- c("Hypnum", "Brachytheciastrum","Brachythecium","Hypnum",  
+                  "Zygodon", "Oxymitra", "Bryophyta", "Musci", '\\\"Moos\\\"')
+vascular.gen  <- c("Polystichum", "Hypericum", "Peltaria", "Pancovia", "Calythrix", "Ripogonum",
                  "Notogrammitis", "Fuscospora", "Lophozonia",  "Rostellularia", 
-                  "Hesperostipa", "Microsorium", 
-                  "Angiosperm","Dicotyledonae", "Spermatophy")
-alga.gen <- c("Chara", "Characeae", "Tonina", "Nostoc", "Entermorpha", "Hydrocoleum" )
+                  "Hesperostipa", "Microsorium", "Angiosperm","Dicotyledonae", "Spermatophy")
+alga.gen      <- c("Chara", "Characeae", "Tonina", "Nostoc", "Entermorpha", "Hydrocoleum" )
 
 DT1 <- DT1 %>% 
  mutate(`Taxon group`=replace(`Taxon group`, 
@@ -212,22 +243,8 @@ DT1 <- DT1 %>%
 table(DT1$`Taxon group`, exclude=NULL)
 ```

-```{r, eval=F, echo=F}
-#check for conflicts in attribution of genera to Taxon groups
-conflict <- DT1 %>% 
-  filter(!is.na(Name_short)) %>% 
-  dplyr::select(Genus, `Taxon group`) %>% 
-  filter(!is.na(`Taxon group`)) %>% 
-  distinct() %>% 
-  group_by(Genus) %>% 
-  summarize(n=n()) %>% 
-  filter(n>1) %>% 
-  arrange(desc(n)) %>% 
-  pull(Genus)
-```
-

-Delete all records of fungi
+Delete all records of fungi, and use lists of genera to fix additional problems. While in the previous round the matching was done on the resolve Genus name, here we match based on the unresolved Genus name.
 ```{r}
 DT1 <- DT1 %>% 
  dplyr::select(-Genus) %>% 
@@ -253,9 +270,14 @@ DT1 <- DT1 %>%

 table(DT1$`Taxon group`, exclude=NULL)
 ```
+```{r echo=F}
+nunknown <- DT1 %>% filter(`Taxon group`=="Unknown") %>% nrow()
+```
+
+After cross-checking all sources of information, the number of taxa not having `Taxon group` information decreased to `r nunknown` species.

-Check the most frequent species for which we don't have taxon group info
 ```{r, echo=F, eval=F}
+#Check the most frequent species for which we don't have taxon group info
 DT1 %>% 
  filter(`Taxon group` == "Unknown") %>% 
  group_by(Genus) %>% 
@@ -264,28 +286,84 @@ DT1 %>%
    slice(1:40)
 ```

-Calculate relative cover per layer per species in each plot
+## Calculate relative cover per layer per species in each plot
+
+Species abundance information varies across datasets and plots. While for the large majority of plots abundance values are returned as percentage cover, there is a subset where abundance is returned with different scales. These are marked in the column `Cover code` as follows:
+\newline \newline
+*x_BA* - Basal Area  
+*x_IC* - Individual count  
+*x_SC* - Stem count  
+*x_IV* - Relative Importance  
+*x_RF* - Relative Frequency  
+*x* - Presence absence  
+\newline \newline
+Still, it's not really intuitive that in case `Cover code` belongs to one of the classes above, then the actual abundance value is stored in the `x_` column. This stems from the way this data is stored in `TURBOVEG`.  
+To make the cover data more user friendly, I simplify the way cover is stored, so that there are only two columns:  
+`Ab_scale` - to report the type of scale used
+`Abundance` - to coalesce the cover\\abundance values previously in the columns `Cover %` and `x_`.  
+
 ```{r}
+# Create Ab_scale field
 DT1 <- DT1 %>% 
-  mutate(tmp.cover=ifelse(`Cover code` %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF"), 
+  mutate(Ab_scale = ifelse(`Cover code` %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF", "x") & !is.na(x_), 
+                               `Cover code`, 
+                               "CoverPerc")) %>% 
+  mutate(Ab_scale = ifelse(Ab_scale =="x", "pa", Ab_scale)) 
+```
+
+Fix some error. There are some plots where only p\\a information is available (`Cover code`=="x"), but have zeros in the field `Cover %`. Consider this as presence\\absence and transform `Cover %` to 1.  
+```{r}
+DT1 <- DT1 %>% 
+  mutate(`Cover %`=replace(`Cover %`, 
+                           list=(PlotObservationID %in% (DT1 %>% 
+                                   group_by(PlotObservationID) %>% 
+                                   mutate(check= (`Cover %`==0 & `Cover code`=="x")) %>% 
+                                   summarize(allzero=mean(check)==1) %>% 
+                                   filter(allzero==T) %>% 
+                                   pull(PlotObservationID))), 
+                           values=1))
+```
+There are also some plots having different cover scales in the same layer. They are not many, and I will reduce their cover value to p\\a.
+```{r}
+mixed <- DT1 %>% 
+  distinct(PlotObservationID, Ab_scale, Layer) %>% 
+  group_by(PlotObservationID, Layer) %>% 
+  summarize(n=n()) %>% 
+  filter(n>1) %>% 
+  pull(PlotObservationID) %>% 
+  unique()
+length(mixed)
+
+#Transform these plots to p\a and create field Ab_scale to summarize abundance info
+DT1 <- DT1 %>% 
+  mutate(Ab_scale=replace(Ab_scale, 
+                           list=PlotObservationID %in% mixed, 
+                           values="pa")) %>% 
+  #Create additional field Abundance to avoid overwriting original data
+  mutate(Abundance =ifelse(Ab_scale %in% c("x_BA", "x_IC", "x_SC", "x_IV", "x_RF"), 
                          x_, `Cover %`)) %>% 
+  mutate(Abundance=replace(Abundance, 
+                           list=PlotObservationID %in% mixed, 
+                           values=1)) %>% 
+  
+
+```
+
+I then transform abundances to relative abundance, on a layer by layer basis. For consistency with the previous version of sPlot, I call the field `Relative cover`
+```{r}
+DT1 <- DT1 %>% 
  left_join(x=., 
            y={.} %>%
              group_by(PlotObservationID, Layer) %>% 
-              summarize(tot.cover=sum(tmp.cover)), 
+              summarize(tot.abundance=sum(Abundance)), 
            by=c("PlotObservationID", "Layer")) %>% 
-  mutate(Relative.cover=tmp.cover/tot.cover)
-              
-            
+  mutate(Relative.cover=Abundance/tot.abundance)
 ```

-
-
-
 ## Clean DT and export
 ```{r}
 DT2 <- DT1 %>% 
-  dplyr::select(PlotObservationID, Name_short, `Turboveg2 concept`, `Taxon group`, Layer:x_, Relative.cover ) %>% 
+  dplyr::select(PlotObservationID, Name_short, `Turboveg2 concept`, Rank_correct, `Taxon group`, Layer:x_, Ab_scale, Abundance, Relative.cover ) %>% 
  rename(species_original=`Turboveg2 concept`, 
         species=Name_short,
         taxon_group=`Taxon group`, 
@@ -293,7 +371,7 @@ DT2 <- DT1 %>%
         cover_code=`Cover code`)
 ```

-The output of the DT table contains `r nrow(DT2)` records, over `r length(unique(DT2$PlotObservationID))` plots. The total number of taxa is `r length(unique(DT2$Species_original))` and `r length(unique(DT2$Species_matched))`, before and after standardization, respectively. Information on the `Taxon group` is available for `r DT2 %>% filter(Taxon_group!="Unknown") %>% distinct(Species_matched) %>% nrow()` standardized species.
+The output of the DT table contains `r nrow(DT2)` records, over `r length(unique(DT2$PlotObservationID))` plots. The total number of taxa is `r length(unique(DT2$species_original))` and `r length(unique(DT2$species))`, before and after standardization, respectively. Information on the `Taxon group` is available for `r DT2 %>% filter(taxon_group!="Unknown") %>% distinct(species) %>% nrow()` standardized species.

 ```{r}
 save(DT2, file = "../_output/DT_sPlot3.0.RData")