Skip to content
Snippets Groups Projects
Commit 460c8115 authored by Francesco Sabatini's avatar Francesco Sabatini
Browse files

Brushed up Backbone, Created Taxon group. Manual clean families

parent ae8bd4fd
Branches
No related tags found
No related merge requests found
......@@ -58,8 +58,14 @@ mushroom <- c("Mycena", "Boletus", "Russula","Calocybe","Collybia","Amanita","Am
"Sarcodom","Sarcoscyphus","Scleroderma","Stropharia","Tylopilus","Typhula", "Calyptella", "Chrysopsora", "Lacrymaria", "Dermoloma",
"Agaricus","Alnicola", "Amanitina", "Bovista", "Cheilymenia","Clavulinopsis", "Clitocybe", "Entoloma", "Geaster", "Inocybe",
"Laccaria", "Laetiporus", "Lepista", "Macrolepiota", "Macrolepis", "Marasmius", "Panaeolus", "Psathyrella", "Psilocybe",
"Rickenella", "Sarcoscypha", "Vascellum", "Ramaria",
"Amphoroblasia", "Amphoroblastia", "Agrocybe")
"Rickenella", "Sarcoscypha", "Vascellum", "Ramaria","Amphoroblasia", "Amphoroblastia", "Agrocybe",
"Flammulaster", "Phaeocollybia", "Cortinarius", "Lepiota", "Cystoderma",
"Armillaria", "Athelia", "Ceraceomyces", "Chlorociboria", "Clavariaceae",
"Cystoderma", "Dacrymyces","Dendrographa","Dirina", "Flammulaster","Fomes","Gyrophora",
"Kirschsteiniothelia", "Lasallia","Lepiota","Llimoniella","Mazosia","Microthelia","Mollisia",
"Multiclavula","Phaeocollybia","Phellinus","Plectocarpon","Pleospora","Ramariopsis","Reinkella",
"Roccella","Roccellina","Sigridea","Stereum","Tremella","Tulostoma","Umbilicaria","Unguiculariopsis" ,
"Xanthoconium")
```
......@@ -1200,7 +1206,7 @@ tpl.submit <- tnrs.res.certain %>% filter(is.na(Accepted_name)) %>% dplyr::selec
nrow(tpl.submit)
write_csv(tpl.submit, path="../_derived/TPL/tpl.submit.csv")
#dividve in 99 batches
#divide in 99 batches
indices <- 1:nrow(tpl.submit)
chunks <- split(indices, sort(indices%%99))
......@@ -1248,11 +1254,17 @@ save(tpl.ncbi.certain, tpl.ncbi.uncertain, file="../_derived/TNRS_submit/tnrs.it
# Merge the resolved species lists into a Backbone
## Read files
```{r, eval = T}
```{r, eval = T, warning=F}
load("../_derived/TNRS_submit/tnrs.iter1.RData")
load("../_derived/TNRS_submit/tnrs.iter2.RData")
load("../_derived/TNRS_submit/tnrs.iter3.RData")
load("../_derived/TNRS_submit/tnrs.iter4.RData")
#Double check of wrong taxa from TNRS
finalcheck <- c("Salix repens subsp. repens var. repens","Hieracium lachenalii")
tpl.ncbi.certain <- tpl.ncbi.certain %>%
bind_rows(TPL(finalcheck))
```
Combine the `certain` data sets:
......@@ -1300,11 +1312,11 @@ Backbone <- spec.list.TRY.sPlot %>%
levels=c("Accepted","Synonym", "No opinion","Invalid",
"Illegitimate","Misapplied","Rejected name",
"Unresolved"))) %>%
arrange(Taxonomic_status) %>%
slice(1) %>%
#delete empty spaces at end of names
mutate(Accepted_name=gsub(pattern=" $", replacement="", x=Accepted_name)) %>%
mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)) %>%
arrange(Taxonomic_status) %>%
slice(1),
mutate(Accepted_name_species=gsub(pattern=" $", replacement="", x=Accepted_name_species)),
by="Name_submitted")
#Double check
nrow(Backbone) == nrow(spec.list.TRY.sPlot)
......@@ -1318,7 +1330,7 @@ If names were neither resolved at the accepted or synonym level, set `Status_cor
```{r, eval = T}
Backbone <- Backbone %>%
mutate(Status_correct=fct_collapse(Taxonomic_status,
Other=c("No opinion","Invalid",
Other=c("No opinion","Invalid", "Unresolved",
"Illegitimate","Misapplied","Rejected name"))) %>%
mutate(Status_correct=fct_explicit_na(Status_correct, "No suitable matches found.")) %>%
#Create Name_correct field. Use Accepted names, if any. Otherwise matched names.
......@@ -1338,13 +1350,12 @@ Backbone <- Backbone %>%
summary(Backbone$Status_correct)
summary(Backbone$Rank_correct)
```
There are `r sum(is.na(Backbone$Name_correct))` species names for which we found no match in any of the taxonomic resources we used. Yet, for as many as `r sum(Backbone$Rank_correct %in% c("higher", "family", "genus"))` taxa, the matching did not properly resolve the species name, and we only found a match at genus or higher level.
## Complete list of families
There are `r sum(is.na(Backbone$Accepted_name_family))` records with missing family information.
### Derive info from other species of the same Genera in the Backbone itself
Copy family info for taxa resolved at family level.
There are `r sum(is.na(Backbone$Accepted_name_family))` records with missing family information. Create field `Family_correct`.
```{r}
Backbone <- Backbone %>%
mutate(family.lev=str_extract(word(Name_correct,1), pattern='([^\\s]+aceae)')) %>%
......@@ -1356,38 +1367,6 @@ Backbone <- Backbone %>%
# Remaining records with missing family info
sum((is.na(Backbone$Family_correct)))
```
Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info.
```{r}
genera_families <- Backbone %>%
filter(Taxonomic_status=="Accepted") %>%
dplyr::select(Genus_correct, Family_correct) %>%
rename(family=Family_correct) %>%
distinct() %>%
na.omit() %>%
#for some genera there are multiple families assigned
# (e.g. in case of unresolved species names )
# Extract the family names that occurs most often across each genus
group_by(Genus_correct, family) %>%
summarize(n=n()) %>%
arrange(desc(n)) %>%
slice(1) %>%
ungroup() %>%
dplyr::select(-n)
# Assign family derived from backbone to other records
Backbone <- Backbone %>%
left_join(genera_families, by="Genus_correct") %>%
mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)),
family,
Family_correct)) %>%
dplyr::select(-family)
#Records with missing family info
sum(is.na(Backbone$Family_correct))
```
### Resolve genera with missing family info with `TNRS`
```{r, eval=F}
......@@ -1443,11 +1422,27 @@ unzip("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/CatLife2019.
```
```{r, message=F, warning=F}
cat.life <- read_delim("/data/sPlot/users/Francesco/Ancillary_Data/Catalogue_of_Life/taxa.txt",
delim="\t")
delim="\t",
col_types = cols(
.default = col_character(),
taxonID = col_double(),
datasetID = col_double(),
acceptedNameUsageID = col_double(),
parentNameUsageID = col_double(),
superfamily = col_logical(),
subgenus = col_logical(),
source = col_logical(),
namePublishedIn = col_logical(),
modified = col_character(),
taxonConceptID = col_double(),
isExtinct = col_logical()
)) %>%
#correct family names to match to the standards in TPL
mutate(family=ifelse(family=="Fabaceae", "Leguminosae", family)) %>%
mutate(family=ifelse(family=="Asteraceae", "Compositae", family))
Genera_missing <- Backbone %>%
filter(is.na(Family_correct) & !is.na(Genus_correct)) %>%
dplyr::select(Genus_correct) %>%
......@@ -1471,39 +1466,116 @@ sum(is.na(Backbone$Family_correct))
```
After matching the remaining genera with the Catalogue of life there are still `r nrow(Backbone %>% filter(is.na(Family_correct)))` records without Family affiliation, for a total of `r nrow(Backbone %>% filter(is.na(Family_correct)) %>% dplyr::select(Genus_correct) %>% distinct())` genera.
### Manually fix residual, known issues
### Manually fix some known issues
```{r}
Backbone <- Backbone %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Coptidium",
list=Genus_correct=="Coptidium",
values="Ranunculaceae")) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Balanocarpus",
list=Genus_correct=="Balanocarpus",
values="Dipterocarpaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Cardaminopsis",
list=Genus_correct=="Cardaminopsis",
values="Brassicaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Carpolepis",
list=Genus_correct=="Carpolepis",
values="Myrtaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Cathartolinum",
list=Genus_correct=="Cathartolinum",
values="Linaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Didiscus",
list=Genus_correct=="Didiscus",
values="Araliaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Grammadenia",
list=Genus_correct=="Grammadenia",
values="Primulaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=word(Accepted_name_species, 1)=="Antholoma",
values="Elaeocarpaceae" ))
list=Genus_correct=="Antholoma",
values="Elaeocarpaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Odontarrhena",
values="Brassicaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Trichinium",
values="Amaranthaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Limonium",
values="Plumbaginaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Eunanus",
values="Phrymaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Sunaptea",
values="Dipterocarpaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Aconogonon",
values="Polygonaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Cajophora",
values="Loasaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Calobota",
values="Leguminosae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Alsine",
values="Caryophyllaceae" )) %>%
mutate(Family_correct=replace(Family_correct,
list=Genus_correct=="Cyanococcus",
values="Ericaceae" )) %>%
mutate(Family_correct=ifelse(Family_correct %in% c("Papilionaceae", "Fabaceae"),
"Leguminosae", Family_correct)) %>%
mutate(Family_correct=ifelse(Family_correct=="Asteraceae", "Compositae", Family_correct)) %>%
mutate(Family_correct=ifelse(Family_correct=="Unknown", NA, Family_correct))
#Records with missing family info
sum(is.na(Backbone$Family_correct))
```
### Derive info from other species of the same Genera in the Backbone itself
Derive family info from each genus in the backbone, and use this info to complement records from the same genera, but with missing family info.
```{r}
genera_families <- Backbone %>%
filter(Taxonomic_status=="Accepted") %>%
dplyr::select(Genus_correct, Family_correct) %>%
rename(family=Family_correct) %>%
distinct() %>%
na.omit() %>%
#for some genera there are multiple families assigned
# (e.g. in case of unresolved species names )
# Extract the family names that occurs most often across each genus
group_by(Genus_correct, family) %>%
summarize(n=n()) %>%
arrange(desc(n)) %>%
slice(1) %>%
ungroup() %>%
dplyr::select(-n)
### Create field `Name_short`
# Assign family derived from backbone to other records
Backbone <- Backbone %>%
left_join(genera_families, by="Genus_correct") %>%
mutate(Family_correct=ifelse( (is.na(Family_correct) & !is.na(family)),
family,
Family_correct)) %>%
dplyr::select(-family)
#Records with missing family info
sum(is.na(Backbone$Family_correct))
```
### Delete records assigned to mushroom families, if any
```{r}
mushroom.families <- c("Physalacriaceae", "Clavariaceae","Agaricaceae","Roccellaceae",
"Atheliaceae","Meruliaceae","Helotiaceae", "Dacrymycetaceae", "Boletaceae",
"Cortinariaceae", "Polyporaceae", "Umbilicariaceae" , "Pleosporaceae",
"Leotiaceae","Dermateaceae", "Hymenochaetaceae","Stereaceae","Tremellaceae")
Backbone <- Backbone %>%
filter(!Genus_correct %in% mushroom) %>%
filter(!Family_correct %in% mushroom.families)
```
## Create field `Name_short`
Shorten names that have more than two words and where the second word is a x. If there is no species name available, fill in with either genus or family info
```{r}
Backbone <- Backbone %>%
......@@ -1523,8 +1595,8 @@ sum(is.na(Backbone$Name_correct))
```
## Create Field `is_vascular_plant`
Assign all families that belong to `Tracheophyta` to category `is_vascular_species`, based on `The Catalogue of Life`
## Create field `is_vascular_plant` and `Taxon group`
Attach phylum information from `The Catalogue of Life`.
```{r}
Backbone <- Backbone %>%
left_join(cat.life %>%
......@@ -1532,11 +1604,77 @@ Backbone <- Backbone %>%
distinct() %>%
na.omit() %>%
rename(Family_correct=family),
by="Family_correct") %>%
mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F))
by="Family_correct")
```
Create fields `is_vascular_species` and `Taxon group` based on list of family manually classified, and on phyla from `The Catalogue of Life`.
Assign all families that belong to `Tracheophyta` to category `is_vascular_species`, based on
```{r}
vascular <- c("Leguminosae" , "Alliaceae", "Bombacaceae" ,"Taxodiaceae",
"Aceraceae", "Centrolepidaceae","Callitrichaceae" ,"Flacourtiaceae",
"Compositae", "Asclepiadaceae", "Papilionaceae","Tiliaceae",
"Mimosaceae" , "Xanthorrhoeaceae","Arthropteridaceae",
"Valerianaceae", "Grammitidaceae" ,"Anarthriaceae", "Caesalpiniaceae",
"Chenopodiaceae", "Corylaceae", "Diervillaceae", "Dipsacaceae","Guttiferae",
"Haptanthaceae", "Hymenophyllopsidaceae", "Isoëtaceae","Labiatae",
"Lactoridaceae","Lemnaceae","Selaginaceae","Sterculiaceae","Myoporaceae",
"Myrsinaceae" ,"Pyrolaceae", "Rhoipteleaceae" ,"Xanthoceraceae")
lichens <- c("Acarosporaceae" , "Parmeliaceae", "Physciaceae", "Lichinaceae",
"Caliciaceae", "Lecanoraceae", "Venturiaceae" ,"Sphaerophoraceae" ,
"Verrucariaceae", "Tricholomataceae","Baeomycetaceae",
"Catillariaceae" ,"Megasporaceae","Ramalinaceae","Pilocarpaceae" ,
"Teloschistaceae","Candelariaceae","Rhizocarpaceae","Lecideaceae",
"Icmadophilaceae","Cladoniaceae","Collemataceae","Pannariaceae" ,
"Lobariaceae", "Ophioparmaceae" ,"Psoraceae","Stereocaulaceae",
"Massalongiaceae","Peltigeraceae","Nephromataceae")
lichen.genera <- c("Amygdalaria", "Anamylospora", "Arthonia", "Pertusaria", "Pyrenula","Opegrapha",
"Ochrolechia", "Graphis", "Micarea", "Porpidia", "Arthopyrenia", "Graphina", "Anisomeridium",
"Mycobilimbia","Peltula", "Thelotrema", "Arthothelium", "Diploschistes", "Strigula",
"Trichothelium", "Melaspilea", "Phaeographis", "Thelenella", "Chaenothecopsis","Fuscidea",
"Dactylospora", "Gyalecta", "Myriotrema", "Placynthium")
mosses <- c("Pilotrichaceae", "Chonecoleaceae", "Hypopterygiaceae", "Scorpidiaceae",
"Balantiopsaceae", "Mesoptychiaceae","Octoblepharaceae" ,"Takakiaceae")
algae_diatoms <- c("Sargassaceae", "Chordaceae", "Cocconeidaceae", "Desmarestiaceae",
"Chordariaceae", "Dinobryaceae", "Diploneidaceae", "Ectocarpaceae",
"Fragilariaceae","Sphacelariaceae","Vaucheriaceae" ,
"Amphipleuraceae", "Fucaceae", "Gomphonemataceae", "Melosiraceae",
"Laminariaceae","Acinetosporaceae" ,"Botryochloridaceae",
#diatoms below
"Thalassiosiraceae", "Cymbellaceae", "Naviculaceae","Bacillariaceae")
Backbone <- Backbone %>%
mutate(is_vascular_species=ifelse(phylum=="Tracheophyta", T, F)) %>%
mutate(is_vascular_species=replace(is_vascular_species,
list=Family_correct %in% vascular,
values=T)) %>%
mutate(`Taxon group`="Unknown") %>%
mutate(`Taxon group`=ifelse((!is.na(is_vascular_species) & is_vascular_species==T),
"Vascular Plant", `Taxon group`)) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Family_correct %in% lichens,
values="Lichen")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Genus_correct %in% lichen.genera,
values="Lichen")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Family_correct %in% algae_diatoms,
values="Alga")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=phylum %in% c("Glaucophyta", "Rhodophyta", "Charophyta", "Chlorophyta"),
values="Alga")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=Family_correct %in% mosses,
values="Moss")) %>%
mutate(`Taxon group`=replace(`Taxon group`,
list=phylum %in% c("Bryophyta", "Bryophyta", "Anthocerotophyta" ),
values="Moss")) %>%
mutate(is_vascular_species=ifelse(`Taxon group` %in% c("Moss", "Alga", "Lichen"),
F, is_vascular_species))
table(Backbone$`Taxon group`, exclude=NULL)
table(Backbone$is_vascular_species, exclude=NULL)
```
## Export Backbone
```{r echo=F}
knitr::kable(Backbone %>%
......@@ -1596,6 +1734,7 @@ knitr::kable(Backbone %>%
*Name_short* - First two words of `Name_correct`
*phylum* - As derived from `The Catalogue of Life`
*is_vascular_species* - As derived based on selection of `phylum` from `The Catalogue of Life`
*`Taxon group`* - Taxon group, as in Turboveg. 'Vascular plant', 'Moss' (include liverworts), 'Lichen', 'Algae', 'Unknown
```{r}
save(Backbone, file="../_output/Backbone3.0.RData")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment