From 0d1b4cac6b43d7db52b4d0bbe353d304e0d4cbd4 Mon Sep 17 00:00:00 2001 From: Christian Krause <christian.krause@idiv.de> Date: Wed, 25 Jan 2023 22:07:22 +0100 Subject: [PATCH] wip --- .gitignore | 1 + .gitmodules | 3 + Makefile | 10 +++ img/rdm-use-case-merged.dot | 63 +++++++++++++++++ pandoc-cheat-sheet | 1 + rdm.md | 131 ++++++++++++++++++++++++++++++++++++ rdm.yml | 37 ++++++++++ 7 files changed, 246 insertions(+) create mode 100644 img/rdm-use-case-merged.dot create mode 160000 pandoc-cheat-sheet create mode 100644 rdm.md create mode 100644 rdm.yml diff --git a/.gitignore b/.gitignore index 6a6bfce..ce355b1 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ paper.html paper.pdf *.pdf img/staging-area-diff.svg +/img/rdm-use-case-merged.svg diff --git a/.gitmodules b/.gitmodules index 12123b5..736663c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "reveal.js"] path = reveal.js url = https://github.com/hakimel/reveal.js.git +[submodule "pandoc-cheat-sheet"] + path = pandoc-cheat-sheet + url = https://github.com/idiv-biodiversity/pandoc-cheat-sheet.git diff --git a/Makefile b/Makefile index ea9486b..56d6b86 100644 --- a/Makefile +++ b/Makefile @@ -17,12 +17,22 @@ pandoc-papers.pdf: pandoc-papers.md -o pandoc-papers.pdf \ pandoc-papers.md +rdm.pdf: img rdm.md rdm.yml + pandoc \ + --standalone \ + --from=markdown+yaml_metadata_block \ + --pdf-engine=xelatex \ + --template=pandoc-cheat-sheet/cheat-sheet.tex \ + -o rdm.pdf \ + rdm.yml rdm.md + all: subdirs clean: for dir in $(SUBDIRS); do \ $(MAKE) -C $$dir $@; \ done + rm -f rdm.pdf default: all diff --git a/img/rdm-use-case-merged.dot b/img/rdm-use-case-merged.dot new file mode 100644 index 0000000..424b147 --- /dev/null +++ b/img/rdm-use-case-merged.dot @@ -0,0 +1,63 @@ +digraph { + compound = true + node [shape = "box", style = "filled, rounded"] + + subgraph cluster_script { + label = "scripts" + + node [color = lightskyblue] + + script_version_b[label = "nature-v2"] + script_version_a[label = "nature-v1"] + } + + subgraph cluster_data { + label = "data repository (not git!)" + + node [color = limegreen] + + data_version_a[label = "doi:blah/blah"] + } + + subgraph cluster_paper { + label = "paper" + + node [color = limegreen] + + paper_version_a[label = "nature-review-1"] + paper_version_b[label = "nature-review-2"] + paper_version_c[label = "nature-final"] + } + + subgraph cluster_software_a { + label = "software A" + + node [color = orchid] + + software_a_version_a[label = "v2.1.6"] + } + + subgraph cluster_software_b { + label = "software B" + + node [color = orchid] + + software_b_version_b[label = "v0.3.4"] + software_b_version_a[label = "v0.2.0"] + } + + script_version_a -> software_a_version_a [label = "\n\n"] + script_version_b -> software_a_version_a [label = "\n\n"] + + script_version_a -> software_b_version_a [label = "\n\n"] + script_version_b -> software_b_version_b [label = "\n\n"] + + paper_version_a -> script_version_a [label = "\n\n"] + paper_version_b -> script_version_b [label = "\n\n"] + paper_version_c -> script_version_b [label = "\n\n"] + + paper_version_c -> data_version_a [label = "\n\n"] + + data_version_a -> script_version_b [label = "\n\n"] + +} diff --git a/pandoc-cheat-sheet b/pandoc-cheat-sheet new file mode 160000 index 0000000..2346a00 --- /dev/null +++ b/pandoc-cheat-sheet @@ -0,0 +1 @@ +Subproject commit 2346a002289757ee612bfd4b56b57db53e0c751b diff --git a/rdm.md b/rdm.md new file mode 100644 index 0000000..2062c3f --- /dev/null +++ b/rdm.md @@ -0,0 +1,131 @@ +git for RDM and reproducibility +=============================== + +checklist +--------- + +- **software** (a generic tool to do *something*) + - [ ] use separate git repo for software + - [ ] tag versions for reproducibility + - [ ] keep software as generic as possible +- **scripts** (*how* to use *software*) + - [ ] use separate git repo for scripts + - [ ] tag versions for reproducibility + - [ ] software is configured here + - [ ] reference used software tag +- **data management** + - [ ] publish dataset(s) to scientific data archive system + - [ ] always attach proper metadata + - [ ] get DOI for each version of the dataset(s) for reproducibility + - [ ] reference used scripts tag +- **publishing** + - [ ] use separate git repo for paper/thesis/... + - [ ] tag versions for draft/review/final + - [ ] convert text/source to (binary) products + - [ ] reference used scripts tag + - [ ] reference used data DOI +- **platforms** (GitLab, GitHub) + - [ ] use platforms (GitLab, GitHub) for collaboration + - [ ] review commits / merge requests + - [ ] utilize project management tools + - [ ] utilize automation for testing and publishing + + +intro +----- + +- version control system (VCS) records changes (what, who, when, why) +- use platforms (GitLab, GitHub) for collaboration + + +git use cases +------------- + +### software + +- keep software as generic as possible +- turn configuration/parameters into arguments, e.g. `myapp --seed=42` +- this avoids having to rewrite software for parameter changes +- use software testing to verify software does what it's supposed to do +- tag versions to enable **reproducibility** + +### scripting + +- separate scripting from software + - software: generic + - scripting: software called with specific configuration/arguments +- scripting means **how** to run the software + - i.e. here is where the parameters/arguments go + - think of it as digital lab notes + - this enables **reproducibility** +- specialized script variants for different environments, e.g. + - laptop + - RStudio / terminal server + - HPC cluster +- think about *execution scalability*, i.e. not having to change software and + scripting when you want to change parameters +- keep failed attempts in branches to keep history of what you tried and why it + didn't work in commit message + +### publishing + +- for paper, thesis, book, presentation, documentation, blog posts + - use *programming languages* code/scripts for plots, flowcharts, etc. + - write text/paragraphs in markup language (e.g. markdown) +- use automation workflows to + - generate plot/flowchart code to image files + - convert text with pandoc to PDF/PS/HTML/ebup +- use platforms for review process + +## integration of use cases for reproducibility + + + + +anti patterns +------------- + +> An anti-pattern is a common response to a recurring problem that is usually +> ineffective and risks being highly counterproductive. + +- most git anti-patterns are about *how* to use git +- focus here is on these relating to RDM and reproducibility + +### binary files + +- git as VCS only good for text files + - markdown + - source code, scripts + - (small) CSV +- binary files can't be diff'ed, e.g. + - compiled programs + - MS word, excel + - PDF, PS + - JPEG, PNG +- use textual representation, e.g. + - graphviz dot for flowcharts + - R ggplot and CSV for plots +- use automation to convert textual representation to e.g. images +- use gitignore to never add binary products to the repo + +### scientific data in git repos + +- data is often binary +- git repo should be small, data blows it up, even if text +- data has different release cycles than code +- even git lfs (large file storage) is bad because still big ball of mud + - scientific datasets need metadata! +- use proper archive system for data + + +platforms +--------- + +- enable collaboration + - bug tracker / feature requests + - documentation / wiki +- project management tools + - issue boards, milestones, gantt +- trigger automation +- publish/download releases +- go to https://git.idiv.de log in and create new projects! diff --git a/rdm.yml b/rdm.yml new file mode 100644 index 0000000..e665af9 --- /dev/null +++ b/rdm.yml @@ -0,0 +1,37 @@ +--- +# these are not shown in the document, they are just for metadata +title: git RDM reproducibility check list cheat sheet +author: Christian Krause +lang: en +keywords: + - git + - RDM + - resource data management + - reproducibility + +# highlighting increases readability +linkcolor: blue + +# these LaTeX variables fit as much content on as few pages as possible +documentclass: scrartcl +pagestyle: empty +papersize: a4paper +geometry: + - a4paper + - left=1cm + - right=1cm + - top=1cm + - bottom=1cm +# you can also add "landscape" to geometry if you want more than 2 columns + +# fiddle with these to increase readability +columns: 2 +fontsize: 9pt + +# this essentially disables justification, which can increase readability +ragged: yes + +# color for header background +sectionbg: BurntOrange +subsectionbg: Apricot +... -- GitLab