adds the pipeline

09a74453 · ck85nori · c41444d1 · 09a74453 · 09a74453 · 09a74453
Verified Commit 09a74453 authored 6 years ago by ck85nori
--- a/README.md
+++ b/README.md
@@ -10,3 +10,22 @@ bash chunk-input.sh \
  /data/GROUP/videos-chunked-10 \
  10
 ```
+
+## the pipeline
+
+The **zamba predict** pipeline consists of two jobs:
+
+1.  the array job handling the chunks
+2.  a follow-up job aggregating the individual outputs of the array job tasks
+
+The following command will submit the entire pipeline:
+
+```bash
+bash pipeline.sh /data/GROUP/videos-chunked-10
+```
+
+Outputs are:
+
+- logs will be in `/work/$USER/zamba/logs`
+- chunked CSVs will be in `/work/$USER/zamba/csvs-DATE`
+- aggregated CSV will be in `/work/$USER/zamba/output-DATE.csv`
--- a/pipeline.sh
+++ b/pipeline.sh
+#!/bin/bash
+
+# -----------------------------------------------------------------------------
+# init
+# -----------------------------------------------------------------------------
+
+pipeline_date=$(date +%F-%H-%M-%S)
+script_dir=$(dirname "$0")
+
+# -----------------------------------------------------------------------------
+# command line arguments
+# -----------------------------------------------------------------------------
+
+function usage { cat << EOF
+usage: bash $(basename "$0") chunk_dir [output]
+EOF
+}
+
+[[ $# -eq 1 ]] || {
+  usage >&2
+  exit 1
+}
+
+chunk_dir=$1
+
+[[ -d $chunk_dir ]] || {
+  echo "chunk directory does not exist or is not a directory" >&2
+  exit 1
+}
+
+csv_dir=/work/$USER/zamba/csvs-$pipeline_date
+csv_chunk_dir=$csv_dir/chunks
+
+if [[ -n $2 ]] ; then
+  output=$2
+else
+  output="$csv_dir/output-$pipeline_date.csv"
+fi
+
+# -----------------------------------------------------------------------------
+# submission
+# -----------------------------------------------------------------------------
+
+# create output directories
+mkdir -p "/work/$USER/zamba/logs"
+
+n_chunks=$(find "$chunk_dir" -mindepth 1 -maxdepth 1 -type d | wc -l)
+
+# submit prediction
+array_job_id=$(
+  qsub \
+    -terse \
+    -v OUTPUT_DIR="$csv_chunk_dir" \
+    -t 1-"$n_chunks" \
+    "$script_dir"/submit-zamba-array-job.sh \
+    "$chunk_dir"
+)
+
+# submit csv combiner
+qsub \
+  -hold_jid "${array_job_id%%.*}" \
+  "$script_dir"/submit-csv-combiner.sh \
+  "$csv_chunk_dir" \
+  "$output"
--- a/submit-zamba-predict-csv-combiner.sh
+++ b/submit-zamba-predict-csv-combiner.sh
+#!/bin/bash
+
+#$ -N zamba-predict-csv-combiner
+
+#$ -S /bin/bash
+
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=6G
+
+#$ -binding linear:1
+
+#$ -o /work/$USER/zamba/logs/$JOB_NAME-$JOB_ID.log
+#$ -j y
+
+#$ -m ae
+
+#$ -r yes
+
+input_dir=$1
+output=${2:-"$input_dir/all.csv"}
+
+[[ -d $input_dir ]] || {
+  echo "input directory does not exist: $input_dir" >&2
+  exit 1
+}
+
+[[ -e "$input_dir"/chunk-1.csv ]] || {
+  echo "first chunk output does not exist, bailing out" >&2
+  exit 1
+}
+
+mkdir -p "$(dirname "$output")"
+
+{
+  # get header from first chunk
+  head -1 "$input_dir"/chunk-1.csv
+
+  # get all other content
+  find "$input_dir" -type f -name 'chunk-*.csv' |
+  while read -r csv ; do
+    # drop the header
+    sed '1d' "$csv"
+  done |
+  sort -t, -k1
+} > "$output"
--- a/submit-zamba-predict.sh
+++ b/submit-zamba-predict.sh
+#!/bin/bash
+
+#$ -N zamba-predict
+
+#$ -S /bin/bash
+
+#$ -l h_rt=24:00:00
+#$ -l h_vmem=18G,highmem
+
+#$ -binding linear:1
+
+#$ -o /work/$USER/zamba/logs/$JOB_NAME-$JOB_ID-$TASK_ID.log
+#$ -j y
+
+module load zamba/cpu/0.1.6-1
+
+chunk_dir=$1
+
+[[ -d $chunk_dir ]] || {
+  echo "input directory does not exist: $chunk_dir" >&2
+  exit 1
+}
+
+mkdir -p "${OUTPUT_DIR:=/work/$USER/zamba/$JOB_NAME-$JOB_ID/chunks}"
+
+output="$OUTPUT_DIR/chunk-$SGE_TASK_ID.csv"
+
+# -----------------------------------------------------------------------------
+# issue with xgboost python interface:
+#   https://github.com/dmlc/xgboost/issues/3425
+#
+# this is the workaround
+export LD_PRELOAD=/usr/local/zamba/cpu/0.1.6-1/lib/python3.6/site-packages/xgboost/lib/libxgboost.so
+
+zamba predict \
+  "$chunk_dir/chunk-$SGE_TASK_ID" \
+  "$output"