diff --git a/README.md b/README.md index f0cfb340bcd8be74bd01ceaea03a37819a4c4ec3..5f1fcc930cfda56a89ff5f36b49303dea4b1eb06 100644 --- a/README.md +++ b/README.md @@ -10,3 +10,22 @@ bash chunk-input.sh \ /data/GROUP/videos-chunked-10 \ 10 ``` + +## the pipeline + +The **zamba predict** pipeline consists of two jobs: + +1. the array job handling the chunks +2. a follow-up job aggregating the individual outputs of the array job tasks + +The following command will submit the entire pipeline: + +```bash +bash pipeline.sh /data/GROUP/videos-chunked-10 +``` + +Outputs are: + +- logs will be in `/work/$USER/zamba/logs` +- chunked CSVs will be in `/work/$USER/zamba/csvs-DATE` +- aggregated CSV will be in `/work/$USER/zamba/output-DATE.csv` diff --git a/pipeline.sh b/pipeline.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ed2d5c97995fa311eb3ebdc5579e2d59a5668d1 --- /dev/null +++ b/pipeline.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# ----------------------------------------------------------------------------- +# init +# ----------------------------------------------------------------------------- + +pipeline_date=$(date +%F-%H-%M-%S) +script_dir=$(dirname "$0") + +# ----------------------------------------------------------------------------- +# command line arguments +# ----------------------------------------------------------------------------- + +function usage { cat << EOF +usage: bash $(basename "$0") chunk_dir [output] +EOF +} + +[[ $# -eq 1 ]] || { + usage >&2 + exit 1 +} + +chunk_dir=$1 + +[[ -d $chunk_dir ]] || { + echo "chunk directory does not exist or is not a directory" >&2 + exit 1 +} + +csv_dir=/work/$USER/zamba/csvs-$pipeline_date +csv_chunk_dir=$csv_dir/chunks + +if [[ -n $2 ]] ; then + output=$2 +else + output="$csv_dir/output-$pipeline_date.csv" +fi + +# ----------------------------------------------------------------------------- +# submission +# ----------------------------------------------------------------------------- + +# create output directories +mkdir -p "/work/$USER/zamba/logs" + +n_chunks=$(find "$chunk_dir" -mindepth 1 -maxdepth 1 -type d | wc -l) + +# submit prediction +array_job_id=$( + qsub \ + -terse \ + -v OUTPUT_DIR="$csv_chunk_dir" \ + -t 1-"$n_chunks" \ + "$script_dir"/submit-zamba-array-job.sh \ + "$chunk_dir" +) + +# submit csv combiner +qsub \ + -hold_jid "${array_job_id%%.*}" \ + "$script_dir"/submit-csv-combiner.sh \ + "$csv_chunk_dir" \ + "$output" diff --git a/submit-zamba-predict-csv-combiner.sh b/submit-zamba-predict-csv-combiner.sh new file mode 100644 index 0000000000000000000000000000000000000000..e0bb705c5dea1adb5bcf687f3fbdf8241e8d82cb --- /dev/null +++ b/submit-zamba-predict-csv-combiner.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +#$ -N zamba-predict-csv-combiner + +#$ -S /bin/bash + +#$ -l h_rt=24:00:00 +#$ -l h_vmem=6G + +#$ -binding linear:1 + +#$ -o /work/$USER/zamba/logs/$JOB_NAME-$JOB_ID.log +#$ -j y + +#$ -m ae + +#$ -r yes + +input_dir=$1 +output=${2:-"$input_dir/all.csv"} + +[[ -d $input_dir ]] || { + echo "input directory does not exist: $input_dir" >&2 + exit 1 +} + +[[ -e "$input_dir"/chunk-1.csv ]] || { + echo "first chunk output does not exist, bailing out" >&2 + exit 1 +} + +mkdir -p "$(dirname "$output")" + +{ + # get header from first chunk + head -1 "$input_dir"/chunk-1.csv + + # get all other content + find "$input_dir" -type f -name 'chunk-*.csv' | + while read -r csv ; do + # drop the header + sed '1d' "$csv" + done | + sort -t, -k1 +} > "$output" diff --git a/submit-zamba-predict.sh b/submit-zamba-predict.sh new file mode 100644 index 0000000000000000000000000000000000000000..1e343467e36298264c40d7e6c59167a55966f9bc --- /dev/null +++ b/submit-zamba-predict.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#$ -N zamba-predict + +#$ -S /bin/bash + +#$ -l h_rt=24:00:00 +#$ -l h_vmem=18G,highmem + +#$ -binding linear:1 + +#$ -o /work/$USER/zamba/logs/$JOB_NAME-$JOB_ID-$TASK_ID.log +#$ -j y + +module load zamba/cpu/0.1.6-1 + +chunk_dir=$1 + +[[ -d $chunk_dir ]] || { + echo "input directory does not exist: $chunk_dir" >&2 + exit 1 +} + +mkdir -p "${OUTPUT_DIR:=/work/$USER/zamba/$JOB_NAME-$JOB_ID/chunks}" + +output="$OUTPUT_DIR/chunk-$SGE_TASK_ID.csv" + +# ----------------------------------------------------------------------------- +# issue with xgboost python interface: +# https://github.com/dmlc/xgboost/issues/3425 +# +# this is the workaround +export LD_PRELOAD=/usr/local/zamba/cpu/0.1.6-1/lib/python3.6/site-packages/xgboost/lib/libxgboost.so + +zamba predict \ + "$chunk_dir/chunk-$SGE_TASK_ID" \ + "$output"