Skip to content
Snippets Groups Projects
Verified Commit 09a74453 authored by ck85nori's avatar ck85nori :railway_track:
Browse files

adds the pipeline

parent c41444d1
No related branches found
No related tags found
No related merge requests found
......@@ -10,3 +10,22 @@ bash chunk-input.sh \
/data/GROUP/videos-chunked-10 \
10
```
## the pipeline
The **zamba predict** pipeline consists of two jobs:
1. the array job handling the chunks
2. a follow-up job aggregating the individual outputs of the array job tasks
The following command will submit the entire pipeline:
```bash
bash pipeline.sh /data/GROUP/videos-chunked-10
```
Outputs are:
- logs will be in `/work/$USER/zamba/logs`
- chunked CSVs will be in `/work/$USER/zamba/csvs-DATE`
- aggregated CSV will be in `/work/$USER/zamba/output-DATE.csv`
#!/bin/bash
# -----------------------------------------------------------------------------
# init
# -----------------------------------------------------------------------------
pipeline_date=$(date +%F-%H-%M-%S)
script_dir=$(dirname "$0")
# -----------------------------------------------------------------------------
# command line arguments
# -----------------------------------------------------------------------------
function usage { cat << EOF
usage: bash $(basename "$0") chunk_dir [output]
EOF
}
[[ $# -eq 1 ]] || {
usage >&2
exit 1
}
chunk_dir=$1
[[ -d $chunk_dir ]] || {
echo "chunk directory does not exist or is not a directory" >&2
exit 1
}
csv_dir=/work/$USER/zamba/csvs-$pipeline_date
csv_chunk_dir=$csv_dir/chunks
if [[ -n $2 ]] ; then
output=$2
else
output="$csv_dir/output-$pipeline_date.csv"
fi
# -----------------------------------------------------------------------------
# submission
# -----------------------------------------------------------------------------
# create output directories
mkdir -p "/work/$USER/zamba/logs"
n_chunks=$(find "$chunk_dir" -mindepth 1 -maxdepth 1 -type d | wc -l)
# submit prediction
array_job_id=$(
qsub \
-terse \
-v OUTPUT_DIR="$csv_chunk_dir" \
-t 1-"$n_chunks" \
"$script_dir"/submit-zamba-array-job.sh \
"$chunk_dir"
)
# submit csv combiner
qsub \
-hold_jid "${array_job_id%%.*}" \
"$script_dir"/submit-csv-combiner.sh \
"$csv_chunk_dir" \
"$output"
#!/bin/bash
#$ -N zamba-predict-csv-combiner
#$ -S /bin/bash
#$ -l h_rt=24:00:00
#$ -l h_vmem=6G
#$ -binding linear:1
#$ -o /work/$USER/zamba/logs/$JOB_NAME-$JOB_ID.log
#$ -j y
#$ -m ae
#$ -r yes
input_dir=$1
output=${2:-"$input_dir/all.csv"}
[[ -d $input_dir ]] || {
echo "input directory does not exist: $input_dir" >&2
exit 1
}
[[ -e "$input_dir"/chunk-1.csv ]] || {
echo "first chunk output does not exist, bailing out" >&2
exit 1
}
mkdir -p "$(dirname "$output")"
{
# get header from first chunk
head -1 "$input_dir"/chunk-1.csv
# get all other content
find "$input_dir" -type f -name 'chunk-*.csv' |
while read -r csv ; do
# drop the header
sed '1d' "$csv"
done |
sort -t, -k1
} > "$output"
#!/bin/bash
#$ -N zamba-predict
#$ -S /bin/bash
#$ -l h_rt=24:00:00
#$ -l h_vmem=18G,highmem
#$ -binding linear:1
#$ -o /work/$USER/zamba/logs/$JOB_NAME-$JOB_ID-$TASK_ID.log
#$ -j y
module load zamba/cpu/0.1.6-1
chunk_dir=$1
[[ -d $chunk_dir ]] || {
echo "input directory does not exist: $chunk_dir" >&2
exit 1
}
mkdir -p "${OUTPUT_DIR:=/work/$USER/zamba/$JOB_NAME-$JOB_ID/chunks}"
output="$OUTPUT_DIR/chunk-$SGE_TASK_ID.csv"
# -----------------------------------------------------------------------------
# issue with xgboost python interface:
# https://github.com/dmlc/xgboost/issues/3425
#
# this is the workaround
export LD_PRELOAD=/usr/local/zamba/cpu/0.1.6-1/lib/python3.6/site-packages/xgboost/lib/libxgboost.so
zamba predict \
"$chunk_dir/chunk-$SGE_TASK_ID" \
"$output"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment