#!/bin/bash

this=`basename $0`
out1="tags_unknown"
out2="tags_info"
if (( $# < 1 )); then 
	echo "Usage:"
	echo "1. $this dir"
	echo "   write info about tags and detections to ${out2}"
	echo ""
	echo "2. $this dir list"
	echo "   write list of unknown tags together with their occurrences to '${out1}'"
	echo ""
	echo "Looks for data-file ( {dir}/*-data-*-filtered )"
	exit
fi

dir=$1
COL_TAG=13
HEADER_SIZE=1

in=`ls ${dir}/*-data-*-filtered | head -1`
LEN_W_HEADER=`wc -l "$in" | cut -d" " -f1`
LEN=$(( $LEN_W_HEADER - $HEADER_SIZE))
	
function unknownTags() {

	unknown=`tail -${LEN} "$in"| cut -d',' -f"$COL_TAG"- | grep "NA" | cut -d',' -f1 | sort | uniq`
	for tag in $unknown; do
		# echo $tag
		n=`grep -c "$tag" $2`
		echo "$n : $tag" >> tmp
	done

	sort -r -t : -k 1 -g tmp
	rm tmp
}

function info() {

	SCALE=2 #floating-point precision

	echo "## Tags"
	tag_total=`tail -${LEN} "$in" | cut -d',' -f"$COL_TAG" | sort | uniq | wc -l`
	tag_known=`tail -${LEN} "$in" | cut -d',' -f"$COL_TAG"- | grep -v "NA" | cut -d',' -f1 | sort | uniq | wc -l`
	tag_unknown=`tail -${LEN} "$in" | cut -d',' -f"$COL_TAG"- | grep "NA" | cut -d',' -f1 | sort | uniq | wc -l`

	tag_known_percent=$(bc <<< "scale=$SCALE; $tag_known * 100 /$tag_total")
	tag_unknown_percent=$(bc <<< "scale=$SCALE; $tag_unknown * 100 / $tag_total" )

	echo "  total: $tag_total"
	echo "  known: $tag_known 	(${tag_known_percent}%)"
	echo "unknown: $tag_unknown 	(${tag_unknown_percent}%)"
	echo ""

	echo "## Detections"
	detection_known=`tail -${LEN} "$in" | cut -d',' -f"$COL_TAG"- | grep -v "NA" | cut -d',' -f1 | wc -l`
	detection_unknown=`tail -${LEN} "$in" | cut -d',' -f"$COL_TAG"- | grep "NA" | cut -d',' -f1 | wc -l`
	detection_known_percent=$(bc <<< "scale=$SCALE; $detection_known * 100 / $LEN"  )
	detection_unknown_percent=$(bc <<< "scale=$SCALE; $detection_unknown * 100 / $LEN"  )

	echo "  total: $LEN"
	echo "  known: $detection_known 	(${detection_known_percent}%)"
	echo "unknown: $detection_unknown	(${detection_unknown_percent}%)"
}


echo "write info about tags to ${out2}"
info "$in" > "${dir}/${out2}"

echo "print list of unknown tags along with the number of their occurrences to ${out1}"
unknownTags "$1" "$in" > "${dir}/${out1}"