diff --git a/src/validation/quartiles-categorized-diff.scala b/src/validation/quartiles-categorized-diff.scala new file mode 100644 index 0000000000000000000000000000000000000000..8a53db577a4596260e1c76fac12246f7a2cf01e8 --- /dev/null +++ b/src/validation/quartiles-categorized-diff.scala @@ -0,0 +1,176 @@ +#!/usr/bin/env amm + +import $ivy.`com.github.tototoshi::scala-csv:1.3.4` +import $ivy.`org.apache.commons:commons-math3:3.6.1` + +import ammonite.ops._ +import com.github.tototoshi.csv._ + +// http://commons.apache.org/proper/commons-math/javadocs/api-3.6.1/index.html +import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics + +import scala.collection.mutable.ListBuffer +import scala.collection.mutable.{ Map => MMap } +import scala.io.Source + +@main +def main(input: Path, output: Path, mapping: Path, years: Seq[Int] = 1999 to 2015, threshold: Double = 0, bufSize: Int = 4194304) = { + + // --- grid id to category mapping + + // open mapping + val file = Source.fromFile(mapping.toIO, bufSize) + val reader = CSVReader.open(file) + + // grid_id -> category + val categories: Map[Int, String] = { + for { + (row, index) <- { + val it = reader.iterator.zipWithIndex + it.next // drop header + it + } + category = row(0) if category != "NA" + } yield index -> category + }.toMap + + reader.close() + + // --- input + + val readers: Array[CSVReader] = { + val files = for { + file <- ls! input |? (_.ext == "csv") + } yield { + Source.fromFile(file.toIO, bufSize) + } + + files.toArray.map(file => CSVReader.open(file)) + } + + // final input data structure + + // file row col cell content + // list list list string + + val iterators: Array[Iterator[Seq[String]]] = readers.map(_.iterator) + + // --- drop headers + + println("dropping headers ...") + + iterators.foreach(_.next) + + // --- stats per year per category + + // year cat bstrap sum + val y_c_b_sum: Map[Int, MMap[String, Array[Double]]] = + years.map(_ -> MMap[String, Array[Double]]()).toMap + + // --- read data and fill stats + + var grid_id = 1 + + while (iterators.forall(_.hasNext)) { + + categories get grid_id match { + case Some(category) => + println(s"""processing line $grid_id with $category ... """) + + // 1000 x 16 x Double + val bootstraps: Array[Seq[Double]] = iterators.map(_.next.map(_.toDouble)) + + for { + bstrap_id <- 0 until 1000 + bootstrap: Seq[Double] = bootstraps(bstrap_id) + year <- years + } { + val value: Double = bootstrap(year - 1999) + + if (value > threshold) { + val c_b_sum: MMap[String, Array[Double]] = + y_c_b_sum(year) + + val bstraps = c_b_sum get category match { + case Some(bstraps) => + bstraps + + case None => + val bstraps = Array.fill(1000)(0.0) + c_b_sum.update(category, bstraps) + bstraps + } + + bstraps(bstrap_id) = bstraps(bstrap_id) + value + } + } + + case None => + println(s"""skipping $grid_id - that cell doesn't have that category ... """) + iterators.map(_.next) + } + + grid_id += 1 + } + + readers.foreach(_.close()) + + // --- calculate stats and write out + + val early_y = y_c_b_sum.keys.min + val later_y = y_c_b_sum.keys.max + + val early_c_b_sum = y_c_b_sum(early_y) + val later_c_b_sum = y_c_b_sum(later_y) + + val stats = for (category <- early_c_b_sum.keys) yield { + val early_b_sum: Array[Double] = early_c_b_sum(category) + val later_b_sum: Array[Double] = later_c_b_sum(category) + + val diff_b_sum: Array[Double] = for { + (early, later) <- early_b_sum zip later_b_sum + } yield (early - later) + + val ds = diff_b_sum.foldLeft(new DescriptiveStatistics) { + (ds, value) => { + ds addValue value + ds + } + } + + category -> ds + } + + val writer = CSVWriter.open(output.toIO) + + val output_header = Seq ( + "category", + "percentile_2.5", + "percentile_25", + "median", + "percentile_75", + "percentile_97.5" + ) + + writer.writeRow(output_header) + + // category stats + // stats: Map[String, DescriptiveStatistics] + + for ((category, ds) <- stats) { + println(s"""writing $category ...""") + + val values = Seq ( + category, + ds.getPercentile(2.5), + ds.getPercentile(25), + ds.getPercentile(50), + ds.getPercentile(75), + ds.getPercentile(97.5) + ) + + writer.writeRow(values) + } + + writer.close() +}