merging from cluster down

Merge branch 'master' of https://git.idiv.de/mv39zilo/orangutan_density_distribution

merging from cluster down
834d97ad · Maria Voigt · 1a91f68e · 4174a592 · 834d97ad
Commit 834d97ad authored Oct 30, 2017 by Maria Voigt
--- a/src/validation/quartiles-categorized-diff.scala
+++ b/src/validation/quartiles-categorized-diff.scala
+#!/usr/bin/env amm
+
+import $ivy.`com.github.tototoshi::scala-csv:1.3.4`
+import $ivy.`org.apache.commons:commons-math3:3.6.1`
+
+import ammonite.ops._
+import com.github.tototoshi.csv._
+
+// http://commons.apache.org/proper/commons-math/javadocs/api-3.6.1/index.html
+import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics
+
+import scala.collection.mutable.ListBuffer
+import scala.collection.mutable.{ Map => MMap }
+import scala.io.Source
+
+@main
+def main(input: Path, output: Path, mapping: Path, years: Seq[Int] = 1999 to 2015, threshold: Double = 0, bufSize: Int = 4194304) = {
+
+  // --- grid id to category mapping
+
+  // open mapping
+  val file = Source.fromFile(mapping.toIO, bufSize)
+  val reader = CSVReader.open(file)
+
+  // grid_id -> category
+  val categories: Map[Int, String] = {
+    for {
+      (row, index) <- {
+        val it = reader.iterator.zipWithIndex
+        it.next // drop header
+        it
+      }
+      category = row(0) if category != "NA"
+    } yield index -> category
+  }.toMap
+
+  reader.close()
+
+  // --- input
+
+  val readers: Array[CSVReader] = {
+    val files = for {
+      file <- ls! input |? (_.ext == "csv")
+    } yield {
+      Source.fromFile(file.toIO, bufSize)
+    }
+
+    files.toArray.map(file => CSVReader.open(file))
+  }
+
+  // final input data structure
+
+  // file   row    col    cell content
+  // list   list   list   string
+
+  val iterators: Array[Iterator[Seq[String]]] = readers.map(_.iterator)
+
+  // --- drop headers
+
+  println("dropping headers ...")
+
+  iterators.foreach(_.next)
+
+  // --- stats per year per category
+
+  //                 year      cat     bstrap sum
+  val y_c_b_sum: Map[Int, MMap[String, Array[Double]]] =
+    years.map(_ -> MMap[String, Array[Double]]()).toMap
+
+  // --- read data and fill stats
+
+  var grid_id = 1
+
+  while (iterators.forall(_.hasNext)) {
+
+    categories get grid_id match {
+      case Some(category) =>
+        println(s"""processing line $grid_id with $category ... """)
+
+        // 1000 x 16 x Double
+        val bootstraps: Array[Seq[Double]] = iterators.map(_.next.map(_.toDouble))
+
+        for {
+          bstrap_id <- 0 until 1000
+          bootstrap: Seq[Double] = bootstraps(bstrap_id)
+          year <- years
+        } {
+          val value: Double = bootstrap(year - 1999)
+
+          if (value > threshold) {
+            val c_b_sum: MMap[String, Array[Double]] =
+              y_c_b_sum(year)
+
+            val bstraps = c_b_sum get category match {
+              case Some(bstraps) =>
+                bstraps
+
+              case None =>
+                val bstraps = Array.fill(1000)(0.0)
+                c_b_sum.update(category, bstraps)
+                bstraps
+            }
+
+            bstraps(bstrap_id) = bstraps(bstrap_id) + value
+          }
+        }
+
+      case None =>
+        println(s"""skipping $grid_id - that cell doesn't have that category ... """)
+        iterators.map(_.next)
+    }
+
+    grid_id += 1
+  }
+
+  readers.foreach(_.close())
+
+  // --- calculate stats and write out
+
+  val early_y = y_c_b_sum.keys.min
+  val later_y = y_c_b_sum.keys.max
+
+  val early_c_b_sum = y_c_b_sum(early_y)
+  val later_c_b_sum = y_c_b_sum(later_y)
+
+  val stats = for (category <- early_c_b_sum.keys) yield {
+    val early_b_sum: Array[Double] = early_c_b_sum(category)
+    val later_b_sum: Array[Double] = later_c_b_sum(category)
+
+    val diff_b_sum: Array[Double] = for {
+      (early, later) <- early_b_sum zip later_b_sum
+    } yield (early - later)
+
+    val ds = diff_b_sum.foldLeft(new DescriptiveStatistics) {
+      (ds, value) => {
+        ds addValue value
+        ds
+      }
+    }
+
+    category -> ds
+  }
+
+  val writer = CSVWriter.open(output.toIO)
+
+  val output_header = Seq (
+    "category",
+    "percentile_2.5",
+    "percentile_25",
+    "median",
+    "percentile_75",
+    "percentile_97.5"
+  )
+
+  writer.writeRow(output_header)
+
+  //            category stats
+  // stats: Map[String,  DescriptiveStatistics]
+
+  for ((category, ds) <- stats) {
+    println(s"""writing $category ...""")
+
+    val values = Seq (
+      category,
+      ds.getPercentile(2.5),
+      ds.getPercentile(25),
+      ds.getPercentile(50),
+      ds.getPercentile(75),
+      ds.getPercentile(97.5)
+    )
+
+    writer.writeRow(values)
+  }
+
+  writer.close()
+}