Skip to content
Snippets Groups Projects
Commit 834d97ad authored by Maria Voigt's avatar Maria Voigt
Browse files

merging from cluster down

parents 1a91f68e 4174a592
Branches
No related tags found
No related merge requests found
#!/usr/bin/env amm
import $ivy.`com.github.tototoshi::scala-csv:1.3.4`
import $ivy.`org.apache.commons:commons-math3:3.6.1`
import ammonite.ops._
import com.github.tototoshi.csv._
// http://commons.apache.org/proper/commons-math/javadocs/api-3.6.1/index.html
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.{ Map => MMap }
import scala.io.Source
@main
def main(input: Path, output: Path, mapping: Path, years: Seq[Int] = 1999 to 2015, threshold: Double = 0, bufSize: Int = 4194304) = {
// --- grid id to category mapping
// open mapping
val file = Source.fromFile(mapping.toIO, bufSize)
val reader = CSVReader.open(file)
// grid_id -> category
val categories: Map[Int, String] = {
for {
(row, index) <- {
val it = reader.iterator.zipWithIndex
it.next // drop header
it
}
category = row(0) if category != "NA"
} yield index -> category
}.toMap
reader.close()
// --- input
val readers: Array[CSVReader] = {
val files = for {
file <- ls! input |? (_.ext == "csv")
} yield {
Source.fromFile(file.toIO, bufSize)
}
files.toArray.map(file => CSVReader.open(file))
}
// final input data structure
// file row col cell content
// list list list string
val iterators: Array[Iterator[Seq[String]]] = readers.map(_.iterator)
// --- drop headers
println("dropping headers ...")
iterators.foreach(_.next)
// --- stats per year per category
// year cat bstrap sum
val y_c_b_sum: Map[Int, MMap[String, Array[Double]]] =
years.map(_ -> MMap[String, Array[Double]]()).toMap
// --- read data and fill stats
var grid_id = 1
while (iterators.forall(_.hasNext)) {
categories get grid_id match {
case Some(category) =>
println(s"""processing line $grid_id with $category ... """)
// 1000 x 16 x Double
val bootstraps: Array[Seq[Double]] = iterators.map(_.next.map(_.toDouble))
for {
bstrap_id <- 0 until 1000
bootstrap: Seq[Double] = bootstraps(bstrap_id)
year <- years
} {
val value: Double = bootstrap(year - 1999)
if (value > threshold) {
val c_b_sum: MMap[String, Array[Double]] =
y_c_b_sum(year)
val bstraps = c_b_sum get category match {
case Some(bstraps) =>
bstraps
case None =>
val bstraps = Array.fill(1000)(0.0)
c_b_sum.update(category, bstraps)
bstraps
}
bstraps(bstrap_id) = bstraps(bstrap_id) + value
}
}
case None =>
println(s"""skipping $grid_id - that cell doesn't have that category ... """)
iterators.map(_.next)
}
grid_id += 1
}
readers.foreach(_.close())
// --- calculate stats and write out
val early_y = y_c_b_sum.keys.min
val later_y = y_c_b_sum.keys.max
val early_c_b_sum = y_c_b_sum(early_y)
val later_c_b_sum = y_c_b_sum(later_y)
val stats = for (category <- early_c_b_sum.keys) yield {
val early_b_sum: Array[Double] = early_c_b_sum(category)
val later_b_sum: Array[Double] = later_c_b_sum(category)
val diff_b_sum: Array[Double] = for {
(early, later) <- early_b_sum zip later_b_sum
} yield (early - later)
val ds = diff_b_sum.foldLeft(new DescriptiveStatistics) {
(ds, value) => {
ds addValue value
ds
}
}
category -> ds
}
val writer = CSVWriter.open(output.toIO)
val output_header = Seq (
"category",
"percentile_2.5",
"percentile_25",
"median",
"percentile_75",
"percentile_97.5"
)
writer.writeRow(output_header)
// category stats
// stats: Map[String, DescriptiveStatistics]
for ((category, ds) <- stats) {
println(s"""writing $category ...""")
val values = Seq (
category,
ds.getPercentile(2.5),
ds.getPercentile(25),
ds.getPercentile(50),
ds.getPercentile(75),
ds.getPercentile(97.5)
)
writer.writeRow(values)
}
writer.close()
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment