From 05b8704e51d2e946da2aca44a1e08e54ab5af4d7 Mon Sep 17 00:00:00 2001
From: Michael Krabbe Borregaard <mkborregaard@snm.ku.dk>
Date: Wed, 23 Aug 2017 15:27:36 +0200
Subject: [PATCH] Add proportion normalization to histograms (#293)

* Add proportion normalization to histograms

* Fix tests

* Change name to :fraction

* Remove redundant words

* Change implementation when isdensity == true

* update tests to match new behaviour

* Improve implementation/tests of Histogram normalization in :fraction mode

* Replace :fraction with :probability
---
 src/hist.jl  | 39 +++++++++++++++++++++++++++------------
 test/hist.jl |  9 +++++++++
 2 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/src/hist.jl b/src/hist.jl
index 82b006ea1..691aef965 100644
--- a/src/hist.jl
+++ b/src/hist.jl
@@ -401,9 +401,9 @@ arrays appropriately. See description of `normalize` for details. Returns `h`.
 
         if mode == :none
             # nothing to do
-        elseif mode == :pdf || mode == :density
+        elseif mode == :pdf || mode == :density || mode == :probability
             if h.isdensity
-                if mode == :pdf
+                if mode == :pdf || mode == :probability
                     # histogram already represents a density, just divide weights by norm
                     s = 1/norm(h)
                     weights .*= s
@@ -411,22 +411,31 @@ arrays appropriately. See description of `normalize` for details. Returns `h`.
                         A .*= s
                     end
                 else
-                    # histogram already represents a density, nothing to do
+                    # :density - histogram already represents a density, nothing to do
                 end
             else
-                # Divide weights by bin volume, for :pdf also divide by sum of weights
-                SumT = norm_type(h)
-                vs_0 = (mode == :pdf) ? sum(SumT(x) for x in weights) : one(SumT)
-                @inbounds @nloops $N i weights d->(vs_{$N-d+1} = vs_{$N-d} * _edge_binvolume(SumT, edges[d], i_d)) begin
-                    (@nref $N weights i) /= $(Symbol("vs_$N"))
+                if mode == :pdf || mode == :density
+                    # Divide weights by bin volume, for :pdf also divide by sum of weights
+                    SumT = norm_type(h)
+                    vs_0 = (mode == :pdf) ? sum(SumT(x) for x in weights) : one(SumT)
+                    @inbounds @nloops $N i weights d->(vs_{$N-d+1} = vs_{$N-d} * _edge_binvolume(SumT, edges[d], i_d)) begin
+                        (@nref $N weights i) /= $(Symbol("vs_$N"))
+                        for A in aux_weights
+                            (@nref $N A i) /= $(Symbol("vs_$N"))
+                        end
+                    end
+                    h.isdensity = true
+                else
+                    # :probability - divide weights by sum of weights
+                    nf = inv(sum(weights))
+                    weights .*= nf
                     for A in aux_weights
-                        (@nref $N A i) /= $(Symbol("vs_$N"))
+                        A .*= nf
                     end
                 end
             end
-            h.isdensity = true
-        else mode != :pdf && mode != :density
-            throw(ArgumentError("Normalization mode must be :pdf, :density or :none"))
+        else
+            throw(ArgumentError("Normalization mode must be :pdf, :density, :probability or :none"))
         end
         h
     end
@@ -445,8 +454,14 @@ Valid values for `mode` are:
 * `:density`: Normalize by bin sizes only. Resulting histogram represents
    count density of input and does not have norm 1. Will not modify the
    histogram if it already represents a density (`h.isdensity == 1`).
+* `:probability`: Normalize by sum of weights only. Resulting histogram
+   represents the fraction of probability mass for each bin and does not have
+   norm 1.
 *  `:none`: Leaves histogram unchanged. Useful to simplify code that has to
    conditionally apply different modes of normalization.
+
+Successive application of both `:probability` and `:density` normalization (in
+any order) is equivalent to `:pdf` normalization.
 """
 normalize(h::Histogram{T,N}; mode::Symbol=:pdf) where {T,N} =
     normalize!(deepcopy(float(h)), mode = mode)
diff --git a/test/hist.jl b/test/hist.jl
index 8996da988..f28649080 100644
--- a/test/hist.jl
+++ b/test/hist.jl
@@ -160,6 +160,7 @@ end
     @test @inferred(norm(h_pdf)) ≈ 1
     @test @inferred(normalize(h_pdf, mode = :pdf)) == h_pdf
     @test @inferred(normalize(h_pdf, mode = :density)) == h_pdf
+    @test @inferred(normalize(h_pdf, mode = :probability)) == h_pdf
 
     h_density = normalize(h, mode = :density)
     @test h_density.weights ≈ h.weights ./ bin_vols
@@ -169,6 +170,14 @@ end
         Histogram(h_density.edges, h_density.weights .* (1/norm(h_density)), h_density.closed, true)
     @test normalize(h_density, mode = :pdf).weights ≈ h_pdf.weights
     @test normalize(h_density, mode = :density) == h_density
+    @test normalize(h_density, mode = :probability).weights ≈ h_pdf.weights
+
+    h_fraction = normalize(h, mode = :probability)
+    @test sum(h_fraction.weights) ≈ 1
+    @test h_fraction.isdensity == false
+    @test normalize(h_fraction, mode = :pdf).weights ≈ h_pdf.weights
+    @test normalize(h_fraction, mode = :density).weights ≈ h_pdf.weights
+    @test normalize(h_fraction, mode = :probability).weights ≈ h_fraction.weights
 
     h_copy = deepcopy(float(h))
     @test @inferred(normalize!(h_copy, mode = :density)) == h_copy