package edu.illinois.dais.ttr; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; /** * The Statistics class provides static utility methods * for statistical computations. * * @author Bob Carpenter * @since LingPipe2.0 */ public class Statistics { // don't allow instances private Statistics() { /* do nothing */ } /** * Returns the Kullback-Leibler divergence of the second specified * Dirichlet distribution relative to the first. The Dirichlet * distributions are specified by their distribution and concentration, * which are folded into a single count argument. * *

The KL divergence between two Dirichlet distributions with * parameters xs and ys of dimensionality * n is: * *

     * DKL(Dirichlet(xs) || Dirichlet(ys))
     * =   p(θ|xs) log ( p(θ|xs) / p(θ|ys) ) dθ
     * = log Γ(Σi < n xs[i])
     *   - log Γ(Σi < n ys[i])
     *   - Σi < n log Γ(xs[i])
     *   + Σi < n log Γ(ys[i])
     *   + Σi < n (xs[i] - ys[i]) * (Ψ(xs[i]) - Ψ(Σk < n xs[i]))
* * where Γ is the gamma function (see {@link * com.aliasi.util.Mathematics#log2Gamma(double)}), and where * Ψ is the digamma function defined in {@link * com.aliasi.util.Mathematics#digamma(double)}. * *

This method in keeping with other information-theoretic * methods returns the answer in bits (base 2) rather than nats (base e). * The return is rescaled from the natural-log based divergence: * *

     * klDivergenceDirichlet(xs,ys)
     * = DKL(Dirichlet(xs) || Dirichlet(ys)) / log2 e
* *

Further descriptions of the computation of KL-divergence between * Dirichlets may be found in: * *

* * @param xs Dirichlet parameter for the first distribution. * @param ys Dirichlet parameter for the second distribution. * @return The KL-divergence of the second Dirichlet distribution * relative to the first. */ public static double klDivergenceDirichlet(double[] xs, double[] ys) { verifyDivergenceDirichletArgs(xs,ys); // verifyDivergenceArgs(xs,ys); double sumXs = sum(xs); double sumYs = sum(ys); double divergence = (logGamma(sumXs) - logGamma(sumYs)); double digammaSumXs = Mathematics.digamma(sumXs); for (int i = 0; i < xs.length; ++i) { divergence += (logGamma(ys[i]) - logGamma(xs[i])) + (xs[i] - ys[i]) * (Mathematics.digamma(xs[i]) - digammaSumXs); } return divergence; } static void verifyDivergenceDirichletArgs(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "Parameter arrays must be the same length." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } for (int i = 0; i < xs.length; ++i) { if (xs[i] <= 0.0 || Double.isInfinite(xs[i]) || Double.isNaN(xs[i])) { String msg = "All parameters must be positive and finite." + " Found xs[" + i + "]=" + xs[i]; throw new IllegalArgumentException(msg); } } for (int i = 0; i < ys.length; ++i) { if (ys[i] <= 0.0 || Double.isInfinite(ys[i]) || Double.isNaN(ys[i])) { String msg = "All parameters must be positive and finite." + " Found ys[" + i + "]=" + ys[i]; throw new IllegalArgumentException(msg); } } } /** * Returns the symmetric version of the Kullback-Leibler divergence * between the Dirichlet distributions determined by the specified * parameters. * *

The symmetrized KL divergence for Dirichlets is just the * average of both relative divergences: * *

     * DSKL(Dirichlet(xs), Dirichlet(ys))
     * = (DKL(Dirichlet(xs) || Dirichlet(ys)) + DKL(Dirichlet(ys) || Dirichlet(xs))) / 2
     * 
* *

See the method documentation for {@link * #klDivergenceDirichlet(double[],double[])} for a definition of * the relative divergence for Dirichlet distributions. * * @param xs Dirichlet parameter for the first distribution. * @param ys Dirichlet parameter for the second distribution. * @return The symmetrized KL-divergence of the distributions. */ public static double symmetrizedKlDivergenceDirichlet(double[] xs, double[] ys) { return (klDivergenceDirichlet(xs,ys) + klDivergenceDirichlet(ys,xs)) / 2.0; } static double logGamma(double x) { return Mathematics.log2Gamma(x) / Mathematics.log2(java.lang.Math.E); } public static double sum(double[] xs) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) sum += xs[i]; return sum; } /** * Returns the Kullback-Leibler divergence of the second * specified multinomial relative to the first. * *

The K-L divergence of a multinomial q relative * to a multinomial p, both with n * outcomes, is: * *

     * DKL(p||q)
     * = Σi < n  p(i) log2 (p(i) / q(i))
* * The value is guaranteed to be non-negative, and will be 0.0 * only if the two distributions are identicial. If any outcome * has zero probability in q and non-zero probability * in p, the result is infinite. * *

KL divergence is not symmetric. That is, there are * p and q such that * DKL(p||q) != * DKL(q||p). See {@link * #symmetrizedKlDivergence(double[],double[])} and {@link * #jsDivergence(double[],double[])} for symmetric variants. * *

KL divergence is equivalent to conditional entropy, although * it is written in the opposite order. If H(p,q) is * the joint entropy of the distributions p and * q, and H(p) is the entropy of * p, then: * *

     * DKL(p||q) = H(p,q) - H(p)
* * @param p First multinomial distribution. * @param q Second multinomial distribution. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double klDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); double divergence = 0.0; int len = p.length; for (int i = 0; i < len; ++i) { if (p[i] > 0.0 && p[i] != q[i]) divergence += p[i] * Mathematics.log2(p[i] / q[i]); } return divergence; } static void verifyDivergenceArgs(double[] p, double[] q) { if (p.length != q.length) { String msg = "Input distributions must have same length." + " Found p.length=" + p.length + " q.length=" + q.length; throw new IllegalArgumentException(msg); } int len = p.length; for (int i = 0; i < len; ++i) { if (p[i] < 0.0 || p[i] > 1.0 || Double.isNaN(p[i]) || Double.isInfinite(p[i])) { String msg = "p[i] must be between 0.0 and 1.0 inclusive." + " found p[" + i + "]=" + p[i]; throw new IllegalArgumentException(msg); } if (q[i] < 0.0 || q[i] > 1.0 || Double.isNaN(q[i]) || Double.isInfinite(q[i])) { String msg = "q[i] must be between 0.0 and 1.0 inclusive." + " found q[" + i + "] =" + q[i]; throw new IllegalArgumentException(msg); } } } /** * Returns the symmetrized KL-divergence between the specified distributions. * The symmetrization is carried out by averaging their relative * divergences: * *
     * DSKL(p,q)
     * = ( DKL(p,q) + DKL(q,p) ) / 2
     * 
* * @param p First multinomial. * @param q Second multinomial. * @return The Symmetrized KL divergence between the multinomials. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double symmetrizedKlDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); return (klDivergence(p,q) + klDivergence(q,p)) / 2.0; } /** * Return the Jenson-Shannon divergence between the specified * multinomial distributions. The JS divergence is defined by * *
     * DJS(p,q)
     * = ( DKL(p,m) + DKL(q,m) ) / 2
* * where m is defined as the balanced linear * interpolation (that is, the average) of p and * q: * *
* m[i] = (p[i] + q[i]) / 2
* * The JS divergence is non-zero, equal to zero only if p * and q are the same distribution, and symmetric. * * @param p First multinomial. * @param q Second multinomial. * @return The JS divergence between the multinomials. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double jsDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); double[] m = new double[p.length]; for (int i = 0; i < p.length; ++i) m[i] = (p[i] + q[i])/2.0; return (klDivergence(p,m) + klDivergence(q,m)) / 2.0; } /** * Returns a permutation of the integers between 0 and * the specified length minus one. * * @param length Size of permutation to return. * @return Permutation of the specified length. */ public static int[] permutation(int length) { int[] xs = new int[length]; for (int i = 0; i < xs.length; ++i) xs[i] = i; Random random = new Random(); for (int i = xs.length; --i > 0; ) { int pos = random.nextInt(i); int temp = xs[pos]; xs[pos] = xs[i]; xs[i] = temp; } return xs; } /** * Returns Pearson's C2 goodness-of-fit * statistic for independence over the specified binary * contingency matrix. Asymptotically, this statistic has a * χ2 distribution with one degree of freedom. The * higher the value, the less likely the two outcomes are * independent. Note that this method is just a special case of * the general chi-squared independence test: * *
     * chiSquaredIndependence(both,oneOnly,twoOnly,neither)
     * = chiSquaredIndependence(new double[][] { {both, oneOnly},
     *                                           {twoOnly, neither} });
     * 
* * The specified values make up the following contingency matrix * for boolean outcomes labeled One and * Two: * *
* * * * *
 +Two-Two
+OnebothoneOnly
-OnetwoOnlyneither
*
* *

The value both is the count of events where both * outcomes occurred, oneOnly for where only the * first outcome occurred, twoOnly for where only * the second outcome occurred, and neither for * when neither occurred. Let totalCount be * the sum of all of the cells in the matrix. * *

From the contingency matrix, marginal probabilities for * the two outcomes may be computed: * *

* P(One) = (both + oneOnly) / totalCount *
* P(Two) = (both + twoOnly) / totalCount *
* * If these probabilities are independent, the expected values of * the matrix cells are: * *
* E(both)= totalCount * P(One) * P(Two) *
* E(oneOnly) = totalCount * P(One) * (1-P(Two)) *
* E(twoOnly) = totalCount * (1-P(One)) * P(Two) *
* E(neither) = totalCount * (1-P(One)) * (1-P(Two)) *
* * These are used to derive the independence test statistic, which * is the square differences between observed and expected values * under the independence assumption, normalized by the expected * values: * *
* C2 * = (both - E(both))2 / E(both) *
        * + (oneOnly - E(oneOnly))2 / E(oneOnly) *
        * + (twoOnly - E(twoOnly))2 / E(twoOnly) *
        * + (neither - E(neither))2 / E(neither) *
* * Unlike the higher dimensional case, this statistic applies as a * hypothesis test only in the case when all expected values are * at least 10. * @param both Count of samples of both outcomes. * @param oneOnly Count of samples with the first and not the * second outcome. * @param twoOnly Count of samples with the second and not the * first outcome. * @param neither Count of samples with with neither outcome. * @throws IllegalArgumentException If any of the arguments are not * non-negative finite numbers. * @return Pearson's C2 goodness-of-fit * statistic for independence over the specified sample counts. */ public static double chiSquaredIndependence(double both, double oneOnly, double twoOnly, double neither) { assertNonNegative("both",both); assertNonNegative("oneOnly",oneOnly); assertNonNegative("twoOnly",twoOnly); assertNonNegative("neither",neither); double n = both + oneOnly + twoOnly + neither; double p1 = (both + oneOnly) / n; double p2 = (both + twoOnly) / n; double eBoth = n * p1 * p2; double eOneOnly = n * p1 * (1.0 - p2); double eTwoOnly = n * (1.0 - p1) * p2; double eNeither = n * (1.0 - p1) * (1.0 - p2); return csTerm(both,eBoth) + csTerm(oneOnly,eOneOnly) + csTerm(twoOnly,eTwoOnly) + csTerm(neither,eNeither); } /** * Returns a two-element array of lineary regression coefficients * for the specified x and y values. The coefficients returned, * { β0, β1 }, define a linear function: * *
* f(x) = β1 * x + β0 *
* * The coefficients returned produce the linear function f(x) * with the smallest squared error: * *
* sqErr(f,xs,ys) = * Σi * (f(xs[i]) - ys[i])2 *
* * where all sums are for 0 << i < xs.length. * * The funciton requires only a single pass through the two * arrays, with β0 and β1 * given by: * *
     * β1 = n * Σi x[i] * y[i]  -  (Σi x[i])(Σi y[i])
     *      ------------------------------------------
     *          n * Σi x[i]*x[i]  -  (Σi x[i])2
     * 
* *
     * β0 = Σi y[i] - β1 Σi x[i]
     *      ---------------------
     *              n
     * 
* * where n = xs.length = ys.length. * * @param xs Array of x values. * @param ys Parallel array of y values. * @return Pair of regression coefficients. * @throws IllegalArgumentException If the arrays are of length * less than 2, or if the arrays are not of the same length. */ public static double[] linearRegression(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "Require parallel arrays of x and y values." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } if (xs.length < 2) { String msg = "Require arrays of length >= 2." + " Found xs.length=" + xs.length; throw new IllegalArgumentException(msg); } double n = xs.length; double xSum = 0.0; double ySum = 0.0; double xySum = 0.0; double xxSum = 0.0; for (int i = 0; i < xs.length; ++i) { double x = xs[i]; double y = ys[i]; xSum += x; ySum += y; xxSum += x * x; xySum += x * y; } double denominator = n * xxSum - xSum * xSum; if (denominator == 0.0) { String msg = "Ill formed input. Denominator for beta1 is zero." + " Most likely cause is fewer than 2 distinct inputs."; throw new IllegalArgumentException(msg); } double beta1 = (n * xySum - xSum * ySum) / denominator; double beta0 = (ySum - beta1 * xSum) / n; return new double[] { beta0, beta1 }; } /** * Returns a two-element array of logistic regression coefficients * for the specified x and y values. The coefficients returned, * { β0, β1 }, define the logistic function: * *
     *                L
     * f(x) =  ---------------
     *         1 + e β1 * x + β0
     * 
* * with the minimum squared error. See {@link * #linearRegression(double[],double[])} for a definition of * squared error. This function takes real values in the the open * interval (0, L). * *

Logistic regression coefficients are computed using * linear regression, after transforming the y values. This * is possible because of the following linear relation: * *

     * log ((L - y) / y) = β1 * x + β0
     * 
* * @param xs Array of x values. * @param ys Array of y values. * @param maxValue Maximum value of function. * @return Binary array of logistic regression coordinates. * @throws IllegalArgumentException If the maximum value is not * positive and finite, if the arrays are of length less than 2, * or if the arrays are not of the same length. */ public static double[] logisticRegression(double[] xs, double[] ys, double maxValue) { if (maxValue <= 0.0 || Double.isInfinite(maxValue) || Double.isNaN(maxValue)) { String msg = "Require finite max value > 0." + " Found maxValue=" + maxValue; throw new IllegalArgumentException(msg); } double[] logisticYs = new double[ys.length]; for (int i = 0; i < ys.length; ++i) logisticYs[i] = java.lang.Math.log((maxValue - ys[i]) / ys[i]); return linearRegression(xs,logisticYs); } /** * Returns the value of Pearson's C2 * goodness of fit statistic for independence over the specified * contingency matrix. Asymptotically, this statistic has a * χ2 distribution with * (numRows-1)*(numCols-1) degrees of freedom. The * higher the value, the less likely the two outcomes are * independent. * * Pearson's C2 statistic is defined as follows: * *
* C2 * = Σi * Σj * (matrix[i][j] - expected(i,j,matrix))2 / expectedCount(i,j,matrix) *
* * where the expected count is the total count times the max * likelihood estimates of row i probability times * column j probability: * *
* expectedCount(i,j,matrix) *
* = totalCount(matrix) *
    * * rowCount(i,matrix)/totalCount(matrix) *
    * * colCount(j,matrix)/totalCount(matrix) *
* = rowCount(i,matrix) * colCount(j,matrix) / totalCount(matrix) *
* * where * *
* rowCount(i,matrix) * = Σ0<=j<=numCols * matrix[i][j] * *
* colCount(j,matrix) * = Σ0<=i<=numRows * matrix[i][j] *
* totalCount(matrix) * = Σ0<=i<=numRows * = Σ0<=j<=numCols * matrix[i][j] *
* *

The χ2 test is a large sample test and is only * valid if all of the expected counts are at least 5. This restriction * is often ignored for ranking purposes. * * @param contingencyMatrix The specified contingency matrix. * @return Pearson's C2 statistic for the independence * testing over the contingency matrix. * @throws Illegal argument exception if the matrix is not rectangular * or if all values are not non-negative finite numbers. */ public static double chiSquaredIndependence(double[][] contingencyMatrix) { int numRows = contingencyMatrix.length; if (numRows < 2) { String msg = "Require at least two rows." + " Found numRows=" + numRows; throw new IllegalArgumentException(msg); } int numCols = contingencyMatrix[0].length; if (numCols < 2) { String msg = "Require at least two cols." + " Found numCols=" + numCols; throw new IllegalArgumentException(msg); } double[] rowSums = new double[numRows]; Arrays.fill(rowSums,0.0); double[] colSums = new double[numCols]; Arrays.fill(colSums,0.0); double totalCount = 0.0; for (int i = 0; i < numRows; ++i) { if (contingencyMatrix[i].length != numCols) { String msg = "Matrix must be rectangular." + "Row 0 length=" + numCols + "Row " + i + " length=" + contingencyMatrix[i].length; throw new IllegalArgumentException(msg); } for (int j = 0; j < numCols; ++j) { double val = contingencyMatrix[i][j]; if (Double.isInfinite(val) || val < 0.0 || Double.isNaN(val)) { String msg = "Values must be finite non-negative." + " Found matrix[" + i + "][" + j + "]=" + val; throw new IllegalArgumentException(msg); } rowSums[i] += val; colSums[j] += val; totalCount += val; } } double result = 0.0; for (int i = 0; i < numRows; ++i) for (int j = 0; j < numCols; ++j) result += csTerm(contingencyMatrix[i][j], rowSums[i] * colSums[j] / totalCount); return result; } /** * Return an array of probabilities resulting from normalizing the * specified probability ratios. The resulting array of * probabilities is the same length as the input ratio array and * each probability is simply the input array's value divided by * the sum of the ratios. * *

Warning: This method is implemented by summing the * probability ratios and then dividing each element by the sum. * Because of the limited precision of double-based * arithmetic, if the largest ratio is much larger than the next * largest ratio, then the largest normalized probability will be * one and all others will be zero. Java double values follow the * IEEE 754 arithmetic standard and thus use 52 bits for their * mantissas. Thus only ratios within * 252~1016 of * the maximum ratio will be non-zero. * * @param probabilityRatios Ratios of probabilities. * @return Probabilities resulting from normalizing the ratios. * @throws IllegalArgumentException If the input contains a value * that is not a finite non-negative number, or if the input does * not contain at least one non-zero entry. */ public static double[] normalize(double[] probabilityRatios) { for (int i = 0; i < probabilityRatios.length; ++i) { if (probabilityRatios[i] < 0.0 || Double.isInfinite(probabilityRatios[i]) || Double.isNaN(probabilityRatios[i])) { String msg = "Probabilities must be finite non-negative." + " Found probabilityRatios[" + i + "]=" + probabilityRatios[i]; throw new IllegalArgumentException(msg); } } double sum = Mathematics.sum(probabilityRatios); if (sum <= 0.0) { String msg = "Ratios must sum to number greater than zero." + " Found sum=" + sum; throw new IllegalArgumentException(msg); } double[] result = new double[probabilityRatios.length]; for (int i = 0; i < probabilityRatios.length; ++i) result[i] = probabilityRatios[i]/sum; return result; } /** * Returns the value of the kappa statistic for the specified * observed and expected probabilities. The kappa statistic * provides a kind of adjustment for the exptected (random) * difficulty of a problem. It is defined by: * *

* kappa(p,e) = (p - e) / (1 - e) *
* *

The most typical use for kappa is in evaluating * classification problems of a machine versus a gold standard or * between two human annotators to asses inter-annotator * agreement. * * @param observedProb Observed probability. * @param expectedProb Expected probability. * @return The value of the kappa statistic for the specified * probability 8 and expected probability. */ public static double kappa(double observedProb, double expectedProb) { return (observedProb - expectedProb)/(1 - expectedProb); } /** * Returns the mean of the specified array of input values. The mean * of an array is defined by: * *

* mean(xs) * = Σi < xs.length * xs[i] / xs.length *
* * If the array is of length zero, the result is {@link Double#NaN}. * * @param xs Array of values. * @return Mean of array of values. */ public static double mean(double[] xs, int len) { return Mathematics.sum(xs) / (double) len; } /** * Returns the mean of the specified array of input values. The mean * of an array is defined by: * *
* mean(xs) * = Σi < xs.length * xs[i] / xs.length *
* * If the array is of length zero, the result is {@link Double#NaN}. * * @param xs Array of values. * @return Mean of array of values. */ public static double mean(double[] xs) { return mean(xs, xs.length); } public static double mean(ArrayList m) { return Mathematics.sum(m) / (double) m.size(); } public static double median(double[] m) { Arrays.sort(m); int middle = m.length/2; // subscript of middle element if (m.length%2 == 1) { // Odd number of elements -- return the middle one. return m[middle]; } else { // Even number -- return average of middle two // Must cast the numbers to double before dividing. return (m[middle-1] + m[middle]) / 2.0; } } /** * Returns the variance of the specified array of input values. * The variance of an array of values is the mean of the * squared differences between the values and the mean: * *
* variance(xs) * = Σi < xs.length * (mean(xs) - xs[i])2 / xs.length *
* * If the array is of length zero, the result is {@link Double#NaN}. * * @param xs Array of values. * @return Variance of array of values. */ public static double variance(double[] xs) { return variance(xs, mean(xs)); } /** * Returns the standard deviation of the specified array of input values. * The standard deviation is just the square root of the variance: * *
* standardDeviation(xs) = variance(xs)(1/2) *
* * If the array is of length zero, the result is {@link Double#NaN}. * * @param xs * Array of values. * @return Standard deviation of array of values. */ public static double standardDeviation(double[] xs) { return java.lang.Math.sqrt(variance(xs)); } /** * Returns the standard deviation of the specified array of input * values. The standard deviation is just the square root of the * variance: * *
* standardDeviation(xs) = variance(xs)(1/2) *
* * If the array is of length zero, the result is {@link Double#NaN}. * * @param xs Array of values. * @return Standard deviation of array of values. */ public static double standardDeviation(List xs) { double[] x = new double[xs.size()]; for(int i=0; ir2
, measures the * square error between a best linear fit between the two arrays * of data. Rescaling either array by a positive constant will * not affect the result. * *

The square root r of the correlation * coefficient r2 is the variance * in the second array explained by a linear relation with the * the first array. * *

The definition of the correlation coefficient is: * *

* correlation(xs,ys)2 *
= sumSqDiff(xs,ys)2 *
  / (sumSqDiff(xs,xs) * sumSqDiff(xs,xs)) *
* * where * *
* sumSqDiff(xs,ys) *
= Σi<xs.length * (xs[i] - mean(xs)) * (ys[i] - mean(ys)) *
* * and thus the terms in the denominator above reduce using: * *
* sumSqDiffs(xs,xs) *
= Σi<xs.length * (xs[i] - mean(xs))2 *
* *

See the following for more details: * *

* * @param xs First array of values. * @param ys Second array of values. * @return The correlation coefficient of the two arrays. * @throws IllegalArgumentException If the arrays are not the same * length. */ public static double correlation(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "xs and ys must be the same length." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } double meanXs = mean(xs); double meanYs = mean(ys); double ssXX = sumSquareDiffs(xs,meanXs); double ssYY = sumSquareDiffs(ys,meanYs); double ssXY = sumSquareDiffs(xs,ys,meanXs,meanYs); return java.lang.Math.sqrt((ssXY*ssXY) / (ssXX * ssYY)); } /** * Returns a sample from the discrete distribution represented by the * specified cumulative probability ratios, using the specified random * number generator. * *

The cumulative probability ratios represent unnormalized probabilities * of generating the value of their index or less, that is, unnormalized * cumulative probabilities. For instance, consider * the cumulative probability ratios { 0.5, 0.5, 3.9, 10.1}: * *

* * * * * *
OutcomeValueUnnormalized ProbProb
0 0.5 0.5 0.5/10.1
1 0.5 0.0 0.0/10.1
2 3.9 3.4 3.4/10.1
3 10.1 6.2 6.2/10.1
* * A sample is taken by generating a random number x between 0.0 and * the value of the last element (here 10.1). The value returned is * the index i such that: * *
     * cumulativeProbRatios[i-1] < x <= cumulativeProbRatios[i]
* * The corresponding probabilities given the sample input are * listed in the last column in the table above. * *

Note that if two * elements have the same value, there is no chance of generating * the outcome with the higher index. In the example above, this * corresponds to outcome 1 having probaiblity 0.0 * *

Warning: The cumulative probability ratios are required to meet * two conditions. First, all values must be finite and non-negative. Second, * the values must be non-decreasing, so that * cumulativeProbRatios[i] <= cumulativeProbRatios[i+1]. * If either of these conditions is not met, the result is undefined. * * @param cumulativeProbRatios Cumulative probability for outcome less than * or equal to index. * @param random Random number generator for sampling. * @return A sample from the specified distribution. */ public static int sample(double[] cumulativeProbRatios, Random random) { int low = 0; int high = cumulativeProbRatios.length - 1; double x = random.nextDouble() * cumulativeProbRatios[high]; while (low < high) { int mid = (high + low)/2; if (x > cumulativeProbRatios[mid]) { low = mid+1; } else if (high == mid) { return (x > cumulativeProbRatios[low]) ? mid : low; } else { high = mid; } } return low; } /** * Returns the log (base 2) of the probability of the specified * discrete distribution given the specified uniform Dirichlet * with concentration parameters equal to the specified value. * *

See {@link #dirichletLog2Prob(double[],double[])} for * more information on the Dirichlet distribution. This method * returns a result equivalent to the following (though is more * efficiently implemented): * *

     * double[] alphas = new double[xs.length];
     * java.util.Arrays.fill(alphas,alpha);
     * assert(dirichletLog2Prob(alpha,xs) == dirichletLog2Prob(alphas,xs));
* *

For the uniform Dirichlet, the distribution simplifies to * the following form: * *

     * p(xs | Dirichlet(α)) = (1/Z(α)) Πi < k xs[i]α-1
* * where * *
     * Z(α) = Γ(α)k / Γ(k * α)
* *

Warning: The probability distribution must be proper * in the sense of having values between 0 and 1 inclusive and * summing to 1.0. This property is not checked by this method. * * @param alpha Dirichlet concentration parameter to use for each * dimension. * @param xs The distribution whose probability is returned. * @return The log (base 2) probability of the specified * distribution in the uniform Dirichlet distribution with concentration * parameters equal to alpha. * @throws IllegalArgumentException If the values in the distribution * are not between 0 and 1 inclusive or if the concentration parameter is * not positive and finite. */ public static double dirichletLog2Prob(double alpha, double[] xs) { verifyAlpha(alpha); verifyDistro(xs); int k = xs.length; // normalizing term double result = Mathematics.log2Gamma(k * alpha) - k * Mathematics.log2Gamma(alpha); double alphaMinus1 = alpha - 1; for (int i = 0; i < k; ++i) result += alphaMinus1 * Mathematics.log2(xs[i]); return result; } /** * Returns the log (base 2) of the probability of the specified * discrete distribution given the specified Dirichlet * distribution concentration parameters. A Dirichlet * distribution is a distribution over k-dimensional * discrete distributions. * *

The Dirichlet is widely used because it is the conjugate * prior for multinomials in a Bayesian setting, and thus may * be used to specify a convenient distribution over discrete * distributions.

* *

A Dirichlet distribution is specified by a dimensionality * k and a concentration parameter α[i] > 0 * for each i < k. * The probability of the distribution xs * in a Dirichlet distribution of dimension k * and concentration parameters α is * given (up to a normalizing constant) by: * *

     * p(xs | Dirichlet(α))
     *  Πi < k xs[i]α[i]-1
* * The full distribution is: * *
     * p(xs | Dirichlet(α))
     * = (1/Z(α)) * Πi < k xs[i]α[i]-1
* * where the normalizing constant is given by: * *
     * Z(α) = Πi < k Γ(α[i])
     *        / Γ(Σi < k α[i])
* *

Warning: The probability distribution must be proper * in the sense of having values between 0 and 1 inclusive and * summing to 1.0. This property is not checked by this method. * * @param alphas The concentration parameters for the uniform Dirichlet. * @param xs The outcome distribution * @return The probability of the outcome distribution given the * Dirichlet concentratioin parameter. * @throws IllegalArgumentException If the Dirichlet parameters and * distribution are not arrays of the same length or if the distribution * parameters in xs are not between 0 and 1 inclusive, or if any of * the concentration parameters is not positive and finite. */ public static double dirichletLog2Prob(double[] alphas, double[] xs) { if (alphas.length != xs.length) { String msg = "Dirichlet prior alphas and distribution xs must be the same length." + " Found alphas.length=" + alphas.length + " xs.length=" + xs.length; throw new IllegalArgumentException(msg); } for (int i = 0; i < alphas.length; ++i) verifyAlpha(alphas[i]); verifyDistro(xs); int k = xs.length; double result = 0.0; double alphaSum = 0.0; for (int i = 0; i < alphas.length; ++i) { alphaSum += alphas[i]; result -= Mathematics.log2Gamma(alphas[i]); } result += Mathematics.log2Gamma(alphaSum); for (int i = 0; i < k; ++i) result += (alphas[i] - 1) * Mathematics.log2(xs[i]); return result; } static void verifyAlpha(double alpha) { if (Double.isNaN(alpha) || Double.isInfinite(alpha) || alpha <= 0.0) { String msg = "Concentration parameter must be positive and finite." + " Found alpha=" + alpha; throw new IllegalArgumentException(msg); } } static void verifyDistro(double[] xs) { for (int i = 0; i < xs.length; ++i) { if (xs[i] < 0.0 || xs[i] > 1.0 || Double.isNaN(xs[i]) || Double.isInfinite(xs[i])) { String msg = "All xs must be betwee 0.0 and 1.0 inclusive." + " Found xs[" + i + "]=" + xs[i]; throw new IllegalArgumentException(msg); } } } // = sumSquareDiffs(xs,mean,xs,mean); static double sumSquareDiffs(double[] xs, double mean) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) { double diff = xs[i] - mean; sum += diff * diff; } return sum; } static double sumSquareDiffs(double[] xs, double[] ys, double meanXs, double meanYs) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) sum += (xs[i] - meanXs) * (ys[i] - meanYs); return sum; } static double variance(double[] xs, double mean) { return sumSquareDiffs(xs,mean) / (double) xs.length; } static void assertNonNegative(String variableName, double value) { if (Double.isInfinite(value) || Double.isNaN(value) || value < 0.0) { String msg = "Require finite non-negative value." + " Found " + variableName + " =" + value; throw new IllegalArgumentException(msg); } } private static double csTerm(double found, double expected) { double diff = found - expected; return diff * diff / expected; } }