package edu.illinois.dais.ttr; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; /** * The Statistics class provides static utility methods * for statistical computations. * * @author Bob Carpenter * @since LingPipe2.0 */ public class Statistics { // don't allow instances private Statistics() { /* do nothing */ } /** * Returns the Kullback-Leibler divergence of the second specified * Dirichlet distribution relative to the first. The Dirichlet * distributions are specified by their distribution and concentration, * which are folded into a single count argument. * *

The KL divergence between two Dirichlet distributions with * parameters xs and ys of dimensionality * n is: * *

     * D_{_KL}(Dirichlet(xs) || Dirichlet(ys))
     * = ∫  p(θ|xs) log ( p(θ|xs) / p(θ|ys) ) dθ
     * = log Γ(Σ_{_{i < n}} xs[i])
     *   - log Γ(Σ_{_{i < n}} ys[i])
     *   - Σ_{_{i < n}} log Γ(xs[i])
     *   + Σ_{_{i < n}} log Γ(ys[i])
     *   + Σ_{_{i < n}} (xs[i] - ys[i]) * (Ψ(xs[i]) - Ψ(Σ_{_{k < n}} xs[i]))

* * where Γ is the gamma function (see {@link * com.aliasi.util.Mathematics#log2Gamma(double)}), and where * Ψ is the digamma function defined in {@link * com.aliasi.util.Mathematics#digamma(double)}. * *

This method in keeping with other information-theoretic * methods returns the answer in bits (base 2) rather than nats (base e). * The return is rescaled from the natural-log based divergence: * *

     * klDivergenceDirichlet(xs,ys)
     * = D_{_KL}(Dirichlet(xs) || Dirichlet(ys)) / log_₂ e

* *

Further descriptions of the computation of KL-divergence between * Dirichlets may be found in: * *

W.D. Penny. 2001. Kullback-Liebler * Divergences of Normal, Gamma, Dirichlet and Wishart * Densities. Technical report, Wellcome Department of * Cognitive Neurology, 2001.
*
Blei, Franks, Jordan and Mian. 2006. Statistical * modeling of biomedical corpora. BMC Bioinformatics * 7:250. (Note: This paper has sign errors; see Penny for * the correct version.)

* * @param xs Dirichlet parameter for the first distribution. * @param ys Dirichlet parameter for the second distribution. * @return The KL-divergence of the second Dirichlet distribution * relative to the first. */ public static double klDivergenceDirichlet(double[] xs, double[] ys) { verifyDivergenceDirichletArgs(xs,ys); // verifyDivergenceArgs(xs,ys); double sumXs = sum(xs); double sumYs = sum(ys); double divergence = (logGamma(sumXs) - logGamma(sumYs)); double digammaSumXs = Mathematics.digamma(sumXs); for (int i = 0; i < xs.length; ++i) { divergence += (logGamma(ys[i]) - logGamma(xs[i])) + (xs[i] - ys[i]) * (Mathematics.digamma(xs[i]) - digammaSumXs); } return divergence; } static void verifyDivergenceDirichletArgs(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "Parameter arrays must be the same length." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } for (int i = 0; i < xs.length; ++i) { if (xs[i] <= 0.0 || Double.isInfinite(xs[i]) || Double.isNaN(xs[i])) { String msg = "All parameters must be positive and finite." + " Found xs[" + i + "]=" + xs[i]; throw new IllegalArgumentException(msg); } } for (int i = 0; i < ys.length; ++i) { if (ys[i] <= 0.0 || Double.isInfinite(ys[i]) || Double.isNaN(ys[i])) { String msg = "All parameters must be positive and finite." + " Found ys[" + i + "]=" + ys[i]; throw new IllegalArgumentException(msg); } } } /** * Returns the symmetric version of the Kullback-Leibler divergence * between the Dirichlet distributions determined by the specified * parameters. * *

The symmetrized KL divergence for Dirichlets is just the * average of both relative divergences: * *

     * D_{_SKL}(Dirichlet(xs), Dirichlet(ys))
     * = (D_{_KL}(Dirichlet(xs) || Dirichlet(ys)) + D_{_KL}(Dirichlet(ys) || Dirichlet(xs))) / 2
     *

* *

See the method documentation for {@link * #klDivergenceDirichlet(double[],double[])} for a definition of * the relative divergence for Dirichlet distributions. * * @param xs Dirichlet parameter for the first distribution. * @param ys Dirichlet parameter for the second distribution. * @return The symmetrized KL-divergence of the distributions. */ public static double symmetrizedKlDivergenceDirichlet(double[] xs, double[] ys) { return (klDivergenceDirichlet(xs,ys) + klDivergenceDirichlet(ys,xs)) / 2.0; } static double logGamma(double x) { return Mathematics.log2Gamma(x) / Mathematics.log2(java.lang.Math.E); } public static double sum(double[] xs) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) sum += xs[i]; return sum; } /** * Returns the Kullback-Leibler divergence of the second * specified multinomial relative to the first. * *

The K-L divergence of a multinomial q relative * to a multinomial p, both with n * outcomes, is: * *

     * D_{_KL}(p||q)
     * = Σ_{_{i < n}}  p(i) log₂ (p(i) / q(i))

* * The value is guaranteed to be non-negative, and will be 0.0 * only if the two distributions are identicial. If any outcome * has zero probability in q and non-zero probability * in p, the result is infinite. * *

KL divergence is not symmetric. That is, there are * p and q such that * D_{_KL}(p||q) != * D_{_KL}(q||p). See {@link * #symmetrizedKlDivergence(double[],double[])} and {@link * #jsDivergence(double[],double[])} for symmetric variants. * *

KL divergence is equivalent to conditional entropy, although * it is written in the opposite order. If H(p,q) is * the joint entropy of the distributions p and * q, and H(p) is the entropy of * p, then: * *

     * D_{_KL}(p||q) = H(p,q) - H(p)

* * @param p First multinomial distribution. * @param q Second multinomial distribution. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double klDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); double divergence = 0.0; int len = p.length; for (int i = 0; i < len; ++i) { if (p[i] > 0.0 && p[i] != q[i]) divergence += p[i] * Mathematics.log2(p[i] / q[i]); } return divergence; } static void verifyDivergenceArgs(double[] p, double[] q) { if (p.length != q.length) { String msg = "Input distributions must have same length." + " Found p.length=" + p.length + " q.length=" + q.length; throw new IllegalArgumentException(msg); } int len = p.length; for (int i = 0; i < len; ++i) { if (p[i] < 0.0 || p[i] > 1.0 || Double.isNaN(p[i]) || Double.isInfinite(p[i])) { String msg = "p[i] must be between 0.0 and 1.0 inclusive." + " found p[" + i + "]=" + p[i]; throw new IllegalArgumentException(msg); } if (q[i] < 0.0 || q[i] > 1.0 || Double.isNaN(q[i]) || Double.isInfinite(q[i])) { String msg = "q[i] must be between 0.0 and 1.0 inclusive." + " found q[" + i + "] =" + q[i]; throw new IllegalArgumentException(msg); } } } /** * Returns the symmetrized KL-divergence between the specified distributions. * The symmetrization is carried out by averaging their relative * divergences: * *

     * D_{_SKL}(p,q)
     * = ( D_{_KL}(p,q) + D_{_KL}(q,p) ) / 2
     *

* * @param p First multinomial. * @param q Second multinomial. * @return The Symmetrized KL divergence between the multinomials. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double symmetrizedKlDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); return (klDivergence(p,q) + klDivergence(q,p)) / 2.0; } /** * Return the Jenson-Shannon divergence between the specified * multinomial distributions. The JS divergence is defined by * *

     * D_{_JS}(p,q)
     * = ( D_{_KL}(p,m) + D_{_KL}(q,m) ) / 2

* * where m is defined as the balanced linear * interpolation (that is, the average) of p and * q: * *


     * m[i] = (p[i] + q[i]) / 2

* * The JS divergence is non-zero, equal to zero only if p * and q are the same distribution, and symmetric. * * @param p First multinomial. * @param q Second multinomial. * @return The JS divergence between the multinomials. * @throws IllegalArgumentException If the distributions are not * the same length or have entries less than zero or greater than * 1. */ public static double jsDivergence(double[] p, double[] q) { verifyDivergenceArgs(p,q); double[] m = new double[p.length]; for (int i = 0; i < p.length; ++i) m[i] = (p[i] + q[i])/2.0; return (klDivergence(p,m) + klDivergence(q,m)) / 2.0; } /** * Returns a permutation of the integers between 0 and * the specified length minus one. * * @param length Size of permutation to return. * @return Permutation of the specified length. */ public static int[] permutation(int length) { int[] xs = new int[length]; for (int i = 0; i < xs.length; ++i) xs[i] = i; Random random = new Random(); for (int i = xs.length; --i > 0; ) { int pos = random.nextInt(i); int temp = xs[pos]; xs[pos] = xs[i]; xs[i] = temp; } return xs; } /** * Returns Pearson's C_₂ goodness-of-fit * statistic for independence over the specified binary * contingency matrix. Asymptotically, this statistic has a * χ² distribution with one degree of freedom. The * higher the value, the less likely the two outcomes are * independent. Note that this method is just a special case of * the general chi-squared independence test: * *

     * chiSquaredIndependence(both,oneOnly,twoOnly,neither)
     * = chiSquaredIndependence(new double[][] { {both, oneOnly},
     *                                           {twoOnly, neither} });
     *

* * The specified values make up the following contingency matrix * for boolean outcomes labeled One and * Two: * *

* * * * *
+Two -Two
+One both oneOnly
-One twoOnly neither
*

* *

The value both is the count of events where both * outcomes occurred, oneOnly for where only the * first outcome occurred, twoOnly for where only * the second outcome occurred, and neither for * when neither occurred. Let totalCount be * the sum of all of the cells in the matrix. * *

From the contingency matrix, marginal probabilities for * the two outcomes may be computed: * *

* P(One) = (both + oneOnly) / totalCount * * P(Two) = (both + twoOnly) / totalCount *

* * If these probabilities are independent, the expected values of * the matrix cells are: * *

* E(both)= totalCount * P(One) * P(Two) * * E(oneOnly) = totalCount * P(One) * (1-P(Two)) * * E(twoOnly) = totalCount * (1-P(One)) * P(Two) * * E(neither) = totalCount * (1-P(One)) * (1-P(Two)) *

* * These are used to derive the independence test statistic, which * is the square differences between observed and expected values * under the independence assumption, normalized by the expected * values: * *

* C_₂ * = (both - E(both))^² / E(both) * * + (oneOnly - E(oneOnly))^² / E(oneOnly) * * + (twoOnly - E(twoOnly))^² / E(twoOnly) * * + (neither - E(neither))^² / E(neither) *

* * Unlike the higher dimensional case, this statistic applies as a * hypothesis test only in the case when all expected values are * at least 10. * @param both Count of samples of both outcomes. * @param oneOnly Count of samples with the first and not the * second outcome. * @param twoOnly Count of samples with the second and not the * first outcome. * @param neither Count of samples with with neither outcome. * @throws IllegalArgumentException If any of the arguments are not * non-negative finite numbers. * @return Pearson's C_₂ goodness-of-fit * statistic for independence over the specified sample counts. */ public static double chiSquaredIndependence(double both, double oneOnly, double twoOnly, double neither) { assertNonNegative("both",both); assertNonNegative("oneOnly",oneOnly); assertNonNegative("twoOnly",twoOnly); assertNonNegative("neither",neither); double n = both + oneOnly + twoOnly + neither; double p1 = (both + oneOnly) / n; double p2 = (both + twoOnly) / n; double eBoth = n * p1 * p2; double eOneOnly = n * p1 * (1.0 - p2); double eTwoOnly = n * (1.0 - p1) * p2; double eNeither = n * (1.0 - p1) * (1.0 - p2); return csTerm(both,eBoth) + csTerm(oneOnly,eOneOnly) + csTerm(twoOnly,eTwoOnly) + csTerm(neither,eNeither); } /** * Returns a two-element array of lineary regression coefficients * for the specified x and y values. The coefficients returned, * { β0, β1 }, define a linear function: * *

* f(x) = β1 * x + β0 *

* * The coefficients returned produce the linear function f(x) * with the smallest squared error: * *

* sqErr(f,xs,ys) = * Σ_{_i} * (f(xs[i]) - ys[i])² *

* * where all sums are for 0 << i < xs.length. * * The funciton requires only a single pass through the two * arrays, with β0 and β1 * given by: * *

     * β1 = n * Σ_i x[i] * y[i]  -  (Σ_i x[i])(Σ_i y[i])
     *      ------------------------------------------
     *          n * Σ_i x[i]*x[i]  -  (Σ_i x[i])²
     *

* *

     * β0 = Σ_i y[i] - β1 Σ_i x[i]
     *      ---------------------
     *              n
     *

* * where n = xs.length = ys.length. * * @param xs Array of x values. * @param ys Parallel array of y values. * @return Pair of regression coefficients. * @throws IllegalArgumentException If the arrays are of length * less than 2, or if the arrays are not of the same length. */ public static double[] linearRegression(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "Require parallel arrays of x and y values." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } if (xs.length < 2) { String msg = "Require arrays of length >= 2." + " Found xs.length=" + xs.length; throw new IllegalArgumentException(msg); } double n = xs.length; double xSum = 0.0; double ySum = 0.0; double xySum = 0.0; double xxSum = 0.0; for (int i = 0; i < xs.length; ++i) { double x = xs[i]; double y = ys[i]; xSum += x; ySum += y; xxSum += x * x; xySum += x * y; } double denominator = n * xxSum - xSum * xSum; if (denominator == 0.0) { String msg = "Ill formed input. Denominator for beta1 is zero." + " Most likely cause is fewer than 2 distinct inputs."; throw new IllegalArgumentException(msg); } double beta1 = (n * xySum - xSum * ySum) / denominator; double beta0 = (ySum - beta1 * xSum) / n; return new double[] { beta0, beta1 }; } /** * Returns a two-element array of logistic regression coefficients * for the specified x and y values. The coefficients returned, * { β0, β1 }, define the logistic function: * *

     *                L
     * f(x) =  ---------------
     *         1 + e ^{^{β1 * x + β0}}
     *

* * with the minimum squared error. See {@link * #linearRegression(double[],double[])} for a definition of * squared error. This function takes real values in the the open * interval (0, L). * *

Logistic regression coefficients are computed using * linear regression, after transforming the y values. This * is possible because of the following linear relation: * *

     * log ((L - y) / y) = β1 * x + β0
     *

* * @param xs Array of x values. * @param ys Array of y values. * @param maxValue Maximum value of function. * @return Binary array of logistic regression coordinates. * @throws IllegalArgumentException If the maximum value is not * positive and finite, if the arrays are of length less than 2, * or if the arrays are not of the same length. */ public static double[] logisticRegression(double[] xs, double[] ys, double maxValue) { if (maxValue <= 0.0 || Double.isInfinite(maxValue) || Double.isNaN(maxValue)) { String msg = "Require finite max value > 0." + " Found maxValue=" + maxValue; throw new IllegalArgumentException(msg); } double[] logisticYs = new double[ys.length]; for (int i = 0; i < ys.length; ++i) logisticYs[i] = java.lang.Math.log((maxValue - ys[i]) / ys[i]); return linearRegression(xs,logisticYs); } /** * Returns the value of Pearson's C_₂ * goodness of fit statistic for independence over the specified * contingency matrix. Asymptotically, this statistic has a * χ² distribution with * (numRows-1)*(numCols-1) degrees of freedom. The * higher the value, the less likely the two outcomes are * independent. * * Pearson's C_₂ statistic is defined as follows: * *

* C_₂ * = Σ_{_i} * Σ_{_j} * (matrix[i][j] - expected(i,j,matrix))² / expectedCount(i,j,matrix) *

* * where the expected count is the total count times the max * likelihood estimates of row i probability times * column j probability: * *

* expectedCount(i,j,matrix) * * = totalCount(matrix) * * * rowCount(i,matrix)/totalCount(matrix) * * * colCount(j,matrix)/totalCount(matrix) * * = rowCount(i,matrix) * colCount(j,matrix) / totalCount(matrix) *

* * where * *

* rowCount(i,matrix) * = Σ_{_{0<=j<=numCols}} * matrix[i][j] * * * colCount(j,matrix) * = Σ_{_{0<=i<=numRows}} * matrix[i][j] * * totalCount(matrix) * = Σ_{_{0<=i<=numRows}} * = Σ_{_{0<=j<=numCols}} * matrix[i][j] *

* *

The χ² test is a large sample test and is only * valid if all of the expected counts are at least 5. This restriction * is often ignored for ranking purposes. * * @param contingencyMatrix The specified contingency matrix. * @return Pearson's C_₂ statistic for the independence * testing over the contingency matrix. * @throws Illegal argument exception if the matrix is not rectangular * or if all values are not non-negative finite numbers. */ public static double chiSquaredIndependence(double[][] contingencyMatrix) { int numRows = contingencyMatrix.length; if (numRows < 2) { String msg = "Require at least two rows." + " Found numRows=" + numRows; throw new IllegalArgumentException(msg); } int numCols = contingencyMatrix[0].length; if (numCols < 2) { String msg = "Require at least two cols." + " Found numCols=" + numCols; throw new IllegalArgumentException(msg); } double[] rowSums = new double[numRows]; Arrays.fill(rowSums,0.0); double[] colSums = new double[numCols]; Arrays.fill(colSums,0.0); double totalCount = 0.0; for (int i = 0; i < numRows; ++i) { if (contingencyMatrix[i].length != numCols) { String msg = "Matrix must be rectangular." + "Row 0 length=" + numCols + "Row " + i + " length=" + contingencyMatrix[i].length; throw new IllegalArgumentException(msg); } for (int j = 0; j < numCols; ++j) { double val = contingencyMatrix[i][j]; if (Double.isInfinite(val) || val < 0.0 || Double.isNaN(val)) { String msg = "Values must be finite non-negative." + " Found matrix[" + i + "][" + j + "]=" + val; throw new IllegalArgumentException(msg); } rowSums[i] += val; colSums[j] += val; totalCount += val; } } double result = 0.0; for (int i = 0; i < numRows; ++i) for (int j = 0; j < numCols; ++j) result += csTerm(contingencyMatrix[i][j], rowSums[i] * colSums[j] / totalCount); return result; } /** * Return an array of probabilities resulting from normalizing the * specified probability ratios. The resulting array of * probabilities is the same length as the input ratio array and * each probability is simply the input array's value divided by * the sum of the ratios. * *

Warning: This method is implemented by summing the * probability ratios and then dividing each element by the sum. * Because of the limited precision of double-based * arithmetic, if the largest ratio is much larger than the next * largest ratio, then the largest normalized probability will be * one and all others will be zero. Java double values follow the * IEEE 754 arithmetic standard and thus use 52 bits for their * mantissas. Thus only ratios within * 2^⁵²~10^¹⁶ of * the maximum ratio will be non-zero. * * @param probabilityRatios Ratios of probabilities. * @return Probabilities resulting from normalizing the ratios. * @throws IllegalArgumentException If the input contains a value * that is not a finite non-negative number, or if the input does * not contain at least one non-zero entry. */ public static double[] normalize(double[] probabilityRatios) { for (int i = 0; i < probabilityRatios.length; ++i) { if (probabilityRatios[i] < 0.0 || Double.isInfinite(probabilityRatios[i]) || Double.isNaN(probabilityRatios[i])) { String msg = "Probabilities must be finite non-negative." + " Found probabilityRatios[" + i + "]=" + probabilityRatios[i]; throw new IllegalArgumentException(msg); } } double sum = Mathematics.sum(probabilityRatios); if (sum <= 0.0) { String msg = "Ratios must sum to number greater than zero." + " Found sum=" + sum; throw new IllegalArgumentException(msg); } double[] result = new double[probabilityRatios.length]; for (int i = 0; i < probabilityRatios.length; ++i) result[i] = probabilityRatios[i]/sum; return result; } /** * Returns the value of the kappa statistic for the specified * observed and expected probabilities. The kappa statistic * provides a kind of adjustment for the exptected (random) * difficulty of a problem. It is defined by: * *

* kappa(p,e) = (p - e) / (1 - e) *


     *
     * The most typical use for kappa is in evaluating
     * classification problems of a machine versus a gold standard or
     * between two human annotators to asses inter-annotator
     * agreement.
     *
     * @param observedProb Observed probability.
     * @param expectedProb Expected probability.
     * @return The value of the kappa statistic for the specified
     * probability 8 and expected probability.
     */
    public static double kappa(double observedProb, double expectedProb) {
        return (observedProb - expectedProb)/(1 - expectedProb);
    }

    /**
     * Returns the mean of the specified array of input values.  The mean
     * of an array is defined by:
     *
     * 

     *   mean(xs)
     *   = Σ_{_{i < xs.length}}
     *     xs[i] / xs.length
     * 
     *
     * If the array is of length zero, the result is {@link Double#NaN}.
     *
     * @param xs Array of values.
     * @return Mean of array of values.
     */
    public static double mean(double[] xs, int len) {
        return Mathematics.sum(xs) / (double) len;
    }
    
    
    /**
     * Returns the mean of the specified array of input values.  The mean
     * of an array is defined by:
     *
     * 
     *   mean(xs)
     *   = Σ_{_{i < xs.length}}
     *     xs[i] / xs.length
     * 
     *
     * If the array is of length zero, the result is {@link Double#NaN}.
     *
     * @param xs Array of values.
     * @return Mean of array of values.
     */
    public static double mean(double[] xs) {
        return mean(xs, xs.length);
    }
    
    public static double mean(ArrayList m) {
    	return Mathematics.sum(m) / (double) m.size();
    }
        
    public static double median(double[] m) {
    	Arrays.sort(m);
        int middle = m.length/2;  // subscript of middle element
        if (m.length%2 == 1) {
            // Odd number of elements -- return the middle one.
            return m[middle];
        } else {
           // Even number -- return average of middle two
           // Must cast the numbers to double before dividing.
           return (m[middle-1] + m[middle]) / 2.0;
        }
    }

    /**
     * Returns the variance of the specified array of input values.
     * The variance of an array of values is the mean of the
     * squared differences between the values and the mean:
     *
     * 
     *  variance(xs)
     *   = Σ_{_{i < xs.length}}
     *     (mean(xs) - xs[i])^² / xs.length
     * 
     *
     * If the array is of length zero, the result is {@link Double#NaN}.
     *
     * @param xs Array of values.
     * @return Variance of array of values.
     */
    public static double variance(double[] xs) {
		return variance(xs, mean(xs));
	}


    /**
	 * Returns the standard deviation of the specified array of input values.
	 * The standard deviation is just the square root of the variance:
	 * 
	 * 
     * standardDeviation(xs) = variance(xs)^{^(1/2)}
     * 
	 * 
	 * If the array is of length zero, the result is {@link Double#NaN}.
	 * 
	 * @param xs
	 *            Array of values.
	 * @return Standard deviation of array of values.
	 */
    public static double standardDeviation(double[] xs) {
        return java.lang.Math.sqrt(variance(xs));
    }
    
    /**
     * Returns the standard deviation of the specified array of input
     * values. The standard deviation is just the square root of the
     * variance:
     *
     * 
     * standardDeviation(xs) = variance(xs)^{^(1/2)}
     * 
     *
     * If the array is of length zero, the result is {@link Double#NaN}.
     *
     * @param xs Array of values.
     * @return Standard deviation of array of values.
     */
    public static double standardDeviation(List xs) {
    	double[] x = new double[xs.size()];
    	for(int i=0; ir²

, measures the * square error between a best linear fit between the two arrays * of data. Rescaling either array by a positive constant will * not affect the result. * *

The square root r of the correlation * coefficient r² is the variance * in the second array explained by a linear relation with the * the first array. * *

The definition of the correlation coefficient is: * *

* correlation(xs,ys)^² * = sumSqDiff(xs,ys)^² * / (sumSqDiff(xs,xs) * sumSqDiff(xs,xs)) *

* * where * *

* sumSqDiff(xs,ys) * = Σ_{_i<xs.length} * (xs[i] - mean(xs)) * (ys[i] - mean(ys)) *

* * and thus the terms in the denominator above reduce using: * *

* sumSqDiffs(xs,xs) * = Σ_{_i<xs.length} * (xs[i] - mean(xs))^² *

* *

See the following for more details: * *

Eric W. Weisstein. "Correlation Coefficient." From * MathWorld--A Wolfram Web Resource. http://mathworld.wolfram.com/CorrelationCoefficient.html * *
Wikipedia. Pearson * product-moment correlation coefficient. * *

* * @param xs First array of values. * @param ys Second array of values. * @return The correlation coefficient of the two arrays. * @throws IllegalArgumentException If the arrays are not the same * length. */ public static double correlation(double[] xs, double[] ys) { if (xs.length != ys.length) { String msg = "xs and ys must be the same length." + " Found xs.length=" + xs.length + " ys.length=" + ys.length; throw new IllegalArgumentException(msg); } double meanXs = mean(xs); double meanYs = mean(ys); double ssXX = sumSquareDiffs(xs,meanXs); double ssYY = sumSquareDiffs(ys,meanYs); double ssXY = sumSquareDiffs(xs,ys,meanXs,meanYs); return java.lang.Math.sqrt((ssXY*ssXY) / (ssXX * ssYY)); } /** * Returns a sample from the discrete distribution represented by the * specified cumulative probability ratios, using the specified random * number generator. * *

The cumulative probability ratios represent unnormalized probabilities * of generating the value of their index or less, that is, unnormalized * cumulative probabilities. For instance, consider * the cumulative probability ratios { 0.5, 0.5, 3.9, 10.1}: * *

* * * * * *
Outcome Value Unnormalized Prob Prob
0 0.5 0.5 0.5/10.1
1 0.5 0.0 0.0/10.1
2 3.9 3.4 3.4/10.1
3 10.1 6.2 6.2/10.1

Outcome	Value	Unnormalized Prob	Prob
0	0.5	0.5	0.5/10.1
1	0.5	0.0	0.0/10.1
2	3.9	3.4	3.4/10.1
3	10.1	6.2	6.2/10.1

* * A sample is taken by generating a random number x between 0.0 and * the value of the last element (here 10.1). The value returned is * the index i such that: * *

     * cumulativeProbRatios[i-1] < x <= cumulativeProbRatios[i]

* * The corresponding probabilities given the sample input are * listed in the last column in the table above. * *

Note that if two * elements have the same value, there is no chance of generating * the outcome with the higher index. In the example above, this * corresponds to outcome 1 having probaiblity 0.0 * *

Warning: The cumulative probability ratios are required to meet * two conditions. First, all values must be finite and non-negative. Second, * the values must be non-decreasing, so that * cumulativeProbRatios[i] <= cumulativeProbRatios[i+1]. * If either of these conditions is not met, the result is undefined. * * @param cumulativeProbRatios Cumulative probability for outcome less than * or equal to index. * @param random Random number generator for sampling. * @return A sample from the specified distribution. */ public static int sample(double[] cumulativeProbRatios, Random random) { int low = 0; int high = cumulativeProbRatios.length - 1; double x = random.nextDouble() * cumulativeProbRatios[high]; while (low < high) { int mid = (high + low)/2; if (x > cumulativeProbRatios[mid]) { low = mid+1; } else if (high == mid) { return (x > cumulativeProbRatios[low]) ? mid : low; } else { high = mid; } } return low; } /** * Returns the log (base 2) of the probability of the specified * discrete distribution given the specified uniform Dirichlet * with concentration parameters equal to the specified value. * *

See {@link #dirichletLog2Prob(double[],double[])} for * more information on the Dirichlet distribution. This method * returns a result equivalent to the following (though is more * efficiently implemented): * *

     * double[] alphas = new double[xs.length];
     * java.util.Arrays.fill(alphas,alpha);
     * assert(dirichletLog2Prob(alpha,xs) == dirichletLog2Prob(alphas,xs));

* *

For the uniform Dirichlet, the distribution simplifies to * the following form: * *

     * p(xs | Dirichlet(α)) = (1/Z(α)) Π_{_{i < k}} xs[i]^α-1

* * where * *

     * Z(α) = Γ(α)^k / Γ(k * α)

* *

Warning: The probability distribution must be proper * in the sense of having values between 0 and 1 inclusive and * summing to 1.0. This property is not checked by this method. * * @param alpha Dirichlet concentration parameter to use for each * dimension. * @param xs The distribution whose probability is returned. * @return The log (base 2) probability of the specified * distribution in the uniform Dirichlet distribution with concentration * parameters equal to alpha. * @throws IllegalArgumentException If the values in the distribution * are not between 0 and 1 inclusive or if the concentration parameter is * not positive and finite. */ public static double dirichletLog2Prob(double alpha, double[] xs) { verifyAlpha(alpha); verifyDistro(xs); int k = xs.length; // normalizing term double result = Mathematics.log2Gamma(k * alpha) - k * Mathematics.log2Gamma(alpha); double alphaMinus1 = alpha - 1; for (int i = 0; i < k; ++i) result += alphaMinus1 * Mathematics.log2(xs[i]); return result; } /** * Returns the log (base 2) of the probability of the specified * discrete distribution given the specified Dirichlet * distribution concentration parameters. A Dirichlet * distribution is a distribution over k-dimensional * discrete distributions. * *

The Dirichlet is widely used because it is the conjugate * prior for multinomials in a Bayesian setting, and thus may * be used to specify a convenient distribution over discrete * distributions.

* *

A Dirichlet distribution is specified by a dimensionality * k and a concentration parameter α[i] > 0 * for each i < k. * The probability of the distribution xs * in a Dirichlet distribution of dimension k * and concentration parameters α is * given (up to a normalizing constant) by: * *

     * p(xs | Dirichlet(α))
     * ∝ Π_{_{i < k}} xs[i]^α[i]-1

* * The full distribution is: * *

     * p(xs | Dirichlet(α))
     * = (1/Z(α)) * Π_{_{i < k}} xs[i]^α[i]-1

* * where the normalizing constant is given by: * *

     * Z(α) = Π_{_{i < k}} Γ(α[i])
     *        / Γ(Σ_{_{i < k}} α[i])

* *

Warning: The probability distribution must be proper * in the sense of having values between 0 and 1 inclusive and * summing to 1.0. This property is not checked by this method. * * @param alphas The concentration parameters for the uniform Dirichlet. * @param xs The outcome distribution * @return The probability of the outcome distribution given the * Dirichlet concentratioin parameter. * @throws IllegalArgumentException If the Dirichlet parameters and * distribution are not arrays of the same length or if the distribution * parameters in xs are not between 0 and 1 inclusive, or if any of * the concentration parameters is not positive and finite. */ public static double dirichletLog2Prob(double[] alphas, double[] xs) { if (alphas.length != xs.length) { String msg = "Dirichlet prior alphas and distribution xs must be the same length." + " Found alphas.length=" + alphas.length + " xs.length=" + xs.length; throw new IllegalArgumentException(msg); } for (int i = 0; i < alphas.length; ++i) verifyAlpha(alphas[i]); verifyDistro(xs); int k = xs.length; double result = 0.0; double alphaSum = 0.0; for (int i = 0; i < alphas.length; ++i) { alphaSum += alphas[i]; result -= Mathematics.log2Gamma(alphas[i]); } result += Mathematics.log2Gamma(alphaSum); for (int i = 0; i < k; ++i) result += (alphas[i] - 1) * Mathematics.log2(xs[i]); return result; } static void verifyAlpha(double alpha) { if (Double.isNaN(alpha) || Double.isInfinite(alpha) || alpha <= 0.0) { String msg = "Concentration parameter must be positive and finite." + " Found alpha=" + alpha; throw new IllegalArgumentException(msg); } } static void verifyDistro(double[] xs) { for (int i = 0; i < xs.length; ++i) { if (xs[i] < 0.0 || xs[i] > 1.0 || Double.isNaN(xs[i]) || Double.isInfinite(xs[i])) { String msg = "All xs must be betwee 0.0 and 1.0 inclusive." + " Found xs[" + i + "]=" + xs[i]; throw new IllegalArgumentException(msg); } } } // = sumSquareDiffs(xs,mean,xs,mean); static double sumSquareDiffs(double[] xs, double mean) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) { double diff = xs[i] - mean; sum += diff * diff; } return sum; } static double sumSquareDiffs(double[] xs, double[] ys, double meanXs, double meanYs) { double sum = 0.0; for (int i = 0; i < xs.length; ++i) sum += (xs[i] - meanXs) * (ys[i] - meanYs); return sum; } static double variance(double[] xs, double mean) { return sumSquareDiffs(xs,mean) / (double) xs.length; } static void assertNonNegative(String variableName, double value) { if (Double.isInfinite(value) || Double.isNaN(value) || value < 0.0) { String msg = "Require finite non-negative value." + " Found " + variableName + " =" + value; throw new IllegalArgumentException(msg); } } private static double csTerm(double found, double expected) { double diff = found - expected; return diff * diff / expected; } }