[Scipy-svn] r4141 - in trunk/scipy/stats: . tests
scipy-svn@scip...
scipy-svn@scip...
Mon Apr 14 14:01:09 CDT 2008
Author: pierregm
Date: 2008-04-14 14:01:01 -0500 (Mon, 14 Apr 2008)
New Revision: 4141
Added:
trunk/scipy/stats/mmorestats.py
trunk/scipy/stats/mstats.py
trunk/scipy/stats/tests/test_mmorestats.py
trunk/scipy/stats/tests/test_mstats.py
Log:
mstats/mmorestats : collection of statistical functions with support of missing values.
Added: trunk/scipy/stats/mmorestats.py
===================================================================
--- trunk/scipy/stats/mmorestats.py 2008-04-13 22:17:05 UTC (rev 4140)
+++ trunk/scipy/stats/mmorestats.py 2008-04-14 19:01:01 UTC (rev 4141)
@@ -0,0 +1,390 @@
+"""
+Additional statistics functions, with support to MA.
+
+:author: Pierre GF Gerard-Marchant
+:contact: pierregm_at_uga_edu
+:date: $Date: 2007-10-29 17:18:13 +0200 (Mon, 29 Oct 2007) $
+:version: $Id: morestats.py 3473 2007-10-29 15:18:13Z jarrod.millman $
+"""
+__author__ = "Pierre GF Gerard-Marchant"
+__docformat__ = "restructuredtext en"
+
+
+__all__ = ['compare_median_ms',
+ 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
+ 'idealfourths',
+ 'median_cihs','mjci','mquantiles_cimj',
+ 'rsh',
+ 'trimmed_mean_ci',]
+
+import numpy as np
+from numpy import bool_, float_, int_, ndarray, array as narray
+
+import numpy.ma as ma
+from numpy.ma import masked, nomask, MaskedArray
+#from numpy.ma.extras import apply_along_axis, dot, median
+
+import scipy.stats.mstats as mstats
+#from numpy.ma.mstats import trim_both, trimmed_stde, mquantiles, stde_median
+
+from scipy.stats.distributions import norm, beta, t, binom
+from scipy.stats.morestats import find_repeats
+
+
+
+#####--------------------------------------------------------------------------
+#---- --- Quantiles ---
+#####--------------------------------------------------------------------------
+def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
+ """Computes quantile estimates with the Harrell-Davis method, where the estimates
+are calculated as a weighted linear combination of order statistics.
+
+Parameters
+----------
+ data: ndarray
+ Data array.
+ prob: sequence
+ Sequence of quantiles to compute.
+ axis : int
+ Axis along which to compute the quantiles. If None, use a flattened array.
+ var : boolean
+ Whether to return the variance of the estimate.
+
+Returns
+-------
+ A (p,) array of quantiles (if ``var`` is False), or a (2,p) array of quantiles
+ and variances (if ``var`` is True), where ``p`` is the number of quantiles.
+
+Notes
+-----
+ The function is restricted to 2D arrays.
+
+ """
+ def _hd_1D(data,prob,var):
+ "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
+ xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
+ # Don't use length here, in case we have a numpy scalar
+ n = xsorted.size
+ #.........
+ hd = np.empty((2,len(prob)), float_)
+ if n < 2:
+ hd.flat = np.nan
+ if var:
+ return hd
+ return hd[0]
+ #.........
+ v = np.arange(n+1) / float(n)
+ betacdf = beta.cdf
+ for (i,p) in enumerate(prob):
+ _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
+ w = _w[1:] - _w[:-1]
+ hd_mean = np.dot(w, xsorted)
+ hd[0,i] = hd_mean
+ #
+ hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
+ #
+ hd[0, prob == 0] = xsorted[0]
+ hd[0, prob == 1] = xsorted[-1]
+ if var:
+ hd[1, prob == 0] = hd[1, prob == 1] = np.nan
+ return hd
+ return hd[0]
+ # Initialization & checks ---------
+ data = ma.array(data, copy=False, dtype=float_)
+ p = np.array(prob, copy=False, ndmin=1)
+ # Computes quantiles along axis (or globally)
+ if (axis is None) or (data.ndim == 1):
+ result = _hd_1D(data, p, var)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
+ #
+ return ma.fix_invalid(result, copy=False)
+
+#..............................................................................
+def hdmedian(data, axis=-1, var=False):
+ """Returns the Harrell-Davis estimate of the median along the given axis.
+
+Parameters
+----------
+ data: ndarray
+ Data array.
+ axis : int
+ Axis along which to compute the quantiles. If None, use a flattened array.
+ var : boolean
+ Whether to return the variance of the estimate.
+
+ """
+ result = hdquantiles(data,[0.5], axis=axis, var=var)
+ return result.squeeze()
+
+
+#..............................................................................
+def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
+ """Computes the standard error of the Harrell-Davis quantile estimates by jackknife.
+
+
+Parameters
+----------
+ data: ndarray
+ Data array.
+ prob: sequence
+ Sequence of quantiles to compute.
+ axis : int
+ Axis along which to compute the quantiles. If None, use a flattened array.
+
+Notes
+-----
+ The function is restricted to 2D arrays.
+
+ """
+ def _hdsd_1D(data,prob):
+ "Computes the std error for 1D arrays."
+ xsorted = np.sort(data.compressed())
+ n = len(xsorted)
+ #.........
+ hdsd = np.empty(len(prob), float_)
+ if n < 2:
+ hdsd.flat = np.nan
+ #.........
+ vv = np.arange(n) / float(n-1)
+ betacdf = beta.cdf
+ #
+ for (i,p) in enumerate(prob):
+ _w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
+ w = _w[1:] - _w[:-1]
+ mx_ = np.fromiter([np.dot(w,xsorted[np.r_[range(0,k),
+ range(k+1,n)].astype(int_)])
+ for k in range(n)], dtype=float_)
+ mx_var = np.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
+ hdsd[i] = float(n-1) * np.sqrt(np.diag(mx_var).diagonal() / float(n))
+ return hdsd
+ # Initialization & checks ---------
+ data = ma.array(data, copy=False, dtype=float_)
+ p = np.array(prob, copy=False, ndmin=1)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ result = _hdsd_1D(data.compressed(), p)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
+ #
+ return ma.fix_invalid(result, copy=False).ravel()
+
+
+#####--------------------------------------------------------------------------
+#---- --- Confidence intervals ---
+#####--------------------------------------------------------------------------
+
+def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
+ alpha=0.05, axis=None):
+ """Returns the selected confidence interval of the trimmed mean along the
+given axis.
+
+Parameters
+----------
+ data : sequence
+ Input data. The data is transformed to a masked array
+ proportiontocut : float
+ Proportion of the data to cut from each side of the data .
+ As a result, (2*proportiontocut*n) values are actually trimmed.
+ alpha : float
+ Confidence level of the intervals.
+ inclusive : tuple of boolean
+ If relative==False, tuple indicating whether values exactly equal to the
+ absolute limits are allowed.
+ If relative==True, tuple indicating whether the number of data being masked
+ on each side should be rounded (True) or truncated (False).
+ axis : int
+ Axis along which to cut. If None, uses a flattened version of the input.
+
+ """
+ data = ma.array(data, copy=False)
+ trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
+ tmean = trimmed.mean(axis)
+ tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
+ df = trimmed.count(axis) - 1
+ tppf = t.ppf(1-alpha/2.,df)
+ return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
+
+#..............................................................................
+def mjci(data, prob=[0.25,0.5,0.75], axis=None):
+ """Returns the Maritz-Jarrett estimators of the standard error of selected
+experimental quantiles of the data.
+
+Parameters
+-----------
+ data: ndarray
+ Data array.
+ prob: sequence
+ Sequence of quantiles to compute.
+ axis : int
+ Axis along which to compute the quantiles. If None, use a flattened array.
+
+ """
+ def _mjci_1D(data, p):
+ data = np.sort(data.compressed())
+ n = data.size
+ prob = (np.array(p) * n + 0.5).astype(int_)
+ betacdf = beta.cdf
+ #
+ mj = np.empty(len(prob), float_)
+ x = np.arange(1,n+1, dtype=float_) / n
+ y = x - 1./n
+ for (i,m) in enumerate(prob):
+ (m1,m2) = (m-1, n-m)
+ W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
+ C1 = np.dot(W,data)
+ C2 = np.dot(W,data**2)
+ mj[i] = np.sqrt(C2 - C1**2)
+ return mj
+ #
+ data = ma.array(data, copy=False)
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ p = np.array(prob, copy=False, ndmin=1)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ return _mjci_1D(data, p)
+ else:
+ return ma.apply_along_axis(_mjci_1D, axis, data, p)
+
+#..............................................................................
+def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
+ """Computes the alpha confidence interval for the selected quantiles of the
+data, with Maritz-Jarrett estimators.
+
+Parameters
+----------
+ data: ndarray
+ Data array.
+ prob: sequence
+ Sequence of quantiles to compute.
+ alpha : float
+ Confidence level of the intervals.
+ axis : integer
+ Axis along which to compute the quantiles. If None, use a flattened array.
+ """
+ alpha = min(alpha, 1-alpha)
+ z = norm.ppf(1-alpha/2.)
+ xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
+ smj = mjci(data, prob, axis=axis)
+ return (xq - z * smj, xq + z * smj)
+
+
+#.............................................................................
+def median_cihs(data, alpha=0.05, axis=None):
+ """Computes the alpha-level confidence interval for the median of the data,
+following the Hettmasperger-Sheather method.
+
+Parameters
+----------
+ data : sequence
+ Input data. Masked values are discarded. The input should be 1D only, or
+ axis should be set to None.
+ alpha : float
+ Confidence level of the intervals.
+ axis : integer
+ Axis along which to compute the quantiles. If None, use a flattened array.
+ """
+ def _cihs_1D(data, alpha):
+ data = np.sort(data.compressed())
+ n = len(data)
+ alpha = min(alpha, 1-alpha)
+ k = int(binom._ppf(alpha/2., n, 0.5))
+ gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+ if gk < 1-alpha:
+ k -= 1
+ gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+ gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
+ I = (gk - 1 + alpha)/(gk - gkk)
+ lambd = (n-k) * I / float(k + (n-2*k)*I)
+ lims = (lambd*data[k] + (1-lambd)*data[k-1],
+ lambd*data[n-k-1] + (1-lambd)*data[n-k])
+ return lims
+ data = ma.rray(data, copy=False)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ result = _cihs_1D(data.compressed(), alpha)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
+ #
+ return result
+
+#..............................................................................
+def compare_medians_ms(group_1, group_2, axis=None):
+ """Compares the medians from two independent groups along the given axis.
+
+The comparison is performed using the McKean-Schrader estimate of the standard
+error of the medians.
+
+Parameters
+----------
+ group_1 : {sequence}
+ First dataset.
+ group_2 : {sequence}
+ Second dataset.
+ axis : {integer}
+ Axis along which the medians are estimated. If None, the arrays are flattened.
+
+Returns
+-------
+ A (p,) array of comparison values.
+
+ """
+ (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
+ (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
+ mstats.stde_median(group_2, axis=axis))
+ W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
+ return 1 - norm.cdf(W)
+
+
+def idealfourths(data, axis=None):
+ """Returns an estimate of the lower and upper quartiles of the data along
+ the given axis, as computed with the ideal fourths.
+ """
+ def _idf(data):
+ x = data.compressed()
+ n = len(x)
+ if n < 3:
+ return [np.nan,np.nan]
+ (j,h) = divmod(n/4. + 5/12.,1)
+ qlo = (1-h)*x[j-1] + h*x[j]
+ k = n - j
+ qup = (1-h)*x[k] + h*x[k-1]
+ return [qlo, qup]
+ data = ma.sort(data, axis=axis).view(MaskedArray)
+ if (axis is None):
+ return _idf(data)
+ else:
+ return ma.apply_along_axis(_idf, axis, data)
+
+
+def rsh(data, points=None):
+ """Evaluates Rosenblatt's shifted histogram estimators for each point
+on the dataset 'data'.
+
+Parameters
+ data : sequence
+ Input data. Masked values are ignored.
+ points : sequence
+ Sequence of points where to evaluate Rosenblatt shifted histogram.
+ If None, use the data.
+ """
+ data = ma.array(data, copy=False)
+ if points is None:
+ points = data
+ else:
+ points = np.array(points, copy=False, ndmin=1)
+ if data.ndim != 1:
+ raise AttributeError("The input array should be 1D only !")
+ n = data.count()
+ r = idealfourths(data, axis=None)
+ h = 1.2 * (r[-1]-r[0]) / n**(1./5)
+ nhi = (data[:,None] <= points[None,:] + h).sum(0)
+ nlo = (data[:,None] < points[None,:] - h).sum(0)
+ return (nhi-nlo) / (2.*n*h)
+
+
+###############################################################################
+
Added: trunk/scipy/stats/mstats.py
===================================================================
--- trunk/scipy/stats/mstats.py 2008-04-13 22:17:05 UTC (rev 4140)
+++ trunk/scipy/stats/mstats.py 2008-04-14 19:01:01 UTC (rev 4141)
@@ -0,0 +1,1961 @@
+"""
+An extension of scipy.stats.stats to support masked arrays
+
+:author: Pierre GF Gerard-Marchant
+:contact: pierregm_at_uga_edu
+"""
+#TODO : f_value_wilks_lambda looks botched... what are dfnum & dfden for ?
+#TODO : ttest_reel looks botched: what are x1,x2,v1,v2 for ?
+#TODO : reimplement ksonesamp
+
+__author__ = "Pierre GF Gerard-Marchant"
+__docformat__ = "restructuredtext en"
+
+__all__ = ['argstoarray',
+ 'betai',
+ 'chisquare','corrcoef','count_tied_groups','cov',
+ 'describe',
+ 'f_oneway','f_value_wilks_lambda','find_repeats','friedmanchisquare',
+ 'gmean',
+ 'hmean',
+ 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
+ 'ks_twosamp','ks_2samp','kurtosis','kurtosistest',
+ 'linregress',
+ 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
+ 'normaltest',
+ 'obrientransform',
+ 'pearsonr','plotting_positions','pointbiserialr',
+ 'rankdata',
+ 'samplestd','samplevar','scoreatpercentile','sem','std',
+ 'sen_seasonal_slopes','signaltonoise','skew','skewtest','spearmanr',
+ 'stderr',
+ 'theilslopes','threshold','tmax','tmean','tmin','trim','trimboth',
+ 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
+ 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
+ 'ttest_ind','ttest_rel','tvar',
+ 'var','variation',
+ 'winsorize',
+ 'z','zmap','zs'
+ ]
+
+import numpy as np
+from numpy import ndarray
+import numpy.ma as ma
+from numpy.ma import MaskedArray, masked, nomask
+
+import itertools
+import warnings
+
+
+import scipy.stats as stats
+import scipy.special as special
+import scipy.misc as misc
+import scipy.stats.futil as futil
+
+
+genmissingvaldoc = """
+Notes
+-----
+ Missing values are considered pair-wise: if a value is missing in x,
+ the corresponding value in y is masked.
+"""
+#------------------------------------------------------------------------------
+def _chk_asarray(a, axis):
+ if axis is None:
+ a = ma.ravel(a)
+ outaxis = 0
+ else:
+ a = ma.asanyarray(a)
+ outaxis = axis
+ return a, outaxis
+
+def _chk2_asarray(a, b, axis):
+ if axis is None:
+ a = ma.ravel(a)
+ b = ma.ravel(b)
+ outaxis = 0
+ else:
+ a = ma.asanyarray(a)
+ b = ma.asanyarray(b)
+ outaxis = axis
+ return a, b, outaxis
+
+def _chk_size(a,b):
+ a = ma.asanyarray(a)
+ b = ma.asanyarray(b)
+ (na, nb) = (a.size, b.size)
+ if na != nb:
+ raise ValueError("The size of the input array should match!"\
+ " (%s <> %s)" % (na,nb))
+ return (a,b,na)
+
+def argstoarray(*args):
+ """Constructs a 2D array from a sequence of sequences. Sequences are filled
+ with missing values to match the length of the longest sequence.
+
+ Returns
+ -------
+ output : MaskedArray
+ a (mxn) masked array, where m is the number of arguments and n the
+ length of the longest argument.
+ """
+ if len(args) == 1 and not isinstance(args[0], ndarray):
+ output = ma.asarray(args[0])
+ assert(output.ndim == 2, "The input should be 2D!")
+ else:
+ n = len(args)
+ m = max([len(k) for k in args])
+ output = ma.array(np.empty((n,m), dtype=float), mask=True)
+ for (k,v) in enumerate(args):
+ output[k,:len(v)] = v
+ output[np.logical_not(np.isfinite(output._data))] = masked
+ return output
+
+
+
+#####--------------------------------------------------------------------------
+#---- --- Ranking ---
+#####--------------------------------------------------------------------------
+
+def find_repeats(arr):
+ """Find repeats in arr and return a tuple (repeats, repeat_count).
+ Masked values are discarded.
+
+Parameters
+----------
+ arr : sequence
+ Input array. The array is flattened if it is not 1D.
+
+Returns
+-------
+ repeats : ndarray
+ Array of repeated values.
+ counts : ndarray
+ Array of counts.
+
+ """
+ marr = ma.compressed(arr)
+ if not marr.size:
+ return (np.array(0), np.array(0))
+ (v1, v2, n) = futil.dfreps(ma.array(ma.compressed(arr), copy=True))
+ return (v1[:n], v2[:n])
+
+
+def count_tied_groups(x, use_missing=False):
+ """Counts the number of tied values in x, and returns a dictionary
+ (nb of ties: nb of groups).
+
+Parameters
+----------
+ x : sequence
+ Sequence of data on which to counts the ties
+ use_missing : boolean
+ Whether to consider missing values as tied.
+
+Example
+-------
+ >>>z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
+ >>>count_tied_groups(z)
+ >>>{2:1, 3:2}
+ >>># The ties were 0 (3x), 2 (3x) and 3 (2x)
+ >>>z = ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
+ >>>count_tied_groups(z)
+ >>>{2:2, 3:1}
+ >>># The ties were 0 (2x), 2 (3x) and 3 (2x)
+ >>>z[[1,-1]] = masked
+ >>>count_tied_groups(z)
+ >>>{2:2, 3:1}
+ >>># The ties were 2 (3x), 3 (2x) and masked (2x)
+ """
+ nmasked = ma.getmask(x).sum()
+ # We need the copy as find_repeats will overwrite the initial data
+ data = ma.compressed(x).copy()
+ (ties, counts) = find_repeats(data)
+ nties = {}
+ if len(ties):
+ nties = dict(zip(np.unique(counts), itertools.repeat(1)))
+ nties.update(dict(zip(*find_repeats(counts))))
+ if nmasked and use_missing:
+ try:
+ nties[nmasked] += 1
+ except KeyError:
+ nties[nmasked] = 1
+ return nties
+
+
+def rankdata(data, axis=None, use_missing=False):
+ """Returns the rank (also known as order statistics) of each data point
+ along the given axis.
+
+ If some values are tied, their rank is averaged.
+ If some values are masked, their rank is set to 0 if use_missing is False,
+ or set to the average rank of the unmasked values if use_missing is True.
+
+ Parameters
+ ----------
+ data : sequence
+ Input data. The data is transformed to a masked array
+ axis : {None,int} optional
+ Axis along which to perform the ranking.
+ If None, the array is first flattened. An exception is raised if
+ the axis is specified for arrays with a dimension larger than 2
+ use_missing : {boolean} optional
+ Whether the masked values have a rank of 0 (False) or equal to the
+ average rank of the unmasked values (True).
+ """
+ #
+ def _rank1d(data, use_missing=False):
+ n = data.count()
+ rk = np.empty(data.size, dtype=float)
+ idx = data.argsort()
+ rk[idx[:n]] = np.arange(1,n+1)
+ #
+ if use_missing:
+ rk[idx[n:]] = (n+1)/2.
+ else:
+ rk[idx[n:]] = 0
+ #
+ repeats = find_repeats(data.copy())
+ for r in repeats[0]:
+ condition = (data==r).filled(False)
+ rk[condition] = rk[condition].mean()
+ return rk
+ #
+ data = ma.array(data, copy=False)
+ if axis is None:
+ if data.ndim > 1:
+ return _rank1d(data.ravel(), use_missing).reshape(data.shape)
+ else:
+ return _rank1d(data, use_missing)
+ else:
+ return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
+
+
+#####--------------------------------------------------------------------------
+#---- --- Central tendency ---
+#####--------------------------------------------------------------------------
+
+def gmean(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ log_a = ma.log(a)
+ return ma.exp(log_a.mean(axis=axis))
+gmean.__doc__ = stats.gmean.__doc__
+
+
+def hmean(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ if isinstance(a, MaskedArray):
+ size = a.count(axis)
+ else:
+ size = a.shape[axis]
+ return size / (1.0/a).sum(axis)
+hmean.__doc__ = stats.hmean.__doc__
+
+
+def mode(a, axis=0):
+ def _mode1D(a):
+ (rep,cnt) = find_repeats(a)
+ if not cnt.ndim:
+ return (0,0)
+ elif cnt.size:
+ return (rep[cnt.argmax()], cnt.max())
+ return (a[0], 1)
+ #
+ if axis is None:
+ output = _mode1D(ma.ravel(a))
+ else:
+ output = ma.apply_along_axis(_mode1D, axis, a)
+ return output
+mode.__doc__ = stats.mode.__doc__
+
+
+#####--------------------------------------------------------------------------
+#---- --- Probabilities ---
+#####--------------------------------------------------------------------------
+
+def betai(a, b, x):
+ x = np.asanyarray(x)
+ x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
+ return special.betainc(a, b, x)
+betai.__doc__ = stats.betai.__doc__
+
+
+#####--------------------------------------------------------------------------
+#---- --- Correlation ---
+#####--------------------------------------------------------------------------
+
+def msign(x):
+ """Returns the sign of x, or 0 if x is masked."""
+ return ma.filled(np.sign(x), 0)
+
+
+def cov(x, y=None, rowvar=False, bias=False, allow_masked=True):
+ """Estimates the covariance matrix.
+
+Normalization is by (N-1) where N is the number of observations (unbiased
+estimate). If bias is True then normalization is by N.
+
+
+
+Parameters
+----------
+ x : ndarray
+ Input data. If x is a 1D array, returns the variance. If x is a 2D array,
+ returns the covariance matrix.
+ y : {None, ndarray} optional
+ Optional set of variables.
+ rowvar : {False, True} optional
+ If rowvar is true, then each row is a variable with obersvations in columns.
+ If rowvar is False, each column is a variable and the observations are in
+ the rows.
+ bias : {False, True} optional
+ Whether to use a biased (True) or unbiased (False) estimate of the covariance.
+ If bias is True, then the normalization is by N, the number of observations.
+ Otherwise, the normalization is by (N-1).
+ allow_masked : {True, False} optional
+ If True, masked values are propagated pair-wise: if a value is masked in x,
+ the corresponding value is masked in y.
+ If False, raises an exception.
+ """
+ x = ma.asarray(x)
+ if y is None:
+ y = x
+ else:
+ y = ma.asarray(y)
+ common_mask = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ if allow_masked:
+ x.unshare_mask()
+ y.unshare_mask()
+ x._mask = y._mask = common_mask
+ elif common_mask is not nomask:
+ raise ValueError("Cannot process masked data...")
+ n = x.count()
+ #
+ if rowvar:
+ (x, y) = (x.T, y.T)
+ #
+ x -= x.mean(0)
+ y -= y.mean(0)
+ result = np.dot(x.filled(0).T, y.filled(0).conj()).squeeze()
+ if bias:
+ result /= float(n)
+ else:
+ result /= (n-1.)
+ return result
+
+
+def corrcoef(x, y=None, rowvar=False, bias=False, allow_masked=True):
+ """The correlation coefficients formed from 2-d array x, where the
+ rows are the observations, and the columns are variables.
+
+ corrcoef(x,y) where x and y are 1d arrays is the same as
+ corrcoef(transpose([x,y]))
+
+Parameters
+----------
+ x : ndarray
+ Input data. If x is a 1D array, returns the variance. If x is a 2D array,
+ returns the covariance matrix.
+ y : {None, ndarray} optional
+ Optional set of variables.
+ rowvar : {False, True} optional
+ If True, then each row is a variable with obersvations in columns.
+ If False, each column is a variable and the observations are in the rows.
+ bias : {False, True} optional
+ Whether to use a biased (True) or unbiased (False) estimate of the
+ covariance.
+ If True, then the normalization is by N, the number of observations.
+ Otherwise, the normalization is by (N-1).
+ allow_masked : {True, False} optional
+ If True, masked values are propagated pair-wise: if a value is masked in x,
+ the corresponding value is masked in y.
+ If False, raises an exception.
+ """
+ if y is not None:
+ x = ma.column_stack([x,y])
+ y = None
+ c = cov(x, y, rowvar=rowvar, bias=bias, allow_masked=allow_masked)
+ d = ma.diagonal(c)
+ return c/ma.sqrt(ma.multiply.outer(d,d))
+
+
+def pearsonr(x,y):
+ """Calculates a Pearson correlation coefficient and the p-value for testing
+ non-correlation.
+
+ The Pearson correlation coefficient measures the linear relationship
+ between two datasets. Strictly speaking, Pearson's correlation requires
+ that each dataset be normally distributed. Like other correlation
+ coefficients, this one varies between -1 and +1 with 0 implying no
+ correlation. Correlations of -1 or +1 imply an exact linear
+ relationship. Positive correlations imply that as x increases, so does
+ y. Negative correlations imply that as x increases, y decreases.
+
+ The p-value roughly indicates the probability of an uncorrelated system
+ producing datasets that have a Pearson correlation at least as extreme
+ as the one computed from these datasets. The p-values are not entirely
+ reliable but are probably reasonable for datasets larger than 500 or so.
+
+ Parameters
+ ----------
+ x : 1D array
+ y : 1D array the same length as x
+
+ Returns
+ -------
+ (Pearson's correlation coefficient,
+ 2-tailed p-value)
+
+ References
+ ----------
+ http://www.statsoft.com/textbook/glosp.html#Pearson%20Correlation
+ """
+ (x, y, n) = _chk_size(x, y)
+ (x, y) = (x.ravel(), y.ravel())
+ # Get the common mask and the total nb of unmasked elements
+ m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ n -= m.sum()
+ df = n-2
+ if df < 0:
+ return (masked, masked)
+ #
+ (mx, my) = (x.mean(), y.mean())
+ (xm, ym) = (x-mx, y-my)
+ #
+ r_num = n*(ma.add.reduce(xm*ym))
+ r_den = n*ma.sqrt(ma.dot(xm,xm)*ma.dot(ym,ym))
+ r = (r_num / r_den)
+ # Presumably, if r > 1, then it is only some small artifact of floating
+ # point arithmetic.
+ r = min(r, 1.0)
+ r = max(r, -1.0)
+ df = n-2
+ #
+ t = ma.sqrt(df/((1.0-r)*(1.0+r))) * r
+ if t is masked:
+ prob = 0.
+ else:
+ prob = betai(0.5*df,0.5,df/(df+t*t))
+ return (r,prob)
+
+
+def spearmanr(x, y, use_ties=True):
+ """Calculates a Spearman rank-order correlation coefficient and the p-value
+ to test for non-correlation.
+
+ The Spearman correlation is a nonparametric measure of the linear
+ relationship between two datasets. Unlike the Pearson correlation, the
+ Spearman correlation does not assume that both datasets are normally
+ distributed. Like other correlation coefficients, this one varies
+ between -1 and +1 with 0 implying no correlation. Correlations of -1 or
+ +1 imply an exact linear relationship. Positive correlations imply that
+ as x increases, so does y. Negative correlations imply that as x
+ increases, y decreases.
+
+ Missing values are discarded pair-wise: if a value is missing in x, the
+ corresponding value in y is masked.
+
+ The p-value roughly indicates the probability of an uncorrelated system
+ producing datasets that have a Spearman correlation at least as extreme
+ as the one computed from these datasets. The p-values are not entirely
+ reliable but are probably reasonable for datasets larger than 500 or so.
+
+Parameters
+----------
+ x : 1D array
+ y : 1D array the same length as x
+ The lengths of both arrays must be > 2.
+ use_ties : {True, False} optional
+ Whether the correction for ties should be computed.
+
+Returns
+-------
+ (Spearman correlation coefficient,
+ 2-tailed p-value)
+
+ References
+ ----------
+ [CRCProbStat2000] section 14.7
+ """
+ (x, y, n) = _chk_size(x, y)
+ (x, y) = (x.ravel(), y.ravel())
+ #
+ m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ n -= m.sum()
+ if m is not nomask:
+ x = ma.array(x, mask=m, copy=True)
+ y = ma.array(y, mask=m, copy=True)
+ df = n-2
+ if df < 0:
+ raise ValueError("The input must have at least 3 entries!")
+ # Gets the ranks and rank differences
+ rankx = rankdata(x)
+ ranky = rankdata(y)
+ dsq = np.add.reduce((rankx-ranky)**2)
+ # Tie correction
+ if use_ties:
+ xties = count_tied_groups(x)
+ yties = count_tied_groups(y)
+ corr_x = np.sum(v*k*(k**2-1) for (k,v) in xties.iteritems())/12.
+ corr_y = np.sum(v*k*(k**2-1) for (k,v) in yties.iteritems())/12.
+ else:
+ corr_x = corr_y = 0
+ denom = n*(n**2 - 1)/6.
+ if corr_x != 0 or corr_y != 0:
+ rho = denom - dsq - corr_x - corr_y
+ rho /= ma.sqrt((denom-2*corr_x)*(denom-2*corr_y))
+ else:
+ rho = 1. - dsq/denom
+ #
+ t = ma.sqrt(ma.divide(df,(rho+1.0)*(1.0-rho))) * rho
+ if t is masked:
+ prob = 0.
+ else:
+ prob = betai(0.5*df,0.5,df/(df+t*t))
+ return rho, prob
+
+
+def kendalltau(x, y, use_ties=True, use_missing=False):
+ """Computes Kendall's rank correlation tau on two variables *x* and *y*.
+
+Parameters
+----------
+ xdata: sequence
+ First data list (for example, time).
+ ydata: sequence
+ Second data list.
+ use_ties: {True, False} optional
+ Whether ties correction should be performed.
+ use_missing: {False, True} optional
+ Whether missing data should be allocated a rank of 0 (False) or the
+ average rank (True)
+ """
+ (x, y, n) = _chk_size(x, y)
+ (x, y) = (x.flatten(), y.flatten())
+ m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ if m is not nomask:
+ x = ma.array(x, mask=m, copy=True)
+ y = ma.array(y, mask=m, copy=True)
+ n -= m.sum()
+ #
+ rx = ma.masked_equal(rankdata(x, use_missing=use_missing),0)
+ ry = ma.masked_equal(rankdata(y, use_missing=use_missing),0)
+ idx = rx.argsort()
+ (rx, ry) = (rx[idx], ry[idx])
+ C = np.sum((((ry[i+1:]>ry[i])*(rx[i+1:]>rx[i])).filled(0).sum()
+ for i in range(len(ry)-1)))
+ D = np.sum((((ry[i+1:]<ry[i])*(rx[i+1:]>rx[i])).filled(0).sum()
+ for i in range(len(ry)-1)))
+ if use_ties:
+ xties = count_tied_groups(x)
+ yties = count_tied_groups(y)
+ corr_x = np.sum(v*k*(k-1) for (k,v) in xties.iteritems())
+ corr_y = np.sum(v*k*(k-1) for (k,v) in yties.iteritems())
+ denom = ma.sqrt((n*(n-1)-corr_x) * (n*(n-1)-corr_y)) / 2.
+ else:
+ denom = n*(n-1)/2.
+ tau = (C-D) / denom
+ #
+ var_s = n*(n-1)*(2*n+5)
+ if use_ties:
+ var_s -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in xties.iteritems())
+ var_s -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in yties.iteritems())
+ v1 = np.sum(v*k*(k-1) for (k,v) in xties.iteritems()) * \
+ np.sum(v*k*(k-1) for (k,v) in yties.iteritems())
+ v1 /= 2.*n*(n-1)
+ v2 = np.sum(v*k*(k-1)*(k-2) for (k,v) in xties.iteritems()) * \
+ np.sum(v*k*(k-1)*(k-2) for (k,v) in yties.iteritems())
+ v2 /= 9.*n*(n-1)*(n-2)
+ else:
+ v1 = v2 = 0
+ var_s /= 18.
+ var_s += (v1 + v2)
+ z = (C-D)/np.sqrt(var_s)
+ prob = special.erfc(abs(z)/np.sqrt(2))
+ return (tau,prob)
+
+
+def kendalltau_seasonal(x):
+ """Computes a multivariate extension Kendall's rank correlation tau, designed
+ for seasonal data.
+
+Parameters
+----------
+ x: 2D array
+ Array of seasonal data, with seasons in columns.
+ """
+ x = ma.array(x, subok=True, copy=False, ndmin=2)
+ (n,m) = x.shape
+ n_p = x.count(0)
+ #
+ S_szn = np.sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
+ S_tot = S_szn.sum()
+ #
+ n_tot = x.count()
+ ties = count_tied_groups(x.compressed())
+ corr_ties = np.sum(v*k*(k-1) for (k,v) in ties.iteritems())
+ denom_tot = ma.sqrt(n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
+ #
+ R = rankdata(x, axis=0, use_missing=True)
+ K = ma.empty((m,m), dtype=int)
+ covmat = ma.empty((m,m), dtype=float)
+# cov_jj = ma.empty(m, dtype=float)
+ denom_szn = ma.empty(m, dtype=float)
+ for j in range(m):
+ ties_j = count_tied_groups(x[:,j].compressed())
+ corr_j = np.sum(v*k*(k-1) for (k,v) in ties_j.iteritems())
+ cmb = n_p[j]*(n_p[j]-1)
+ for k in range(j,m,1):
+ K[j,k] = np.sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
+ for i in range(n))
+ covmat[j,k] = (K[j,k] +4*(R[:,j]*R[:,k]).sum() - \
+ n*(n_p[j]+1)*(n_p[k]+1))/3.
+ K[k,j] = K[j,k]
+ covmat[k,j] = covmat[j,k]
+# cov_jj[j] = (nn_p*(2*n_p[j]+5))
+# cov_jj[j] -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in ties_j.iteritems())
+# cov_jj[j] /= 18.
+ denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
+ var_szn = covmat.diagonal()
+ #
+ z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
+ z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
+ z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
+ #
+ prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))
+ prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
+ prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
+ #
+ chi2_tot = (z_szn*z_szn).sum()
+ chi2_trd = m * z_szn.mean()**2
+ output = {'seasonal tau': S_szn/denom_szn,
+ 'global tau': S_tot/denom_tot,
+ 'global tau (alt)': S_tot/denom_szn.sum(),
+ 'seasonal p-value': prob_szn,
+ 'global p-value (indep)': prob_tot_ind,
+ 'global p-value (dep)': prob_tot_dep,
+ 'chi2 total': chi2_tot,
+ 'chi2 trend': chi2_trd,
+ }
+ return output
+
+
+def pointbiserialr(x, y):
+ x = ma.fix_invalid(x, copy=True).astype(bool)
+ y = ma.fix_invalid(y, copy=True).astype(float)
+ # Get rid of the missing data ..........
+ m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ if m is not nomask:
+ unmask = np.logical_not(m)
+ x = x[unmask]
+ y = y[unmask]
+ #
+ n = len(x)
+ # phat is the fraction of x values that are True
+ phat = x.sum() / float(n)
+ y0 = y[~x] # y-values where x is False
+ y1 = y[x] # y-values where x is True
+ y0m = y0.mean()
+ y1m = y1.mean()
+ #
+ rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
+ #
+ df = n-2
+ t = rpb*ma.sqrt(df/(1.0-rpb**2))
+ prob = betai(0.5*df, 0.5, df/(df+t*t))
+ return rpb, prob
+pointbiserialr.__doc__ = stats.pointbiserialr.__doc__ + genmissingvaldoc
+
+
+def linregress(*args):
+ if len(args) == 1: # more than 1D array?
+ args = ma.array(args[0], copy=True)
+ if len(args) == 2:
+ x = args[0]
+ y = args[1]
+ else:
+ x = args[:,0]
+ y = args[:,1]
+ else:
+ x = ma.array(args[0]).flatten()
+ y = ma.array(args[1]).flatten()
+ m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ if m is not nomask:
+ x = ma.array(x,mask=m)
+ y = ma.array(y,mask=m)
+ n = len(x)
+ (xmean, ymean) = (x.mean(), y.mean())
+ (xm, ym) = (x-xmean, y-ymean)
+ (Sxx, Syy) = (ma.add.reduce(xm*xm), ma.add.reduce(ym*ym))
+ Sxy = ma.add.reduce(xm*ym)
+ r_den = ma.sqrt(Sxx*Syy)
+ if r_den == 0.0:
+ r = 0.0
+ else:
+ r = Sxy / r_den
+ if (r > 1.0):
+ r = 1.0 # from numerical error
+ #z = 0.5*log((1.0+r+TINY)/(1.0-r+TINY))
+ df = n-2
+ t = r * ma.sqrt(df/(1.0-r*r))
+ prob = betai(0.5*df,0.5,df/(df+t*t))
+ slope = Sxy / Sxx
+ intercept = ymean - slope*xmean
+ sterrest = ma.sqrt(1.-r*r) * y.std()
+ return slope, intercept, r, prob, sterrest, Syy/Sxx
+linregress.__doc__ = stats.linregress.__doc__ + genmissingvaldoc
+
+
+def theilslopes(y, x=None, alpha=0.05):
+ """Computes the Theil slope over the dataset (x,y), as the median of all slopes
+ between paired values.
+
+ Parameters
+ ----------
+ y : sequence
+ Dependent variable.
+ x : {None, sequence} optional
+ Independent variable. If None, use arange(len(y)) instead.
+ alpha : float
+ Confidence degree.
+
+ """
+ y = ma.asarray(y).flatten()
+ y[-1] = masked
+ n = len(y)
+ if x is None:
+ x = ma.arange(len(y), dtype=float)
+ else:
+ x = ma.asarray(x).flatten()
+ if len(x) != n:
+ raise ValueError, "Incompatible lengths ! (%s<>%s)" % (n,len(x))
+ m = ma.mask_or(ma.getmask(x), ma.getmask(y))
+ y._mask = x._mask = m
+ ny = y.count()
+ #
+ slopes = ma.hstack([(y[i+1:]-y[i])/(x[i+1:]-x[i]) for i in range(n-1)])
+ slopes.sort()
+ medslope = ma.median(slopes)
+ medinter = ma.median(y) - medslope*ma.median(x)
+ #
+ if alpha > 0.5:
+ alpha = 1.-alpha
+ z = stats.distributions.norm.ppf(alpha/2.)
+ #
+ (xties, yties) = (count_tied_groups(x), count_tied_groups(y))
+ nt = ny*(ny-1)/2.
+ sigsq = (ny*(ny-1)*(2*ny+5)/18.)
+ sigsq -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in xties.iteritems())
+ sigsq -= np.sum(v*k*(k-1)*(2*k+5) for (k,v) in yties.iteritems())
+ sigma = np.sqrt(sigsq)
+
+ Ru = np.round((nt - z*sigma)/2. + 1)
+ Rl = np.round((nt + z*sigma)/2.)
+ delta = slopes[[Rl,Ru]]
+ return medslope, medinter, delta
+
+
+def sen_seasonal_slopes(x):
+ x = ma.array(x, subok=True, copy=False, ndmin=2)
+ (n,_) = x.shape
+ # Get list of slopes per season
+ szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
+ for i in range(n)])
+ szn_medslopes = ma.median(szn_slopes, axis=0)
+ medslope = ma.median(szn_slopes, axis=None)
+ return szn_medslopes, medslope
+
+
+#####--------------------------------------------------------------------------
+#---- --- Inferential statistics ---
+#####--------------------------------------------------------------------------
+
+def ttest_onesamp(a, popmean):
+ a = ma.asarray(a)
+ x = a.mean(axis=None)
+ v = a.var(axis=None,ddof=1)
+ n = a.count(axis=None)
+ df = n-1
+ svar = ((n-1)*v) / float(df)
+ t = (x-popmean)/ma.sqrt(svar*(1.0/n))
+ prob = betai(0.5*df,0.5,df/(df+t*t))
+ return t,prob
+ttest_onesamp.__doc__ = stats.ttest_1samp.__doc__
+ttest_1samp = ttest_onesamp
+
+
+def ttest_ind(a, b, axis=0):
+ a, b, axis = _chk2_asarray(a, b, axis)
+ (x1, x2) = (a.mean(axis), b.mean(axis))
+ (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
+ (n1, n2) = (a.count(axis), b.count(axis))
+ df = n1+n2-2
+ svar = ((n1-1)*v1+(n2-1)*v2) / float(df)
+ svar == 0
+ t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # N-D COMPUTATION HERE!!!!!!
+ t = ma.filled(t, 1) # replace NaN t-values with 1.0
+ probs = betai(0.5*df,0.5,float(df)/(df+t*t)).reshape(t.shape)
+ return t, probs.squeeze()
+ttest_ind.__doc__ = stats.ttest_ind.__doc__
+
+
+def ttest_rel(a,b,axis=None):
+ a, b, axis = _chk2_asarray(a, b, axis)
+ if len(a)!=len(b):
+ raise ValueError, 'unequal length arrays'
+ (x1, x2) = (a.mean(axis), b.mean(axis))
+ (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
+ n = a.count(axis)
+ df = (n-1.0)
+ d = (a-b).astype('d')
+ denom = ma.sqrt((n*ma.add.reduce(d*d,axis) - ma.add.reduce(d,axis)**2) /df)
+ #zerodivproblem = denom == 0
+ t = ma.add.reduce(d, axis) / denom
+ t = ma.filled(t, 1)
+ probs = betai(0.5*df,0.5,df/(df+t*t)).reshape(t.shape).squeeze()
+ return t, probs
+ttest_rel.__doc__ = stats.ttest_rel.__doc__
+
+
+def chisquare(f_obs, f_exp=None):
+ f_obs = ma.asarray(f_obs)
+ if f_exp is None:
+ f_exp = ma.array([f_obs.mean(axis=0)] * len(f_obs))
+ f_exp = f_exp.astype(float)
+ chisq = ma.add.reduce((f_obs-f_exp)**2 / f_exp)
+ return chisq, stats.chisqprob(chisq, f_obs.count(0)-1)
+chisquare.__doc__ = stats.chisquare.__doc__
+
+
+def mannwhitneyu(x,y, use_continuity=True):
+ """Computes the Mann-Whitney on samples x and y.
+ Missing values in x and/or y are discarded.
+
+ Parameters
+ ----------
+ x : sequence
+ y : sequence
+ use_continuity : {True, False} optional
+ Whether a continuity correction (1/2.) should be taken into account.
+
+ Returns
+ -------
+ u : float
+ The Mann-Whitney statistics
+ prob : float
+ Approximate p-value assuming a normal distribution.
+
+ """
+ x = ma.asarray(x).compressed().view(ndarray)
+ y = ma.asarray(y).compressed().view(ndarray)
+ ranks = rankdata(np.concatenate([x,y]))
+ (nx, ny) = (len(x), len(y))
+ nt = nx + ny
+ U = ranks[:nx].sum() - nx*(nx+1)/2.
+ U = max(U, nx*ny - U)
+ u = nx*ny - U
+ #
+ mu = (nx*ny)/2.
+ sigsq = (nt**3 - nt)/12.
+ ties = count_tied_groups(ranks)
+ sigsq -= np.sum(v*(k**3-k) for (k,v) in ties.iteritems())/12.
+ sigsq *= nx*ny/float(nt*(nt-1))
+ #
+ if use_continuity:
+ z = (U - 1/2. - mu) / ma.sqrt(sigsq)
+ else:
+ z = (U - mu) / ma.sqrt(sigsq)
+ prob = special.erfc(abs(z)/np.sqrt(2))
+ return (u, prob)
+
+
+def kruskalwallis(*args):
+ output = argstoarray(*args)
+ ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
+ sumrk = ranks.sum(-1)
+ ngrp = ranks.count(-1)
+ ntot = ranks.count()
+# ssbg = (sumrk**2/ranks.count(-1)).sum() - ranks.sum()**2/ntotal
+# H = ssbg / (ntotal*(ntotal+1)/12.)
+ H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
+ # Tie correction
+ ties = count_tied_groups(ranks)
+ T = 1. - np.sum(v*(k**3-k) for (k,v) in ties.iteritems())/float(ntot**3-ntot)
+ if T == 0:
+ raise ValueError, 'All numbers are identical in kruskal'
+ H /= T
+ #
+ df = len(output) - 1
+ prob = stats.chisqprob(H,df)
+ return (H, prob)
+kruskal = kruskalwallis
+kruskalwallis.__doc__ = stats.kruskal.__doc__
+
+
+_kolmog2 = special.kolmogorov
+def _kolmog1(x,n):
+ if x <= 0:
+ return 0
+ if x >= 1:
+ return 1
+ j = np.arange(np.floor(n*(1-x))+1)
+ return 1 - x * np.sum(np.exp(np.log(misc.comb(n,j))
+ + (n-j) * np.log(1-x-j/float(n))
+ + (j-1) * np.log(x+j/float(n))))
+
+
+def ks_twosamp(data1, data2, alternative="two_sided"):
+ """Computes the Kolmogorov-Smirnov test on two samples.
+ Missing values are discarded.
+
+ Parameters
+ ----------
+ data1 : sequence
+ First data set
+ data2 : sequence
+ Second data set
+ alternative : {'two_sided', 'less', 'greater'} optional
+ Indicates the alternative hypothesis.
+
+ Returns
+ -------
+ d : float
+ Value of the Kolmogorov Smirnov test
+ p : float
+ Corresponding p-value.
+
+ """
+ (data1, data2) = (ma.asarray(data1), ma.asarray(data2))
+ (n1, n2) = (data1.count(), data2.count())
+ n = (n1*n2/float(n1+n2))
+ mix = ma.concatenate((data1.compressed(), data2.compressed()))
+ mixsort = mix.argsort(kind='mergesort')
+ csum = np.where(mixsort<n1, 1./n1, -1./n2).cumsum()
+ # Check for ties
+ if len(np.unique(mix)) < (n1+n2):
+ csum = csum[np.r_[np.diff(mix[mixsort]).nonzero()[0],-1]]
+ #
+ alternative = str(alternative).lower()[0]
+ if alternative == 't':
+ d = ma.abs(csum).max()
+ prob = _kolmog2(np.sqrt(n)*d)
+ elif alternative == 'l':
+ d = -csum.min()
+ prob = np.exp(-2*n*d**2)
+ elif alternative == 'g':
+ d = csum.max()
+ prob = np.exp(-2*n*d**2)
+ else:
+ raise ValueError("Invalid value for the alternative hypothesis: "\
+ "should be in 'two_sided', 'less' or 'greater'")
+ return (d, prob)
+ks_2samp = ks_twosamp
+
+
+def ks_twosamp_old(data1, data2):
+ """ Computes the Kolmogorov-Smirnov statistic on 2 samples.
+ Returns: KS D-value, p-value
+ #"""
+ (data1, data2) = [ma.asarray(d).compressed() for d in (data1,data2)]
+ return stats.ks_2samp(data1,data2)
+
+
+#####--------------------------------------------------------------------------
+#---- --- Trimming ---
+#####--------------------------------------------------------------------------
+
+def threshold(a, threshmin=None, threshmax=None, newval=0):
+ """Clip array to a given value.
+
+Similar to numpy.clip(), except that values less than threshmin or
+greater than threshmax are replaced by newval, instead of by
+threshmin and threshmax respectively.
+
+Parameters
+----------
+ a : ndarray
+ Input data
+ threshmin : {None, float} optional
+ Lower threshold. If None, set to the minimum value.
+ threshmax : {None, float} optional
+ Upper threshold. If None, set to the maximum value.
+ newval : {0, float} optional
+ Value outside the thresholds.
+
+Returns
+-------
+ a, with values less (greater) than threshmin (threshmax) replaced with newval.
+
+"""
+ a = ma.array(a, copy=True)
+ mask = np.zeros(a.shape, dtype=bool)
+ if threshmin is not None:
+ mask |= (a < threshmin).filled(False)
+ if threshmax is not None:
+ mask |= (a > threshmax).filled(False)
+ a[mask] = newval
+ return a
+
+
+def trima(a, limits=None, inclusive=(True,True)):
+ """Trims an array by masking the data outside some given limits.
+ Returns a masked version of the input array.
+
+ Parameters
+ ----------
+ a : sequence
+ Input array.
+ limits : {None, tuple} optional
+ Tuple of (lower limit, upper limit) in absolute values.
+ Values of the input array lower (greater) than the lower (upper) limit
+ will be masked. A limit is None indicates an open interval.
+ inclusive : {(True,True) tuple} optional
+ Tuple of (lower flag, upper flag), indicating whether values exactly
+ equal to the lower (upper) limit are allowed.
+
+ """
+ a = ma.asarray(a)
+ a.unshare_mask()
+ if limits is None:
+ return a
+ (lower_lim, upper_lim) = limits
+ (lower_in, upper_in) = inclusive
+ condition = False
+ if lower_lim is not None:
+ if lower_in:
+ condition |= (a < lower_lim)
+ else:
+ condition |= (a <= lower_lim)
+ if upper_lim is not None:
+ if upper_in:
+ condition |= (a > upper_lim)
+ else:
+ condition |= (a >= upper_lim)
+ a[condition.filled(True)] = masked
+ return a
+
+
+def trimr(a, limits=None, inclusive=(True, True), axis=None):
+ """Trims an array by masking some proportion of the data on each end.
+ Returns a masked version of the input array.
+
+ Parameters
+ ----------
+ a : sequence
+ Input array.
+ limits : {None, tuple} optional
+ Tuple of the percentages to cut on each side of the array, with respect
+ to the number of unmasked data, as floats between 0. and 1.
+ Noting n the number of unmasked data before trimming, the (n*limits[0])th
+ smallest data and the (n*limits[1])th largest data are masked, and the
+ total number of unmasked data after trimming is n*(1.-sum(limits))
+ The value of one limit can be set to None to indicate an open interval.
+ inclusive : {(True,True) tuple} optional
+ Tuple of flags indicating whether the number of data being masked on the
+ left (right) end should be truncated (True) or rounded (False) to integers.
+ axis : {None,int} optional
+ Axis along which to trim. If None, the whole array is trimmed, but its
+ shape is maintained.
+
+ """
+ def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
+ n = a.count()
+ idx = a.argsort()
+ if low_limit:
+ if low_inclusive:
+ lowidx = int(low_limit*n)
+ else:
+ lowidx = np.round(low_limit*n)
+ a[idx[:lowidx]] = masked
+ if up_limit is not None:
+ if up_inclusive:
+ upidx = n - int(n*up_limit)
+ else:
+ upidx = n- np.round(n*up_limit)
+ a[idx[upidx:]] = masked
+ return a
+ #
+ a = ma.asarray(a)
+ a.unshare_mask()
+ if limits is None:
+ return a
+ # Check the limits
+ (lolim, uplim) = limits
+ errmsg = "The proportion to cut from the %s should be between 0. and 1."
+ if lolim is not None:
+ if lolim > 1. or lolim < 0:
+ raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
+ if uplim is not None:
+ if uplim > 1. or uplim < 0:
+ raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
+ #
+ (loinc, upinc) = inclusive
+ #
+ if axis is None:
+ shp = a.shape
+ return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
+ else:
+ return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
+
+trimdoc = """
+ Parameters
+ ----------
+ a : sequence
+ Input array
+ limits : {None, tuple} optional
+ If relative == False, tuple (lower limit, upper limit) in absolute values.
+ Values of the input array lower (greater) than the lower (upper) limit are
+ masked.
+ If relative == True, tuple (lower percentage, upper percentage) to cut
+ on each side of the array, with respect to the number of unmasked data.
+ Noting n the number of unmasked data before trimming, the (n*limits[0])th
+ smallest data and the (n*limits[1])th largest data are masked, and the
+ total number of unmasked data after trimming is n*(1.-sum(limits))
+ In each case, the value of one limit can be set to None to indicate an
+ open interval.
+ If limits is None, no trimming is performed
+ inclusive : {(True, True) tuple} optional
+ If relative==False, tuple indicating whether values exactly equal to the
+ absolute limits are allowed.
+ If relative==True, tuple indicating whether the number of data being masked
+ on each side should be rounded (True) or truncated (False).
+ relative : {False, True} optional
+ Whether to consider the limits as absolute values (False) or proportions
+ to cut (True).
+ axis : {None, integer}, optional
+ Axis along which to trim.
+"""
+
+
+def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
+ """Trims an array by masking the data outside some given limits.
+ Returns a masked version of the input array.
+ %s
+
+ Examples
+ --------
+ >>>z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
+ >>>trim(z,(3,8))
+ [--,--, 3, 4, 5, 6, 7, 8,--,--]
+ >>>trim(z,(0.1,0.2),relative=True)
+ [--, 2, 3, 4, 5, 6, 7, 8,--,--]
+
+
+ """
+ if relative:
+ return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
+ else:
+ return trima(a, limits=limits, inclusive=inclusive)
+trim.__doc__ = trim.__doc__ % trimdoc
+
+
+def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
+ """Trims the data by masking the int(proportiontocut*n) smallest and
+ int(proportiontocut*n) largest values of data along the given axis, where n
+ is the number of unmasked values before trimming.
+
+Parameters
+----------
+ data : ndarray
+ Data to trim.
+ proportiontocut : {0.2, float} optional
+ Percentage of trimming (as a float between 0 and 1).
+ If n is the number of unmasked values before trimming, the number of
+ values after trimming is:
+ (1-2*proportiontocut)*n.
+ inclusive : {(True, True) tuple} optional
+ Tuple indicating whether the number of data being masked on each side
+ should be rounded (True) or truncated (False).
+ axis : {None, integer}, optional
+ Axis along which to perform the trimming.
+ If None, the input array is first flattened.
+
+ """
+ return trimr(data, limits=(proportiontocut,proportiontocut),
+ inclusive=inclusive, axis=axis)
+
+#..............................................................................
+def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
+ axis=None):
+ """Trims the data by masking int(trim*n) values from ONE tail of the
+ data along the given axis, where n is the number of unmasked values.
+
+Parameters
+----------
+ data : {ndarray}
+ Data to trim.
+ proportiontocut : {0.2, float} optional
+ Percentage of trimming. If n is the number of unmasked values
+ before trimming, the number of values after trimming is
+ (1-proportiontocut)*n.
+ tail : {'left','right'} optional
+ If left (right), the ``proportiontocut`` lowest (greatest) values will
+ be masked.
+ inclusive : {(True, True) tuple} optional
+ Tuple indicating whether the number of data being masked on each side
+ should be rounded (True) or truncated (False).
+ axis : {None, integer}, optional
+ Axis along which to perform the trimming.
+ If None, the input array is first flattened.
+
+ """
+ tail = str(tail).lower()[0]
+ if tail == 'l':
+ limits = (proportiontocut,None)
+ elif tail == 'r':
+ limits = (None, proportiontocut)
+ else:
+ raise TypeError("The tail argument should be in ('left','right')")
+ return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
+
+trim1 = trimtail
+
+def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
+ axis=None):
+ """Returns the trimmed mean of the data along the given axis.
+
+ %s
+
+ """ % trimdoc
+ if (not isinstance(limits,tuple)) and isinstance(limits,float):
+ limits = (limits, limits)
+ if relative:
+ return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
+ else:
+ return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
+
+
+def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
+ axis=None, ddof=0):
+ """Returns the trimmed variance of the data along the given axis.
+
+ %s
+ ddof : {0,integer}, optional
+ Means Delta Degrees of Freedom. The denominator used during computations
+ is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
+ biased estimate of the variance.
+
+ """ % trimdoc
+ if (not isinstance(limits,tuple)) and isinstance(limits,float):
+ limits = (limits, limits)
+ if relative:
+ out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
+ else:
+ out = trima(a,limits=limits,inclusive=inclusive)
+ return out.var(axis=axis, ddof=ddof)
+
+
+def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
+ axis=None, ddof=0):
+ """Returns the trimmed standard deviation of the data along the given axis.
+
+ %s
+ ddof : {0,integer}, optional
+ Means Delta Degrees of Freedom. The denominator used during computations
+ is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
+ biased estimate of the variance.
+
+ """ % trimdoc
+ if (not isinstance(limits,tuple)) and isinstance(limits,float):
+ limits = (limits, limits)
+ if relative:
+ out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
+ else:
+ out = trima(a,limits=limits,inclusive=inclusive)
+ return out.std(axis=axis,ddof=ddof)
+
+
+def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
+ """Returns the standard error of the trimmed mean of the data along the given
+ axis.
+ Parameters
+ ----------
+ a : sequence
+ Input array
+ limits : {(0.1,0.1), tuple of float} optional
+ tuple (lower percentage, upper percentage) to cut on each side of the
+ array, with respect to the number of unmasked data.
+ Noting n the number of unmasked data before trimming, the (n*limits[0])th
+ smallest data and the (n*limits[1])th largest data are masked, and the
+ total number of unmasked data after trimming is n*(1.-sum(limits))
+ In each case, the value of one limit can be set to None to indicate an
+ open interval.
+ If limits is None, no trimming is performed
+ inclusive : {(True, True) tuple} optional
+ Tuple indicating whether the number of data being masked on each side
+ should be rounded (True) or truncated (False).
+ axis : {None, integer}, optional
+ Axis along which to trim.
+ """
+ #........................
+ def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
+ "Returns the standard error of the trimmed mean for a 1D input data."
+ n = a.count()
+ idx = a.argsort()
+ if low_limit:
+ if low_inclusive:
+ lowidx = int(low_limit*n)
+ else:
+ lowidx = np.round(low_limit*n)
+ a[idx[:lowidx]] = masked
+ if up_limit is not None:
+ if up_inclusive:
+ upidx = n - int(n*up_limit)
+ else:
+ upidx = n- np.round(n*up_limit)
+ a[idx[upidx:]] = masked
+ nsize = a.count()
+ a[idx[:lowidx]] = a[idx[lowidx]]
+ a[idx[upidx:]] = a[idx[upidx-1]]
+ winstd = a.std(ddof=1)
+ return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
+ #........................
+ a = ma.array(a, copy=True, subok=True)
+ a.unshare_mask()
+ if limits is None:
+ return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
+ if (not isinstance(limits,tuple)) and isinstance(limits,float):
+ limits = (limits, limits)
+ # Check the limits
+ (lolim, uplim) = limits
+ errmsg = "The proportion to cut from the %s should be between 0. and 1."
+ if lolim is not None:
+ if lolim > 1. or lolim < 0:
+ raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
+ if uplim is not None:
+ if uplim > 1. or uplim < 0:
+ raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
+ #
+ (loinc, upinc) = inclusive
+ if (axis is None):
+ shp = a.shape
+ return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
+ else:
+ assert a.ndim <= 2, "Array should be 2D at most !"
+ return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
+ lolim,uplim,loinc,upinc)
+
+
+def tmean(a, limits=None, inclusive=(True,True)):
+ return trima(a, limits=limits, inclusive=inclusive).mean()
+tmean.__doc__ = stats.tmean.__doc__
+
+def tvar(a, limits=None, inclusive=(True,True)):
+ return trima(a, limits=limits, inclusive=inclusive).var()
+tvar.__doc__ = stats.tvar.__doc__
+
+def tmin(a, lowerlimit=None, axis=0, inclusive=True):
+ a, axis = _chk_asarray(a, axis)
+ am = trima(a, (lowerlimit, None), (inclusive, False))
+ return ma.minimum.reduce(am, axis)
+tmin.__doc__ = stats.tmin.__doc__
+
+def tmax(a, upperlimit, axis=0, inclusive=True):
+ a, axis = _chk_asarray(a, axis)
+ am = trima(a, (None, upperlimit), (False, inclusive))
+ return ma.maximum.reduce(am, axis)
+tmax.__doc__ = stats.tmax.__doc__
+
+def tsem(a, limits=None, inclusive=(True,True)):
+ a = ma.asarray(a).ravel()
+ if limits is None:
+ n = float(a.count())
+ return a.std()/ma.sqrt(n)
+ am = trima(a.ravel(), limits, inclusive)
+ sd = np.sqrt(am.var())
+ return sd / am.count()
+tsem.__doc__ = stats.tsem.__doc__
+
+
+def winsorize(a, limits=None, inclusive=(True,True), inplace=False, axis=None):
+ """Returns a Winsorized version of the input array.
+
+ The (limits[0])th lowest values are set to the (limits[0])th percentile,
+ and the (limits[1])th highest values are set to the (limits[1])th
+ percentile.
+ Masked values are skipped.
+
+
+ Parameters
+ ----------
+ a : sequence
+ Input array.
+ limits : {None, tuple of float} optional
+ Tuple of the percentages to cut on each side of the array, with respect
+ to the number of unmasked data, as floats between 0. and 1.
+ Noting n the number of unmasked data before trimming, the (n*limits[0])th
+ smallest data and the (n*limits[1])th largest data are masked, and the
+ total number of unmasked data after trimming is n*(1.-sum(limits))
+ The value of one limit can be set to None to indicate an open interval.
+ inclusive : {(True, True) tuple} optional
+ Tuple indicating whether the number of data being masked on each side
+ should be rounded (True) or truncated (False).
+ inplace : {False, True} optional
+ Whether to winsorize in place (True) or to use a copy (False)
+ axis : {None, int} optional
+ Axis along which to trim. If None, the whole array is trimmed, but its
+ shape is maintained.
+
+ """
+ def _winsorize1D(a, low_limit, up_limit, low_include, up_include):
+ n = a.count()
+ idx = a.argsort()
+ if low_limit:
+ if low_include:
+ lowidx = int(low_limit*n)
+ else:
+ lowidx = np.round(low_limit*n)
+ a[idx[:lowidx]] = a[idx[lowidx]]
+ if up_limit is not None:
+ if up_include:
+ upidx = n - int(n*up_limit)
+ else:
+ upidx = n- np.round(n*up_limit)
+ a[idx[upidx:]] = a[idx[upidx-1]]
+ return a
+ # We gonna modify a: better make a copy
+ a = ma.array(a, copy=np.logical_not(inplace))
+ #
+ if limits is None:
+ return a
+ if (not isinstance(limits,tuple)) and isinstance(limits,float):
+ limits = (limits, limits)
+ # Check the limits
+ (lolim, uplim) = limits
+ errmsg = "The proportion to cut from the %s should be between 0. and 1."
+ if lolim is not None:
+ if lolim > 1. or lolim < 0:
+ raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
+ if uplim is not None:
+ if uplim > 1. or uplim < 0:
+ raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
+ #
+ (loinc, upinc) = inclusive
+ #
+ if axis is None:
+ shp = a.shape
+ return _winsorize1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
+ else:
+ return ma.apply_along_axis(_winsorize1D, axis,a,lolim,uplim,loinc,upinc)
+
+
+#####--------------------------------------------------------------------------
+#---- --- Moments ---
+#####--------------------------------------------------------------------------
+
+def moment(a, moment=1, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ if moment == 1:
+ # By definition the first moment about the mean is 0.
+ shape = list(a.shape)
+ del shape[axis]
+ if shape:
+ # return an actual array of the appropriate shape
+ return np.zeros(shape, dtype=float)
+ else:
+ # the input was 1D, so return a scalar instead of a rank-0 array
+ return np.float64(0.0)
+ else:
+ mn = ma.expand_dims(a.mean(axis=axis), axis)
+ s = ma.power((a-mn), moment)
+ return s.mean(axis=axis)
+moment.__doc__ = stats.moment.__doc__
+
+
+def variation(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ return a.std(axis)/a.mean(axis)
+variation.__doc__ = stats.variation.__doc__
+
+
+def skew(a, axis=0, bias=True):
+ a, axis = _chk_asarray(a,axis)
+ n = a.count(axis)
+ m2 = moment(a, 2, axis)
+ m3 = moment(a, 3, axis)
+ vals = ma.where(m2 == 0, 0, m3 / m2**1.5)
+ if not bias:
+ can_correct = (n > 2) & (m2 > 0)
+ if can_correct.any():
+ m2 = np.extract(can_correct, m2)
+ m3 = np.extract(can_correct, m3)
+ nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
+ np.place(vals, can_correct, nval)
+ return vals
+skew.__doc__ = stats.skew.__doc__
+
+
+def kurtosis(a, axis=0, fisher=True, bias=True):
+ a, axis = _chk_asarray(a, axis)
+ n = a.count(axis)
+ m2 = moment(a,2,axis)
+ m4 = moment(a,4,axis)
+ vals = ma.where(m2 == 0, 0, m4/ m2**2.0)
+ if not bias:
+ can_correct = (n > 3) & (m2 > 0)
+ if can_correct.any():
+ m2 = np.extract(can_correct, m2)
+ m4 = np.extract(can_correct, m4)
+ nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
+ np.place(vals, can_correct, nval+3.0)
+ if fisher:
+ return vals - 3
+ else:
+ return vals
+kurtosis.__doc__ = stats.kurtosis.__doc__
+
+def describe(a, axis=0):
+ """Computes several descriptive statistics of the passed array.
+
+ Parameters
+ ----------
+ a : array
+ axis : int or None
+
+ Returns
+ -------
+ (size of the data (discarding missing values),
+ (min, max),
+ arithmetic mean,
+ unbiased variance,
+ biased skewness,
+ biased kurtosis)
+ """
+ a, axis = _chk_asarray(a, axis)
+ n = a.count(axis)
+ mm = (ma.minimum.reduce(a), ma.maximum.reduce(a))
+ m = a.mean(axis)
+ v = a.var(axis)
+ sk = skew(a, axis)
+ kurt = kurtosis(a, axis)
+ return n, mm, m, v, sk, kurt
+
+#.............................................................................
+def stde_median(data, axis=None):
+ """Returns the McKean-Schrader estimate of the standard error of the sample
+median along the given axis. masked values are discarded.
+
+ Parameters
+ ----------
+ data : ndarray
+ Data to trim.
+ axis : {None,int} optional
+ Axis along which to perform the trimming.
+ If None, the input array is first flattened.
+
+ """
+ def _stdemed_1D(data):
+ data = np.sort(data.compressed())
+ n = len(sorted)
+ z = 2.5758293035489004
+ k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
+ return ((data[n-k] - data[k-1])/(2.*z))
+ #
+ data = ma.array(data, copy=False, subok=True)
+ if (axis is None):
+ return _stdemed_1D(data)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return ma.apply_along_axis(_stdemed_1D, axis, data)
+
+#####--------------------------------------------------------------------------
+#---- --- Normality Tests ---
+#####--------------------------------------------------------------------------
+
+def skewtest(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ if axis is None:
+ a = a.ravel()
+ axis = 0
+ b2 = skew(a,axis)
+ n = a.count(axis)
+ if np.min(n) < 8:
+ warnings.warn(
+ "skewtest only valid for n>=8 ... continuing anyway, n=%i" %
+ np.min(n))
+ y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)) )
+ beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) )
+ W2 = -1 + ma.sqrt(2*(beta2-1))
+ delta = 1/ma.sqrt(0.5*ma.log(W2))
+ alpha = ma.sqrt(2.0/(W2-1))
+ y = ma.where(y==0, 1, y)
+ Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
+ return Z, (1.0 - stats.zprob(Z))*2
+skewtest.__doc__ = stats.skewtest.__doc__
+
+def kurtosistest(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ n = a.count(axis=axis).astype(float)
+ if np.min(n) < 20:
+ warnings.warn(
+ "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %
+ np.min(n))
+ b2 = kurtosis(a, axis, fisher=False)
+ E = 3.0*(n-1) /(n+1)
+ varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5))
+ x = (b2-E)/ma.sqrt(varb2)
+ sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5))/
+ (n*(n-2)*(n-3)))
+ A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
+ term1 = 1 - 2./(9.0*A)
+ denom = 1 + x*ma.sqrt(2/(A-4.0))
+ denom[denom < 0] = masked
+ term2 = ma.power((1-2.0/A)/denom,1/3.0)
+ Z = ( term1 - term2 ) / np.sqrt(2/(9.0*A))
+ return Z, (1.0-stats.zprob(Z))*2
+kurtosistest.__doc__ = stats.kurtosistest.__doc__
+
+
+def normaltest(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ s,_ = skewtest(a,axis)
+ k,_ = kurtosistest(a,axis)
+ k2 = s*s + k*k
+ return k2, stats.chisqprob(k2,2)
+normaltest.__doc__ = stats.normaltest.__doc__
+
+# Martinez-Iglewicz test
+# K-S test
+
+
+#####--------------------------------------------------------------------------
+#---- --- Percentiles ---
+#####--------------------------------------------------------------------------
+
+
+def mquantiles(data, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
+ limit=()):
+ """Computes empirical quantiles for a *1xN* data array.
+Samples quantile are defined by:
+*Q(p) = (1-g).x[i] +g.x[i+1]*
+where *x[j]* is the jth order statistic,
+with *i = (floor(n*p+m))*, *m=alpha+p*(1-alpha-beta)* and *g = n*p + m - i)*.
+
+Typical values of (alpha,beta) are:
+
+ - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
+ - (.5,.5) : *p(k) = (k+1/2.)/n* : piecewise linear function (R, type 5)
+ - (0,0) : *p(k) = k/(n+1)* : (R type 6)
+ - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
+ That's R default (R type 7)
+ - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
+ The resulting quantile estimates are approximately median-unbiased
+ regardless of the distribution of x. (R type 8)
+ - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
+ The resulting quantile estimates are approximately unbiased
+ if x is normally distributed (R type 9)
+ - (.4,.4) : approximately quantile unbiased (Cunnane)
+ - (.35,.35): APL, used with PWM
+
+Parameters
+----------
+ x : sequence
+ Input data, as a sequence or array of dimension at most 2.
+ prob : sequence
+ List of quantiles to compute.
+ alpha : {0.4, float} optional
+ Plotting positions parameter.
+ beta : {0.4, float} optional
+ Plotting positions parameter.
+ axis : {None, int} optional
+ Axis along which to perform the trimming.
+ If None, the input array is first flattened.
+ limit : tuple
+ Tuple of (lower, upper) values. Values of a outside this closed interval
+ are ignored.
+ """
+ def _quantiles1D(data,m,p):
+ x = np.sort(data.compressed())
+ n = len(x)
+ if n == 0:
+ return ma.array(np.empty(len(p), dtype=float), mask=True)
+ elif n == 1:
+ return ma.array(np.resize(x, p.shape), mask=nomask)
+ aleph = (n*p + m)
+ k = np.floor(aleph.clip(1, n-1)).astype(int)
+ gamma = (aleph-k).clip(0,1)
+ return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
+
+ # Initialization & checks ---------
+ data = ma.array(data, copy=False)
+ if limit:
+ condition = (limit[0]<data) & (data<limit[1])
+ data[condition.filled(True)] = masked
+ p = np.array(prob, copy=False, ndmin=1)
+ m = alphap + p*(1.-alphap-betap)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ return _quantiles1D(data, m, p)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
+
+
+def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
+ """Calculate the score at the given 'per' percentile of the
+ sequence a. For example, the score at per=50 is the median.
+
+ This function is a shortcut to mquantile
+
+ """
+ if (per < 0) or (per > 100.):
+ raise ValueError("The percentile should be between 0. and 100. !"\
+ " (got %s)" % per)
+ return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
+ limit=limit, axis=0).squeeze()
+
+
+def plotting_positions(data, alpha=0.4, beta=0.4):
+ """Returns the plotting positions (or empirical percentile points) for the
+ data.
+ Plotting positions are defined as (i-alpha)/(n-alpha-beta), where:
+ - i is the rank order statistics
+ - n is the number of unmasked values along the given axis
+ - alpha and beta are two parameters.
+
+ Typical values for alpha and beta are:
+ - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
+ - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5)
+ - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6)
+ - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
+ That's R default (R type 7)
+ - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
+ The resulting quantile estimates are approximately median-unbiased
+ regardless of the distribution of x. (R type 8)
+ - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
+ The resulting quantile estimates are approximately unbiased
+ if x is normally distributed (R type 9)
+ - (.4,.4) : approximately quantile unbiased (Cunnane)
+ - (.35,.35): APL, used with PWM
+
+Parameters
+----------
+ x : sequence
+ Input data, as a sequence or array of dimension at most 2.
+ prob : sequence
+ List of quantiles to compute.
+ alpha : {0.4, float} optional
+ Plotting positions parameter.
+ beta : {0.4, float} optional
+ Plotting positions parameter.
+
+ """
+ data = ma.array(data, copy=False).reshape(1,-1)
+ n = data.count()
+ plpos = np.empty(data.size, dtype=float)
+ plpos[n:] = 0
+ plpos[data.argsort()[:n]] = (np.arange(1,n+1) - alpha)/(n+1-alpha-beta)
+ return ma.array(plpos, mask=data._mask)
+
+meppf = plotting_positions
+
+#####--------------------------------------------------------------------------
+#---- --- Variability ---
+#####--------------------------------------------------------------------------
+
+def obrientransform(*args):
+ """
+Computes a transform on input data (any number of columns). Used to
+test for homogeneity of variance prior to running one-way stats. Each
+array in *args is one level of a factor. If an F_oneway() run on the
+transformed data and found significant, variances are unequal. From
+Maxwell and Delaney, p.112.
+
+Returns: transformed data for use in an ANOVA
+ """
+ data = argstoarray(*args).T
+ v = data.var(axis=0,ddof=1)
+ m = data.mean(0)
+ n = data.count(0).astype(float)
+ # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
+ data -= m
+ data **= 2
+ data *= (n-1.5)*n
+ data -= 0.5*v*(n-1)
+ data /= (n-1.)*(n-2.)
+ if not ma.allclose(v,data.mean(0)):
+ raise ValueError("Lack of convergence in obrientransform.")
+ return data
+
+
+def signaltonoise(data, axis=0):
+ """Calculates the signal-to-noise ratio, as the ratio of the mean over
+ standard deviation along the given axis.
+
+ Parameters
+ ----------
+ data : sequence
+ Input data
+ axis : {0, int} optional
+ Axis along which to compute. If None, the computation is performed
+ on a flat version of the array.
+"""
+ data = ma.array(data, copy=False)
+ m = data.mean(axis)
+ sd = data.std(axis, ddof=0)
+ return m/sd
+
+
+def samplevar(data, axis=0):
+ """Returns a biased estimate of the variance of the data, as the average
+ of the squared deviations from the mean.
+
+ Parameters
+ ----------
+ data : sequence
+ Input data
+ axis : {0, int} optional
+ Axis along which to compute. If None, the computation is performed
+ on a flat version of the array.
+ """
+ return ma.asarray(data).var(axis=axis,ddof=0)
+
+
+def samplestd(data, axis=0):
+ """Returns a biased estimate of the standard deviation of the data, as the
+ square root of the average squared deviations from the mean.
+
+ Parameters
+ ----------
+ data : sequence
+ Input data
+ axis : {0,int} optional
+ Axis along which to compute. If None, the computation is performed
+ on a flat version of the array.
+
+ Notes
+ -----
+ samplestd(a) is equivalent to a.std(ddof=0)
+
+ """
+ return ma.asarray(data).std(axis=axis,ddof=0)
+
+
+def var(a,axis=None):
+ return ma.asarray(a).var(axis=axis,ddof=1)
+var.__doc__ = stats.var.__doc__
+
+def std(a,axis=None):
+ return ma.asarray(a).std(axis=axis,ddof=1)
+std.__doc__ = stats.std.__doc__
+
+def stderr(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ return a.std(axis=axis, ddof=1) / ma.sqrt(a.count(axis=axis))
+stderr.__doc__ = stats.stderr.__doc__
+
+def sem(a, axis=0):
+ a, axis = _chk_asarray(a, axis)
+ n = a.count(axis=axis)
+ s = a.std(axis=axis,ddof=0) / ma.sqrt(n-1)
+ return s
+sem.__doc__ = stats.sem.__doc__
+
+def z(a, score):
+ a = ma.asarray(a)
+ z = (score-a.mean(None)) / a.std(axis=None, ddof=1)
+ return z
+z.__doc__ = stats.z.__doc__
+
+def zs(a):
+ a = ma.asarray(a)
+ mu = a.mean(axis=0)
+ sigma = a.std(axis=0,ddof=0)
+ return (a-mu)/sigma
+zs.__doc__ = stats.zs.__doc__
+
+def zmap(scores, compare, axis=0):
+ (scores, compare) = (ma.asarray(scores), ma.asarray(compare))
+ mns = compare.mean(axis=axis)
+ sstd = compare.std(axis=0, ddof=0)
+ return (scores - mns) / sstd
+zmap.__doc__ = stats.zmap.__doc__
+
+
+#####--------------------------------------------------------------------------
+#---- --- ANOVA ---
+#####--------------------------------------------------------------------------
+
+
+def f_oneway(*args):
+ """
+Performs a 1-way ANOVA, returning an F-value and probability given
+any number of groups. From Heiman, pp.394-7.
+
+Usage: f_oneway (*args) where *args is 2 or more arrays, one per
+ treatment group
+Returns: f-value, probability
+"""
+ # Construct a single array of arguments: each row is a group
+ data = argstoarray(*args)
+ ngroups = len(data)
+ ntot = data.count()
+ sstot = (data**2).sum() - (data.sum())**2/float(ntot)
+ ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
+ sswg = sstot-ssbg
+ dfbg = ngroups-1
+ dfwg = ntot - ngroups
+ msb = ssbg/float(dfbg)
+ msw = sswg/float(dfwg)
+ f = msb/msw
+ prob = stats.fprob(dfbg,dfwg,f)
+ return f, prob
+
+
+def f_value_wilks_lambda(ER, EF, dfnum, dfden, a, b):
+ """Calculation of Wilks lambda F-statistic for multivarite data, per
+ Maxwell & Delaney p.657.
+ """
+ ER = ma.array(ER, copy=False, ndmin=2)
+ EF = ma.array(EF, copy=False, ndmin=2)
+ if ma.getmask(ER).any() or ma.getmask(EF).any():
+ raise NotImplementedError("Not implemented when the inputs "\
+ "have missing data")
+ lmbda = np.linalg.det(EF) / np.linalg.det(ER)
+ q = ma.sqrt( ((a-1)**2*(b-1)**2 - 2) / ((a-1)**2 + (b-1)**2 -5) )
+ q = ma.filled(q, 1)
+ n_um = (1 - lmbda**(1.0/q))*(a-1)*(b-1)
+ d_en = lmbda**(1.0/q) / (n_um*q - 0.5*(a-1)*(b-1) + 1)
+ return n_um / d_en
+
+
+
+def friedmanchisquare(*args):
+ """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
+ This function calculates the Friedman Chi-square test for repeated measures
+ and returns the result, along with the associated probability value.
+
+ Each input is considered a given group. Ideally, the number of treatments
+ among each group should be equal. If this is not the case, only the first
+ n treatments are taken into account, where n is the number of treatments
+ of the smallest group.
+ If a group has some missing values, the corresponding treatments are masked
+ in the other groups.
+ The test statistic is corrected for ties.
+
+ Masked values in one group are propagated to the other groups.
+
+ Returns: chi-square statistic, associated p-value
+ """
+ data = argstoarray(*args).astype(float)
+ k = len(data)
+ if k < 3:
+ raise ValueError("Less than 3 groups (%i): " % k +\
+ "the Friedman test is NOT appropriate.")
+ ranked = ma.masked_values(rankdata(data, axis=0), 0)
+ if ranked._mask is not nomask:
+ ranked = ma.mask_cols(ranked)
+ ranked = ranked.compressed().reshape(k,-1).view(ndarray)
+ else:
+ ranked = ranked._data
+ (k,n) = ranked.shape
+ # Ties correction
+ repeats = np.array([find_repeats(_) for _ in ranked.T], dtype=object)
+ ties = repeats[repeats.nonzero()].reshape(-1,2)[:,-1].astype(int)
+ tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
+ #
+ ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
+ chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
+ return chisq, stats.chisqprob(chisq,k-1)
+
+#-############################################################################-#
Added: trunk/scipy/stats/tests/test_mmorestats.py
===================================================================
--- trunk/scipy/stats/tests/test_mmorestats.py 2008-04-13 22:17:05 UTC (rev 4140)
+++ trunk/scipy/stats/tests/test_mmorestats.py 2008-04-14 19:01:01 UTC (rev 4141)
@@ -0,0 +1,103 @@
+# pylint: disable-msg=W0611, W0612, W0511,R0201
+"""Tests suite for maskedArray statistics.
+
+:author: Pierre Gerard-Marchant
+:contact: pierregm_at_uga_dot_edu
+"""
+__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)"
+
+import numpy as np
+
+import numpy.ma as ma
+from numpy.ma import masked
+
+import scipy.stats.mstats as ms
+import scipy.stats.mmorestats as mms
+
+from scipy.testing import *
+
+
+class TestMisc(TestCase):
+ #
+ def __init__(self, *args, **kwargs):
+ TestCase.__init__(self, *args, **kwargs)
+ #
+ def test_mjci(self):
+ "Tests the Marits-Jarrett estimator"
+ data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(mms.mjci(data),[55.76819,45.84028,198.87875],5)
+ #
+ def test_trimmedmeanci(self):
+ "Tests the confidence intervals of the trimmed mean."
+ data = ma.array([545,555,558,572,575,576,578,580,
+ 594,605,635,651,653,661,666])
+ assert_almost_equal(ms.trimmed_mean(data,0.2), 596.2, 1)
+ assert_equal(np.round(mms.trimmed_mean_ci(data,(0.2,0.2)),1),
+ [561.8, 630.6])
+ #
+ def test_idealfourths(self):
+ "Tests ideal-fourths"
+ test = np.arange(100)
+ assert_almost_equal(np.asarray(mms.idealfourths(test)),
+ [24.416667,74.583333],6)
+ test_2D = test.repeat(3).reshape(-1,3)
+ assert_almost_equal(mms.idealfourths(test_2D, axis=0),
+ [[24.416667,24.416667,24.416667],
+ [74.583333,74.583333,74.583333]],6)
+ assert_almost_equal(mms.idealfourths(test_2D, axis=1),
+ test.repeat(2).reshape(-1,2))
+ test = [0,0]
+ _result = mms.idealfourths(test)
+ assert(np.isnan(_result).all())
+
+#..............................................................................
+class TestQuantiles(TestCase):
+ #
+ def __init__(self, *args, **kwargs):
+ TestCase.__init__(self, *args, **kwargs)
+ #
+ def test_hdquantiles(self):
+ data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
+ 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
+ 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
+ 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
+ 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
+ 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
+ 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
+ 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
+ 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
+ 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
+ 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
+ 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
+ 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
+ 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
+ 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
+ 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
+ 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
+ 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
+ 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
+ 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
+ #
+ assert_almost_equal(mms.hdquantiles(data,[0., 1.]),
+ [0.006514031, 0.995309248])
+ hdq = mms.hdquantiles(data,[0.25, 0.5, 0.75])
+ assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
+ hdq = mms.hdquantiles_sd(data,[0.25, 0.5, 0.75])
+ assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
+ #
+ data = np.array(data).reshape(10,10)
+ hdq = mms.hdquantiles(data,[0.25,0.5,0.75],axis=0)
+ assert_almost_equal(hdq[:,0], mms.hdquantiles(data[:,0],[0.25,0.5,0.75]))
+ assert_almost_equal(hdq[:,-1], mms.hdquantiles(data[:,-1],[0.25,0.5,0.75]))
+ hdq = mms.hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
+ assert_almost_equal(hdq[...,0],
+ mms.hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
+ assert_almost_equal(hdq[...,-1],
+ mms.hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
+
+
+###############################################################################
+
+if __name__ == "__main__":
+ nose.run(argv=['', __file__])
Added: trunk/scipy/stats/tests/test_mstats.py
===================================================================
--- trunk/scipy/stats/tests/test_mstats.py 2008-04-13 22:17:05 UTC (rev 4140)
+++ trunk/scipy/stats/tests/test_mstats.py 2008-04-14 19:01:01 UTC (rev 4141)
@@ -0,0 +1,506 @@
+"""
+Tests for the stats.mstats module (support for maskd arrays)
+"""
+
+
+import numpy as np
+from numpy import nan
+import numpy.ma as ma
+from numpy.ma import masked, nomask
+
+import scipy.stats.mstats as mstats
+from scipy.testing import *
+from numpy.ma.testutils import assert_equal, assert_almost_equal, \
+ assert_array_almost_equal
+
+
+class TestGMean(TestCase):
+ def test_1D(self):
+ a = (1,2,3,4)
+ actual= mstats.gmean(a)
+ desired = np.power(1*2*3*4,1./4.)
+ assert_almost_equal(actual, desired,decimal=14)
+
+ desired1 = mstats.gmean(a,axis=-1)
+ assert_almost_equal(actual, desired1, decimal=14)
+ assert not isinstance(desired1, ma.MaskedArray)
+ #
+ a = ma.array((1,2,3,4),mask=(0,0,0,1))
+ actual= mstats.gmean(a)
+ desired = np.power(1*2*3,1./3.)
+ assert_almost_equal(actual, desired,decimal=14)
+
+ desired1 = mstats.gmean(a,axis=-1)
+ assert_almost_equal(actual, desired1, decimal=14)
+ #
+ def test_2D(self):
+ a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)),
+ mask=((0,0,0,0),(1,0,0,1),(0,1,1,0)))
+ actual= mstats.gmean(a)
+ desired = np.array((1,2,3,4))
+ assert_array_almost_equal(actual, desired, decimal=14)
+ #
+ desired1 = mstats.gmean(a,axis=0)
+ assert_array_almost_equal(actual, desired1, decimal=14)
+ #
+ actual= mstats.gmean(a, -1)
+ desired = ma.array((np.power(1*2*3*4,1./4.),
+ np.power(2*3,1./2.),
+ np.power(1*4,1./2.)))
+ assert_array_almost_equal(actual, desired, decimal=14)
+
+class TestHMean(TestCase):
+ def test_1D(self):
+ a = (1,2,3,4)
+ actual= mstats.hmean(a)
+ desired = 4. / (1./1 + 1./2 + 1./3 + 1./4)
+ assert_almost_equal(actual, desired, decimal=14)
+ desired1 = mstats.hmean(ma.array(a),axis=-1)
+ assert_almost_equal(actual, desired1, decimal=14)
+ #
+ a = ma.array((1,2,3,4),mask=(0,0,0,1))
+ actual= mstats.hmean(a)
+ desired = 3. / (1./1 + 1./2 + 1./3)
+ assert_almost_equal(actual, desired,decimal=14)
+ desired1 = mstats.hmean(a,axis=-1)
+ assert_almost_equal(actual, desired1, decimal=14)
+
+ def test_2D(self):
+ a = ma.array(((1,2,3,4),(1,2,3,4),(1,2,3,4)),
+ mask=((0,0,0,0),(1,0,0,1),(0,1,1,0)))
+ actual= mstats.hmean(a)
+ desired = ma.array((1,2,3,4))
+ assert_array_almost_equal(actual, desired, decimal=14)
+ #
+ actual1 = mstats.hmean(a,axis=-1)
+ desired = (4./(1/1.+1/2.+1/3.+1/4.),
+ 2./(1/2.+1/3.),
+ 2./(1/1.+1/4.)
+ )
+ assert_array_almost_equal(actual1, desired, decimal=14)
+
+
+class TestRanking(TestCase):
+ #
+ def __init__(self, *args, **kwargs):
+ TestCase.__init__(self, *args, **kwargs)
+ #
+ def test_ranking(self):
+ x = ma.array([0,1,1,1,2,3,4,5,5,6,])
+ assert_almost_equal(mstats.rankdata(x),[1,3,3,3,5,6,7,8.5,8.5,10])
+ x[[3,4]] = masked
+ assert_almost_equal(mstats.rankdata(x),[1,2.5,2.5,0,0,4,5,6.5,6.5,8])
+ assert_almost_equal(mstats.rankdata(x,use_missing=True),
+ [1,2.5,2.5,4.5,4.5,4,5,6.5,6.5,8])
+ x = ma.array([0,1,5,1,2,4,3,5,1,6,])
+ assert_almost_equal(mstats.rankdata(x),[1,3,8.5,3,5,7,6,8.5,3,10])
+ x = ma.array([[0,1,1,1,2], [3,4,5,5,6,]])
+ assert_almost_equal(mstats.rankdata(x),[[1,3,3,3,5],[6,7,8.5,8.5,10]])
+ assert_almost_equal(mstats.rankdata(x,axis=1),[[1,3,3,3,5],[1,2,3.5,3.5,5]])
+ assert_almost_equal(mstats.rankdata(x,axis=0),[[1,1,1,1,1],[2,2,2,2,2,]])
+
+
+class TestCorr(TestCase):
+ #
+ def test_pearsonr(self):
+ "Tests some computations of Pearson's r"
+ x = ma.arange(10)
+ assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0)
+ assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0)
+ #
+ x = ma.array(x, mask=True)
+ pr = mstats.pearsonr(x,x)
+ assert(pr[0] is masked)
+ assert(pr[1] is masked)
+ #
+ def test_spearmanr(self):
+ "Tests some computations of Spearman's rho"
+ (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95])
+ assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555)
+ (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan])
+ (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
+ assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555)
+ #
+ x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1,
+ 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7]
+ y = [22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6,
+ 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4]
+ assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299)
+ x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1,
+ 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan]
+ y = [22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6,
+ 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan]
+ (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
+ assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299)
+ #
+ def test_kendalltau(self):
+ "Tests some computations of Kendall's tau"
+ x = ma.fix_invalid([5.05, 6.75, 3.21, 2.66,np.nan])
+ y = ma.fix_invalid([1.65, 26.5, -5.93, 7.96, np.nan])
+ z = ma.fix_invalid([1.65, 2.64, 2.64, 6.95, np.nan])
+ assert_almost_equal(np.asarray(mstats.kendalltau(x,y)),
+ [+0.3333333,0.4969059])
+ assert_almost_equal(np.asarray(mstats.kendalltau(x,z)),
+ [-0.5477226,0.2785987])
+ #
+ x = ma.fix_invalid([ 0, 0, 0, 0,20,20, 0,60, 0,20,
+ 10,10, 0,40, 0,20, 0, 0, 0, 0, 0, np.nan])
+ y = ma.fix_invalid([ 0,80,80,80,10,33,60, 0,67,27,
+ 25,80,80,80,80,80,80, 0,10,45, np.nan, 0])
+ result = mstats.kendalltau(x,y)
+ assert_almost_equal(np.asarray(result), [-0.1585188, 0.4128009])
+ #
+ def test_kendalltau_seasonal(self):
+ "Tests the seasonal Kendall tau."
+ x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
+ [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
+ [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
+ [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
+ x = ma.fix_invalid(x).T
+ output = mstats.kendalltau_seasonal(x)
+ assert_almost_equal(output['global p-value (indep)'], 0.008, 3)
+ assert_almost_equal(output['seasonal p-value'].round(2),
+ [0.18,0.53,0.20,0.04])
+ #
+ def test_pointbiserial(self):
+ "Tests point biserial"
+ x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0,
+ 0,0,0,0,1,-1]
+ y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0,
+ 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1,
+ 0.8,0.7,0.6,0.5,0.2,0.2,0.1,np.nan]
+ assert_almost_equal(mstats.pointbiserialr(x, y)[0], 0.36149, 5)
+ #
+ def test_cov(self):
+ "Tests the cov function."
+ x = ma.array([[1,2,3],[4,5,6]], mask=[[1,0,0],[0,0,0]])
+ c = mstats.cov(x[0])
+ assert_equal(c, x[0].var(ddof=1))
+ c = mstats.cov(x[1])
+ assert_equal(c, x[1].var(ddof=1))
+ c = mstats.cov(x)
+ assert_equal(c[1,0], (x[0].anom()*x[1].anom()).sum())
+ #
+ x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
+ [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
+ [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
+ [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
+ x = ma.fix_invalid(x).T
+ (winter,spring,summer,fall) = x.T
+ #
+ assert_almost_equal(mstats.cov(winter,winter,bias=True),
+ winter.var(ddof=0))
+ assert_almost_equal(mstats.cov(winter,winter,bias=False),
+ winter.var(ddof=1))
+ assert_almost_equal(mstats.cov(winter,spring), 7.7)
+ assert_almost_equal(mstats.cov(winter,summer), 19.1111111, 7)
+ assert_almost_equal(mstats.cov(winter,fall), 20)
+
+
+class TestTrimming(TestCase):
+ #
+ def test_trim(self):
+ "Tests trimming"
+ a = ma.arange(10)
+ assert_equal(mstats.trim(a), [0,1,2,3,4,5,6,7,8,9])
+ a = ma.arange(10)
+ assert_equal(mstats.trim(a,(2,8)), [None,None,2,3,4,5,6,7,8,None])
+ a = ma.arange(10)
+ assert_equal(mstats.trim(a,limits=(2,8),inclusive=(False,False)),
+ [None,None,None,3,4,5,6,7,None,None])
+ a = ma.arange(10)
+ assert_equal(mstats.trim(a,limits=(0.1,0.2),relative=True),
+ [None,1,2,3,4,5,6,7,None,None])
+ #
+ a = ma.arange(12)
+ a[[0,-1]] = a[5] = masked
+ assert_equal(mstats.trim(a,(2,8)),
+ [None,None,2,3,4,None,6,7,8,None,None,None])
+ #
+ x = ma.arange(100).reshape(10,10)
+ trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None)
+ assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20)
+ trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0)
+ assert_equal(trimx._mask.ravel(),[1]*10+[0]*70+[1]*20)
+ trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=-1)
+ assert_equal(trimx._mask.T.ravel(),[1]*10+[0]*70+[1]*20)
+ #
+ x = ma.arange(110).reshape(11,10)
+ x[1] = masked
+ trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=None)
+ assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20)
+ trimx = mstats.trim(x,(0.1,0.2),relative=True,axis=0)
+ assert_equal(trimx._mask.ravel(),[1]*20+[0]*70+[1]*20)
+ trimx = mstats.trim(x.T,(0.1,0.2),relative=True,axis=-1)
+ assert_equal(trimx.T._mask.ravel(),[1]*20+[0]*70+[1]*20)
+ #
+ def test_trim_old(self):
+ "Tests trimming."
+ x = ma.arange(100)
+ assert_equal(mstats.trimboth(x).count(), 60)
+ assert_equal(mstats.trimtail(x,tail='r').count(), 80)
+ x[50:70] = masked
+ trimx = mstats.trimboth(x)
+ assert_equal(trimx.count(), 48)
+ assert_equal(trimx._mask, [1]*16 + [0]*34 + [1]*20 + [0]*14 + [1]*16)
+ x._mask = nomask
+ x.shape = (10,10)
+ assert_equal(mstats.trimboth(x).count(), 60)
+ assert_equal(mstats.trimtail(x).count(), 80)
+ #
+ def test_trimmedmean(self):
+ "Tests the trimmed mean."
+ data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(mstats.trimmed_mean(data,0.1), 343, 0)
+ assert_almost_equal(mstats.trimmed_mean(data,(0.1,0.1)), 343, 0)
+ assert_almost_equal(mstats.trimmed_mean(data,(0.2,0.2)), 283, 0)
+ #
+ def test_trimmed_stde(self):
+ "Tests the trimmed mean standard error."
+ data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(mstats.trimmed_stde(data,(0.2,0.2)), 56.13193, 5)
+ assert_almost_equal(mstats.trimmed_stde(data,0.2), 56.13193, 5)
+ #
+ def test_winsorization(self):
+ "Tests the Winsorization of the data."
+ data = ma.array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(mstats.winsorize(data,(0.2,0.2)).var(ddof=1),
+ 21551.4, 1)
+ data[5] = masked
+ winsorized = mstats.winsorize(data)
+ assert_equal(winsorized.mask, data.mask)
+
+
+class TestMoments(TestCase):
+ """
+ Comparison numbers are found using R v.1.5.1
+ note that length(testcase) = 4
+ testmathworks comes from documentation for the
+ Statistics Toolbox for Matlab and can be found at both
+ http://www.mathworks.com/access/helpdesk/help/toolbox/stats/kurtosis.shtml
+ http://www.mathworks.com/access/helpdesk/help/toolbox/stats/skewness.shtml
+ Note that both test cases came from here.
+ """
+ testcase = [1,2,3,4]
+ testmathworks = ma.fix_invalid([1.165 , 0.6268, 0.0751, 0.3516, -0.6965,
+ np.nan])
+ def test_moment(self):
+ """
+ mean((testcase-mean(testcase))**power,axis=0),axis=0))**power))"""
+ y = mstats.moment(self.testcase,1)
+ assert_almost_equal(y,0.0,10)
+ y = mstats.moment(self.testcase,2)
+ assert_almost_equal(y,1.25)
+ y = mstats.moment(self.testcase,3)
+ assert_almost_equal(y,0.0)
+ y = mstats.moment(self.testcase,4)
+ assert_almost_equal(y,2.5625)
+ def test_variation(self):
+ """variation = samplestd/mean """
+## y = stats.variation(self.shoes[0])
+## assert_almost_equal(y,21.8770668)
+ y = mstats.variation(self.testcase)
+ assert_almost_equal(y,0.44721359549996, 10)
+
+ def test_skewness(self):
+ """
+ sum((testmathworks-mean(testmathworks,axis=0))**3,axis=0)/((sqrt(var(testmathworks)*4/5))**3)/5
+ """
+ y = mstats.skew(self.testmathworks)
+ assert_almost_equal(y,-0.29322304336607,10)
+ y = mstats.skew(self.testmathworks,bias=0)
+ assert_almost_equal(y,-0.437111105023940,10)
+ y = mstats.skew(self.testcase)
+ assert_almost_equal(y,0.0,10)
+
+ def test_kurtosis(self):
+ """
+ sum((testcase-mean(testcase,axis=0))**4,axis=0)/((sqrt(var(testcase)*3/4))**4)/4
+ sum((test2-mean(testmathworks,axis=0))**4,axis=0)/((sqrt(var(testmathworks)*4/5))**4)/5
+ Set flags for axis = 0 and
+ fisher=0 (Pearson's definition of kurtosis for compatibility with Matlab)
+ """
+ y = mstats.kurtosis(self.testmathworks,0,fisher=0,bias=1)
+ assert_almost_equal(y, 2.1658856802973,10)
+ # Note that MATLAB has confusing docs for the following case
+ # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness
+ # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3)
+ # The MATLAB docs imply that both should give Fisher's
+ y = mstats.kurtosis(self.testmathworks,fisher=0,bias=0)
+ assert_almost_equal(y, 3.663542721189047,10)
+ y = mstats.kurtosis(self.testcase,0,0)
+ assert_almost_equal(y,1.64)
+ #
+ def test_mode(self):
+ "Tests the mode"
+ #
+ a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7]
+ a2 = np.reshape(a1, (3,5))
+ ma1 = ma.masked_where(ma.array(a1)>2,a1)
+ ma2 = ma.masked_where(a2>2, a2)
+ assert_equal(mstats.mode(a1, axis=None), (3,4))
+ assert_equal(mstats.mode(ma1, axis=None), (0,3))
+ assert_equal(mstats.mode(a2, axis=None), (3,4))
+ assert_equal(mstats.mode(ma2, axis=None), (0,3))
+ assert_equal(mstats.mode(a2, axis=0), [[0,0,0,1,1],[1,1,1,1,1]])
+ assert_equal(mstats.mode(ma2, axis=0), [[0,0,0,1,1],[1,1,1,1,1]])
+ assert_equal(mstats.mode(a2, axis=-1), [[0,3],[3,3],[3,1]])
+ assert_equal(mstats.mode(ma2, axis=-1), [[0,3],[1,1],[0,0]])
+
+
+class TestPercentile(TestCase):
+ def setUp(self):
+ self.a1 = [3,4,5,10,-3,-5,6]
+ self.a2 = [3,-6,-2,8,7,4,2,1]
+ self.a3 = [3.,4,5,10,-3,-5,-6,7.0]
+
+ def test_percentile(self):
+ x = np.arange(8) * 0.5
+ assert_equal(mstats.scoreatpercentile(x, 0), 0.)
+ assert_equal(mstats.scoreatpercentile(x, 100), 3.5)
+ assert_equal(mstats.scoreatpercentile(x, 50), 1.75)
+
+ def test_2D(self):
+ x = ma.array([[1, 1, 1],
+ [1, 1, 1],
+ [4, 4, 3],
+ [1, 1, 1],
+ [1, 1, 1]])
+ assert_equal(mstats.scoreatpercentile(x,50), [1,1,1])
+
+
+class TestVariability(TestCase):
+ """ Comparison numbers are found using R v.1.5.1
+ note that length(testcase) = 4
+ """
+ testcase = ma.fix_invalid([1,2,3,4,np.nan])
+ #
+ def test_std(self):
+ y = mstats.std(self.testcase)
+ assert_almost_equal(y,1.290994449)
+
+ def test_var(self):
+ """
+ var(testcase) = 1.666666667 """
+ #y = stats.var(self.shoes[0])
+ #assert_approx_equal(y,6.009)
+ y = mstats.var(self.testcase)
+ assert_almost_equal(y,1.666666667)
+
+ def test_samplevar(self):
+ """
+ R does not have 'samplevar' so the following was used
+ var(testcase)*(4-1)/4 where 4 = length(testcase)
+ """
+ #y = stats.samplevar(self.shoes[0])
+ #assert_approx_equal(y,5.4081)
+ y = mstats.samplevar(self.testcase)
+ assert_almost_equal(y,1.25)
+
+ def test_samplestd(self):
+ #y = stats.samplestd(self.shoes[0])
+ #assert_approx_equal(y,2.325532197)
+ y = mstats.samplestd(self.testcase)
+ assert_almost_equal(y,1.118033989)
+
+ def test_signaltonoise(self):
+ """
+ this is not in R, so used
+ mean(testcase,axis=0)/(sqrt(var(testcase)*3/4)) """
+ #y = stats.signaltonoise(self.shoes[0])
+ #assert_approx_equal(y,4.5709967)
+ y = mstats.signaltonoise(self.testcase)
+ assert_almost_equal(y,2.236067977)
+
+ def test_stderr(self):
+ """
+ this is not in R, so used
+ sqrt(var(testcase))/sqrt(4)
+ """
+## y = stats.stderr(self.shoes[0])
+## assert_approx_equal(y,0.775177399)
+ y = mstats.stderr(self.testcase)
+ assert_almost_equal(y,0.6454972244)
+
+ def test_sem(self):
+ """
+ this is not in R, so used
+ sqrt(var(testcase)*3/4)/sqrt(3)
+ """
+ #y = stats.sem(self.shoes[0])
+ #assert_approx_equal(y,0.775177399)
+ y = mstats.sem(self.testcase)
+ assert_almost_equal(y,0.6454972244)
+
+ def test_z(self):
+ """
+ not in R, so used
+ (10-mean(testcase,axis=0))/sqrt(var(testcase)*3/4)
+ """
+ y = mstats.z(self.testcase, ma.array(self.testcase).mean())
+ assert_almost_equal(y,0.0)
+
+ def test_zs(self):
+ """
+ not in R, so tested by using
+ (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4)
+ """
+ y = mstats.zs(self.testcase)
+ desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996 ,
+ 0.44721359549996 , 1.3416407864999, np.nan])
+ assert_almost_equal(desired,y,decimal=12)
+
+
+
+class TestMisc(TestCase):
+ #
+ def test_obrientransform(self):
+ "Tests Obrien transform"
+ args = [[5]*5+[6]*11+[7]*9+[8]*3+[9]*2+[10]*2,
+ [6]+[7]*2+[8]*4+[9]*9+[10]*16]
+ result = [5*[3.1828]+11*[0.5591]+9*[0.0344]+3*[1.6086]+2*[5.2817]+2*[11.0538],
+ [10.4352]+2*[4.8599]+4*[1.3836]+9*[0.0061]+16*[0.7277]]
+ assert_almost_equal(np.round(mstats.obrientransform(*args).T,4),
+ result,4)
+ #
+ def test_kstwosamp(self):
+ "Tests the Kolmogorov-Smirnov 2 samples test"
+ x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
+ [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
+ [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
+ [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
+ x = ma.fix_invalid(x).T
+ (winter,spring,summer,fall) = x.T
+ #
+ assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring),4),
+ (0.1818,0.9892))
+ assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'g'),4),
+ (0.1469,0.7734))
+ assert_almost_equal(np.round(mstats.ks_twosamp(winter,spring,'l'),4),
+ (0.1818,0.6744))
+ #
+ def test_friedmanchisq(self):
+ "Tests the Friedman Chi-square test"
+ # No missing values
+ args = ([9.0,9.5,5.0,7.5,9.5,7.5,8.0,7.0,8.5,6.0],
+ [7.0,6.5,7.0,7.5,5.0,8.0,6.0,6.5,7.0,7.0],
+ [6.0,8.0,4.0,6.0,7.0,6.5,6.0,4.0,6.5,3.0])
+ result = mstats.friedmanchisquare(*args)
+ assert_almost_equal(result[0], 10.4737, 4)
+ assert_almost_equal(result[1], 0.005317, 6)
+ # Missing values
+ x = [[nan,nan, 4, 2, 16, 26, 5, 1, 5, 1, 2, 3, 1],
+ [ 4, 3, 5, 3, 2, 7, 3, 1, 1, 2, 3, 5, 3],
+ [ 3, 2, 5, 6, 18, 4, 9, 1, 1,nan, 1, 1,nan],
+ [nan, 6, 11, 4, 17,nan, 6, 1, 1, 2, 5, 1, 1]]
+ x = ma.fix_invalid(x)
+ result = mstats.friedmanchisquare(*x)
+ assert_almost_equal(result[0], 2.0156, 4)
+ assert_almost_equal(result[1], 0.5692, 4)
+
+
+if __name__ == "__main__":
+ nose.run(argv=['', __file__])
\ No newline at end of file
More information about the Scipy-svn
mailing list