[Scipy-svn] r3153 - in trunk/Lib/sandbox/maskedarray: . tests
scipy-svn@scip...
scipy-svn@scip...
Mon Jul 9 10:39:15 CDT 2007
Author: pierregm
Date: 2007-07-09 10:38:26 -0500 (Mon, 09 Jul 2007)
New Revision: 3153
Added:
trunk/Lib/sandbox/maskedarray/morestats.py
trunk/Lib/sandbox/maskedarray/tests/test_morestats.py
Modified:
trunk/Lib/sandbox/maskedarray/core.py
trunk/Lib/sandbox/maskedarray/extras.py
trunk/Lib/sandbox/maskedarray/mrecords.py
trunk/Lib/sandbox/maskedarray/mstats.py
trunk/Lib/sandbox/maskedarray/tests/test_mstats.py
Log:
core : fixed a pb w/ argmin/argmax & subclasses
extras : fixed apply_along_axis to accept functions that return a nD array
stats : added winsorize,trim_both,trim_tail, trimmed_mean, trimmed_stde, stde_median, rsh, idealfourths, plotting_positions
morestats : added hdquantiles, hdquantiles_sd, trimmed_mean_ci, mjci, mquantiles_cimj, median_cihs, compare_median_ms, rank_data
Modified: trunk/Lib/sandbox/maskedarray/core.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/core.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/core.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -1731,7 +1731,7 @@
"""
if fill_value is None:
fill_value = minimum_fill_value(self)
- d = self.filled(fill_value)
+ d = self.filled(fill_value).view(ndarray)
return d.argmin(axis)
#........................
def argmax(self, axis=None, fill_value=None):
@@ -1749,7 +1749,7 @@
"""
if fill_value is None:
fill_value = maximum_fill_value(self._data)
- d = self.filled(fill_value)
+ d = self.filled(fill_value).view(ndarray)
return d.argmax(axis)
def sort(self, axis=-1, kind='quicksort', order=None,
Modified: trunk/Lib/sandbox/maskedarray/extras.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/extras.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/extras.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -85,9 +85,11 @@
dvar = anom.sum(axis) / (cnt-1)
if axis is None:
return dvar
- return a.__class__(dvar,
- mask=mask_or(a._mask.all(axis), (cnt==1)),
- fill_value=a._fill_value)
+ dvar.__setmask__(mask_or(a._mask.all(axis), (cnt==1)))
+ return dvar
+# return a.__class__(dvar,
+# mask=mask_or(a._mask.all(axis), (cnt==1)),
+# fill_value=a._fill_value)
def stdu(a, axis=None, dtype=None):
"""a.var(axis=None, dtype=None)
@@ -104,8 +106,9 @@
else:
# Should we use umath.sqrt instead ?
return sqrt(dvar)
- return a.__class__(sqrt(dvar._data), mask=dvar._mask,
- fill_value=a._fill_value)
+ return sqrt(dvar)
+# return a.__class__(sqrt(dvar._data), mask=dvar._mask,
+# fill_value=a._fill_value)
MaskedArray.stdu = stdu
MaskedArray.varu = varu
@@ -160,12 +163,22 @@
#####--------------------------------------------------------------------------
#----
#####--------------------------------------------------------------------------
-def apply_along_axis(func1d,axis,arr,*args):
+def flatten_inplace(seq):
+ """Flattens a sequence in place."""
+ k = 0
+ while (k != len(seq)):
+ while hasattr(seq[k],'__iter__'):
+ seq[k:(k+1)] = seq[k]
+ k += 1
+ return seq
+
+
+def apply_along_axis(func1d,axis,arr,*args,**kwargs):
""" Execute func1d(arr[i],*args) where func1d takes 1-D arrays
and arr is an N-d array. i varies so as to apply the function
along the given axis for each 1-d subarray in arr.
"""
- arr = numeric.asanyarray(arr)
+ arr = core.array(arr, copy=False, subok=True)
nd = arr.ndim
if axis < 0:
axis += nd
@@ -179,7 +192,8 @@
i[axis] = slice(None,None)
outshape = numeric.asarray(arr.shape).take(indlist)
i.put(indlist, ind)
- res = func1d(arr[tuple(i.tolist())],*args)
+ j = i.copy()
+ res = func1d(arr[tuple(i.tolist())],*args,**kwargs)
# if res is a number, then we have a smaller output array
asscalar = numeric.isscalar(res)
if not asscalar:
@@ -206,18 +220,23 @@
ind[n] = 0
n -= 1
i.put(indlist,ind)
- res = func1d(arr[tuple(i.tolist())],*args)
+ res = func1d(arr[tuple(i.tolist())],*args,**kwargs)
outarr[tuple(ind)] = res
dtypes.append(asarray(res).dtype)
k += 1
else:
+ res = core.array(res, copy=False, subok=True)
+ j = i.copy()
+ j[axis] = ([slice(None,None)] * res.ndim)
+ j.put(indlist, ind)
Ntot = numeric.product(outshape)
holdshape = outshape
outshape = list(arr.shape)
- outshape[axis] = len(res)
+ outshape[axis] = res.shape
dtypes.append(asarray(res).dtype)
+ outshape = flatten_inplace(outshape)
outarr = zeros(outshape, object_)
- outarr[tuple(i.tolist())] = res
+ outarr[tuple(flatten_inplace(j.tolist()))] = res
k = 1
while k < Ntot:
# increment the index
@@ -228,8 +247,9 @@
ind[n] = 0
n -= 1
i.put(indlist, ind)
- res = func1d(arr[tuple(i.tolist())],*args)
- outarr[tuple(i.tolist())] = res
+ j.put(indlist, ind)
+ res = func1d(arr[tuple(i.tolist())],*args,**kwargs)
+ outarr[tuple(flatten_inplace(j.tolist()))] = res
dtypes.append(asarray(res).dtype)
k += 1
max_dtypes = numeric.dtype(numeric.asarray(dtypes).max())
Added: trunk/Lib/sandbox/maskedarray/morestats.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/morestats.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/morestats.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -0,0 +1,351 @@
+"""
+Generic statistics functions, with support to MA.
+
+:author: Pierre GF Gerard-Marchant
+:contact: pierregm_at_uga_edu
+:date: $Date$
+:version: $Id$
+"""
+__author__ = "Pierre GF Gerard-Marchant ($Author$)"
+__version__ = '1.0'
+__revision__ = "$Revision$"
+__date__ = '$Date$'
+
+
+import numpy
+from numpy import bool_, float_, int_, ndarray, \
+ sqrt,\
+ arange, empty,\
+ r_
+from numpy import array as narray
+import numpy.core.numeric as numeric
+from numpy.core.numeric import concatenate
+
+import maskedarray as MA
+from maskedarray.core import masked, nomask, MaskedArray, masked_array
+from maskedarray.extras import apply_along_axis, dot
+from maskedarray.mstats import trim_both, trimmed_stde, mquantiles, mmedian, stde_median
+
+from scipy.stats.distributions import norm, beta, t, binom
+from scipy.stats.morestats import find_repeats
+
+__all__ = ['hdquantiles', 'hdquantiles_sd',
+ 'trimmed_mean_ci', 'mjci', 'rank_data']
+
+
+#####--------------------------------------------------------------------------
+#---- --- Quantiles ---
+#####--------------------------------------------------------------------------
+def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
+ """Computes quantile estimates with the Harrell-Davis method, where the estimates
+ are calculated as a weighted linear combination of order statistics.
+ If var=True, the variance of the estimate is also returned.
+ Depending on var, returns a (p,) array of quantiles or a (2,p) array of quantiles
+ and variances.
+
+:Inputs:
+ data: ndarray
+ Data array.
+ prob: Sequence
+ List of quantiles to compute.
+ axis : integer *[None]*
+ Axis along which to compute the quantiles. If None, use a flattened array.
+ var : boolean *[False]*
+ Whether to return the variance of the estimate.
+
+:Note:
+ The function is restricted to 2D arrays.
+ """
+ def _hd_1D(data,prob,var):
+ "Computes the HD quantiles for a 1D array."
+ xsorted = numpy.sort(data.compressed().view(ndarray))
+ n = len(xsorted)
+ #.........
+ hd = empty((2,len(prob)), float_)
+ if n < 2:
+ hd.flat = numpy.nan
+ return hd
+ #.........
+ v = arange(n+1) / float(n)
+ betacdf = beta.cdf
+ for (i,p) in enumerate(prob):
+ _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
+ w = _w[1:] - _w[:-1]
+ hd_mean = dot(w, xsorted)
+ hd[0,i] = hd_mean
+ #
+ hd[1,i] = dot(w, (xsorted-hd_mean)**2)
+ #
+ hd[0, prob == 0] = xsorted[0]
+ hd[0, prob == 1] = xsorted[-1]
+ if var:
+ hd[1, prob == 0] = hd[1, prob == 1] = numpy.nan
+ return hd
+ return hd[0]
+ # Initialization & checks ---------
+ data = masked_array(data, copy=False, dtype=float_)
+ p = numpy.array(prob, copy=False, ndmin=1)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ result = _hd_1D(data.compressed(), p, var)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ result = apply_along_axis(_hd_1D, axis, data, p, var)
+ #
+ return masked_array(result, mask=numpy.isnan(result))
+
+#..............................................................................
+def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
+ """Computes the standard error of the Harrell-Davis quantile estimates by jackknife.
+
+:Inputs:
+ data: ndarray
+ Data array.
+ prob: Sequence
+ List of quantiles to compute.
+ axis : integer *[None]*
+ Axis along which to compute the quantiles. If None, use a flattened array.
+ var : boolean *[False]*
+ Whether to return the variance of the estimate.
+ stderr : boolean *[False]*
+ Whether to return the standard error of the estimate.
+
+:Note:
+ The function is restricted to 2D arrays.
+ """
+ def _hdsd_1D(data,prob):
+ "Computes the std error for 1D arrays."
+ xsorted = numpy.sort(data.compressed())
+ n = len(xsorted)
+ #.........
+ hdsd = empty(len(prob), float_)
+ if n < 2:
+ hdsd.flat = numpy.nan
+ #.........
+ vv = arange(n) / float(n-1)
+ betacdf = beta.cdf
+ #
+ for (i,p) in enumerate(prob):
+ _w = betacdf(vv, (n+1)*p, (n+1)*(1-p))
+ w = _w[1:] - _w[:-1]
+ mx_ = numpy.fromiter([dot(w,xsorted[r_[range(0,k),
+ range(k+1,n)].astype(int_)])
+ for k in range(n)], dtype=float_)
+ mx_var = numpy.array(mx_.var(), copy=False, ndmin=1) * n / float(n-1)
+ hdsd[i] = float(n-1) * sqrt(numpy.diag(mx_var).diagonal() / float(n))
+ return hdsd
+ # Initialization & checks ---------
+ data = masked_array(data, copy=False, dtype=float_)
+ p = numpy.array(prob, copy=False, ndmin=1)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ result = _hdsd_1D(data.compressed(), p)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ result = apply_along_axis(_hdsd_1D, axis, data, p)
+ #
+ return masked_array(result, mask=numpy.isnan(result)).ravel()
+
+
+#####--------------------------------------------------------------------------
+#---- --- Confidence intervals ---
+#####--------------------------------------------------------------------------
+
+def trimmed_mean_ci(data, proportiontocut=0.2, alpha=0.05, axis=None):
+ """Returns the selected confidence interval of the trimmed mean along the
+ given axis.
+
+:Inputs:
+ data : sequence
+ Input data. The data is transformed to a masked array
+ proportiontocut : float *[0.2]*
+ Proportion of the data to cut from each side of the data .
+ As a result, (2*proportiontocut*n) values are actually trimmed.
+ alpha : float *[0.05]*
+ Confidence level of the intervals
+ axis : integer *[None]*
+ Axis along which to cut.
+ """
+ data = masked_array(data, copy=False)
+ trimmed = trim_both(data, proportiontocut=proportiontocut, axis=axis)
+ tmean = trimmed.mean(axis)
+ tstde = trimmed_stde(data, proportiontocut=proportiontocut, axis=axis)
+ df = trimmed.count(axis) - 1
+ tppf = t.ppf(1-alpha/2.,df)
+ return numpy.array((tmean - tppf*tstde, tmean+tppf*tstde))
+
+#..............................................................................
+def mjci(data, prob=[0.25,0.5,0.75], axis=None):
+ """Returns the Maritz-Jarrett estimators of the standard error of selected
+ experimental quantiles of the data.
+
+:Input:
+ data : sequence
+ Input data.
+ prob : sequence *[0.25,0.5,0.75]*
+ Sequence of quantiles whose standard error must be estimated.
+ axis : integer *[None]*
+ Axis along which to compute the standard error.
+ """
+ def _mjci_1D(data, p):
+ data = data.compressed()
+ sorted = numpy.sort(data)
+ n = data.size
+ prob = (numpy.array(p) * n + 0.5).astype(int_)
+ betacdf = beta.cdf
+ #
+ mj = empty(len(prob), float_)
+ x = arange(1,n+1, dtype=float_) / n
+ y = x - 1./n
+ for (i,m) in enumerate(prob):
+ (m1,m2) = (m-1, n-m)
+ W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
+ C1 = numpy.dot(W,sorted)
+ C2 = numpy.dot(W,sorted**2)
+ mj[i] = sqrt(C2 - C1**2)
+ return mj
+ #
+ data = masked_array(data, copy=False)
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ p = numpy.array(prob, copy=False, ndmin=1)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ return _mjci_1D(data, p)
+ else:
+ return apply_along_axis(_mjci_1D, axis, data, p)
+
+#..............................................................................
+def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
+ """Computes the alpha confidence interval for the selected quantiles of the
+ data, with Maritz-Jarrett estimators.
+
+:Input:
+ data : sequence
+ Input data.
+ prob : sequence *[0.25,0.5,0.75]*
+ Sequence of quantiles whose standard error must be estimated.
+ alpha : float *[0.05]*
+ Confidence degree.
+ axis : integer *[None]*
+ Axis along which to compute the standard error.
+ """
+ alpha = min(alpha, 1-alpha)
+ z = norm.ppf(1-alpha/2.)
+ xq = mquantiles(data, prob, alphap=0, betap=0, axis=axis)
+ smj = mjci(data, prob, axis=axis)
+ return (xq - z * smj, xq + z * smj)
+
+
+#.............................................................................
+def median_cihs(data, alpha=0.05, axis=None):
+ """Computes the alpha-level confidence interval for the median of the data,
+ following the Hettmasperger-Sheather method.
+
+:Inputs:
+ data : sequence
+ Input data. Masked values are discarded. The input should be 1D only
+ alpha : float *[0.05]*
+ Confidence degree.
+ """
+ def _cihs_1D(data, alpha):
+ data = numpy.sort(data.compressed())
+ n = len(data)
+ alpha = min(alpha, 1-alpha)
+ k = int(binom._ppf(alpha/2., n, 0.5))
+ gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+ if gk < 1-alpha:
+ k -= 1
+ gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
+ gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
+ I = (gk - 1 + alpha)/(gk - gkk)
+ lambd = (n-k) * I / float(k + (n-2*k)*I)
+ lims = (lambd*data[k] + (1-lambd)*data[k-1],
+ lambd*data[n-k-1] + (1-lambd)*data[n-k])
+ return lims
+ data = masked_array(data, copy=False)
+ # Computes quantiles along axis (or globally)
+ if (axis is None):
+ result = _cihs_1D(data.compressed(), p, var)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ result = apply_along_axis(_cihs_1D, axis, data, alpha)
+ #
+ return result
+
+#..............................................................................
+def compare_medians_ms(group_1, group_2, axis=None):
+ """Compares the medians from two independent groups along the given axis.
+ Returns an array of p values.
+ The comparison is performed using the McKean-Schrader estimate of the standard
+ error of the medians.
+
+:Inputs:
+ group_1 : sequence
+ First dataset.
+ group_2 : sequence
+ Second dataset.
+ axis : integer *[None]*
+ Axis along which the medians are estimated. If None, the arrays are flattened.
+ """
+ (med_1, med_2) = (mmedian(group_1, axis=axis), mmedian(group_2, axis=axis))
+ (std_1, std_2) = (stde_median(group_1, axis=axis),
+ stde_median(group_2, axis=axis))
+ W = abs(med_1 - med_2) / sqrt(std_1**2 + std_2**2)
+ return 1 - norm.cdf(W)
+
+
+#####--------------------------------------------------------------------------
+#---- --- Ranking ---
+#####--------------------------------------------------------------------------
+
+#..............................................................................
+def rank_data(data, axis=None, use_missing=False):
+ """Returns the rank (also known as order statistics) of each data point
+ along the given axis.
+ If some values are tied, their rank is averaged.
+ If some values are masked, their rank is set to 0 if use_missing is False, or
+ set to the average rank of the unmasked values if use_missing is True.
+
+:Inputs:
+ data : sequence
+ Input data. The data is transformed to a masked array
+ axis : integer *[None]*
+ Axis along which to perform the ranking. If None, the array is first
+ flattened. An exception is raised if the axis is specified for arrays
+ with a dimension larger than 2
+ use_missing : boolean *[False]*
+ Flag indicating whether the masked values have a rank of 0 (False) or
+ equal to the average rank of the unmasked values (True)
+ """
+ #
+ def _rank1d(data, use_missing=False):
+ n = data.count()
+ rk = numpy.empty(data.size, dtype=float_)
+ idx = data.argsort()
+ rk[idx[:n]] = numpy.arange(1,n+1)
+ #
+ if use_missing:
+ rk[idx[n:]] = (n+1)/2.
+ else:
+ rk[idx[n:]] = 0
+ #
+ repeats = find_repeats(data)
+ for r in repeats[0]:
+ condition = (data==r).filled(False)
+ rk[condition] = rk[condition].mean()
+ return rk
+ #
+ data = masked_array(data, copy=False)
+ if axis is None:
+ if data.ndim > 1:
+ return _rank1d(data.ravel(), use_missing).reshape(data.shape)
+ else:
+ return _rank1d(data, use_missing)
+ else:
+ return apply_along_axis(_rank1d, axis, data, use_missing)
+
+###############################################################################
+if __name__ == '__main__':
+ data = numpy.arange(100).reshape(4,25)
+# tmp = hdquantiles(data, prob=[0.25,0.75,0.5], axis=1, var=False)
+
Property changes on: trunk/Lib/sandbox/maskedarray/morestats.py
___________________________________________________________________
Name: svn:keywords
+ Date
Author
Revision
Id
Modified: trunk/Lib/sandbox/maskedarray/mrecords.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/mrecords.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/mrecords.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -675,4 +675,10 @@
mrec[-1] = masked
assert_equal(mrec._mask, [1,1,0,0,1])
+ if 1:
+ x = [(1.,10.,'a'),(2.,20,'b'),(3.14,30,'c'),(5.55,40,'d')]
+ desc = [('ffloat', N.float_), ('fint', N.int_), ('fstr', 'S10')]
+ mr = MaskedRecords(x,dtype=desc)
+ mr[0] = masked
+ mr.ffloat[-1] = masked
\ No newline at end of file
Modified: trunk/Lib/sandbox/maskedarray/mstats.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/mstats.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/mstats.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -13,40 +13,187 @@
import numpy
-from numpy import bool_, float_, int_
+from numpy import bool_, float_, int_, \
+ sqrt
from numpy import array as narray
import numpy.core.numeric as numeric
from numpy.core.numeric import concatenate
import maskedarray as MA
-from maskedarray.core import masked, nomask, MaskedArray
-from maskedarray.core import masked_array as marray
+from maskedarray.core import masked, nomask, MaskedArray, masked_array
from maskedarray.extras import apply_along_axis, dot
+__all__ = ['cov','meppf','plotting_positions','meppf','mmedian','mquantiles',
+ 'stde_median','trim_tail','trim_both','trimmed_mean','trimmed_stde',
+ 'winsorize']
-def _quantiles_1D(data,m,p):
- """Returns quantiles for 1D compressed data.
- Used internally by `mquantiles`.
+#####--------------------------------------------------------------------------
+#---- -- Trimming ---
+#####--------------------------------------------------------------------------
+
+def winsorize(data, alpha=0.2):
+ """Returns a Winsorized version of the input array: the (alpha/2.) lowest
+ values are set to the (alpha/2.)th percentile, and the (alpha/2.) highest
+ values are set to the (1-alpha/2.)th percentile
+ Masked values are skipped. The input array is first flattened.
+ """
+ data = masked_array(data, copy=False).ravel()
+ idxsort = data.argsort()
+ (nsize, ncounts) = (data.size, data.count())
+ ntrim = int(alpha * ncounts)
+ (xmin,xmax) = data[idxsort[[ntrim, ncounts-nsize-ntrim-1]]]
+ return masked_array(numpy.clip(data, xmin, xmax), mask=data._mask)
+
+#..............................................................................
+def trim_both(data, proportiontocut=0.2, axis=None):
+ """Trims the data by masking the int(trim*n) smallest and int(trim*n) largest
+ values of data along the given axis, where n is the number of unmasked values.
-:Parameters:
- data : ndarray
- Array to quantize
- m : Sequence
- p : float ndarray
- Quantiles to compute
+:Inputs:
+ data : MaskedArray
+ Data to trim.
+ trim : float *[0.2]*
+ Percentage of trimming. If n is the number of unmasked values before trimming,
+ the number of values after trimming is (1-2*trim)*n.
+ axis : integer *[None]*
+ Axis along which to perform the trimming.
"""
- n = data.count()
- if n == 0:
- return MA.resize(masked, len(p))
- elif n == 1:
- return MA.resize(data,len(p))
- data = data.compressed()
- aleph = (n*p + m)
- k = numpy.floor(aleph.clip(1, n-1)).astype(int_)
- gamma = (aleph-k).clip(0,1)
- y = MA.sort(data)
- return (1.-gamma)*y[(k-1).tolist()] + gamma*y[k.tolist()]
+ #...................
+ def _trim_1D(data, trim):
+ "Private function: return a trimmed 1D array."
+ nsize = data.size
+ ncounts = data.count()
+ ntrim = int(trim * ncounts)
+ idxsort = data.argsort()
+ data[idxsort[:ntrim]] = masked
+ data[idxsort[ncounts-nsize-ntrim:]] = masked
+ return data
+ #...................
+ data = masked_array(data, copy=False, subok=True)
+ data.unshare_mask()
+ if (axis is None):
+ return _trim_1D(data.ravel(), proportiontocut)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_trim_1D, axis, data, proportiontocut)
+#..............................................................................
+def trim_tail(data, proportiontocut=0.2, tail='left', axis=None):
+ """Trims the data by masking int(trim*n) values from ONE tail of the data
+ along the given axis, where n is the number of unmasked values.
+
+:Inputs:
+ data : MaskedArray
+ Data to trim.
+ trim : float *[0.2]*
+ Percentage of trimming. If n is the number of unmasked values before trimming,
+ the number of values after trimming is (1-2*trim)*n.
+ axis : integer *[None]*
+ Axis along which to perform the trimming.
+ """
+ #...................
+ def _trim_1D(data, trim, left):
+ "Private function: return a trimmed 1D array."
+ nsize = data.size
+ ncounts = data.count()
+ ntrim = int(trim * ncounts)
+ idxsort = data.argsort()
+ if left:
+ data[idxsort[:ntrim]] = masked
+ else:
+ data[idxsort[ncounts-nsize-ntrim:]] = masked
+ return data
+ #...................
+ data = masked_array(data, copy=False, subok=True)
+ data.unshare_mask()
+ #
+ if not isinstance(tail, str):
+ raise TypeError("The tail argument should be in ('left','right')")
+ tail = tail.lower()[0]
+ if tail == 'l':
+ left = True
+ elif tail == 'r':
+ left=False
+ else:
+ raise ValueError("The tail argument should be in ('left','right')")
+ #
+ if (axis is None):
+ return _trim_1D(data.ravel(), proportiontocut, left)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_trim_1D, axis, data, proportiontocut, left)
+
+#..............................................................................
+def trimmed_mean(data, proportiontocut=0.2, axis=None):
+ """Returns the trimmed mean of the data along the given axis. Trimming is
+ performed on both ends of the distribution.
+
+:Inputs:
+ data : MaskedArray
+ Data to trim.
+ proportiontocut : float *[0.2]*
+ Proportion of the data to cut from each side of the data .
+ As a result, (2*proportiontocut*n) values are actually trimmed.
+ axis : integer *[None]*
+ Axis along which to perform the trimming.
+ """
+ return trim_both(data, proportiontocut=proportiontocut, axis=axis).mean(axis=axis)
+
+#..............................................................................
+def trimmed_stde(data, proportiontocut=0.2, axis=None):
+ """Returns the standard error of the trimmed mean for the input data,
+ along the given axis. Trimming is performed on both ends of the distribution.
+
+:Inputs:
+ data : MaskedArray
+ Data to trim.
+ proportiontocut : float *[0.2]*
+ Proportion of the data to cut from each side of the data .
+ As a result, (2*proportiontocut*n) values are actually trimmed.
+ axis : integer *[None]*
+ Axis along which to perform the trimming.
+ """
+ #........................
+ def _trimmed_stde_1D(data, trim=0.2):
+ "Returns the standard error of the trimmed mean for a 1D input data."
+ winsorized = winsorize(data)
+ nsize = winsorized.count()
+ winstd = winsorized.stdu()
+ return winstd / ((1-2*trim) * numpy.sqrt(nsize))
+ #........................
+ data = masked_array(data, copy=False, subok=True)
+ data.unshare_mask()
+ if (axis is None):
+ return _trimmed_stde_1D(data.ravel(), proportiontocut)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_trimmed_stde_1D, axis, data, proportiontocut)
+
+#.............................................................................
+def stde_median(data, axis=None):
+ """Returns the McKean-Schrader estimate of the standard error of the sample
+ median along the given axis.
+ """
+ def _stdemed_1D(data):
+ sorted = numpy.sort(data.compressed())
+ n = len(sorted)
+ z = 2.5758293035489004
+ k = int(round((n+1)/2. - z * sqrt(n/4.),0))
+ return ((sorted[n-k] - sorted[k-1])/(2.*z))
+ #
+ data = masked_array(data, copy=False, subok=True)
+ if (axis is None):
+ return _stdemed_1D(data)
+ else:
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_stdemed_1D, axis, data)
+
+
+#####--------------------------------------------------------------------------
+#---- --- Quantiles ---
+#####--------------------------------------------------------------------------
+
+
def mquantiles(data, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None):
"""Computes empirical quantiles for a *1xN* data array.
Samples quantile are defined by:
@@ -83,73 +230,77 @@
Axis along which to compute quantiles. If *None*, uses the whole
(flattened/compressed) dataset.
"""
+ def _quantiles1D(data,m,p):
+ x = numpy.sort(data.compressed())
+ n = len(x)
+ if n == 0:
+ return masked_array(numpy.empty(len(p), dtype=float_), mask=True)
+ elif n == 1:
+ return masked_array(numpy.resize(x, p.shape), mask=nomask)
+ aleph = (n*p + m)
+ k = numpy.floor(aleph.clip(1, n-1)).astype(int_)
+ gamma = (aleph-k).clip(0,1)
+ return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
# Initialization & checks ---------
- data = marray(data, copy=False)
- assert data.ndim <= 2, "Array should be 2D at most !"
+ data = masked_array(data, copy=False)
p = narray(prob, copy=False, ndmin=1)
m = alphap + p*(1.-alphap-betap)
# Computes quantiles along axis (or globally)
if (axis is None):
- return _quantiles_1D(data, m, p)
+ return _quantiles1D(data, m, p)
else:
- return apply_along_axis(_quantiles_1D, axis, data, m, p)
+ assert data.ndim <= 2, "Array should be 2D at most !"
+ return apply_along_axis(_quantiles1D, axis, data, m, p)
-def _median1d(data):
- """Returns the median of a 1D masked array. Used internally by mmedian."""
- datac = data.compressed()
- if datac.size > 0:
- return numpy.median(data.compressed())
- return masked
-
-def _median2d_1(data):
- data = marray(data, subok=True, copy=True)
- if data._mask is nomask:
- return numpy.median(data)
- if data.ndim != 2 :
- raise ValueError("Input array should be 2D!")
- (n,p) = data.shape
- if p < n//3:
- return apply_along_axis(_median1d, 0, data)
- data.sort(axis=0)
- counts = data.count(axis=0)
- midx = (counts//2)
- midx_even = (counts%2==0)
- med = marray(numeric.empty((data.shape[-1],), dtype=data.dtype))
- med[midx_even] = (data[midx-1]+data[midx])/2.
- med[numpy.logical_not(midx_even)] = data[midx]
- if not med._mask.any():
- med._mask = nomask
- return med
-
-def _median2d_2(data):
- return apply_along_axis(_median1d, 0, data)
+def plotting_positions(data, alpha=0.4, beta=0.4):
+ """Returns the plotting positions (or empirical percentile points) for the
+ data.
+ Plotting positions are defined as (i-alpha)/(n-alpha-beta), where:
+ - i is the rank order statistics
+ - n is the number of unmasked values along the given axis
+ - alpha and beta are two parameters.
+ Typical values for alpha and beta are:
+ - (0,1) : *p(k) = k/n* : linear interpolation of cdf (R, type 4)
+ - (.5,.5) : *p(k) = (k-1/2.)/n* : piecewise linear function (R, type 5)
+ - (0,0) : *p(k) = k/(n+1)* : Weibull (R type 6)
+ - (1,1) : *p(k) = (k-1)/(n-1)*. In this case, p(k) = mode[F(x[k])].
+ That's R default (R type 7)
+ - (1/3,1/3): *p(k) = (k-1/3)/(n+1/3)*. Then p(k) ~ median[F(x[k])].
+ The resulting quantile estimates are approximately median-unbiased
+ regardless of the distribution of x. (R type 8)
+ - (3/8,3/8): *p(k) = (k-3/8)/(n+1/4)*. Blom.
+ The resulting quantile estimates are approximately unbiased
+ if x is normally distributed (R type 9)
+ - (.4,.4) : approximately quantile unbiased (Cunnane)
+ - (.35,.35): APL, used with PWM
+ """
+ data = masked_array(data, copy=False).reshape(1,-1)
+ n = data.count()
+ plpos = numpy.empty(data.size, dtype=float_)
+ plpos[n:] = 0
+ plpos[data.argsort()[:n]] = (numpy.arange(1,n+1) - alpha)/(n+1-alpha-beta)
+ return masked_array(plpos, mask=data._mask)
-def mmedian(data):
- """Returns the median of data along the first axis. Missing data are discarded."""
- data = marray(data, subok=True, copy=True)
- if data._mask is nomask:
- return numpy.median(data)
- if data.ndim == 1:
- return _median1d(data)
-# elif data.ndim == 2:
-# (n, p) = data.shape
-# if p < n//3:
-# return apply_along_axis(_median1d, 0, data)
-# data.sort(axis=0)
-# counts = data.count(axis=0)
-# midx = (counts//2)
-# midx_even = (counts%2==0)
-# med = marray(numeric.empty((p,), dtype=data.dtype))
-# med[midx_even] = (data[midx-1]+data[midx])/2.
-# med[numpy.logical_not(midx_even)] = data[midx]
-# if not med._mask.any():
-# med._mask = nomask
-# return med
- return apply_along_axis(_median1d, 0, data)
+meppf = plotting_positions
+
+
+def mmedian(data, axis=None):
+ """Returns the median of data along the given axis. Missing data are discarded."""
+ def _median1D(data):
+ x = numpy.sort(data.compressed())
+ if x.size == 0:
+ return masked
+ return numpy.median(x)
+ data = masked_array(data, subok=True, copy=True)
+ if axis is None:
+ return _median1D(data)
+ else:
+ return apply_along_axis(_median1D, axis, data)
+
def cov(x, y=None, rowvar=True, bias=False, strict=False):
"""
Estimate the covariance matrix.
@@ -197,59 +348,47 @@
return (dot(X, X.T.conj(), strict=False) / fact).squeeze()
-################################################################################
-if __name__ == '__main__':
- from maskedarray.testutils import assert_almost_equal, assert_equal
- import timeit
- import maskedarray
+def idealfourths(data, axis=None):
+ """Returns an estimate of the interquartile range of the data along the given
+ axis, as computed with the ideal fourths.
+ """
+ def _idf(data):
+ x = numpy.sort(data.compressed())
+ n = len(x)
+ (j,h) = divmod(n/4. + 5/12.,1)
+ qlo = (1-h)*x[j] + h*x[j+1]
+ k = n - j
+ qup = (1-h)*x[k] + h*x[k-1]
+ return qup - qlo
+ data = masked_array(data, copy=False)
+ if (axis is None):
+ return _idf(data)
+ else:
+ return apply_along_axis(_idf, axis, data)
- if 1:
- (n,p) = (101,30)
- x = marray(numpy.linspace(-1.,1.,n),)
- x[:10] = x[-10:] = masked
- z = marray(numpy.empty((n,p), dtype=numpy.float_))
- z[:,0] = x[:]
- idx = numpy.arange(len(x))
- for i in range(1,p):
- numpy.random.shuffle(idx)
- z[:,i] = x[idx]
- assert_equal(mmedian(z[:,0]), 0)
- assert_equal(mmedian(z), numpy.zeros((p,)))
-
- x = maskedarray.arange(24).reshape(3,4,2)
- x[x%3==0] = masked
- assert_equal(mmedian(x), [[12,9],[6,15],[12,9],[18,15]])
- x.shape = (4,3,2)
- assert_equal(mmedian(x),[[99,10],[11,99],[13,14]])
- x = maskedarray.arange(24).reshape(4,3,2)
- x[x%5==0] = masked
- assert_equal(mmedian(x), [[12,10],[8,9],[16,17]])
-
-
- """ [[[0 1], [2 3], [4 5]]
- [[6 7], [8 9], [10 11]]
- [[9 13] [14 15] [16 17]]
- [[18 19] [20 21] [22 23]]],
+def rsh(data, points=None):
+ """Evalutates Rosenblatt's shifted histogram estimators for each
+ point of 'points' on the dataset 'data'.
+
+:Inputs:
+ data : sequence
+ Input data. Masked values are discarded.
+ points :
+ Sequence of points where to evaluate Rosenblatt shifted histogram.
+ If None, use the data.
+ """
+ data = masked_array(data, copy=False)
+ if points is None:
+ points = data
+ else:
+ points = numpy.array(points, copy=False, ndmin=1)
+ if data.ndim != 1:
+ raise AttributeError("The input array should be 1D only !")
+ n = data.count()
+ h = 1.2 * idealfourths(data) / n**(1./5)
+ nhi = (data[:,None] <= points[None,:] + h).sum(0)
+ nlo = (data[:,None] < points[None,:] - h).sum(0)
+ return (nhi-nlo) / (2.*n*h)
- [[[-- 1] [2 --] [4 5] [-- 7]]
- [[8 --] [10 11] [-- 13] [14 --]]
- [[16 17] [-- 19] [20 --] [22 23]]],
-
- """
-
- if 0:
- print "GO!"
- med_setup1 = "from __main__ import _median2d_1,z"
- med_setup3 = "from __main__ import mmedian,z"
- med_setup2 = "from __main__ import _median2d_2,z"
- (nrep, nloop) = (3,10)
- med_r1 = timeit.Timer("_median2d_1(z)", med_setup1).repeat(nrep,nloop)
- med_r2 = timeit.Timer("_median2d_2(z)", med_setup2).repeat(nrep,nloop)
- med_r3 = timeit.Timer("mmedian(z)", med_setup3).repeat(nrep,nloop)
- med_r1 = numpy.sort(med_r1)
- med_r2 = numpy.sort(med_r2)
- med_r3 = numpy.sort(med_r3)
- print "median2d_1 : %s" % med_r1
- print "median2d_2 : %s" % med_r2
- print "median : %s" % med_r3
\ No newline at end of file
+
\ No newline at end of file
Added: trunk/Lib/sandbox/maskedarray/tests/test_morestats.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/tests/test_morestats.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/tests/test_morestats.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -0,0 +1,115 @@
+# pylint: disable-msg=W0611, W0612, W0511,R0201
+"""Tests suite for maskedArray statistics.
+
+:author: Pierre Gerard-Marchant
+:contact: pierregm_at_uga_dot_edu
+:version: $Id: test_morestats.py 240 2007-07-09 15:36:48Z backtopop $
+"""
+__author__ = "Pierre GF Gerard-Marchant ($Author: backtopop $)"
+__version__ = '1.0'
+__revision__ = "$Revision: 240 $"
+__date__ = '$Date: 2007-07-09 11:36:48 -0400 (Mon, 09 Jul 2007) $'
+
+import numpy
+
+import maskedarray
+from maskedarray import masked, masked_array
+
+import maskedarray.mstats
+from maskedarray.mstats import *
+import maskedarray.morestats
+from maskedarray.morestats import *
+
+import maskedarray.testutils
+from maskedarray.testutils import *
+
+
+class test_misc(NumpyTestCase):
+ #
+ def __init__(self, *args, **kwargs):
+ NumpyTestCase.__init__(self, *args, **kwargs)
+ #
+ def test_mjci(self):
+ "Tests the Marits-Jarrett estimator"
+ data = masked_array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(mjci(data),[55.76819,45.84028,198.8788],5)
+ #
+ def test_trimmedmeanci(self):
+ "Tests the confidence intervals of the trimmed mean."
+ data = masked_array([545,555,558,572,575,576,578,580,
+ 594,605,635,651,653,661,666])
+ assert_almost_equal(trimmed_mean(data,0.2), 596.2, 1)
+ assert_equal(numpy.round(trimmed_mean_ci(data,0.2),1), [561.8, 630.6])
+
+#..............................................................................
+class test_ranking(NumpyTestCase):
+ #
+ def __init__(self, *args, **kwargs):
+ NumpyTestCase.__init__(self, *args, **kwargs)
+ #
+ def test_ranking(self):
+ x = masked_array([0,1,1,1,2,3,4,5,5,6,])
+ assert_almost_equal(rank_data(x),[1,3,3,3,5,6,7,8.5,8.5,10])
+ x[[3,4]] = masked
+ assert_almost_equal(rank_data(x),[1,2.5,2.5,0,0,4,5,6.5,6.5,8])
+ assert_almost_equal(rank_data(x,use_missing=True),
+ [1,2.5,2.5,4.5,4.5,4,5,6.5,6.5,8])
+ x = masked_array([0,1,5,1,2,4,3,5,1,6,])
+ assert_almost_equal(rank_data(x),[1,3,8.5,3,5,7,6,8.5,3,10])
+ x = masked_array([[0,1,1,1,2], [3,4,5,5,6,]])
+ assert_almost_equal(rank_data(x),[[1,3,3,3,5],[6,7,8.5,8.5,10]])
+ assert_almost_equal(rank_data(x,axis=1),[[1,3,3,3,5],[1,2,3.5,3.5,5]])
+ assert_almost_equal(rank_data(x,axis=0),[[1,1,1,1,1],[2,2,2,2,2,]])
+
+#..............................................................................
+class test_quantiles(NumpyTestCase):
+ #
+ def __init__(self, *args, **kwargs):
+ NumpyTestCase.__init__(self, *args, **kwargs)
+ #
+ def test_hdquantiles(self):
+ data = [0.706560797,0.727229578,0.990399276,0.927065621,0.158953014,
+ 0.887764025,0.239407086,0.349638551,0.972791145,0.149789972,
+ 0.936947700,0.132359948,0.046041972,0.641675031,0.945530547,
+ 0.224218684,0.771450991,0.820257774,0.336458052,0.589113496,
+ 0.509736129,0.696838829,0.491323573,0.622767425,0.775189248,
+ 0.641461450,0.118455200,0.773029450,0.319280007,0.752229111,
+ 0.047841438,0.466295911,0.583850781,0.840581845,0.550086491,
+ 0.466470062,0.504765074,0.226855960,0.362641207,0.891620942,
+ 0.127898691,0.490094097,0.044882048,0.041441695,0.317976349,
+ 0.504135618,0.567353033,0.434617473,0.636243375,0.231803616,
+ 0.230154113,0.160011327,0.819464108,0.854706985,0.438809221,
+ 0.487427267,0.786907310,0.408367937,0.405534192,0.250444460,
+ 0.995309248,0.144389588,0.739947527,0.953543606,0.680051621,
+ 0.388382017,0.863530727,0.006514031,0.118007779,0.924024803,
+ 0.384236354,0.893687694,0.626534881,0.473051932,0.750134705,
+ 0.241843555,0.432947602,0.689538104,0.136934797,0.150206859,
+ 0.474335206,0.907775349,0.525869295,0.189184225,0.854284286,
+ 0.831089744,0.251637345,0.587038213,0.254475554,0.237781276,
+ 0.827928620,0.480283781,0.594514455,0.213641488,0.024194386,
+ 0.536668589,0.699497811,0.892804071,0.093835427,0.731107772]
+ #
+ assert_almost_equal(hdquantiles(data,[0., 1.]),
+ [0.006514031, 0.995309248])
+ hdq = hdquantiles(data,[0.25, 0.5, 0.75])
+ assert_almost_equal(hdq, [0.253210762, 0.512847491, 0.762232442,])
+ hdq = hdquantiles_sd(data,[0.25, 0.5, 0.75])
+ assert_almost_equal(hdq, [0.03786954, 0.03805389, 0.03800152,], 4)
+ #
+ data = numpy.array(data).reshape(10,10)
+ hdq = hdquantiles(data,[0.25,0.5,0.75],axis=0)
+ assert_almost_equal(hdq[:,0], hdquantiles(data[:,0],[0.25,0.5,0.75]))
+ assert_almost_equal(hdq[:,-1], hdquantiles(data[:,-1],[0.25,0.5,0.75]))
+ hdq = hdquantiles(data,[0.25,0.5,0.75],axis=0,var=True)
+ assert_almost_equal(hdq[...,0],
+ hdquantiles(data[:,0],[0.25,0.5,0.75],var=True))
+ assert_almost_equal(hdq[...,-1],
+ hdquantiles(data[:,-1],[0.25,0.5,0.75], var=True))
+
+
+###############################################################################
+#------------------------------------------------------------------------------
+if __name__ == "__main__":
+ NumpyTest().run()
+
\ No newline at end of file
Modified: trunk/Lib/sandbox/maskedarray/tests/test_mstats.py
===================================================================
--- trunk/Lib/sandbox/maskedarray/tests/test_mstats.py 2007-07-09 13:59:52 UTC (rev 3152)
+++ trunk/Lib/sandbox/maskedarray/tests/test_mstats.py 2007-07-09 15:38:26 UTC (rev 3153)
@@ -18,7 +18,7 @@
import maskedarray.testutils
from maskedarray.testutils import *
-from maskedarray.mstats import mquantiles, mmedian, cov
+from maskedarray.mstats import *
#..............................................................................
class test_quantiles(NumpyTestCase):
@@ -103,13 +103,56 @@
"Tests median w/ 3D"
x = maskedarray.arange(24).reshape(3,4,2)
x[x%3==0] = masked
- assert_equal(mmedian(x), [[12,9],[6,15],[12,9],[18,15]])
+ assert_equal(mmedian(x,0), [[12,9],[6,15],[12,9],[18,15]])
x.shape = (4,3,2)
- assert_equal(mmedian(x),[[99,10],[11,99],[13,14]])
+ assert_equal(mmedian(x,0),[[99,10],[11,99],[13,14]])
x = maskedarray.arange(24).reshape(4,3,2)
x[x%5==0] = masked
- assert_equal(mmedian(x), [[12,10],[8,9],[16,17]])
-
+ assert_equal(mmedian(x,0), [[12,10],[8,9],[16,17]])
+
+#..............................................................................
+class test_trimming(NumpyTestCase):
+ #
+ def __init__(self, *args, **kwds):
+ NumpyTestCase.__init__(self, *args, **kwds)
+ #
+ def test_trim(self):
+ "Tests trimming."
+ x = maskedarray.arange(100)
+ assert_equal(trim_both(x).count(), 60)
+ assert_equal(trim_tail(x,tail='r').count(), 80)
+ x[50:70] = masked
+ trimx = trim_both(x)
+ assert_equal(trimx.count(), 48)
+ assert_equal(trimx._mask, [1]*16 + [0]*34 + [1]*20 + [0]*14 + [1]*16)
+ x._mask = nomask
+ x.shape = (10,10)
+ assert_equal(trim_both(x).count(), 60)
+ assert_equal(trim_tail(x).count(), 80)
+ #
+ def test_trimmedmean(self):
+ "Tests the trimmed mean."
+ data = masked_array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(trimmed_mean(data,0.1), 343, 0)
+ assert_almost_equal(trimmed_mean(data,0.2), 283, 0)
+ #
+ def test_trimmed_stde(self):
+ "Tests the trimmed mean standard error."
+ data = masked_array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(trimmed_stde(data,0.2), 56.1, 1)
+ #
+ def test_winsorization(self):
+ "Tests the Winsorization of the data."
+ data = masked_array([ 77, 87, 88,114,151,210,219,246,253,262,
+ 296,299,306,376,428,515,666,1310,2611])
+ assert_almost_equal(winsorize(data).varu(), 21551.4, 1)
+ data[5] = masked
+ winsorized = winsorize(data)
+ assert_equal(winsorized.mask, data.mask)
+#..............................................................................
+
class test_misc(NumpyTestCase):
def __init__(self, *args, **kwds):
NumpyTestCase.__init__(self, *args, **kwds)
@@ -123,6 +166,7 @@
assert_equal(c, (x[1].anom()**2).sum()/2.)
c = cov(x)
assert_equal(c[1,0], (x[0].anom()*x[1].anom()).sum())
+
###############################################################################
#------------------------------------------------------------------------------
More information about the Scipy-svn
mailing list