[Scipy-svn] r3377 - trunk/scipy/sandbox/dhuard
scipy-svn@scip...
scipy-svn@scip...
Thu Sep 27 13:37:28 CDT 2007
Author: dhuard
Date: 2007-09-27 13:37:13 -0500 (Thu, 27 Sep 2007)
New Revision: 3377
Added:
trunk/scipy/sandbox/dhuard/histogram.f
trunk/scipy/sandbox/dhuard/histogram.py
trunk/scipy/sandbox/dhuard/test_histogram.py
Log:
added histogram functions.
Added: trunk/scipy/sandbox/dhuard/histogram.f
===================================================================
--- trunk/scipy/sandbox/dhuard/histogram.f 2007-09-27 15:49:05 UTC (rev 3376)
+++ trunk/scipy/sandbox/dhuard/histogram.f 2007-09-27 18:37:13 UTC (rev 3377)
@@ -0,0 +1,348 @@
+C*******************************************************************
+C RETURN THE HISTOGRAM OF ARRAY X, THAT IS, THE NUMBER OF ELEMENTS
+C IN X FALLING INTO EACH BIN.
+C THE BIN ARRAY CONSISTS IN N BINS STARTING AT BIN0 WITH WIDTH DELTA.
+C HISTO H : | LOWER OUTLIERS | 1 | 2 | 3 | ... | N | UPPER OUTLIERS |
+C INDEX i : | 1 | 2 | 3 | 4 | ... | N+1 | N+2 |
+
+ SUBROUTINE FIXED_BINSIZE(X, BIN0, DELTA, N, NX, H)
+
+C PARAMETERS
+C ----------
+C X : ARRAY
+C BIN0 : LEFT BIN EDGE
+C DELTA : BIN WIDTH
+C N : NUMBER OF BINS
+C H : HISTOGRAM
+
+ IMPLICIT NONE
+ INTEGER :: N, NX, i, K
+ DOUBLE PRECISION :: X(NX), BIN0, DELTA
+ INTEGER :: H(N+2), UP, LOW
+
+CF2PY INTEGER INTENT(IN) :: N
+CF2PY INTEGER INTENT(HIDE) :: NX = LEN(X)
+CF2PY DOUBLE PRECISION DIMENSION(NX), INTENT(IN) :: X
+CF2PY DOUBLE PRECISION INTENT(IN) :: BIN0, DELTA
+CF2PY INTEGER DIMENSION(N+2), INTENT(OUT) :: H
+
+
+ DO i=1,N+2
+ H(i) = 0
+ ENDDO
+
+C OUTLIERS INDICES
+ UP = N+2
+ LOW = 1
+
+ DO i=1,NX
+ IF (X(i) >= BIN0) THEN
+ K = INT((X(i)-BIN0)/DELTA)+1
+ IF (K <= N) THEN
+ H(K+1) = H(K+1) + 1
+ ELSE
+ H(UP) = H(UP) + 1
+ ENDIF
+ ELSE
+ H(LOW) = H(LOW) + 1
+ ENDIF
+ ENDDO
+
+ END SUBROUTINE
+
+
+
+C*******************************************************************
+C RETURN THE WEIGHTED HISTOGRAM OF ARRAY X, THAT IS, THE SUM OF THE
+C WEIGHTS OF THE ELEMENTS OF X FALLING INTO EACH BIN.
+C THE BIN ARRAY CONSISTS IN N BINS STARTING AT BIN0 WITH WIDTH DELTA.
+C HISTO H : | LOWER OUTLIERS | 1 | 2 | 3 | ... | N | UPPER OUTLIERS |
+C INDEX i : | 1 | 2 | 3 | 4 | ... | N+1 | N+2 |
+
+ SUBROUTINE WEIGHTED_FIXED_BINSIZE(X, W, BIN0, DELTA, N, NX, H)
+
+C PARAMETERS
+C ----------
+C X : ARRAY
+C W : WEIGHTS
+C BIN0 : LEFT BIN EDGE
+C DELTA : BIN WIDTH
+C N : NUMBER OF BINS
+C H : HISTOGRAM
+
+ IMPLICIT NONE
+ INTEGER :: N, NX, i, K
+ DOUBLE PRECISION :: X(NX), W(NX), BIN0, DELTA, H(N+2)
+ INTEGER :: UP, LOW
+
+CF2PY INTEGER INTENT(IN) :: N
+CF2PY INTEGER INTENT(HIDE) :: NX = LEN(X)
+CF2PY DOUBLE PRECISION DIMENSION(NX), INTENT(IN) :: X, W
+CF2PY DOUBLE PRECISION INTENT(IN) :: BIN0, DELTA
+CF2PY DOUBLE PRECISION DIMENSION(N+2), INTENT(OUT) :: H
+
+
+ DO i=1,N+2
+ H(i) = 0.D0
+ ENDDO
+
+C OUTLIERS INDICES
+ UP = N+2
+ LOW = 1
+
+ DO i=1,NX
+ IF (X(i) >= BIN0) THEN
+ K = INT((X(i)-BIN0)/DELTA)+1
+ IF (K <= N) THEN
+ H(K+1) = H(K+1) + W(i)
+ ELSE
+ H(UP) = H(UP) + W(i)
+ ENDIF
+ ELSE
+ H(LOW) = H(LOW) + W(i)
+ ENDIF
+ ENDDO
+
+ END SUBROUTINE
+
+
+C*****************************************************************************
+C COMPUTE N DIMENSIONAL FLATTENED HISTOGRAM
+
+ SUBROUTINE FIXED_BINSIZE_ND(X, BIN0, DELTA, N, COUNT, NX,D,NC)
+
+C PARAMETERS
+C ----------
+C X : ARRAY (NXD)
+C BIN0 : LEFT BIN EDGES (D)
+C DELTA : BIN WIDTH (D)
+C N : NUMBER OF BINS (D)
+C COUNT : FLATTENED HISTOGRAM (NC)
+C NC : PROD(N(:)+2)
+
+ IMPLICIT NONE
+ INTEGER :: NX, D, NC,N(D), i, j, k, T
+ DOUBLE PRECISION :: X(NX,D), BIN0(D), DELTA(D)
+ INTEGER :: INDEX(NX), ORDER(D), MULT, COUNT(NC)
+
+
+CF2PY DOUBLE PRECISION DIMENSION(NX,D), INTENT(IN) :: X
+CF2PY DOUBLE PRECISION DIMENSION(D) :: BIN0, DELTA
+CF2PY INTEGER INTENT(IN) :: N
+CF2PY INTEGER DIMENSION(NC), INTENT(OUT) :: COUNT
+CF2PY INTEGER INTENT(HIDE) :: NX=SHAPE(X,1)
+CF2PY INTEGER INTENT(HIDE) :: D=SHAPE(X,2)
+
+
+C INITIALIZE INDEX
+ DO i=1, NX
+ INDEX(i) = 0
+ ENDDO
+
+C INITIALIZE COUNT
+ DO i=1,NC
+ COUNT(i) = 0
+ ENDDO
+
+C ORDER THE BIN SIZE ARRAY N(D)
+ CALL QSORTI(ORDER, D, N)
+
+C INITIALIZE THE DIMENSIONAL MULTIPLIER
+ MULT=1
+
+C FIND THE FLATTENED INDEX OF EACH SAMPLE
+ DO j=1, D
+ k = ORDER(j)
+ MULT=MULT*N(k)
+
+ DO i=1, NX
+ IF (X(i,k) >= BIN0(k)) THEN
+ T = INT((X(i, k)-BIN0(k))/DELTA(k))+1
+ IF (T <= N(k)) THEN
+ T = T+1
+ ELSE
+ T = N(k)+2
+ ENDIF
+ ELSE
+ T = 1
+ ENDIF
+
+ INDEX(i) = INDEX(I) + T*MULT
+ ENDDO
+ ENDDO
+
+C COUNT THE NUMBER OF SAMPLES FALLING INTO EACH BIN
+ DO i=1,NX
+ COUNT(INDEX(i)) = COUNT(INDEX(i)) + 1
+ ENDDO
+
+ END SUBROUTINE
+
+
+C From HDK@psuvm.psu.edu Thu Dec 8 15:27:16 MST 1994
+C
+C The following was converted from Algol recursive to Fortran iterative
+C by a colleague at Penn State (a long time ago - Fortran 66, please
+C excuse the GoTo's). The following code also corrects a bug in the
+C Quicksort algorithm published in the ACM (see Algorithm 402, CACM,
+C Sept. 1970, pp 563-567; also you younger folks who weren't born at
+C that time might find interesting the history of the Quicksort
+C algorithm beginning with the original published in CACM, July 1961,
+C pp 321-322, Algorithm 64). Note that the following algorithm sorts
+C integer data; actual data is not moved but sort is affected by sorting
+C a companion index array (see leading comments). The data type being
+C sorted can be changed by changing one line; see comments after
+C declarations and subsequent one regarding comparisons(Fortran
+C 77 takes care of character comparisons of course, so that comment
+C is merely historical from the days when we had to write character
+C compare subprograms, usually in assembler language for a specific
+C mainframe platform at that time). But the following algorithm is
+C good, still one of the best available.
+
+
+ SUBROUTINE QSORTI (ORD,N,A)
+C
+C==============SORTS THE ARRAY A(I),I=1,2,...,N BY PUTTING THE
+C ASCENDING ORDER VECTOR IN ORD. THAT IS ASCENDING ORDERED A
+C IS A(ORD(I)),I=1,2,...,N; DESCENDING ORDER A IS A(ORD(N-I+1)),
+C I=1,2,...,N . THIS SORT RUNS IN TIME PROPORTIONAL TO N LOG N .
+C
+C
+C ACM QUICKSORT - ALGORITHM #402 - IMPLEMENTED IN FORTRAN 66 BY
+C WILLIAM H. VERITY, WHV@PSUVM.PSU.EDU
+C CENTER FOR ACADEMIC COMPUTING
+C THE PENNSYLVANIA STATE UNIVERSITY
+C UNIVERSITY PARK, PA. 16802
+C
+ IMPLICIT INTEGER (A-Z)
+C
+ DIMENSION ORD(N),POPLST(2,20)
+ INTEGER X,XX,Z,ZZ,Y
+C
+C TO SORT DIFFERENT INPUT TYPES, CHANGE THE FOLLOWING
+C SPECIFICATION STATEMENTS; FOR EXAMPLE, FOR FORTRAN CHARACTER
+C USE THE FOLLOWING: CHARACTER *(*) A(N)
+C
+ INTEGER A(N)
+C
+ NDEEP=0
+ U1=N
+ L1=1
+ DO 1 I=1,N
+ 1 ORD(I)=I
+ 2 IF (U1.LE.L1) RETURN
+C
+ 3 L=L1
+ U=U1
+C
+C PART
+C
+ 4 P=L
+ Q=U
+C FOR CHARACTER SORTS, THE FOLLOWING 3 STATEMENTS WOULD BECOME
+C X = ORD(P)
+C Z = ORD(Q)
+C IF (A(X) .LE. A(Z)) GO TO 2
+C
+C WHERE "CLE" IS A LOGICAL FUNCTION WHICH RETURNS "TRUE" IF THE
+C FIRST ARGUMENT IS LESS THAN OR EQUAL TO THE SECOND, BASED ON "LEN"
+C CHARACTERS.
+C
+ X=A(ORD(P))
+ Z=A(ORD(Q))
+ IF (X.LE.Z) GO TO 5
+ Y=X
+ X=Z
+ Z=Y
+ YP=ORD(P)
+ ORD(P)=ORD(Q)
+ ORD(Q)=YP
+ 5 IF (U-L.LE.1) GO TO 15
+ XX=X
+ IX=P
+ ZZ=Z
+ IZ=Q
+C
+C LEFT
+C
+ 6 P=P+1
+ IF (P.GE.Q) GO TO 7
+ X=A(ORD(P))
+ IF (X.GE.XX) GO TO 8
+ GO TO 6
+ 7 P=Q-1
+ GO TO 13
+C
+C RIGHT
+C
+ 8 Q=Q-1
+ IF (Q.LE.P) GO TO 9
+ Z=A(ORD(Q))
+ IF (Z.LE.ZZ) GO TO 10
+ GO TO 8
+ 9 Q=P
+ P=P-1
+ Z=X
+ X=A(ORD(P))
+C
+C DIST
+C
+ 10 IF (X.LE.Z) GO TO 11
+ Y=X
+ X=Z
+ Z=Y
+ IP=ORD(P)
+ ORD(P)=ORD(Q)
+ ORD(Q)=IP
+ 11 IF (X.LE.XX) GO TO 12
+ XX=X
+ IX=P
+ 12 IF (Z.GE.ZZ) GO TO 6
+ ZZ=Z
+ IZ=Q
+ GO TO 6
+C
+C OUT
+C
+ 13 CONTINUE
+ IF (.NOT.(P.NE.IX.AND.X.NE.XX)) GO TO 14
+ IP=ORD(P)
+ ORD(P)=ORD(IX)
+ ORD(IX)=IP
+ 14 CONTINUE
+ IF (.NOT.(Q.NE.IZ.AND.Z.NE.ZZ)) GO TO 15
+ IQ=ORD(Q)
+ ORD(Q)=ORD(IZ)
+ ORD(IZ)=IQ
+ 15 CONTINUE
+ IF (U-Q.LE.P-L) GO TO 16
+ L1=L
+ U1=P-1
+ L=Q+1
+ GO TO 17
+ 16 U1=U
+ L1=Q+1
+ U=P-1
+ 17 CONTINUE
+ IF (U1.LE.L1) GO TO 18
+C
+C START RECURSIVE CALL
+C
+ NDEEP=NDEEP+1
+ POPLST(1,NDEEP)=U
+ POPLST(2,NDEEP)=L
+ GO TO 3
+ 18 IF (U.GT.L) GO TO 4
+C
+C POP BACK UP IN THE RECURSION LIST
+C
+ IF (NDEEP.EQ.0) GO TO 2
+ U=POPLST(1,NDEEP)
+ L=POPLST(2,NDEEP)
+ NDEEP=NDEEP-1
+ GO TO 18
+C
+C END SORT
+C END QSORT
+C
+ END
Added: trunk/scipy/sandbox/dhuard/histogram.py
===================================================================
--- trunk/scipy/sandbox/dhuard/histogram.py 2007-09-27 15:49:05 UTC (rev 3376)
+++ trunk/scipy/sandbox/dhuard/histogram.py 2007-09-27 18:37:13 UTC (rev 3377)
@@ -0,0 +1,272 @@
+import numpy as np
+import subprocess
+
+try:
+ import flib
+except:
+ print 'Building the flib fortran library.'
+ subprocess.call('f2py -c histogram.f -m flib', shell=True)
+ import flib
+
+def histogram(a, bins=10, range=None, normed=False, weights=None, axis=None, strategy=None):
+ """histogram(a, bins=10, range=None, normed=False, weights=None, axis=None)
+ -> H, dict
+
+ Return the distribution of sample.
+
+ :Parameters:
+ - `a` : Array sample.
+ - `bins` : Number of bins, or an array of bin edges, in which case the
+ range is not used. If 'Scott' or 'Freeman' is passed, then
+ the named method is used to find the optimal number of bins.
+ - `range` : Lower and upper bin edges, default: [min, max].
+ - `normed` :Boolean, if False, return the number of samples in each bin,
+ if True, return the density.
+ - `weights` : Sample weights. The weights are normed only if normed is
+ True. Should weights.sum() not equal len(a), the total bin count
+ will not be equal to the number of samples.
+ - `axis` : Specifies the dimension along which the histogram is computed.
+ Defaults to None, which aggregates the entire sample array.
+ - `strategy` : Histogramming method (binsize, searchsorted or digitize).
+
+ :Return:
+ - `H` : The number of samples in each bin.
+ If normed is True, H is a frequency distribution.
+ - dict{ 'edges': The bin edges, including the rightmost edge.
+ 'upper': Upper outliers.
+ 'lower': Lower outliers.
+ 'bincenters': Center of bins.
+ 'strategy': the histogramming method employed.}
+
+ :Examples:
+ >>> x = random.rand(100,10)
+ >>> H, D = histogram(x, bins=10, range=[0,1], normed=True)
+ >>> H2, D = histogram(x, bins=10, range=[0,1], normed=True, axis=0)
+
+ :SeeAlso: histogramnd
+ """
+ weighted = weights is not None
+
+ a = np.asarray(a)
+ if axis is None:
+ a = np.atleast_1d(a.ravel())
+ if weighted:
+ weights = np.atleast_1d(weights.ravel())
+ axis = 0
+
+ # Define the range
+ if range is None:
+ mn, mx = a.min(), a.max()
+ if mn == mx:
+ mn = mn - .5
+ mx = mx + .5
+ range = [mn, mx]
+
+ # Find the optimal number of bins.
+ if bins is None or type(bins) == str:
+ bins = _optimize_binning(a, range, bins)
+
+ # Compute the bin edges if they are not given explicitely.
+ # For the rightmost bin, we want values equal to the right
+ # edge to be counted in the last bin, and not as an outlier.
+ # Hence, we shift the last bin by a tiny amount.
+ if not np.iterable(bins):
+ dr = np.diff(range)/bins*1e-10
+ edges = np.linspace(range[0], range[1]+dr, bins+1, endpoint=True)
+ else:
+ edges = np.asarray(bins, float)
+
+ dedges = np.diff(edges)
+ bincenters = edges[:-1] + dedges/2.
+
+ # Number of bins
+ nbin = len(edges)-1
+
+ # Measure of bin precision.
+ decimal = int(-np.log10(dedges.min())+10)
+
+ # Choose the fastest histogramming method
+ even = (len(set(np.around(dedges, decimal))) == 1)
+ if strategy is None:
+ if even:
+ strategy = 'binsize'
+ else:
+ if nbin > 30: # approximative threshold
+ strategy = 'searchsort'
+ else:
+ strategy = 'digitize'
+ else:
+ if strategy not in ['binsize', 'digitize', 'searchsort']:
+ raise 'Unknown histogramming strategy.', strategy
+ if strategy == 'binsize' and not even:
+ raise 'This binsize strategy cannot be used for uneven bins.'
+
+ # Parameters for the fixed_binsize functions.
+ start = float(edges[0])
+ binwidth = float(dedges[0])
+
+ # Looping to reduce memory usage
+ block = 66600
+ slices = [slice(None)]*a.ndim
+ for i in np.arange(0,len(a),block):
+ slices[axis] = slice(i,i+block)
+ at = a[slices]
+ if weighted:
+ at = np.concatenate((at, weights[slices]), axis)
+ if strategy == 'binsize':
+ count = np.apply_along_axis(_splitinmiddle,axis,at,
+ flib.weighted_fixed_binsize,start,binwidth,nbin)
+ elif strategy == 'searchsort':
+ count = np.apply_along_axis(_splitinmiddle,axis,at, \
+ _histogram_searchsort_weighted, edges)
+ elif strategy == 'digitize':
+ count = np.apply_along_axis(_splitinmiddle,axis,at,\
+ _histogram_digitize,edges,normed)
+ else:
+ if strategy == 'binsize':
+ count = np.apply_along_axis(flib.fixed_binsize,axis,at,start,binwidth,nbin)
+ elif strategy == 'searchsort':
+ count = np.apply_along_axis(_histogram_searchsort,axis,at,edges)
+ elif strategy == 'digitize':
+ count = np.apply_along_axis(_histogram_digitize,axis,at,None,edges,
+ normed)
+
+ if i == 0:
+ total = count
+ else:
+ total += count
+
+ # Outlier count
+ upper = total.take(np.array([-1]), axis)
+ lower = total.take(np.array([0]), axis)
+
+ # Non-outlier count
+ core = a.ndim*[slice(None)]
+ core[axis] = slice(1, -1)
+ hist = total[core]
+
+ if normed:
+ normalize = lambda x: np.atleast_1d(x/(x*dedges).sum())
+ hist = np.apply_along_axis(normalize, axis, hist)
+
+ return hist, {'edges':edges, 'lower':lower, 'upper':upper, \
+ 'bincenters':bincenters, 'strategy':strategy}
+
+
+
+def _histogram_fixed_binsize(a, start, width, n):
+ """histogram_even(a, start, width, n) -> histogram
+
+ Return an histogram where the first bin counts the number of lower
+ outliers and the last bin the number of upper outliers. Works only with
+ fixed width bins.
+
+ :Parameters:
+ a : array
+ Array of samples.
+ start : float
+ Left-most bin edge.
+ width : float
+ Width of the bins. All bins are considered to have the same width.
+ n : int
+ Number of bins.
+
+ :Return:
+ H : array
+ Array containing the number of elements in each bin. H[0] is the number
+ of samples smaller than start and H[-1] the number of samples
+ greater than start + n*width.
+ """
+
+ return flib.fixed_binsize(a, start, width, n)
+
+
+def _histogram_binsize_weighted(a, w, start, width, n):
+ """histogram_even_weighted(a, start, width, n) -> histogram
+
+ Return an histogram where the first bin counts the number of lower
+ outliers and the last bin the number of upper outliers. Works only with
+ fixed width bins.
+
+ :Parameters:
+ a : array
+ Array of samples.
+ w : array
+ Weights of samples.
+ start : float
+ Left-most bin edge.
+ width : float
+ Width of the bins. All bins are considered to have the same width.
+ n : int
+ Number of bins.
+
+ :Return:
+ H : array
+ Array containing the number of elements in each bin. H[0] is the number
+ of samples smaller than start and H[-1] the number of samples
+ greater than start + n*width.
+ """
+ return flib.weighted_fixed_binsize(a, w, start, width, n)
+
+def _histogram_searchsort(a, bins):
+ n = np.sort(a).searchsorted(bins)
+ n = np.concatenate([n, [len(a)]])
+ count = np.concatenate([[n[0]], n[1:]-n[:-1]])
+ return count
+
+def _histogram_searchsort_weighted(a, w, bins):
+ i = np.sort(a).searchsorted(bins)
+ sw = w[np.argsort(a)]
+ i = np.concatenate([i, [len(a)]])
+ n = np.concatenate([[0],sw.cumsum()])[i]
+ count = np.concatenate([[n[0]], n[1:]-n[:-1]])
+ return count
+
+def _splitinmiddle(x, function, *args, **kwds):
+ x1,x2 = np.hsplit(x, 2)
+ return function(x1,x2,*args, **kwds)
+
+def _histogram_digitize(a, w, edges, normed):
+ """Internal routine to compute the 1d weighted histogram for uneven bins.
+ a: sample
+ w: weights
+ edges: bin edges
+ weighted: Means that the weights are appended to array a.
+ Return the bin count or frequency if normed.
+ """
+ weighted = w is not None
+ nbin = edges.shape[0]+1
+ if weighted:
+ count = np.zeros(nbin, dtype=w.dtype)
+ if normed:
+ count = np.zeros(nbin, dtype=float)
+ w = w/w.mean()
+ else:
+ count = np.zeros(nbin, int)
+
+ binindex = np.digitize(a, edges)
+
+ # Count the number of identical indices.
+ flatcount = np.bincount(binindex, w)
+
+ # Place the count in the histogram array.
+ count[:len(flatcount)] = flatcount
+
+ return count
+
+
+def _optimize_binning(x, range, method='Freedman'):
+ """Find the optimal number of bins.
+ Available methods : Freedman, Scott
+ """
+ N = x.shape[0]
+ if method.lower()=='freedman':
+ s=np.sort(x)
+ IQR = s[int(N*.75)] - s[int(N*.25)] # Interquantile range (75% -25%)
+ width = 2* IQR*N**(-1./3)
+
+ elif method.lower()=='scott':
+ width = 3.49 * x.std()* N**(-1./3)
+ else:
+ raise 'Method must be Scott or Freedman', method
+ return int(np.diff(range)/width)
Added: trunk/scipy/sandbox/dhuard/test_histogram.py
===================================================================
--- trunk/scipy/sandbox/dhuard/test_histogram.py 2007-09-27 15:49:05 UTC (rev 3376)
+++ trunk/scipy/sandbox/dhuard/test_histogram.py 2007-09-27 18:37:13 UTC (rev 3377)
@@ -0,0 +1,99 @@
+from numpy.testing import *
+from histogram import _histogram_fixed_binsize, _histogram_digitize,\
+ _histogram_searchsort, histogram,_optimize_binning
+import numpy as np
+from numpy.random import rand
+
+class test_histogram1d_functions(NumpyTestCase):
+ def check_consistency(self):
+ n = 100
+ r = rand(n)*12-1
+ bins = range(11)
+ a = _histogram_fixed_binsize(r, bins[0], bins[1]-bins[0], len(bins)-1)
+ b = _histogram_digitize(r, None, np.array(bins), False)
+ c = _histogram_searchsort(r,bins)
+ assert_array_equal(a,b)
+ assert_array_equal(c,b)
+
+class test_histogram(NumpyTestCase):
+ def check_simple(self):
+ n=100
+ v=rand(n)
+ (a,b)=histogram(v)
+ #check if the sum of the bins equals the number of samples
+ assert_equal(np.sum(a,axis=0),n)
+ #check that the bin counts are evenly spaced when the data is from a linear function
+ (a,b)=histogram(np.linspace(0,10,100))
+ assert_array_equal(a,10)
+ #Check the construction of the bin array
+ a, b = histogram(v, bins=4, range=[.2,.8])
+ assert_array_almost_equal(b['edges'],np.linspace(.2, .8, 5),8)
+ #Check the number of outliers
+ assert_equal((v<.2).sum(), b['lower'])
+ assert_equal((v>.8).sum(),b['upper'])
+ #Check the normalization
+ bins = [0,.5,.75,1]
+ a,b = histogram(v, bins, normed=True)
+ assert_almost_equal((a*np.diff(bins)).sum(), 1)
+
+ def check_axis(self):
+ n,m = 100,20
+ v = rand(n,m)
+ a,b = histogram(v, bins=5)
+ # Check dimension is reduced (axis=None).
+ assert_equal(a.ndim, 1)
+ #Check total number of count is equal to the number of samples.
+ assert_equal(a.sum(), n*m)
+ a,b = histogram(v, bins = 7, axis=0)
+ # Check shape of new array is ok.
+ assert(a.ndim == 2)
+ assert_array_equal(a.shape,[7, m])
+ # Check normalization is consistent
+ a,b = histogram(v, bins = 7, axis=0, normed=True)
+ assert_array_almost_equal((a.T*np.diff(b['edges'])).sum(1), np.ones((m)),5)
+ a,b = histogram(v, bins = 7, axis=1, normed=True)
+ assert_array_equal(a.shape, [n,7])
+ assert_array_almost_equal((a*np.diff(b['edges'])).sum(1), np.ones((n)))
+ # Check results are consistent with 1d estimate
+ a1, b1 = histogram(v[0,:], bins=b['edges'], normed=True)
+ assert_array_almost_equal(a1, a[0,:],7)
+
+ def check_weights(self):
+ # Check weights = constant gives the same answer as no weights.
+ v = rand(100)
+ w = np.ones(100)*5
+ a,b = histogram(v)
+ na,nb = histogram(v, normed=True)
+ wa,wb = histogram(v, weights=w)
+ nwa,nwb = histogram(v, weights=w, normed=True)
+ assert_array_equal(a*5, wa)
+ assert_array_almost_equal(na, nwa,8)
+ # Check weights are properly applied.
+ v = np.linspace(0,10,10)
+ w = np.concatenate((np.zeros(5), np.ones(5)))
+ wa,wb = histogram(v, bins=np.linspace(0,10.01, 11),weights=w)
+ assert_array_almost_equal(wa, w)
+
+ def check_strategies(self):
+ v = rand(100)
+ ae,be = histogram(v, strategy='binsize')
+ ab,bb = histogram(v, strategy='digitize')
+ as,bs = histogram(v, strategy='searchsort')
+ assert_array_equal(ae, ab)
+ assert_array_equal(ae, as)
+
+ w = rand(100)
+ ae,be = histogram(v, weights=w, strategy='binsize')
+ ab,bb = histogram(v, weights=w, strategy='digitize')
+ as,bs = histogram(v, weights=w, strategy='searchsort')
+ assert_array_almost_equal(ae, ab,8)
+ assert_array_almost_equal(ae, as,8)
+
+ def check_automatic_binning(self):
+ v = rand(100)
+ h,b = histogram(v, 'Scott')
+ h,b = histogram(v, 'Freedman')
+
+
+if __name__ == "__main__":
+ NumpyTest().run()
More information about the Scipy-svn
mailing list