[Scipy-svn] r4417 - in trunk/scipy/cluster: . src tests
scipy-svn@scip...
scipy-svn@scip...
Mon Jun 9 00:55:52 CDT 2008
Author: damian.eads
Date: 2008-06-09 00:55:44 -0500 (Mon, 09 Jun 2008)
New Revision: 4417
Added:
trunk/scipy/cluster/distance.py
trunk/scipy/cluster/src/common.h
trunk/scipy/cluster/src/distance.c
trunk/scipy/cluster/src/distance.h
trunk/scipy/cluster/src/distance_wrap.c
Modified:
trunk/scipy/cluster/__init__.py
trunk/scipy/cluster/hierarchy.py
trunk/scipy/cluster/setup.py
trunk/scipy/cluster/src/hierarchy.c
trunk/scipy/cluster/src/hierarchy.h
trunk/scipy/cluster/src/hierarchy_wrap.c
trunk/scipy/cluster/tests/test_hierarchy.py
Log:
Moved distance functions to new module.
Modified: trunk/scipy/cluster/__init__.py
===================================================================
--- trunk/scipy/cluster/__init__.py 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/__init__.py 2008-06-09 05:55:44 UTC (rev 4417)
@@ -4,7 +4,7 @@
from info import __doc__
-__all__ = ['vq', 'hierarchy']
+__all__ = ['vq', 'hierarchy', 'distance']
import vq, hierarchy
from scipy.testing.pkgtester import Tester
Added: trunk/scipy/cluster/distance.py
===================================================================
--- trunk/scipy/cluster/distance.py 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/distance.py 2008-06-09 05:55:44 UTC (rev 4417)
@@ -0,0 +1,839 @@
+"""
+Distance matrix computation from a collection of raw observation vectors
+
+ pdist computes distances between each observation pair.
+
+Distance functions between two vectors u and v
+
+ braycurtis the Bray-Curtis distance.
+ canberra the Canberra distance.
+ chebyshev the Chebyshev distance.
+ cityblock the Manhattan distance.
+ correlation the Correlation distance.
+ cosine the Cosine distance.
+ dice the Dice dissimilarity (boolean).
+ euclidean the Euclidean distance.
+ hamming the Hamming distance (boolean).
+ jaccard the Jaccard distance (boolean).
+ kulsinski the Kulsinski distance (boolean).
+ mahalanobis the Mahalanobis distance.
+ matching the matching dissimilarity (boolean).
+ minkowski the Minkowski distance.
+ rogerstanimoto the Rogers-Tanimoto dissimilarity (boolean).
+ russellrao the Russell-Rao dissimilarity (boolean).
+ seuclidean the normalized Euclidean distance.
+ sokalmichener the Sokal-Michener dissimilarity (boolean).
+ sokalsneath the Sokal-Sneath dissimilarity (boolean).
+ sqeuclidean the squared Euclidean distance.
+ yule the Yule dissimilarity (boolean).
+
+Copyright (C) Damian Eads, 2007-2008. New BSD License.
+
+"""
+
+import numpy as np
+import _distance_wrap
+import types
+
+def _copy_array_if_base_present(a):
+ """
+ Copies the array if its base points to a parent array.
+ """
+ if a.base is not None:
+ return a.copy()
+ elif np.issubsctype(a, np.float32):
+ return array(a, dtype=np.double)
+ else:
+ return a
+
+def _copy_arrays_if_base_present(T):
+ """
+ Accepts a tuple of arrays T. Copies the array T[i] if its base array
+ points to an actual array. Otherwise, the reference is just copied.
+ This is useful if the arrays are being passed to a C function that
+ does not do proper striding.
+ """
+ l = [_copy_array_if_base_present(a) for a in T]
+ return l
+
+def _convert_to_bool(X):
+ if X.dtype != np.bool:
+ X = np.bool_(X)
+ if not X.flags.contiguous:
+ X = X.copy()
+ return X
+
+def _convert_to_double(X):
+ if X.dtype != np.double:
+ X = np.double(X)
+ if not X.flags.contiguous:
+ X = X.copy()
+ return X
+
+def minkowski(u, v, p):
+ """
+ d = minkowski(u, v, p)
+
+ Returns the Minkowski distance between two vectors u and v,
+
+ ||u-v||_p = (\sum {|u_i - v_i|^p})^(1/p).
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ if p < 1:
+ raise ValueError("p must be at least 1")
+ return (abs(u-v)**p).sum() ** (1.0 / p)
+
+def euclidean(u, v):
+ """
+ d = euclidean(u, v)
+
+ Computes the Euclidean distance between two n-vectors u and v, ||u-v||_2
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ q=np.matrix(u-v)
+ return np.sqrt((q*q.T).sum())
+
+def sqeuclidean(u, v):
+ """
+ d = sqeuclidean(u, v)
+
+ Computes the squared Euclidean distance between two n-vectors u and v,
+ (||u-v||_2)^2.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return ((u-v)*(u-v).T).sum()
+
+def cosine(u, v):
+ """
+ d = cosine(u, v)
+
+ Computes the Cosine distance between two n-vectors u and v,
+ (1-uv^T)/(||u||_2 * ||v||_2).
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return (1.0 - (np.dot(u, v.T) / \
+ (np.sqrt(np.dot(u, u.T)) * np.sqrt(np.dot(v, v.T)))))
+
+def correlation(u, v):
+ """
+ d = correlation(u, v)
+
+ Computes the correlation distance between two n-vectors u and v,
+
+ 1 - (u - n|u|_1)(v - n|v|_1)^T
+ --------------------------------- ,
+ |(u - n|u|_1)|_2 |(v - n|v|_1)|^T
+
+ where |*|_1 is the Manhattan norm and n is the common dimensionality
+ of the vectors.
+ """
+ umu = u.mean()
+ vmu = v.mean()
+ um = u - umu
+ vm = v - vmu
+ return 1.0 - (np.dot(um, vm) /
+ (np.sqrt(np.dot(um, um)) \
+ * np.sqrt(np.dot(vm, vm))))
+
+def hamming(u, v):
+ """
+ d = hamming(u, v)
+
+ Computes the Hamming distance between two n-vectors u and v,
+ which is simply the proportion of disagreeing components in u
+ and v. If u and v are boolean vectors, the hamming distance is
+
+ (c_{01} + c_{10}) / n
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return (u != v).mean()
+
+def jaccard(u, v):
+ """
+ d = jaccard(u, v)
+
+ Computes the Jaccard-Needham dissimilarity between two boolean
+ n-vectors u and v, which is
+
+ c_{TF} + c_{FT}
+ ------------------------
+ c_{TT} + c_{FT} + c_{TF}
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return (np.double(np.bitwise_and((u != v),
+ np.bitwise_or(u != 0, v != 0)).sum())
+ / np.double(np.bitwise_or(u != 0, v != 0).sum()))
+
+def kulsinski(u, v):
+ """
+ d = kulsinski(u, v)
+
+ Computes the Kulsinski dissimilarity between two boolean n-vectors
+ u and v, which is
+
+ c_{TF} + c_{FT} - c_{TT} + n
+ ----------------------------
+ c_{FT} + c_{TF} + n
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ n = len(u)
+ (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
+
+ return (ntf + nft - ntt + n) / (ntf + nft + n)
+
+def seuclidean(u, v, V):
+ """
+ d = seuclidean(u, v, V)
+
+ Returns the standardized Euclidean distance between two
+ n-vectors u and v. V is a m-dimensional vector of component
+ variances. It is usually computed among a larger collection vectors.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ V = np.asarray(V)
+ if len(V.shape) != 1 or V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
+ raise TypeError('V must be a 1-D array of the same dimension as u and v.')
+ return np.sqrt(((u-v)**2 / V).sum())
+
+def cityblock(u, v):
+ """
+ d = cityblock(u, v)
+
+ Computes the Manhattan distance between two n-vectors u and v,
+ \sum {u_i-v_i}.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return abs(u-v).sum()
+
+def mahalanobis(u, v, VI):
+ """
+ d = mahalanobis(u, v, VI)
+
+ Computes the Mahalanobis distance between two n-vectors u and v,
+ (u-v)VI(u-v)^T
+ where VI is the inverse covariance matrix.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ VI = np.asarray(VI)
+ return np.sqrt(np.dot(np.dot((u-v),VI),(u-v).T).sum())
+
+def chebyshev(u, v):
+ """
+ d = chebyshev(u, v)
+
+ Computes the Chebyshev distance between two n-vectors u and v,
+ \max {|u_i-v_i|}.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return max(abs(u-v))
+
+def braycurtis(u, v):
+ """
+ d = braycurtis(u, v)
+
+ Computes the Bray-Curtis distance between two n-vectors u and v,
+ \sum{|u_i-v_i|} / \sum{|u_i+v_i|}.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return abs(u-v).sum() / abs(u+v).sum()
+
+def canberra(u, v):
+ """
+ d = canberra(u, v)
+
+ Computes the Canberra distance between two n-vectors u and v,
+ \sum{|u_i-v_i|} / \sum{|u_i|+|v_i}.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ return abs(u-v).sum() / (abs(u).sum() + abs(v).sum())
+
+def _nbool_correspond_all(u, v):
+ if u.dtype != v.dtype:
+ raise TypeError("Arrays being compared must be of the same data type.")
+
+ if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double:
+ not_u = 1.0 - u
+ not_v = 1.0 - v
+ nff = (not_u * not_v).sum()
+ nft = (not_u * v).sum()
+ ntf = (u * not_v).sum()
+ ntt = (u * v).sum()
+ elif u.dtype == np.bool:
+ not_u = ~u
+ not_v = ~v
+ nff = (not_u & not_v).sum()
+ nft = (not_u & v).sum()
+ ntf = (u & not_v).sum()
+ ntt = (u & v).sum()
+ else:
+ raise TypeError("Arrays being compared have unknown type.")
+
+ return (nff, nft, ntf, ntt)
+
+def _nbool_correspond_ft_tf(u, v):
+ if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double:
+ not_u = 1.0 - u
+ not_v = 1.0 - v
+ nff = (not_u * not_v).sum()
+ nft = (not_u * v).sum()
+ ntf = (u * not_v).sum()
+ ntt = (u * v).sum()
+ else:
+ not_u = ~u
+ not_v = ~v
+ nft = (not_u & v).sum()
+ ntf = (u & not_v).sum()
+ return (nft, ntf)
+
+def yule(u, v):
+ """
+ d = yule(u, v)
+ Computes the Yule dissimilarity between two boolean n-vectors u and v,
+
+ R
+ ---------------------
+ c_{TT} + c_{FF} + R/2
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n, and
+
+ R = 2.0 * (c_{TF} + c_{FT}).
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
+ print nff, nft, ntf, ntt
+ return float(2.0 * ntf * nft) / float(ntt * nff + ntf * nft)
+
+def matching(u, v):
+ """
+ d = matching(u, v)
+
+ Computes the Matching dissimilarity between two boolean n-vectors
+ u and v, which is
+
+ (c_{TF} + c_{FT}) / n
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ (nft, ntf) = _nbool_correspond_ft_tf(u, v)
+ return float(nft + ntf) / float(len(u))
+
+def dice(u, v):
+ """
+ d = dice(u, v)
+
+ Computes the Dice dissimilarity between two boolean n-vectors
+ u and v, which is
+
+ c_{TF} + c_{FT}
+ ----------------------------
+ 2 * c_{TT} + c_{FT} + c_{TF}
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ if u.dtype == np.bool:
+ ntt = (u & v).sum()
+ else:
+ ntt = (u * v).sum()
+ (nft, ntf) = _nbool_correspond_ft_tf(u, v)
+ return float(ntf + nft) / float(2.0 * ntt + ntf + nft)
+
+def rogerstanimoto(u, v):
+ """
+ d = rogerstanimoto(u, v)
+
+ Computes the Rogers-Tanimoto dissimilarity between two boolean
+ n-vectors u and v,
+
+ R
+ -------------------
+ c_{TT} + c_{FF} + R
+
+ where c_{ij} is the number of occurrences of
+
+ u[k] == i and v[k] == j
+
+ for k < n, and
+
+ R = 2.0 * (c_{TF} + c_{FT}).
+
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
+ return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
+
+def russellrao(u, v):
+ """
+ d = russellrao(u, v)
+
+ Computes the Russell-Rao dissimilarity between two boolean n-vectors
+ u and v, (n - c_{TT}) / n where c_{ij} is the number of occurrences
+ of u[k] == i and v[k] == j for k < n.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ if u.dtype == np.bool:
+ ntt = (u & v).sum()
+ else:
+ ntt = (u * v).sum()
+ return float(len(u) - ntt) / float(len(u))
+
+def sokalmichener(u, v):
+ """
+ d = sokalmichener(u, v)
+
+ Computes the Sokal-Michener dissimilarity between two boolean vectors
+ u and v, 2R / (S + 2R) where c_{ij} is the number of occurrences of
+ u[k] == i and v[k] == j for k < n and R = 2 * (c_{TF} + c{FT}) and
+ S = c_{FF} + c_{TT}.
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ if u.dtype == np.bool:
+ ntt = (u & v).sum()
+ nff = (~u & ~v).sum()
+ else:
+ ntt = (u * v).sum()
+ nff = ((1.0 - u) * (1.0 - v)).sum()
+ (nft, ntf) = _nbool_correspond_ft_tf(u, v)
+ return float(2.0 * (ntf + nft))/float(ntt + nff + 2.0 * (ntf + nft))
+
+def sokalsneath(u, v):
+ """
+ d = sokalsneath(u, v)
+
+ Computes the Sokal-Sneath dissimilarity between two boolean vectors
+ u and v, 2R / (c_{TT} + 2R) where c_{ij} is the number of occurrences
+ of u[k] == i and v[k] == j for k < n and R = 2 * (c_{TF} + c{FT}).
+ """
+ u = np.asarray(u)
+ v = np.asarray(v)
+ if u.dtype == np.bool:
+ ntt = (u & v).sum()
+ else:
+ ntt = (u * v).sum()
+ (nft, ntf) = _nbool_correspond_ft_tf(u, v)
+ return float(2.0 * (ntf + nft))/float(ntt + 2.0 * (ntf + nft))
+
+
+def pdist(X, metric='euclidean', p=2, V=None, VI=None):
+ """ Y = pdist(X, method='euclidean', p=2)
+
+ Computes the distance between m original observations in
+ n-dimensional space. Returns a condensed distance matrix Y.
+ For each i and j (i<j), the metric dist(u=X[i], v=X[j]) is
+ computed and stored in the ij'th entry. See squareform
+ to learn how to retrieve this entry.
+
+ 1. Y = pdist(X)
+
+ Computes the distance between m points using Euclidean distance
+ (2-norm) as the distance metric between the points. The points
+ are arranged as m n-dimensional row vectors in the matrix X.
+
+ 2. Y = pdist(X, 'minkowski', p)
+
+ Computes the distances using the Minkowski distance ||u-v||_p
+ (p-norm) where p>=1.
+
+ 3. Y = pdist(X, 'cityblock')
+
+ Computes the city block or Manhattan distance between the
+ points.
+
+ 4. Y = pdist(X, 'seuclidean', V=None)
+
+ Computes the standardized Euclidean distance. The standardized
+ Euclidean distance between two n-vectors u and v is
+
+ sqrt(\sum {(u_i-v_i)^2 / V[x_i]}).
+
+ V is the variance vector; V[i] is the variance computed over all
+ the i'th components of the points. If not passed, it is
+ automatically computed.
+
+ 5. Y = pdist(X, 'sqeuclidean')
+
+ Computes the squared Euclidean distance ||u-v||_2^2 between
+ the vectors.
+
+ 6. Y = pdist(X, 'cosine')
+
+ Computes the cosine distance between vectors u and v,
+
+ 1 - uv^T
+ -----------
+ |u|_2 |v|_2
+
+ where |*|_2 is the 2 norm of its argument *.
+
+ 7. Y = pdist(X, 'correlation')
+
+ Computes the correlation distance between vectors u and v. This is
+
+ 1 - (u - n|u|_1)(v - n|v|_1)^T
+ --------------------------------- ,
+ |(u - n|u|_1)|_2 |(v - n|v|_1)|^T
+
+ where |*|_1 is the Manhattan (or 1-norm) of its argument *,
+ and n is the common dimensionality of the vectors.
+
+ 8. Y = pdist(X, 'hamming')
+
+ Computes the normalized Hamming distance, or the proportion
+ of those vector elements between two n-vectors u and v which
+ disagree. To save memory, the matrix X can be of type boolean.
+
+ 9. Y = pdist(X, 'jaccard')
+
+ Computes the Jaccard distance between the points. Given two
+ vectors, u and v, the Jaccard distance is the proportion of
+ those elements u_i and v_i that disagree where at least one
+ of them is non-zero.
+
+ 10. Y = pdist(X, 'chebyshev')
+
+ Computes the Chebyshev distance between the points. The
+ Chebyshev distance between two n-vectors u and v is the maximum
+ norm-1 distance between their respective elements. More
+ precisely, the distance is given by
+
+ d(u,v) = max {|u_i-v_i|}.
+
+ 11. Y = pdist(X, 'canberra')
+
+ Computes the Canberra distance between the points. The
+ Canberra distance between two points u and v is
+
+ |u_1-v_1| |u_2-v_2| |u_n-v_n|
+ d(u,v) = ----------- + ----------- + ... + -----------
+ |u_1|+|v_1| |u_2|+|v_2| |u_n|+|v_n|
+
+ 12. Y = pdist(X, 'braycurtis')
+
+ Computes the Bray-Curtis distance between the points. The
+ Bray-Curtis distance between two points u and v is
+
+ |u_1-v_1| + |u_2-v_2| + ... + |u_n-v_n|
+ d(u,v) = ---------------------------------------
+ |u_1+v_1| + |u_2+v_2| + ... + |u_n+v_n|
+
+ 13. Y = pdist(X, 'mahalanobis', VI=None)
+
+ Computes the Mahalanobis distance between the points. The
+ Mahalanobis distance between two points u and v is
+ (u-v)(1/V)(u-v)^T
+ where (1/V) is the inverse covariance. If VI is not None,
+ VI will be used as the inverse covariance matrix.
+
+ 14. Y = pdist(X, 'yule')
+
+ Computes the Yule distance between each pair of boolean
+ vectors. (see yule function documentation)
+
+ 15. Y = pdist(X, 'matching')
+
+ Computes the matching distance between each pair of boolean
+ vectors. (see matching function documentation)
+
+ 16. Y = pdist(X, 'dice')
+
+ Computes the Dice distance between each pair of boolean
+ vectors. (see dice function documentation)
+
+ 17. Y = pdist(X, 'kulsinski')
+
+ Computes the Kulsinski distance between each pair of
+ boolean vectors. (see kulsinski function documentation)
+
+ 17. Y = pdist(X, 'rogerstanimoto')
+
+ Computes the Rogers-Tanimoto distance between each pair of
+ boolean vectors. (see rogerstanimoto function documentation)
+
+ 18. Y = pdist(X, 'russellrao')
+
+ Computes the Russell-Rao distance between each pair of
+ boolean vectors. (see russellrao function documentation)
+
+ 19. Y = pdist(X, 'sokalmichener')
+
+ Computes the Sokal-Michener distance between each pair of
+ boolean vectors. (see sokalmichener function documentation)
+
+ 20. Y = pdist(X, 'sokalsneath')
+
+ Computes the Sokal-Sneath distance between each pair of
+ boolean vectors. (see sokalsneath function documentation)
+
+ 21. Y = pdist(X, f)
+
+ Computes the distance between all pairs of vectors in X
+ using the user supplied 2-arity function f. For example,
+ Euclidean distance between the vectors could be computed
+ as follows,
+
+ dm = pdist(X, (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())))
+
+ Note that you should avoid passing a reference to one of
+ the distance functions defined in this library. For example,
+
+ dm = pdist(X, sokalsneath)
+
+ would calculate the pair-wise distances between the vectors
+ in X using the Python function sokalsneath. This would result
+ in sokalsneath being called {n \choose 2} times, which is
+ inefficient. Instead, the optimized C version is more
+ efficient, and we call it using the following syntax.
+
+ dm = pdist(X, 'sokalsneath')
+ """
+# 21. Y = pdist(X, 'test_Y')
+#
+# Computes the distance between all pairs of vectors in X
+# using the distance metric Y but with a more succint,
+# verifiable, but less efficient implementation.
+
+
+ X = np.asarray(X)
+
+ #if np.issubsctype(X, np.floating) and not np.issubsctype(X, np.double):
+ # raise TypeError('Floating point arrays must be 64-bit (got %r).' %
+ # (X.dtype.type,))
+
+ # The C code doesn't do striding.
+ [X] = _copy_arrays_if_base_present([_convert_to_double(X)])
+
+ s = X.shape
+
+ if len(s) != 2:
+ raise ValueError('A 2-dimensional array must be passed.');
+
+ m = s[0]
+ n = s[1]
+ dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)
+
+ mtype = type(metric)
+ if mtype is types.FunctionType:
+ k = 0
+ if metric == minkowski:
+ for i in xrange(0, m - 1):
+ for j in xrange(i+1, m):
+ dm[k] = minkowski(X[i, :], X[j, :], p)
+ k = k + 1
+ elif metric == seuclidean:
+ for i in xrange(0, m - 1):
+ for j in xrange(i+1, m):
+ dm[k] = seuclidean(X[i, :], X[j, :], V)
+ k = k + 1
+ elif metric == mahalanobis:
+ for i in xrange(0, m - 1):
+ for j in xrange(i+1, m):
+ dm[k] = mahalanobis(X[i, :], X[j, :], V)
+ k = k + 1
+ else:
+ for i in xrange(0, m - 1):
+ for j in xrange(i+1, m):
+ dm[k] = metric(X[i, :], X[j, :])
+ k = k + 1
+
+ elif mtype is types.StringType:
+ mstr = metric.lower()
+
+ #if X.dtype != np.double and \
+ # (mstr != 'hamming' and mstr != 'jaccard'):
+ # TypeError('A double array must be passed.')
+ if mstr in set(['euclidean', 'euclid', 'eu', 'e']):
+ _distance_wrap.pdist_euclidean_wrap(_convert_to_double(X), dm)
+ elif mstr in set(['sqeuclidean', 'sqe', 'sqeuclid']):
+ _distance_wrap.pdist_euclidean_wrap(_convert_to_double(X), dm)
+ dm = dm ** 2.0
+ elif mstr in set(['cityblock', 'cblock', 'cb', 'c']):
+ _distance_wrap.pdist_city_block_wrap(X, dm)
+ elif mstr in set(['hamming', 'hamm', 'ha', 'h']):
+ if X.dtype == np.bool:
+ _distance_wrap.pdist_hamming_bool_wrap(_convert_to_bool(X), dm)
+ else:
+ _distance_wrap.pdist_hamming_wrap(_convert_to_double(X), dm)
+ elif mstr in set(['jaccard', 'jacc', 'ja', 'j']):
+ if X.dtype == np.bool:
+ _distance_wrap.pdist_jaccard_bool_wrap(_convert_to_bool(X), dm)
+ else:
+ _distance_wrap.pdist_jaccard_wrap(_convert_to_double(X), dm)
+ elif mstr in set(['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch']):
+ _distance_wrap.pdist_chebyshev_wrap(_convert_to_double(X), dm)
+ elif mstr in set(['minkowski', 'mi', 'm']):
+ _distance_wrap.pdist_minkowski_wrap(_convert_to_double(X), dm, p)
+ elif mstr in set(['seuclidean', 'se', 's']):
+ if V is not None:
+ V = np.asarray(V)
+ if type(V) != np.ndarray:
+ raise TypeError('Variance vector V must be a numpy array')
+ if V.dtype != np.double:
+ raise TypeError('Variance vector V must contain doubles.')
+ if len(V.shape) != 1:
+ raise ValueError('Variance vector V must be one-dimensional.')
+ if V.shape[0] != n:
+ raise ValueError('Variance vector V must be of the same dimension as the vectors on which the distances are computed.')
+ # The C code doesn't do striding.
+ [VV] = _copy_arrays_if_base_present([_convert_to_double(V)])
+ else:
+ VV = np.var(X, axis=0, ddof=1)
+ _distance_wrap.pdist_seuclidean_wrap(_convert_to_double(X), VV, dm)
+ # Need to test whether vectorized cosine works better.
+ # Find out: Is there a dot subtraction operator so I can
+ # subtract matrices in a similar way to multiplying them?
+ # Need to get rid of as much unnecessary C code as possible.
+ elif mstr in set(['cosine', 'cos']):
+ norms = np.sqrt(np.sum(X * X, axis=1))
+ _distance_wrap.pdist_cosine_wrap(_convert_to_double(X), dm, norms)
+ elif mstr in set(['old_cosine', 'old_cos']):
+ norms = np.sqrt(np.sum(X * X, axis=1))
+ nV = norms.reshape(m, 1)
+ # The numerator u * v
+ nm = np.dot(X, X.T)
+ # The denom. ||u||*||v||
+ de = np.dot(nV, nV.T);
+ dm = 1.0 - (nm / de)
+ dm[xrange(0,m),xrange(0,m)] = 0.0
+ dm = squareform(dm)
+ elif mstr in set(['correlation', 'co']):
+ X2 = X - X.mean(1)[:,np.newaxis]
+ #X2 = X - np.matlib.repmat(np.mean(X, axis=1).reshape(m, 1), 1, n)
+ norms = np.sqrt(np.sum(X2 * X2, axis=1))
+ _distance_wrap.pdist_cosine_wrap(_convert_to_double(X2), _convert_to_double(dm), _convert_to_double(norms))
+ elif mstr in set(['mahalanobis', 'mahal', 'mah']):
+ if VI is not None:
+ VI = _convert_to_double(np.asarray(VI))
+ if type(VI) != np.ndarray:
+ raise TypeError('VI must be a numpy array.')
+ if VI.dtype != np.double:
+ raise TypeError('The array must contain 64-bit floats.')
+ [VI] = _copy_arrays_if_base_present([VI])
+ else:
+ V = np.cov(X.T)
+ VI = _convert_to_double(np.linalg.inv(V).T.copy())
+ # (u-v)V^(-1)(u-v)^T
+ _distance_wrap.pdist_mahalanobis_wrap(_convert_to_double(X), VI, dm)
+ elif mstr == 'canberra':
+ _distance_wrap.pdist_canberra_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'braycurtis':
+ _distance_wrap.pdist_bray_curtis_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'yule':
+ _distance_wrap.pdist_yule_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'matching':
+ _distance_wrap.pdist_matching_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'kulsinski':
+ _distance_wrap.pdist_kulsinski_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'dice':
+ _distance_wrap.pdist_dice_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'rogerstanimoto':
+ _distance_wrap.pdist_rogerstanimoto_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'russellrao':
+ _distance_wrap.pdist_russellrao_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'sokalmichener':
+ _distance_wrap.pdist_sokalmichener_bool_wrap(_convert_to_bool(X), dm)
+ elif mstr == 'sokalsneath':
+ _distance_wrap.pdist_sokalsneath_bool_wrap(_convert_to_bool(X), dm)
+ elif metric == 'test_euclidean':
+ dm = pdist(X, euclidean)
+ elif metric == 'test_sqeuclidean':
+ if V is None:
+ V = np.var(X, axis=0, ddof=1)
+ else:
+ V = np.asarray(V)
+ dm = pdist(X, lambda u, v: seuclidean(u, v, V))
+ elif metric == 'test_braycurtis':
+ dm = pdist(X, braycurtis)
+ elif metric == 'test_mahalanobis':
+ if VI is None:
+ V = np.cov(X.T)
+ VI = np.linalg.inv(V)
+ else:
+ VI = np.asarray(VI)
+ [VI] = _copy_arrays_if_base_present([VI])
+ # (u-v)V^(-1)(u-v)^T
+ dm = pdist(X, (lambda u, v: mahalanobis(u, v, VI)))
+ elif metric == 'test_cityblock':
+ dm = pdist(X, cityblock)
+ elif metric == 'test_minkowski':
+ dm = pdist(X, minkowski, p)
+ elif metric == 'test_cosine':
+ dm = pdist(X, cosine)
+ elif metric == 'test_correlation':
+ dm = pdist(X, correlation)
+ elif metric == 'test_hamming':
+ dm = pdist(X, hamming)
+ elif metric == 'test_jaccard':
+ dm = pdist(X, jaccard)
+ elif metric == 'test_chebyshev' or metric == 'test_chebychev':
+ dm = pdist(X, chebyshev)
+ elif metric == 'test_yule':
+ dm = pdist(X, yule)
+ elif metric == 'test_matching':
+ dm = pdist(X, matching)
+ elif metric == 'test_dice':
+ dm = pdist(X, dice)
+ elif metric == 'test_kulsinski':
+ dm = pdist(X, kulsinski)
+ elif metric == 'test_rogerstanimoto':
+ dm = pdist(X, rogerstanimoto)
+ elif metric == 'test_russellrao':
+ dm = pdist(X, russellrao)
+ elif metric == 'test_sokalsneath':
+ dm = pdist(X, sokalsneath)
+ elif metric == 'test_sokalmichener':
+ dm = pdist(X, sokalmichener)
+ else:
+ raise ValueError('Unknown Distance Metric: %s' % mstr)
+ else:
+ raise TypeError('2nd argument metric must be a string identifier or a function.')
+ return dm
Modified: trunk/scipy/cluster/hierarchy.py
===================================================================
--- trunk/scipy/cluster/hierarchy.py 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/hierarchy.py 2008-06-09 05:55:44 UTC (rev 4417)
@@ -22,9 +22,6 @@
median the median/WPGMC algorithm. (alias)
ward the Ward/incremental algorithm. (alias)
-Distance matrix computation from a collection of raw observation vectors
-
- pdist computes distances between each observation pair.
squareform converts a sq. D.M. to a condensed one and vice versa.
Statistic computations on hierarchies
@@ -47,30 +44,6 @@
lvlist a left-to-right traversal of the leaves.
totree represents a linkage matrix as a tree object.
-Distance functions between two vectors u and v
-
- braycurtis the Bray-Curtis distance.
- canberra the Canberra distance.
- chebyshev the Chebyshev distance.
- cityblock the Manhattan distance.
- correlation the Correlation distance.
- cosine the Cosine distance.
- dice the Dice dissimilarity (boolean).
- euclidean the Euclidean distance.
- hamming the Hamming distance (boolean).
- jaccard the Jaccard distance (boolean).
- kulsinski the Kulsinski distance (boolean).
- mahalanobis the Mahalanobis distance.
- matching the matching dissimilarity (boolean).
- minkowski the Minkowski distance.
- rogerstanimoto the Rogers-Tanimoto dissimilarity (boolean).
- russellrao the Russell-Rao dissimilarity (boolean).
- seuclidean the normalized Euclidean distance.
- sokalmichener the Sokal-Michener dissimilarity (boolean).
- sokalsneath the Sokal-Sneath dissimilarity (boolean).
- sqeuclidean the squared Euclidean distance.
- yule the Yule dissimilarity (boolean).
-
Predicates
is_valid_dm checks for a valid distance matrix.
@@ -176,6 +149,7 @@
import numpy as np
import _hierarchy_wrap, types
+from distance import pdist
_cpy_non_euclid_methods = {'single': 0, 'complete': 1, 'average': 2,
'weighted': 6}
@@ -191,15 +165,6 @@
def _warning(s):
print ('[WARNING] scipy.cluster: %s' % s)
-def _unbiased_variance(X):
- """
- Computes the unbiased variance of each dimension of a collection of
- observation vectors, represented by a matrix where the rows are the
- observations.
- """
- #n = np.double(X.shape[1])
- return np.var(X, axis=0, ddof=1) # * n / (n - 1.0)
-
def _copy_array_if_base_present(a):
"""
Copies the array if its base points to a parent array.
@@ -788,399 +753,6 @@
else:
raise ValueError('The first argument must be one or two dimensional array. A %d-dimensional array is not permitted' % len(s))
-def minkowski(u, v, p):
- """
- d = minkowski(u, v, p)
-
- Returns the Minkowski distance between two vectors u and v,
-
- ||u-v||_p = (\sum {|u_i - v_i|^p})^(1/p).
- """
- u = np.asarray(u)
- v = np.asarray(v)
- if p < 1:
- raise ValueError("p must be at least 1")
- return (abs(u-v)**p).sum() ** (1.0 / p)
-
-def euclidean(u, v):
- """
- d = euclidean(u, v)
-
- Computes the Euclidean distance between two n-vectors u and v, ||u-v||_2
- """
- u = np.asarray(u)
- v = np.asarray(v)
- q=np.matrix(u-v)
- return np.sqrt((q*q.T).sum())
-
-def sqeuclidean(u, v):
- """
- d = sqeuclidean(u, v)
-
- Computes the squared Euclidean distance between two n-vectors u and v,
- (||u-v||_2)^2.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return ((u-v)*(u-v).T).sum()
-
-def cosine(u, v):
- """
- d = cosine(u, v)
-
- Computes the Cosine distance between two n-vectors u and v,
- (1-uv^T)/(||u||_2 * ||v||_2).
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return (1.0 - (np.dot(u, v.T) / \
- (np.sqrt(np.dot(u, u.T)) * np.sqrt(np.dot(v, v.T)))))
-
-def correlation(u, v):
- """
- d = correlation(u, v)
-
- Computes the correlation distance between two n-vectors u and v,
-
- 1 - (u - n|u|_1)(v - n|v|_1)^T
- --------------------------------- ,
- |(u - n|u|_1)|_2 |(v - n|v|_1)|^T
-
- where |*|_1 is the Manhattan norm and n is the common dimensionality
- of the vectors.
- """
- umu = u.mean()
- vmu = v.mean()
- um = u - umu
- vm = v - vmu
- return 1.0 - (np.dot(um, vm) /
- (np.sqrt(np.dot(um, um)) \
- * np.sqrt(np.dot(vm, vm))))
-
-def hamming(u, v):
- """
- d = hamming(u, v)
-
- Computes the Hamming distance between two n-vectors u and v,
- which is simply the proportion of disagreeing components in u
- and v. If u and v are boolean vectors, the hamming distance is
-
- (c_{01} + c_{10}) / n
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return (u != v).mean()
-
-def jaccard(u, v):
- """
- d = jaccard(u, v)
-
- Computes the Jaccard-Needham dissimilarity between two boolean
- n-vectors u and v, which is
-
- c_{TF} + c_{FT}
- ------------------------
- c_{TT} + c_{FT} + c_{TF}
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return (np.double(np.bitwise_and((u != v),
- np.bitwise_or(u != 0, v != 0)).sum())
- / np.double(np.bitwise_or(u != 0, v != 0).sum()))
-
-def kulsinski(u, v):
- """
- d = kulsinski(u, v)
-
- Computes the Kulsinski dissimilarity between two boolean n-vectors
- u and v, which is
-
- c_{TF} + c_{FT} - c_{TT} + n
- ----------------------------
- c_{FT} + c_{TF} + n
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- n = len(u)
- (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
-
- return (ntf + nft - ntt + n) / (ntf + nft + n)
-
-def seuclidean(u, v, V):
- """
- d = seuclidean(u, v, V)
-
- Returns the standardized Euclidean distance between two
- n-vectors u and v. V is a m-dimensional vector of component
- variances. It is usually computed among a larger collection vectors.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- V = np.asarray(V)
- if len(V.shape) != 1 or V.shape[0] != u.shape[0] or u.shape[0] != v.shape[0]:
- raise TypeError('V must be a 1-D array of the same dimension as u and v.')
- return np.sqrt(((u-v)**2 / V).sum())
-
-def cityblock(u, v):
- """
- d = cityblock(u, v)
-
- Computes the Manhattan distance between two n-vectors u and v,
- \sum {u_i-v_i}.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return abs(u-v).sum()
-
-def mahalanobis(u, v, VI):
- """
- d = mahalanobis(u, v, VI)
-
- Computes the Mahalanobis distance between two n-vectors u and v,
- (u-v)VI(u-v)^T
- where VI is the inverse covariance matrix.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- VI = np.asarray(VI)
- return np.sqrt(np.dot(np.dot((u-v),VI),(u-v).T).sum())
-
-def chebyshev(u, v):
- """
- d = chebyshev(u, v)
-
- Computes the Chebyshev distance between two n-vectors u and v,
- \max {|u_i-v_i|}.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return max(abs(u-v))
-
-def braycurtis(u, v):
- """
- d = braycurtis(u, v)
-
- Computes the Bray-Curtis distance between two n-vectors u and v,
- \sum{|u_i-v_i|} / \sum{|u_i+v_i|}.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return abs(u-v).sum() / abs(u+v).sum()
-
-def canberra(u, v):
- """
- d = canberra(u, v)
-
- Computes the Canberra distance between two n-vectors u and v,
- \sum{|u_i-v_i|} / \sum{|u_i|+|v_i}.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- return abs(u-v).sum() / (abs(u).sum() + abs(v).sum())
-
-def _nbool_correspond_all(u, v):
- if u.dtype != v.dtype:
- raise TypeError("Arrays being compared must be of the same data type.")
-
- if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double:
- not_u = 1.0 - u
- not_v = 1.0 - v
- nff = (not_u * not_v).sum()
- nft = (not_u * v).sum()
- ntf = (u * not_v).sum()
- ntt = (u * v).sum()
- elif u.dtype == np.bool:
- not_u = ~u
- not_v = ~v
- nff = (not_u & not_v).sum()
- nft = (not_u & v).sum()
- ntf = (u & not_v).sum()
- ntt = (u & v).sum()
- else:
- raise TypeError("Arrays being compared have unknown type.")
-
- return (nff, nft, ntf, ntt)
-
-def _nbool_correspond_ft_tf(u, v):
- if u.dtype == np.int or u.dtype == np.float_ or u.dtype == np.double:
- not_u = 1.0 - u
- not_v = 1.0 - v
- nff = (not_u * not_v).sum()
- nft = (not_u * v).sum()
- ntf = (u * not_v).sum()
- ntt = (u * v).sum()
- else:
- not_u = ~u
- not_v = ~v
- nft = (not_u & v).sum()
- ntf = (u & not_v).sum()
- return (nft, ntf)
-
-def yule(u, v):
- """
- d = yule(u, v)
- Computes the Yule dissimilarity between two boolean n-vectors u and v,
-
- R
- ---------------------
- c_{TT} + c_{FF} + R/2
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n, and
-
- R = 2.0 * (c_{TF} + c_{FT}).
- """
- u = np.asarray(u)
- v = np.asarray(v)
- (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
- print nff, nft, ntf, ntt
- return float(2.0 * ntf * nft) / float(ntt * nff + ntf * nft)
-
-def matching(u, v):
- """
- d = matching(u, v)
-
- Computes the Matching dissimilarity between two boolean n-vectors
- u and v, which is
-
- (c_{TF} + c_{FT}) / n
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- (nft, ntf) = _nbool_correspond_ft_tf(u, v)
- return float(nft + ntf) / float(len(u))
-
-def dice(u, v):
- """
- d = dice(u, v)
-
- Computes the Dice dissimilarity between two boolean n-vectors
- u and v, which is
-
- c_{TF} + c_{FT}
- ----------------------------
- 2 * c_{TT} + c_{FT} + c_{TF}
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- if u.dtype == np.bool:
- ntt = (u & v).sum()
- else:
- ntt = (u * v).sum()
- (nft, ntf) = _nbool_correspond_ft_tf(u, v)
- return float(ntf + nft) / float(2.0 * ntt + ntf + nft)
-
-def rogerstanimoto(u, v):
- """
- d = rogerstanimoto(u, v)
-
- Computes the Rogers-Tanimoto dissimilarity between two boolean
- n-vectors u and v,
-
- R
- -------------------
- c_{TT} + c_{FF} + R
-
- where c_{ij} is the number of occurrences of
-
- u[k] == i and v[k] == j
-
- for k < n, and
-
- R = 2.0 * (c_{TF} + c_{FT}).
-
- """
- u = np.asarray(u)
- v = np.asarray(v)
- (nff, nft, ntf, ntt) = _nbool_correspond_all(u, v)
- return float(2.0 * (ntf + nft)) / float(ntt + nff + (2.0 * (ntf + nft)))
-
-def russellrao(u, v):
- """
- d = russellrao(u, v)
-
- Computes the Russell-Rao dissimilarity between two boolean n-vectors
- u and v, (n - c_{TT}) / n where c_{ij} is the number of occurrences
- of u[k] == i and v[k] == j for k < n.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- if u.dtype == np.bool:
- ntt = (u & v).sum()
- else:
- ntt = (u * v).sum()
- return float(len(u) - ntt) / float(len(u))
-
-def sokalmichener(u, v):
- """
- d = sokalmichener(u, v)
-
- Computes the Sokal-Michener dissimilarity between two boolean vectors
- u and v, 2R / (S + 2R) where c_{ij} is the number of occurrences of
- u[k] == i and v[k] == j for k < n and R = 2 * (c_{TF} + c{FT}) and
- S = c_{FF} + c_{TT}.
- """
- u = np.asarray(u)
- v = np.asarray(v)
- if u.dtype == np.bool:
- ntt = (u & v).sum()
- nff = (~u & ~v).sum()
- else:
- ntt = (u * v).sum()
- nff = ((1.0 - u) * (1.0 - v)).sum()
- (nft, ntf) = _nbool_correspond_ft_tf(u, v)
- return float(2.0 * (ntf + nft))/float(ntt + nff + 2.0 * (ntf + nft))
-
-def sokalsneath(u, v):
- """
- d = sokalsneath(u, v)
-
- Computes the Sokal-Sneath dissimilarity between two boolean vectors
- u and v, 2R / (c_{TT} + 2R) where c_{ij} is the number of occurrences
- of u[k] == i and v[k] == j for k < n and R = 2 * (c_{TF} + c{FT}).
- """
- u = np.asarray(u)
- v = np.asarray(v)
- if u.dtype == np.bool:
- ntt = (u & v).sum()
- else:
- ntt = (u * v).sum()
- (nft, ntf) = _nbool_correspond_ft_tf(u, v)
- return float(2.0 * (ntf + nft))/float(ntt + 2.0 * (ntf + nft))
-
def _convert_to_bool(X):
if X.dtype != np.bool:
X = np.bool_(X)
@@ -1195,380 +767,6 @@
X = X.copy()
return X
-def pdist(X, metric='euclidean', p=2, V=None, VI=None):
- """ Y = pdist(X, method='euclidean', p=2)
-
- Computes the distance between m original observations in
- n-dimensional space. Returns a condensed distance matrix Y.
- For each i and j (i<j), the metric dist(u=X[i], v=X[j]) is
- computed and stored in the ij'th entry. See squareform
- to learn how to retrieve this entry.
-
- 1. Y = pdist(X)
-
- Computes the distance between m points using Euclidean distance
- (2-norm) as the distance metric between the points. The points
- are arranged as m n-dimensional row vectors in the matrix X.
-
- 2. Y = pdist(X, 'minkowski', p)
-
- Computes the distances using the Minkowski distance ||u-v||_p
- (p-norm) where p>=1.
-
- 3. Y = pdist(X, 'cityblock')
-
- Computes the city block or Manhattan distance between the
- points.
-
- 4. Y = pdist(X, 'seuclidean', V=None)
-
- Computes the standardized Euclidean distance. The standardized
- Euclidean distance between two n-vectors u and v is
-
- sqrt(\sum {(u_i-v_i)^2 / V[x_i]}).
-
- V is the variance vector; V[i] is the variance computed over all
- the i'th components of the points. If not passed, it is
- automatically computed.
-
- 5. Y = pdist(X, 'sqeuclidean')
-
- Computes the squared Euclidean distance ||u-v||_2^2 between
- the vectors.
-
- 6. Y = pdist(X, 'cosine')
-
- Computes the cosine distance between vectors u and v,
-
- 1 - uv^T
- -----------
- |u|_2 |v|_2
-
- where |*|_2 is the 2 norm of its argument *.
-
- 7. Y = pdist(X, 'correlation')
-
- Computes the correlation distance between vectors u and v. This is
-
- 1 - (u - n|u|_1)(v - n|v|_1)^T
- --------------------------------- ,
- |(u - n|u|_1)|_2 |(v - n|v|_1)|^T
-
- where |*|_1 is the Manhattan (or 1-norm) of its argument *,
- and n is the common dimensionality of the vectors.
-
- 8. Y = pdist(X, 'hamming')
-
- Computes the normalized Hamming distance, or the proportion
- of those vector elements between two n-vectors u and v which
- disagree. To save memory, the matrix X can be of type boolean.
-
- 9. Y = pdist(X, 'jaccard')
-
- Computes the Jaccard distance between the points. Given two
- vectors, u and v, the Jaccard distance is the proportion of
- those elements u_i and v_i that disagree where at least one
- of them is non-zero.
-
- 10. Y = pdist(X, 'chebyshev')
-
- Computes the Chebyshev distance between the points. The
- Chebyshev distance between two n-vectors u and v is the maximum
- norm-1 distance between their respective elements. More
- precisely, the distance is given by
-
- d(u,v) = max {|u_i-v_i|}.
-
- 11. Y = pdist(X, 'canberra')
-
- Computes the Canberra distance between the points. The
- Canberra distance between two points u and v is
-
- |u_1-v_1| |u_2-v_2| |u_n-v_n|
- d(u,v) = ----------- + ----------- + ... + -----------
- |u_1|+|v_1| |u_2|+|v_2| |u_n|+|v_n|
-
- 12. Y = pdist(X, 'braycurtis')
-
- Computes the Bray-Curtis distance between the points. The
- Bray-Curtis distance between two points u and v is
-
- |u_1-v_1| + |u_2-v_2| + ... + |u_n-v_n|
- d(u,v) = ---------------------------------------
- |u_1+v_1| + |u_2+v_2| + ... + |u_n+v_n|
-
- 13. Y = pdist(X, 'mahalanobis', VI=None)
-
- Computes the Mahalanobis distance between the points. The
- Mahalanobis distance between two points u and v is
- (u-v)(1/V)(u-v)^T
- where (1/V) is the inverse covariance. If VI is not None,
- VI will be used as the inverse covariance matrix.
-
- 14. Y = pdist(X, 'yule')
-
- Computes the Yule distance between each pair of boolean
- vectors. (see yule function documentation)
-
- 15. Y = pdist(X, 'matching')
-
- Computes the matching distance between each pair of boolean
- vectors. (see matching function documentation)
-
- 16. Y = pdist(X, 'dice')
-
- Computes the Dice distance between each pair of boolean
- vectors. (see dice function documentation)
-
- 17. Y = pdist(X, 'kulsinski')
-
- Computes the Kulsinski distance between each pair of
- boolean vectors. (see kulsinski function documentation)
-
- 17. Y = pdist(X, 'rogerstanimoto')
-
- Computes the Rogers-Tanimoto distance between each pair of
- boolean vectors. (see rogerstanimoto function documentation)
-
- 18. Y = pdist(X, 'russellrao')
-
- Computes the Russell-Rao distance between each pair of
- boolean vectors. (see russellrao function documentation)
-
- 19. Y = pdist(X, 'sokalmichener')
-
- Computes the Sokal-Michener distance between each pair of
- boolean vectors. (see sokalmichener function documentation)
-
- 20. Y = pdist(X, 'sokalsneath')
-
- Computes the Sokal-Sneath distance between each pair of
- boolean vectors. (see sokalsneath function documentation)
-
- 21. Y = pdist(X, f)
-
- Computes the distance between all pairs of vectors in X
- using the user supplied 2-arity function f. For example,
- Euclidean distance between the vectors could be computed
- as follows,
-
- dm = pdist(X, (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())))
-
- Note that you should avoid passing a reference to one of
- the distance functions defined in this library. For example,
-
- dm = pdist(X, sokalsneath)
-
- would calculate the pair-wise distances between the vectors
- in X using the Python function sokalsneath. This would result
- in sokalsneath being called {n \choose 2} times, which is
- inefficient. Instead, the optimized C version is more
- efficient, and we call it using the following syntax.
-
- dm = pdist(X, 'sokalsneath')
- """
-# 21. Y = pdist(X, 'test_Y')
-#
-# Computes the distance between all pairs of vectors in X
-# using the distance metric Y but with a more succint,
-# verifiable, but less efficient implementation.
-
-
- X = np.asarray(X)
-
- #if np.issubsctype(X, np.floating) and not np.issubsctype(X, np.double):
- # raise TypeError('Floating point arrays must be 64-bit (got %r).' %
- # (X.dtype.type,))
-
- # The C code doesn't do striding.
- [X] = _copy_arrays_if_base_present([_convert_to_double(X)])
-
- s = X.shape
-
- if len(s) != 2:
- raise ValueError('A 2-dimensional array must be passed.');
-
- m = s[0]
- n = s[1]
- dm = np.zeros((m * (m - 1) / 2,), dtype=np.double)
-
- mtype = type(metric)
- if mtype is types.FunctionType:
- k = 0
- if metric == minkowski:
- for i in xrange(0, m - 1):
- for j in xrange(i+1, m):
- dm[k] = minkowski(X[i, :], X[j, :], p)
- k = k + 1
- elif metric == seuclidean:
- for i in xrange(0, m - 1):
- for j in xrange(i+1, m):
- dm[k] = seuclidean(X[i, :], X[j, :], V)
- k = k + 1
- elif metric == mahalanobis:
- for i in xrange(0, m - 1):
- for j in xrange(i+1, m):
- dm[k] = mahalanobis(X[i, :], X[j, :], V)
- k = k + 1
- else:
- for i in xrange(0, m - 1):
- for j in xrange(i+1, m):
- dm[k] = metric(X[i, :], X[j, :])
- k = k + 1
-
- elif mtype is types.StringType:
- mstr = metric.lower()
-
- #if X.dtype != np.double and \
- # (mstr != 'hamming' and mstr != 'jaccard'):
- # TypeError('A double array must be passed.')
- if mstr in set(['euclidean', 'euclid', 'eu', 'e']):
- _hierarchy_wrap.pdist_euclidean_wrap(_convert_to_double(X), dm)
- elif mstr in set(['sqeuclidean', 'sqe', 'sqeuclid']):
- _hierarchy_wrap.pdist_euclidean_wrap(_convert_to_double(X), dm)
- dm = dm ** 2.0
- elif mstr in set(['cityblock', 'cblock', 'cb', 'c']):
- _hierarchy_wrap.pdist_city_block_wrap(X, dm)
- elif mstr in set(['hamming', 'hamm', 'ha', 'h']):
- if X.dtype == np.bool:
- _hierarchy_wrap.pdist_hamming_bool_wrap(_convert_to_bool(X), dm)
- else:
- _hierarchy_wrap.pdist_hamming_wrap(_convert_to_double(X), dm)
- elif mstr in set(['jaccard', 'jacc', 'ja', 'j']):
- if X.dtype == np.bool:
- _hierarchy_wrap.pdist_jaccard_bool_wrap(_convert_to_bool(X), dm)
- else:
- _hierarchy_wrap.pdist_jaccard_wrap(_convert_to_double(X), dm)
- elif mstr in set(['chebychev', 'chebyshev', 'cheby', 'cheb', 'ch']):
- _hierarchy_wrap.pdist_chebyshev_wrap(_convert_to_double(X), dm)
- elif mstr in set(['minkowski', 'mi', 'm']):
- _hierarchy_wrap.pdist_minkowski_wrap(_convert_to_double(X), dm, p)
- elif mstr in set(['seuclidean', 'se', 's']):
- if V is not None:
- V = np.asarray(V)
- if type(V) != np.ndarray:
- raise TypeError('Variance vector V must be a numpy array')
- if V.dtype != np.double:
- raise TypeError('Variance vector V must contain doubles.')
- if len(V.shape) != 1:
- raise ValueError('Variance vector V must be one-dimensional.')
- if V.shape[0] != n:
- raise ValueError('Variance vector V must be of the same dimension as the vectors on which the distances are computed.')
- # The C code doesn't do striding.
- [VV] = _copy_arrays_if_base_present([_convert_to_double(V)])
- else:
- VV = _unbiased_variance(X)
- _hierarchy_wrap.pdist_seuclidean_wrap(_convert_to_double(X), VV, dm)
- # Need to test whether vectorized cosine works better.
- # Find out: Is there a dot subtraction operator so I can
- # subtract matrices in a similar way to multiplying them?
- # Need to get rid of as much unnecessary C code as possible.
- elif mstr in set(['cosine', 'cos']):
- norms = np.sqrt(np.sum(X * X, axis=1))
- _hierarchy_wrap.pdist_cosine_wrap(_convert_to_double(X), dm, norms)
- elif mstr in set(['old_cosine', 'old_cos']):
- norms = np.sqrt(np.sum(X * X, axis=1))
- nV = norms.reshape(m, 1)
- # The numerator u * v
- nm = np.dot(X, X.T)
- # The denom. ||u||*||v||
- de = np.dot(nV, nV.T);
- dm = 1.0 - (nm / de)
- dm[xrange(0,m),xrange(0,m)] = 0.0
- dm = squareform(dm)
- elif mstr in set(['correlation', 'co']):
- X2 = X - X.mean(1)[:,np.newaxis]
- #X2 = X - np.matlib.repmat(np.mean(X, axis=1).reshape(m, 1), 1, n)
- norms = np.sqrt(np.sum(X2 * X2, axis=1))
- _hierarchy_wrap.pdist_cosine_wrap(_convert_to_double(X2), _convert_to_double(dm), _convert_to_double(norms))
- elif mstr in set(['mahalanobis', 'mahal', 'mah']):
- if VI is not None:
- VI = _convert_to_double(np.asarray(VI))
- if type(VI) != np.ndarray:
- raise TypeError('VI must be a numpy array.')
- if VI.dtype != np.double:
- raise TypeError('The array must contain 64-bit floats.')
- [VI] = _copy_arrays_if_base_present([VI])
- else:
- V = np.cov(X.T)
- VI = _convert_to_double(np.linalg.inv(V).T.copy())
- # (u-v)V^(-1)(u-v)^T
- _hierarchy_wrap.pdist_mahalanobis_wrap(_convert_to_double(X), VI, dm)
- elif mstr == 'canberra':
- _hierarchy_wrap.pdist_canberra_wrap(_convert_to_bool(X), dm)
- elif mstr == 'braycurtis':
- _hierarchy_wrap.pdist_bray_curtis_wrap(_convert_to_bool(X), dm)
- elif mstr == 'yule':
- _hierarchy_wrap.pdist_yule_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'matching':
- _hierarchy_wrap.pdist_matching_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'kulsinski':
- _hierarchy_wrap.pdist_kulsinski_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'dice':
- _hierarchy_wrap.pdist_dice_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'rogerstanimoto':
- _hierarchy_wrap.pdist_rogerstanimoto_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'russellrao':
- _hierarchy_wrap.pdist_russellrao_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'sokalmichener':
- _hierarchy_wrap.pdist_sokalmichener_bool_wrap(_convert_to_bool(X), dm)
- elif mstr == 'sokalsneath':
- _hierarchy_wrap.pdist_sokalsneath_bool_wrap(_convert_to_bool(X), dm)
- elif metric == 'test_euclidean':
- dm = pdist(X, euclidean)
- elif metric == 'test_sqeuclidean':
- if V is None:
- V = _unbiased_variance(X)
- else:
- V = np.asarray(V)
- dm = pdist(X, lambda u, v: seuclidean(u, v, V))
- elif metric == 'test_braycurtis':
- dm = pdist(X, braycurtis)
- elif metric == 'test_mahalanobis':
- if VI is None:
- V = np.cov(X.T)
- VI = np.linalg.inv(V)
- else:
- VI = np.asarray(VI)
- [VI] = _copy_arrays_if_base_present([VI])
- # (u-v)V^(-1)(u-v)^T
- dm = pdist(X, (lambda u, v: mahalanobis(u, v, VI)))
- elif metric == 'test_cityblock':
- dm = pdist(X, cityblock)
- elif metric == 'test_minkowski':
- dm = pdist(X, minkowski, p)
- elif metric == 'test_cosine':
- dm = pdist(X, cosine)
- elif metric == 'test_correlation':
- dm = pdist(X, correlation)
- elif metric == 'test_hamming':
- dm = pdist(X, hamming)
- elif metric == 'test_jaccard':
- dm = pdist(X, jaccard)
- elif metric == 'test_chebyshev' or metric == 'test_chebychev':
- dm = pdist(X, chebyshev)
- elif metric == 'test_yule':
- dm = pdist(X, yule)
- elif metric == 'test_matching':
- dm = pdist(X, matching)
- elif metric == 'test_dice':
- dm = pdist(X, dice)
- elif metric == 'test_kulsinski':
- dm = pdist(X, kulsinski)
- elif metric == 'test_rogerstanimoto':
- dm = pdist(X, rogerstanimoto)
- elif metric == 'test_russellrao':
- dm = pdist(X, russellrao)
- elif metric == 'test_sokalsneath':
- dm = pdist(X, sokalsneath)
- elif metric == 'test_sokalmichener':
- dm = pdist(X, sokalmichener)
- else:
- raise ValueError('Unknown Distance Metric: %s' % mstr)
- else:
- raise TypeError('2nd argument metric must be a string identifier or a function.')
- return dm
-
def cophenet(*args, **kwargs):
"""
d = cophenet(Z)
Modified: trunk/scipy/cluster/setup.py
===================================================================
--- trunk/scipy/cluster/setup.py 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/setup.py 2008-06-09 05:55:44 UTC (rev 4417)
@@ -12,6 +12,10 @@
sources=[join('src', 'vq_module.c'), join('src', 'vq.c')],
include_dirs = [get_numpy_include_dirs()])
+ config.add_extension('_distance_wrap',
+ sources=[join('src', 'distance_wrap.c'), join('src', 'distance.c')],
+ include_dirs = [get_numpy_include_dirs()])
+
config.add_extension('_hierarchy_wrap',
sources=[join('src', 'hierarchy_wrap.c'), join('src', 'hierarchy.c')],
include_dirs = [get_numpy_include_dirs()])
Added: trunk/scipy/cluster/src/common.h
===================================================================
--- trunk/scipy/cluster/src/common.h 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/common.h 2008-06-09 05:55:44 UTC (rev 4417)
@@ -0,0 +1,70 @@
+/**
+ * common.h
+ *
+ * Author: Damian Eads
+ * Date: September 22, 2007 (moved into new file on June 8, 2008)
+ *
+ * Copyright (c) 2007, 2008, Damian Eads. All rights reserved.
+ * Adapted for incorporation into Scipy, April 9, 2008.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of the author nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CLUSTER_COMMON_H
+#define _CLUSTER_COMMON_H
+
+#define CPY_MAX(_x, _y) ((_x > _y) ? (_x) : (_y))
+#define CPY_MIN(_x, _y) ((_x < _y) ? (_x) : (_y))
+
+#define NCHOOSE2(_n) ((_n)*(_n-1)/2)
+
+#define CPY_BITS_PER_CHAR (sizeof(unsigned char) * 8)
+#define CPY_FLAG_ARRAY_SIZE_BYTES(num_bits) (CPY_CEIL_DIV((num_bits), \
+ CPY_BITS_PER_CHAR))
+#define CPY_GET_BIT(_xx, i) (((_xx)[(i) / CPY_BITS_PER_CHAR] >> \
+ ((CPY_BITS_PER_CHAR-1) - \
+ ((i) % CPY_BITS_PER_CHAR))) & 0x1)
+#define CPY_SET_BIT(_xx, i) ((_xx)[(i) / CPY_BITS_PER_CHAR] |= \
+ ((0x1) << ((CPY_BITS_PER_CHAR-1) \
+ -((i) % CPY_BITS_PER_CHAR))))
+#define CPY_CLEAR_BIT(_xx, i) ((_xx)[(i) / CPY_BITS_PER_CHAR] &= \
+ ~((0x1) << ((CPY_BITS_PER_CHAR-1) \
+ -((i) % CPY_BITS_PER_CHAR))))
+
+#ifndef CPY_CEIL_DIV
+#define CPY_CEIL_DIV(x, y) ((((double)x)/(double)y) == \
+ ((double)((x)/(y))) ? ((x)/(y)) : ((x)/(y) + 1))
+#endif
+
+
+#ifdef CPY_DEBUG
+#define CPY_DEBUG_MSG(...) fprintf(stderr, __VA_ARGS__)
+#else
+#define CPY_DEBUG_MSG(...)
+#endif
+
+#endif
Added: trunk/scipy/cluster/src/distance.c
===================================================================
--- trunk/scipy/cluster/src/distance.c 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/distance.c 2008-06-09 05:55:44 UTC (rev 4417)
@@ -0,0 +1,592 @@
+/**
+ * distance.c
+ *
+ * Author: Damian Eads
+ * Date: September 22, 2007 (moved to new file on June 8, 2008)
+ *
+ * Copyright (c) 2007, 2008, Damian Eads. All rights reserved.
+ * Adapted for incorporation into Scipy, April 9, 2008.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of the author nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include "common.h"
+#include "distance.h"
+
+static inline double euclidean_distance(const double *u, const double *v, int n) {
+ int i = 0;
+ double s = 0.0, d;
+ for (i = 0; i < n; i++) {
+ d = u[i] - v[i];
+ s = s + d * d;
+ }
+ return sqrt(s);
+}
+
+static inline double ess_distance(const double *u, const double *v, int n) {
+ int i = 0;
+ double s = 0.0, d;
+ for (i = 0; i < n; i++) {
+ d = fabs(u[i] - v[i]);
+ s = s + d * d;
+ }
+ return s;
+}
+
+static inline double chebyshev_distance(const double *u, const double *v, int n) {
+ int i = 0;
+ double d, maxv = 0.0;
+ for (i = 0; i < n; i++) {
+ d = fabs(u[i] - v[i]);
+ if (d > maxv) {
+ maxv = d;
+ }
+ }
+ return maxv;
+}
+
+static inline double canberra_distance(const double *u, const double *v, int n) {
+ int i;
+ double s = 0.0;
+ for (i = 0; i < n; i++) {
+ s += (fabs(u[i] - v[i]) / (fabs(u[i]) + fabs(v[i])));
+ }
+ return s;
+}
+
+static inline double bray_curtis_distance(const double *u, const double *v, int n) {
+ int i;
+ double s1 = 0.0, s2 = 0.0;
+ for (i = 0; i < n; i++) {
+ s1 += fabs(u[i] - v[i]);
+ s2 += fabs(u[i] + v[i]);
+ }
+ return s1 / s2;
+}
+
+static inline double mahalanobis_distance(const double *u, const double *v,
+ const double *covinv, double *dimbuf1,
+ double *dimbuf2, int n) {
+ int i, j;
+ double s;
+ const double *covrow = covinv;
+ for (i = 0; i < n; i++) {
+ dimbuf1[i] = u[i] - v[i];
+ }
+ for (i = 0; i < n; i++) {
+ covrow = covinv + (i * n);
+ s = 0.0;
+ for (j = 0; j < n; j++) {
+ s += dimbuf1[j] * covrow[j];
+ }
+ dimbuf2[i] = s;
+ }
+ s = 0.0;
+ for (i = 0; i < n; i++) {
+ s += dimbuf1[i] * dimbuf2[i];
+ }
+ return sqrt(s);
+}
+
+double hamming_distance(const double *u, const double *v, int n) {
+ int i = 0;
+ double s = 0.0;
+ for (i = 0; i < n; i++) {
+ s = s + (u[i] != v[i]);
+ }
+ return s / (double)n;
+}
+
+static inline double hamming_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ double s = 0.0;
+ for (i = 0; i < n; i++) {
+ s = s + (u[i] != v[i]);
+ }
+ return s / (double)n;
+}
+
+static inline double yule_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ int ntt = 0, nff = 0, nft = 0, ntf = 0;
+ for (i = 0; i < n; i++) {
+ ntt += (u[i] && v[i]);
+ ntf += (u[i] && !v[i]);
+ nft += (!u[i] && v[i]);
+ nff += (!u[i] && !v[i]);
+ }
+ return (2.0 * ntf * nft) / (double)(ntt * nff + ntf * nft);
+}
+
+static inline double matching_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ int nft = 0, ntf = 0;
+ for (i = 0; i < n; i++) {
+ ntf += (u[i] && !v[i]);
+ nft += (!u[i] && v[i]);
+ }
+ return (double)(ntf + nft) / (double)(n);
+}
+
+static inline double dice_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ int ntt = 0, nft = 0, ntf = 0;
+ for (i = 0; i < n; i++) {
+ ntt += (u[i] && v[i]);
+ ntf += (u[i] && !v[i]);
+ nft += (!u[i] && v[i]);
+ }
+ return (double)(nft + ntf) / (double)(2.0 * ntt + ntf + nft);
+}
+
+
+static inline double rogerstanimoto_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ int ntt = 0, nff = 0, nft = 0, ntf = 0;
+ for (i = 0; i < n; i++) {
+ ntt += (u[i] && v[i]);
+ ntf += (u[i] && !v[i]);
+ nft += (!u[i] && v[i]);
+ nff += (!u[i] && !v[i]);
+ }
+ return (2.0 * (ntf + nft)) / ((double)ntt + nff + (2.0 * (ntf + nft)));
+}
+
+static inline double russellrao_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ /** int nff = 0, nft = 0, ntf = 0;**/
+ int ntt = 0;
+ for (i = 0; i < n; i++) {
+ /** nff += (!u[i] && !v[i]);
+ ntf += (u[i] && !v[i]);
+ nft += (!u[i] && v[i]);**/
+ ntt += (u[i] && v[i]);
+ }
+ /** return (double)(ntf + nft + nff) / (double)n;**/
+ return (double) (n - ntt) / (double) n;
+}
+
+static inline double kulsinski_distance_bool(const char *u, const char *v, int n) {
+ int _i = 0;
+ int ntt = 0, nft = 0, ntf = 0, nff = 0;
+ for (_i = 0; _i < n; _i++) {
+ ntt += (u[_i] && v[_i]);
+ ntf += (u[_i] && !v[_i]);
+ nft += (!u[_i] && v[_i]);
+ nff += (!u[_i] && !v[_i]);
+ }
+ return ((double)(ntf + nft - ntt + n)) / ((double)(ntf + nft + n));
+}
+
+static inline double sokalsneath_distance_bool(const char *u, const char *v, int n) {
+ int _i = 0;
+ int ntt = 0, nft = 0, ntf = 0;
+ for (_i = 0; _i < n; _i++) {
+ ntt += (u[_i] && v[_i]);
+ ntf += (u[_i] && !v[_i]);
+ nft += (!u[_i] && v[_i]);
+ }
+ return (2.0 * (ntf + nft))/(2.0 * (ntf + nft) + ntt);
+}
+
+static inline double sokalmichener_distance_bool(const char *u, const char *v, int n) {
+ int _i = 0;
+ int ntt = 0, nft = 0, ntf = 0, nff = 0;
+ for (_i = 0; _i < n; _i++) {
+ ntt += (u[_i] && v[_i]);
+ nff += (!u[_i] && !v[_i]);
+ ntf += (u[_i] && !v[_i]);
+ nft += (!u[_i] && v[_i]);
+ }
+ return (2.0 * (ntf + nft))/(2.0 * (ntf + nft) + ntt + nff);
+}
+
+static inline double jaccard_distance(const double *u, const double *v, int n) {
+ int i = 0;
+ double denom = 0.0, num = 0.0;
+ for (i = 0; i < n; i++) {
+ num += (u[i] != v[i]) && ((u[i] != 0.0) || (v[i] != 0.0));
+ denom += (u[i] != 0.0) || (v[i] != 0.0);
+ }
+ return num / denom;
+}
+
+static inline double jaccard_distance_bool(const char *u, const char *v, int n) {
+ int i = 0;
+ double num = 0.0, denom = 0.0;
+ for (i = 0; i < n; i++) {
+ num += (u[i] != v[i]) && ((u[i] != 0) || (v[i] != 0));
+ denom += (u[i] != 0) || (v[i] != 0);
+ }
+ return num / denom;
+}
+
+static inline double dot_product(const double *u, const double *v, int n) {
+ int i;
+ double s = 0.0;
+ for (i = 0; i < n; i++) {
+ s += u[i] * v[i];
+ }
+ return s;
+}
+
+static inline double cosine_distance(const double *u, const double *v, int n,
+ const double nu, const double nv) {
+ return 1.0 - (dot_product(u, v, n) / (nu * nv));
+}
+
+static inline double seuclidean_distance(const double *var,
+ const double *u, const double *v, int n) {
+ int i = 0;
+ double s = 0.0, d;
+ for (i = 0; i < n; i++) {
+ d = u[i] - v[i];
+ s = s + (d * d) / var[i];
+ }
+ return sqrt(s);
+}
+
+static inline double city_block_distance(const double *u, const double *v, int n) {
+ int i = 0;
+ double s = 0.0, d;
+ for (i = 0; i < n; i++) {
+ d = fabs(u[i] - v[i]);
+ s = s + d;
+ }
+ return s;
+}
+
+double minkowski_distance(const double *u, const double *v, int n, double p) {
+ int i = 0;
+ double s = 0.0, d;
+ for (i = 0; i < n; i++) {
+ d = fabs(u[i] - v[i]);
+ s = s + pow(d, p);
+ }
+ return pow(s, 1.0 / p);
+}
+
+void compute_mean_vector(double *res, const double *X, int m, int n) {
+ int i, j;
+ const double *v;
+ for (i = 0; i < n; i++) {
+ res[i] = 0.0;
+ }
+ for (j = 0; j < m; j++) {
+
+ v = X + (j * n);
+ for (i = 0; i < n; i++) {
+ res[i] += v[i];
+ }
+ }
+ for (i = 0; i < n; i++) {
+ res[i] /= (double)m;
+ }
+}
+
+void pdist_euclidean(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = euclidean_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_mahalanobis(const double *X, const double *covinv,
+ double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ double *dimbuf1, *dimbuf2;
+ dimbuf1 = (double*)malloc(sizeof(double) * 2 * n);
+ dimbuf2 = dimbuf1 + n;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = mahalanobis_distance(u, v, covinv, dimbuf1, dimbuf2, n);
+ }
+ }
+ dimbuf2 = 0;
+ free(dimbuf1);
+}
+
+void pdist_bray_curtis(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = bray_curtis_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_canberra(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = canberra_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_hamming(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = hamming_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_hamming_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = hamming_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_jaccard(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = jaccard_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_jaccard_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = jaccard_distance_bool(u, v, n);
+ }
+ }
+}
+
+
+void pdist_chebyshev(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = chebyshev_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_cosine(const double *X, double *dm, int m, int n, const double *norms) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = cosine_distance(u, v, n, norms[i], norms[j]);
+ }
+ }
+}
+
+void pdist_seuclidean(const double *X, const double *var,
+ double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = seuclidean_distance(var, u, v, n);
+ }
+ }
+}
+
+void pdist_city_block(const double *X, double *dm, int m, int n) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = city_block_distance(u, v, n);
+ }
+ }
+}
+
+void pdist_minkowski(const double *X, double *dm, int m, int n, double p) {
+ int i, j;
+ const double *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = minkowski_distance(u, v, n, p);
+ }
+ }
+}
+
+void pdist_yule_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = yule_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_matching_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = matching_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_dice_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = dice_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_rogerstanimoto_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = rogerstanimoto_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_russellrao_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = russellrao_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_kulsinski_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = kulsinski_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_sokalsneath_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = sokalsneath_distance_bool(u, v, n);
+ }
+ }
+}
+
+void pdist_sokalmichener_bool(const char *X, double *dm, int m, int n) {
+ int i, j;
+ const char *u, *v;
+ double *it = dm;
+ for (i = 0; i < m; i++) {
+ for (j = i + 1; j < m; j++, it++) {
+ u = X + (n * i);
+ v = X + (n * j);
+ *it = sokalmichener_distance_bool(u, v, n);
+ }
+ }
+}
Added: trunk/scipy/cluster/src/distance.h
===================================================================
--- trunk/scipy/cluster/src/distance.h 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/distance.h 2008-06-09 05:55:44 UTC (rev 4417)
@@ -0,0 +1,64 @@
+/**
+ * distance.h
+ *
+ * Author: Damian Eads
+ * Date: September 22, 2007 (moved to new file on June 8, 2008)
+ * Adapted for incorporation into Scipy, April 9, 2008.
+ *
+ * Copyright (c) 2007, 2008, Damian Eads. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of the author nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CPY_DISTANCE_H
+#define _CPY_DISTANCE_H
+
+void pdist_euclidean(const double *X, double *dm, int m, int n);
+void pdist_seuclidean(const double *X,
+ const double *var, double *dm, int m, int n);
+void pdist_mahalanobis(const double *X, const double *covinv,
+ double *dm, int m, int n);
+void pdist_bray_curtis(const double *X, double *dm, int m, int n);
+void pdist_canberra(const double *X, double *dm, int m, int n);
+void pdist_hamming(const double *X, double *dm, int m, int n);
+void pdist_hamming_bool(const char *X, double *dm, int m, int n);
+void pdist_city_block(const double *X, double *dm, int m, int n);
+void pdist_cosine(const double *X, double *dm, int m, int n, const double *norms);
+void pdist_chebyshev(const double *X, double *dm, int m, int n);
+void pdist_jaccard(const double *X, double *dm, int m, int n);
+void pdist_jaccard_bool(const char *X, double *dm, int m, int n);
+void pdist_kulsinski_bool(const char *X, double *dm, int m, int n);
+void pdist_minkowski(const double *X, double *dm, int m, int n, double p);
+void pdist_yule_bool(const char *X, double *dm, int m, int n);
+void pdist_matching_bool(const char *X, double *dm, int m, int n);
+void pdist_dice_bool(const char *X, double *dm, int m, int n);
+void pdist_rogerstanimoto_bool(const char *X, double *dm, int m, int n);
+void pdist_russellrao_bool(const char *X, double *dm, int m, int n);
+void pdist_sokalmichener_bool(const char *X, double *dm, int m, int n);
+void pdist_sokalsneath_bool(const char *X, double *dm, int m, int n);
+
+#endif
Added: trunk/scipy/cluster/src/distance_wrap.c
===================================================================
--- trunk/scipy/cluster/src/distance_wrap.c 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/distance_wrap.c 2008-06-09 05:55:44 UTC (rev 4417)
@@ -0,0 +1,525 @@
+/**
+ * distance_wrap.c
+ *
+ * Author: Damian Eads
+ * Date: September 22, 2007 (moved to new file on June 8, 2008)
+ * Adapted for incorporation into Scipy, April 9, 2008.
+ *
+ * Copyright (c) 2007, Damian Eads. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of the author nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <math.h>
+#include "distance.h"
+#include "Python.h"
+#include <numpy/arrayobject.h>
+#include <stdio.h>
+
+extern PyObject *pdist_euclidean_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_euclidean(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_canberra_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_canberra(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_bray_curtis_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_bray_curtis(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+
+extern PyObject *pdist_mahalanobis_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *covinv_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ const double *covinv;
+ if (!PyArg_ParseTuple(args, "O!O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &covinv_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ covinv = (const double*)covinv_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_mahalanobis(X, covinv, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+
+extern PyObject *pdist_chebyshev_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_chebyshev(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+
+extern PyObject *pdist_cosine_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_, *norms_;
+ int m, n;
+ double *dm;
+ const double *X, *norms;
+ if (!PyArg_ParseTuple(args, "O!O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_,
+ &PyArray_Type, &norms_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ norms = (const double*)norms_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_cosine(X, dm, m, n, norms);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_seuclidean_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_, *var_;
+ int m, n;
+ double *dm;
+ const double *X, *var;
+ if (!PyArg_ParseTuple(args, "O!O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &var_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (double*)X_->data;
+ dm = (double*)dm_->data;
+ var = (double*)var_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_seuclidean(X, var, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_city_block_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_city_block(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_hamming_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_hamming(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_hamming_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_hamming_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_jaccard_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const double *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_jaccard(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_jaccard_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_jaccard_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+extern PyObject *pdist_minkowski_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm, *X;
+ double p;
+ if (!PyArg_ParseTuple(args, "O!O!d",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_,
+ &p)) {
+ return 0;
+ }
+ else {
+ X = (double*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_minkowski(X, dm, m, n, p);
+ }
+ return Py_BuildValue("d", 0.0);
+}
+
+
+extern PyObject *pdist_yule_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_yule_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_matching_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_matching_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_dice_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_dice_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_rogerstanimoto_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_rogerstanimoto_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_russellrao_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_russellrao_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_kulsinski_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_kulsinski_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_sokalmichener_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_sokalmichener_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+extern PyObject *pdist_sokalsneath_bool_wrap(PyObject *self, PyObject *args) {
+ PyArrayObject *X_, *dm_;
+ int m, n;
+ double *dm;
+ const char *X;
+ if (!PyArg_ParseTuple(args, "O!O!",
+ &PyArray_Type, &X_,
+ &PyArray_Type, &dm_)) {
+ return 0;
+ }
+ else {
+ X = (const char*)X_->data;
+ dm = (double*)dm_->data;
+ m = X_->dimensions[0];
+ n = X_->dimensions[1];
+
+ pdist_sokalsneath_bool(X, dm, m, n);
+ }
+ return Py_BuildValue("");
+}
+
+
+static PyMethodDef _distanceWrapMethods[] = {
+ {"pdist_bray_curtis_wrap", pdist_bray_curtis_wrap, METH_VARARGS},
+ {"pdist_canberra_wrap", pdist_canberra_wrap, METH_VARARGS},
+ {"pdist_chebyshev_wrap", pdist_chebyshev_wrap, METH_VARARGS},
+ {"pdist_city_block_wrap", pdist_city_block_wrap, METH_VARARGS},
+ {"pdist_cosine_wrap", pdist_cosine_wrap, METH_VARARGS},
+ {"pdist_dice_bool_wrap", pdist_dice_bool_wrap, METH_VARARGS},
+ {"pdist_euclidean_wrap", pdist_euclidean_wrap, METH_VARARGS},
+ {"pdist_hamming_wrap", pdist_hamming_wrap, METH_VARARGS},
+ {"pdist_hamming_bool_wrap", pdist_hamming_bool_wrap, METH_VARARGS},
+ {"pdist_jaccard_wrap", pdist_jaccard_wrap, METH_VARARGS},
+ {"pdist_jaccard_bool_wrap", pdist_jaccard_bool_wrap, METH_VARARGS},
+ {"pdist_kulsinski_bool_wrap", pdist_kulsinski_bool_wrap, METH_VARARGS},
+ {"pdist_mahalanobis_wrap", pdist_mahalanobis_wrap, METH_VARARGS},
+ {"pdist_matching_bool_wrap", pdist_matching_bool_wrap, METH_VARARGS},
+ {"pdist_minkowski_wrap", pdist_minkowski_wrap, METH_VARARGS},
+ {"pdist_rogerstanimoto_bool_wrap", pdist_rogerstanimoto_bool_wrap, METH_VARARGS},
+ {"pdist_russellrao_bool_wrap", pdist_russellrao_bool_wrap, METH_VARARGS},
+ {"pdist_seuclidean_wrap", pdist_seuclidean_wrap, METH_VARARGS},
+ {"pdist_sokalmichener_bool_wrap", pdist_sokalmichener_bool_wrap, METH_VARARGS},
+ {"pdist_sokalsneath_bool_wrap", pdist_sokalsneath_bool_wrap, METH_VARARGS},
+ {"pdist_yule_bool_wrap", pdist_yule_bool_wrap, METH_VARARGS},
+ {NULL, NULL} /* Sentinel - marks the end of this structure */
+};
+
+void init_distance_wrap(void) {
+ (void) Py_InitModule("_distance_wrap", _distanceWrapMethods);
+ import_array(); // Must be present for NumPy. Called first after above line.
+}
Modified: trunk/scipy/cluster/src/hierarchy.c
===================================================================
--- trunk/scipy/cluster/src/hierarchy.c 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/hierarchy.c 2008-06-09 05:55:44 UTC (rev 4417)
@@ -34,12 +34,11 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#define NCHOOSE2(_n) ((_n)*(_n-1)/2)
+#include "common.h"
+
#define ISCLUSTER(_nd) ((_nd)->id >= n)
#define GETCLUSTER(_id) ((lists + _id - n))
-#define CPY_MAX(_x, _y) ((_x > _y) ? (_x) : (_y))
-#define CPY_MIN(_x, _y) ((_x < _y) ? (_x) : (_y))
/** The number of link stats (for the inconsistency computation) for each
cluster. */
@@ -61,39 +60,15 @@
#define CPY_LIN_DIST 2
#define CPY_LIN_CNT 3
-#define CPY_BITS_PER_CHAR (sizeof(unsigned char) * 8)
-#define CPY_FLAG_ARRAY_SIZE_BYTES(num_bits) (CPY_CEIL_DIV((num_bits), \
- CPY_BITS_PER_CHAR))
-#define CPY_GET_BIT(_xx, i) (((_xx)[(i) / CPY_BITS_PER_CHAR] >> \
- ((CPY_BITS_PER_CHAR-1) - \
- ((i) % CPY_BITS_PER_CHAR))) & 0x1)
-#define CPY_SET_BIT(_xx, i) ((_xx)[(i) / CPY_BITS_PER_CHAR] |= \
- ((0x1) << ((CPY_BITS_PER_CHAR-1) \
- -((i) % CPY_BITS_PER_CHAR))))
-#define CPY_CLEAR_BIT(_xx, i) ((_xx)[(i) / CPY_BITS_PER_CHAR] &= \
- ~((0x1) << ((CPY_BITS_PER_CHAR-1) \
- -((i) % CPY_BITS_PER_CHAR))))
-
-#ifndef CPY_CEIL_DIV
-#define CPY_CEIL_DIV(x, y) ((((double)x)/(double)y) == \
- ((double)((x)/(y))) ? ((x)/(y)) : ((x)/(y) + 1))
-#endif
-
-
-#ifdef CPY_DEBUG
-#define CPY_DEBUG_MSG(...) fprintf(stderr, __VA_ARGS__)
-#else
-#define CPY_DEBUG_MSG(...)
-#endif
-
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "hierarchy.h"
+#include "distance.h"
-double euclidean_distance(const double *u, const double *v, int n) {
+static inline double euclidean_distance(const double *u, const double *v, int n) {
int i = 0;
double s = 0.0, d;
for (i = 0; i < n; i++) {
@@ -103,548 +78,6 @@
return sqrt(s);
}
-double ess_distance(const double *u, const double *v, int n) {
- int i = 0;
- double s = 0.0, d;
- for (i = 0; i < n; i++) {
- d = fabs(u[i] - v[i]);
- s = s + d * d;
- }
- return s;
-}
-
-double chebyshev_distance(const double *u, const double *v, int n) {
- int i = 0;
- double d, maxv = 0.0;
- for (i = 0; i < n; i++) {
- d = fabs(u[i] - v[i]);
- if (d > maxv) {
- maxv = d;
- }
- }
- return maxv;
-}
-
-double canberra_distance(const double *u, const double *v, int n) {
- int i;
- double s = 0.0;
- for (i = 0; i < n; i++) {
- s += (fabs(u[i] - v[i]) / (fabs(u[i]) + fabs(v[i])));
- }
- return s;
-}
-
-double bray_curtis_distance(const double *u, const double *v, int n) {
- int i;
- double s1 = 0.0, s2 = 0.0;
- for (i = 0; i < n; i++) {
- s1 += fabs(u[i] - v[i]);
- s2 += fabs(u[i] + v[i]);
- }
- return s1 / s2;
-}
-
-double mahalanobis_distance(const double *u, const double *v,
- const double *covinv, double *dimbuf1,
- double *dimbuf2, int n) {
- int i, j;
- double s;
- const double *covrow = covinv;
- for (i = 0; i < n; i++) {
- dimbuf1[i] = u[i] - v[i];
- }
- for (i = 0; i < n; i++) {
- covrow = covinv + (i * n);
- s = 0.0;
- for (j = 0; j < n; j++) {
- s += dimbuf1[j] * covrow[j];
- }
- dimbuf2[i] = s;
- }
- s = 0.0;
- for (i = 0; i < n; i++) {
- s += dimbuf1[i] * dimbuf2[i];
- }
- return sqrt(s);
-}
-
-double hamming_distance(const double *u, const double *v, int n) {
- int i = 0;
- double s = 0.0;
- for (i = 0; i < n; i++) {
- s = s + (u[i] != v[i]);
- }
- return s / (double)n;
-}
-
-double hamming_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- double s = 0.0;
- for (i = 0; i < n; i++) {
- s = s + (u[i] != v[i]);
- }
- return s / (double)n;
-}
-
-double yule_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- int ntt = 0, nff = 0, nft = 0, ntf = 0;
- for (i = 0; i < n; i++) {
- ntt += (u[i] && v[i]);
- ntf += (u[i] && !v[i]);
- nft += (!u[i] && v[i]);
- nff += (!u[i] && !v[i]);
- }
- return (2.0 * ntf * nft) / (double)(ntt * nff + ntf * nft);
-}
-
-double matching_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- int nft = 0, ntf = 0;
- for (i = 0; i < n; i++) {
- ntf += (u[i] && !v[i]);
- nft += (!u[i] && v[i]);
- }
- return (double)(ntf + nft) / (double)(n);
-}
-
-double dice_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- int ntt = 0, nft = 0, ntf = 0;
- for (i = 0; i < n; i++) {
- ntt += (u[i] && v[i]);
- ntf += (u[i] && !v[i]);
- nft += (!u[i] && v[i]);
- }
- return (double)(nft + ntf) / (double)(2.0 * ntt + ntf + nft);
-}
-
-
-double rogerstanimoto_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- int ntt = 0, nff = 0, nft = 0, ntf = 0;
- for (i = 0; i < n; i++) {
- ntt += (u[i] && v[i]);
- ntf += (u[i] && !v[i]);
- nft += (!u[i] && v[i]);
- nff += (!u[i] && !v[i]);
- }
- return (2.0 * (ntf + nft)) / ((double)ntt + nff + (2.0 * (ntf + nft)));
-}
-
-double russellrao_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- /** int nff = 0, nft = 0, ntf = 0;**/
- int ntt = 0;
- for (i = 0; i < n; i++) {
- /** nff += (!u[i] && !v[i]);
- ntf += (u[i] && !v[i]);
- nft += (!u[i] && v[i]);**/
- ntt += (u[i] && v[i]);
- }
- /** return (double)(ntf + nft + nff) / (double)n;**/
- return (double) (n - ntt) / (double) n;
-}
-
-static inline double kulsinski_distance_bool(const char *u, const char *v, int n) {
- int _i = 0;
- int ntt = 0, nft = 0, ntf = 0, nff = 0;
- for (_i = 0; _i < n; _i++) {
- ntt += (u[_i] && v[_i]);
- ntf += (u[_i] && !v[_i]);
- nft += (!u[_i] && v[_i]);
- nff += (!u[_i] && !v[_i]);
- }
- return ((double)(ntf + nft - ntt + n)) / ((double)(ntf + nft + n));
-}
-
-static inline double sokalsneath_distance_bool(const char *u, const char *v, int n) {
- int _i = 0;
- int ntt = 0, nft = 0, ntf = 0;
- for (_i = 0; _i < n; _i++) {
- ntt += (u[_i] && v[_i]);
- ntf += (u[_i] && !v[_i]);
- nft += (!u[_i] && v[_i]);
- }
- return (2.0 * (ntf + nft))/(2.0 * (ntf + nft) + ntt);
-}
-
-static inline double sokalmichener_distance_bool(const char *u, const char *v, int n) {
- int _i = 0;
- int ntt = 0, nft = 0, ntf = 0, nff = 0;
- for (_i = 0; _i < n; _i++) {
- ntt += (u[_i] && v[_i]);
- nff += (!u[_i] && !v[_i]);
- ntf += (u[_i] && !v[_i]);
- nft += (!u[_i] && v[_i]);
- }
- return (2.0 * (ntf + nft))/(2.0 * (ntf + nft) + ntt + nff);
-}
-
-double jaccard_distance(const double *u, const double *v, int n) {
- int i = 0;
- double denom = 0.0, num = 0.0;
- for (i = 0; i < n; i++) {
- num += (u[i] != v[i]) && ((u[i] != 0.0) || (v[i] != 0.0));
- denom += (u[i] != 0.0) || (v[i] != 0.0);
- }
- return num / denom;
-}
-
-double jaccard_distance_bool(const char *u, const char *v, int n) {
- int i = 0;
- double num = 0.0, denom = 0.0;
- for (i = 0; i < n; i++) {
- num += (u[i] != v[i]) && ((u[i] != 0) || (v[i] != 0));
- denom += (u[i] != 0) || (v[i] != 0);
- }
- return num / denom;
-}
-
-double dot_product(const double *u, const double *v, int n) {
- int i;
- double s = 0.0;
- for (i = 0; i < n; i++) {
- s += u[i] * v[i];
- }
- return s;
-}
-
-double cosine_distance(const double *u, const double *v, int n,
- const double nu, const double nv) {
- return 1.0 - (dot_product(u, v, n) / (nu * nv));
-}
-
-double seuclidean_distance(const double *var,
- const double *u, const double *v, int n) {
- int i = 0;
- double s = 0.0, d;
- for (i = 0; i < n; i++) {
- d = u[i] - v[i];
- s = s + (d * d) / var[i];
- }
- return sqrt(s);
-}
-
-double city_block_distance(const double *u, const double *v, int n) {
- int i = 0;
- double s = 0.0, d;
- for (i = 0; i < n; i++) {
- d = fabs(u[i] - v[i]);
- s = s + d;
- }
- return s;
-}
-
-double minkowski_distance(const double *u, const double *v, int n, double p) {
- int i = 0;
- double s = 0.0, d;
- for (i = 0; i < n; i++) {
- d = fabs(u[i] - v[i]);
- s = s + pow(d, p);
- }
- return pow(s, 1.0 / p);
-}
-
-void compute_mean_vector(double *res, const double *X, int m, int n) {
- int i, j;
- const double *v;
- for (i = 0; i < n; i++) {
- res[i] = 0.0;
- }
- for (j = 0; j < m; j++) {
-
- v = X + (j * n);
- for (i = 0; i < n; i++) {
- res[i] += v[i];
- }
- }
- for (i = 0; i < n; i++) {
- res[i] /= (double)m;
- }
-}
-
-void pdist_euclidean(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = euclidean_distance(u, v, n);
- }
- }
-}
-
-void pdist_mahalanobis(const double *X, const double *covinv,
- double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- double *dimbuf1, *dimbuf2;
- dimbuf1 = (double*)malloc(sizeof(double) * 2 * n);
- dimbuf2 = dimbuf1 + n;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = mahalanobis_distance(u, v, covinv, dimbuf1, dimbuf2, n);
- }
- }
- dimbuf2 = 0;
- free(dimbuf1);
-}
-
-void pdist_bray_curtis(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = bray_curtis_distance(u, v, n);
- }
- }
-}
-
-void pdist_canberra(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = canberra_distance(u, v, n);
- }
- }
-}
-
-void pdist_hamming(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = hamming_distance(u, v, n);
- }
- }
-}
-
-void pdist_hamming_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = hamming_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_jaccard(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = jaccard_distance(u, v, n);
- }
- }
-}
-
-void pdist_jaccard_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = jaccard_distance_bool(u, v, n);
- }
- }
-}
-
-
-void pdist_chebyshev(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = chebyshev_distance(u, v, n);
- }
- }
-}
-
-void pdist_cosine(const double *X, double *dm, int m, int n, const double *norms) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = cosine_distance(u, v, n, norms[i], norms[j]);
- }
- }
-}
-
-void pdist_seuclidean(const double *X, const double *var,
- double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = seuclidean_distance(var, u, v, n);
- }
- }
-}
-
-void pdist_city_block(const double *X, double *dm, int m, int n) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = city_block_distance(u, v, n);
- }
- }
-}
-
-void pdist_minkowski(const double *X, double *dm, int m, int n, double p) {
- int i, j;
- const double *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = minkowski_distance(u, v, n, p);
- }
- }
-}
-
-void pdist_yule_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = yule_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_matching_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = matching_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_dice_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = dice_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_rogerstanimoto_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = rogerstanimoto_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_russellrao_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = russellrao_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_kulsinski_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = kulsinski_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_sokalsneath_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = sokalsneath_distance_bool(u, v, n);
- }
- }
-}
-
-void pdist_sokalmichener_bool(const char *X, double *dm, int m, int n) {
- int i, j;
- const char *u, *v;
- double *it = dm;
- for (i = 0; i < m; i++) {
- for (j = i + 1; j < m; j++, it++) {
- u = X + (n * i);
- v = X + (n * j);
- *it = sokalmichener_distance_bool(u, v, n);
- }
- }
-}
-
void chopmins(int *ind, int mini, int minj, int np) {
int i;
for (i = mini; i < minj - 1; i++) {
Modified: trunk/scipy/cluster/src/hierarchy.h
===================================================================
--- trunk/scipy/cluster/src/hierarchy.h 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/hierarchy.h 2008-06-09 05:55:44 UTC (rev 4417)
@@ -34,8 +34,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef _CPY_CLUSTER_H
-#define _CPY_CLUSTER_H
+#ifndef _CPY_HIERARCHY_H
+#define _CPY_HIERARCHY_H
#define CPY_LINKAGE_SINGLE 0
#define CPY_LINKAGE_COMPLETE 1
@@ -89,35 +89,9 @@
void dist_to_squareform_from_vector(double *M, const double *v, int n);
void dist_to_vector_from_squareform(const double *M, double *v, int n);
-void pdist_euclidean(const double *X, double *dm, int m, int n);
-void pdist_seuclidean(const double *X,
- const double *var, double *dm, int m, int n);
-void pdist_mahalanobis(const double *X, const double *covinv,
- double *dm, int m, int n);
-void pdist_bray_curtis(const double *X, double *dm, int m, int n);
-void pdist_canberra(const double *X, double *dm, int m, int n);
-void pdist_hamming(const double *X, double *dm, int m, int n);
-void pdist_hamming_bool(const char *X, double *dm, int m, int n);
-void pdist_city_block(const double *X, double *dm, int m, int n);
-void pdist_cosine(const double *X, double *dm, int m, int n, const double *norms);
-void pdist_chebyshev(const double *X, double *dm, int m, int n);
-void pdist_jaccard(const double *X, double *dm, int m, int n);
-void pdist_jaccard_bool(const char *X, double *dm, int m, int n);
-void pdist_kulsinski_bool(const char *X, double *dm, int m, int n);
-void pdist_minkowski(const double *X, double *dm, int m, int n, double p);
-void pdist_yule_bool(const char *X, double *dm, int m, int n);
-void pdist_matching_bool(const char *X, double *dm, int m, int n);
-void pdist_dice_bool(const char *X, double *dm, int m, int n);
-void pdist_rogerstanimoto_bool(const char *X, double *dm, int m, int n);
-void pdist_russellrao_bool(const char *X, double *dm, int m, int n);
-void pdist_sokalmichener_bool(const char *X, double *dm, int m, int n);
-void pdist_sokalsneath_bool(const char *X, double *dm, int m, int n);
-
void inconsistency_calculation(const double *Z, double *R, int n, int d);
void inconsistency_calculation_alt(const double *Z, double *R, int n, int d);
-double dot_product(const double *u, const double *v, int n);
-
void chopmins(int *ind, int mini, int minj, int np);
void chopmins_ns_i(double *ind, int mini, int np);
void chopmins_ns_ij(double *ind, int mini, int minj, int np);
Modified: trunk/scipy/cluster/src/hierarchy_wrap.c
===================================================================
--- trunk/scipy/cluster/src/hierarchy_wrap.c 2008-06-07 08:13:02 UTC (rev 4416)
+++ trunk/scipy/cluster/src/hierarchy_wrap.c 2008-06-09 05:55:44 UTC (rev 4417)
@@ -332,18 +332,6 @@
return Py_BuildValue("d", 0.0);
}
-extern PyObject *dot_product_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *d1_, *d2_;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &d1_,
- &PyArray_Type, &d2_)) {
- return 0;
- }
- return Py_BuildValue("d", dot_product((const double*)d1_->data,
- (const double*)d2_->data,
- d1_->dimensions[0]));
-}
-
extern PyObject *to_squareform_from_vector_wrap(PyObject *self, PyObject *args) {
PyArrayObject *M_, *v_;
int n;
@@ -382,459 +370,6 @@
return Py_BuildValue("d", 0.0);
}
-extern PyObject *pdist_euclidean_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_euclidean(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_canberra_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_canberra(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_bray_curtis_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_bray_curtis(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-
-extern PyObject *pdist_mahalanobis_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *covinv_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- const double *covinv;
- if (!PyArg_ParseTuple(args, "O!O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &covinv_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- covinv = (const double*)covinv_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_mahalanobis(X, covinv, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-
-extern PyObject *pdist_chebyshev_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_chebyshev(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-
-extern PyObject *pdist_cosine_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_, *norms_;
- int m, n;
- double *dm;
- const double *X, *norms;
- if (!PyArg_ParseTuple(args, "O!O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_,
- &PyArray_Type, &norms_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- norms = (const double*)norms_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_cosine(X, dm, m, n, norms);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_seuclidean_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_, *var_;
- int m, n;
- double *dm;
- const double *X, *var;
- if (!PyArg_ParseTuple(args, "O!O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &var_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (double*)X_->data;
- dm = (double*)dm_->data;
- var = (double*)var_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_seuclidean(X, var, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_city_block_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_city_block(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_hamming_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_hamming(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_hamming_bool_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const char *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const char*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_hamming_bool(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_jaccard_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const double *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_jaccard(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_jaccard_bool_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const char *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const char*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_jaccard_bool(X, dm, m, n);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-extern PyObject *pdist_minkowski_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm, *X;
- double p;
- if (!PyArg_ParseTuple(args, "O!O!d",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_,
- &p)) {
- return 0;
- }
- else {
- X = (double*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_minkowski(X, dm, m, n, p);
- }
- return Py_BuildValue("d", 0.0);
-}
-
-
-extern PyObject *pdist_yule_bool_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const char *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const char*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_yule_bool(X, dm, m, n);
- }
- return Py_BuildValue("");
-}
-
-extern PyObject *pdist_matching_bool_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const char *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const char*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_matching_bool(X, dm, m, n);
- }
- return Py_BuildValue("");
-}
-
-extern PyObject *pdist_dice_bool_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const char *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const char*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_dice_bool(X, dm, m, n);
- }
- return Py_BuildValue("");
-}
-
-extern PyObject *pdist_rogerstanimoto_bool_wrap(PyObject *self, PyObject *args) {
- PyArrayObject *X_, *dm_;
- int m, n;
- double *dm;
- const char *X;
- if (!PyArg_ParseTuple(args, "O!O!",
- &PyArray_Type, &X_,
- &PyArray_Type, &dm_)) {
- return 0;
- }
- else {
- X = (const char*)X_->data;
- dm = (double*)dm_->data;
- m = X_->dimensions[0];
- n = X_->dimensions[1];
-
- pdist_rogerstanimoto_bool(X, dm, m, n);
- }
- return Py_BuildValue("");
-}
-
-extern PyObject *pdist_russellrao_bool_wr