[Scipy-svn] r3138 - trunk/Lib/sandbox/pyem
scipy-svn@scip...
scipy-svn@scip...
Mon Jul 2 05:22:54 CDT 2007
Author: cdavid
Date: 2007-07-02 05:22:48 -0500 (Mon, 02 Jul 2007)
New Revision: 3138
Modified:
trunk/Lib/sandbox/pyem/gauss_mix.py
trunk/Lib/sandbox/pyem/gmm_em.py
trunk/Lib/sandbox/pyem/misc.py
Log:
More clean up
Modified: trunk/Lib/sandbox/pyem/gauss_mix.py
===================================================================
--- trunk/Lib/sandbox/pyem/gauss_mix.py 2007-07-02 09:49:12 UTC (rev 3137)
+++ trunk/Lib/sandbox/pyem/gauss_mix.py 2007-07-02 10:22:48 UTC (rev 3138)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Mon Jul 02 06:00 PM 2007 J
+# Last Change: Mon Jul 02 07:00 PM 2007 J
"""Module implementing GM, a class which represents Gaussian mixtures.
@@ -96,11 +96,11 @@
elif mode == 'full':
self.va = N.zeros((k * d, d))
- self.is_valid = False
+ self.__is_valid = False
if d > 1:
- self.is1d = False
+ self.__is1d = False
else:
- self.is1d = True
+ self.__is1d = True
def set_param(self, weights, mu, sigma):
"""Set parameters of the model.
@@ -147,7 +147,7 @@
self.mu = mu
self.va = sigma
- self.is_valid = True
+ self.__is_valid = True
@classmethod
def fromvalues(cls, weights, mu, sigma):
@@ -200,17 +200,17 @@
:Returns:
samples : ndarray
samples in the format one sample per row (nframes, d)."""
- if not self.is_valid:
+ if not self.__is_valid:
raise GmParamError("""Parameters of the model has not been
set yet, please set them using self.set_param()""")
# State index (ie hidden var)
- S = gen_rand_index(self.w, nframes)
- # standard gaussian
- X = randn(nframes, self.d)
+ sti = gen_rand_index(self.w, nframes)
+ # standard gaussian samples
+ x = randn(nframes, self.d)
if self.mode == 'diag':
- X = self.mu[S, :] + X * N.sqrt(self.va[S, :])
+ x = self.mu[sti, :] + x * N.sqrt(self.va[sti, :])
elif self.mode == 'full':
# Faster:
cho = N.zeros((self.k, self.va.shape[1], self.va.shape[1]))
@@ -220,13 +220,13 @@
cho[i] = lin.cholesky(self.va[i*self.d:i*self.d+self.d, :])
for s in range(self.k):
- tmpind = N.where(S == s)[0]
- X[tmpind] = N.dot(X[tmpind], cho[s].transpose()) + self.mu[s]
+ tmpind = N.where(sti == s)[0]
+ x[tmpind] = N.dot(x[tmpind], cho[s].T) + self.mu[s]
else:
raise GmParamError("cov matrix mode not recognized, "\
"this is a bug !")
- return X
+ return x
def conf_ellipses(self, dim = misc.DEF_VIS_DIM, npoints = misc.DEF_ELL_NP,
level = misc.DEF_LEVEL):
@@ -264,31 +264,31 @@
Will plot samples X draw from the mixture model, and
plot the ellipses of equi-probability from the mean with
default level of confidence."""
- if self.is1d:
+ if self.__is1d:
raise ValueError("This function does not make sense for 1d "
"mixtures.")
- if not self.is_valid:
+ if not self.__is_valid:
raise GmParamError("""Parameters of the model has not been
set yet, please set them using self.set_param()""")
- Xe = []
- Ye = []
+ xe = []
+ ye = []
if self.mode == 'diag':
for i in range(self.k):
xe, ye = D.gauss_ell(self.mu[i, :], self.va[i, :],
dim, npoints, level)
- Xe.append(xe)
- Ye.append(ye)
+ xe.append(xe)
+ ye.append(ye)
elif self.mode == 'full':
for i in range(self.k):
xe, ye = D.gauss_ell(self.mu[i, :],
self.va[i*self.d:i*self.d+self.d, :],
dim, npoints, level)
- Xe.append(xe)
- Ye.append(ye)
+ xe.append(xe)
+ ye.append(ye)
- return Xe, Ye
+ return xe, ye
def check_state(self):
"""Returns true if the parameters of the model are valid.
@@ -296,7 +296,7 @@
For Gaussian mixtures, this means weights summing to 1, and variances
to be positive definite.
"""
- if not self.is_valid:
+ if not self.__is_valid:
raise GmParamError("Parameters of the model has not been"\
"set yet, please set them using self.set_param()")
@@ -383,7 +383,7 @@
else:
return N.sum(D.multiple_gauss_den(x, self.mu, self.va) * self.w, 1)
- def pdf_comp(self, x, comp, log = False):
+ def pdf_comp(self, x, cid, log = False):
"""Computes the pdf of the model at given points, at given component.
:Parameters:
@@ -392,7 +392,7 @@
multi-dimensional sample (eg to estimate the pdf at 100
different points in 10 dimension, data's shape should be (100,
20)).
- comp: int
+ cid: int
the component index.
log : bool
If true, returns the log pdf instead of the pdf.
@@ -401,16 +401,17 @@
y : ndarray
the pdf at points x."""
if self.mode == 'diag':
- va = self.va[c]
+ va = self.va[cid]
elif self.mode == 'full':
- va = self.va[c*self.d:(c+1)*self.d]
+ va = self.va[cid*self.d:(cid+1)*self.d]
else:
raise GmParamError("""var mode %s not supported""" % self.mode)
if log:
- D.gauss_den(x, self.mu[c], va, log = True) + N.log(self.w[c])
+ return D.gauss_den(x, self.mu[cid], va, log = True) \
+ + N.log(self.w[cid])
else:
- return D.multiple_gauss_den(x, self.mu[c], va) * self.w[c]
+ return D.multiple_gauss_den(x, self.mu[cid], va) * self.w[cid]
#=================
# Plotting methods
@@ -442,19 +443,19 @@
:SeeAlso:
conf_ellipses is used to compute the ellipses. Use this if you want
to plot with something else than matplotlib."""
- if self.is1d:
+ if self.__is1d:
raise ValueError("This function does not make sense for 1d "
"mixtures.")
- if not self.is_valid:
+ if not self.__is_valid:
raise GmParamError("""Parameters of the model has not been
set yet, please set them using self.set_param()""")
k = self.k
- Xe, Ye = self.conf_ellipses(dim, npoints, level)
+ xe, ye = self.conf_ellipses(dim, npoints, level)
try:
import pylab as P
- return [P.plot(Xe[i], Ye[i], 'r', label='_nolegend_')[0] for i in
+ return [P.plot(xe[i], ye[i], 'r', label='_nolegend_')[0] for i in
range(k)]
except ImportError:
raise GmParamError("matplotlib not found, cannot plot...")
@@ -479,37 +480,29 @@
- h['gpdf'] is the line for the global pdf
- h['conf'] is a list of filling area
"""
- if not self.is1d:
+ if not self.__is1d:
raise ValueError("This function does not make sense for "\
"mixtures which are not unidimensional")
- # This is not optimized at all, may be slow. Should not be
- # difficult to make much faster, but it is late, and I am lazy
- # XXX separete the computation from the plotting
- if not self.d == 1:
- raise GmParamError("the model is not one dimensional model")
from scipy.stats import norm
- nrm = norm(0, 1)
- pval = N.sqrt(self.va[:, 0]) * nrm.ppf((1+level)/2)
+ pval = N.sqrt(self.va[:, 0]) * norm(0, 1).ppf((1+level)/2)
# Compute reasonable min/max for the normal pdf: [-mc * std, mc * std]
# gives the range we are taking in account for each gaussian
mc = 3
std = N.sqrt(self.va[:, 0])
- m = N.amin(self.mu[:, 0] - mc * std)
- M = N.amax(self.mu[:, 0] + mc * std)
+ mi = N.amin(self.mu[:, 0] - mc * std)
+ ma = N.amax(self.mu[:, 0] + mc * std)
np = 500
- x = N.linspace(m, M, np)
- Yf = N.zeros(np)
- Yt = N.zeros(np)
-
+ x = N.linspace(mi, ma, np)
# Prepare the dic of plot handles to return
ks = ['pdf', 'conf', 'gpdf']
hp = dict((i, []) for i in ks)
# Compute the densities
- y = D.multiple_gauss_den(x[:, N.newaxis], self.mu, self.va, log = True) \
+ y = D.multiple_gauss_den(x[:, N.newaxis], self.mu, self.va, \
+ log = True) \
+ N.log(self.w)
yt = self.pdf(x[:, N.newaxis])
@@ -526,11 +519,11 @@
xc = xc[:, N.where(xc<id2)[0]]
# Compute the graph for filling
- Yf = self.pdf_comp(xc, c)
+ yf = self.pdf_comp(xc, c)
xc = N.concatenate(([xc[0]], xc, [xc[-1]]))
- Yf = N.concatenate(([0], Yf, [0]))
- h = P.fill(xc, Yf,
- facecolor = 'b', alpha = 0.1, label='_nolegend_')
+ yf = N.concatenate(([0], yf, [0]))
+ h = P.fill(xc, yf, facecolor = 'b', alpha = 0.1,
+ label='_nolegend_')
hp['conf'].extend(h)
if gpdf:
h = P.plot(x, yt, 'r:', label='_nolegend_')
@@ -540,7 +533,7 @@
raise GmParamError("matplotlib not found, cannot plot...")
def density_on_grid(self, dim = misc.DEF_VIS_DIM, nx = 50, ny = 50,
- nl = 20, maxlevel = 0.95, V = None):
+ nl = 20, maxlevel = 0.95, v = None):
"""Do all the necessary computation for contour plot of mixture's
density.
@@ -567,9 +560,9 @@
Note
----
X, Y, Z and V are as expected by matplotlib contour function."""
- if self.is1d:
+ if self.__is1d:
raise ValueError("This function does not make sense for 1d "
- "mixtures.")
+ "mixtures.")
# Ok, it is a bit gory. Basically, we want to compute the size of the
# grid. We use conf_ellipse, which will return a couple of points for
@@ -579,40 +572,42 @@
# at a given point, one component is largely dominant for its
# contribution to the pdf).
- Xe, Ye = self.conf_ellipses(level = maxlevel, dim = dim)
- ax = [N.min(Xe), N.max(Xe), N.min(Ye), N.max(Ye)]
+ xe, ye = self.conf_ellipses(level = maxlevel, dim = dim)
+ ax = [N.min(xe), N.max(xe), N.min(ye), N.max(ye)]
w = ax[1] - ax[0]
h = ax[3] - ax[2]
- X, Y, lden = self._densityctr(N.linspace(ax[0]-0.2*w, ax[1]+0.2*w, nx), \
- N.linspace(ax[2]-0.2*h, ax[3]+0.2*h, ny), dim = dim)
+ x, y, lden = self._densityctr(N.linspace(ax[0]-0.2*w,
+ ax[1]+0.2*w, nx),
+ N.linspace(ax[2]-0.2*h,
+ ax[3]+0.2*h, ny),
+ dim = dim)
# XXX: how to find "good" values for level ?
- if V is None:
- V = N.linspace(-5, N.max(lden), nl)
- return X, Y, lden, N.array(V)
+ if v is None:
+ v = N.linspace(-5, N.max(lden), nl)
+ return x, y, lden, N.array(v)
def _densityctr(self, rangex, rangey, dim = misc.DEF_VIS_DIM):
"""Helper function to compute density contours on a grid."""
gr = N.meshgrid(rangex, rangey)
- X = gr[0].flatten()
- Y = gr[1].flatten()
- xdata = N.concatenate((X[:, N.newaxis], Y[:, N.newaxis]), axis = 1)
+ x = gr[0].flatten()
+ y = gr[1].flatten()
+ xdata = N.concatenate((x[:, N.newaxis], y[:, N.newaxis]), axis = 1)
dmu = self.mu[:, dim]
dva = self._get_va(dim)
den = GM.fromvalues(self.w, dmu, dva).pdf(xdata, log = True)
den = den.reshape(len(rangey), len(rangex))
- X = gr[0]
- Y = gr[1]
- return X, Y, den
+ return gr[0], gr[1], den
def _get_va(self, dim):
"""Returns variance limited do 2 dimension in tuple dim."""
assert len(dim) == 2
dim = N.array(dim)
if dim.any() < 0 or dim.any() >= self.d:
- raise ValueError("dim elements should be between 0 and dimension"\
- " of the mixture.")
+ raise ValueError("dim elements should be between 0 and dimension"
+ " of the mixture.")
+
if self.mode == 'diag':
return self.va[:, dim]
elif self.mode == 'full':
@@ -636,7 +631,7 @@
msg += " -> %d dimensions\n" % self.d
msg += " -> %d components\n" % self.k
msg += " -> %s covariance \n" % self.mode
- if self.is_valid:
+ if self.__is_valid:
msg += "Has initial values"""
else:
msg += "Has no initial values yet"""
@@ -694,11 +689,11 @@
if not len(w.shape) == 1:
raise GmParamError('weight should be a rank 1 array')
- if N.fabs(N.sum(w) - 1) > misc._MAX_DBL_DEV:
+ if N.fabs(N.sum(w) - 1) > misc.MAX_DBL_DEV:
raise GmParamError('weight does not sum to 1')
# Check that mean and va have the same number of components
- K = len(w)
+ k = len(w)
if N.ndim(mu) < 2:
msg = "mu should be a K,d matrix, and a row vector if only 1 comp"
@@ -708,10 +703,10 @@
only 1 diag comp"""
raise GmParamError(msg)
- (Km, d) = mu.shape
- (Ka, da) = va.shape
+ (km, d) = mu.shape
+ (ka, da) = va.shape
- if not K == Km:
+ if not k == km:
msg = "not same number of component in mean and weights"
raise GmParamError(msg)
@@ -719,15 +714,15 @@
msg = "not same number of dimensions in mean and variances"
raise GmParamError(msg)
- if Km == Ka:
+ if km == ka:
mode = 'diag'
else:
mode = 'full'
- if not Ka == Km*d:
+ if not ka == km*d:
msg = "not same number of dimensions in mean and variances"
raise GmParamError(msg)
- return K, d, mode
+ return k, d, mode
if __name__ == '__main__':
pass
Modified: trunk/Lib/sandbox/pyem/gmm_em.py
===================================================================
--- trunk/Lib/sandbox/pyem/gmm_em.py 2007-07-02 09:49:12 UTC (rev 3137)
+++ trunk/Lib/sandbox/pyem/gmm_em.py 2007-07-02 10:22:48 UTC (rev 3138)
@@ -1,5 +1,5 @@
# /usr/bin/python
-# Last Change: Mon Jul 02 04:00 PM 2007 J
+# Last Change: Mon Jul 02 07:00 PM 2007 J
"""Module implementing GMM, a class to estimate Gaussian mixture models using
EM, and EM, a class which use GMM instances to estimate models parameters using
@@ -45,6 +45,8 @@
return self.message
class MixtureModel(object):
+ """Class to model mixture """
+ # XXX: Is this really needed ?
pass
class ExpMixtureModel(MixtureModel):
@@ -59,12 +61,11 @@
instanciated object can be sampled, trained by EM. """
def init_kmean(self, data, niter = 5):
""" Init the model with kmean."""
- k = self.gm.k
- d = self.gm.d
- init = data[0:k, :]
+ k = self.gm.k
+ d = self.gm.d
+ init = data[0:k, :]
- # XXX: This is bogus initialization should do better (in kmean or here,
- # do not know yet): should
+ # XXX: This is bogus initialization should do better (in kmean with CV)
(code, label) = kmean(data, init, niter, minit = 'matrix')
w = N.ones(k) / k
@@ -89,10 +90,10 @@
def init_random(self, data):
""" Init the model at random."""
- k = self.gm.k
- d = self.gm.d
- w = N.ones(k) / k
- mu = randn(k, d)
+ k = self.gm.k
+ d = self.gm.d
+ w = N.ones(k) / k
+ mu = randn(k, d)
if self.gm.mode == 'diag':
va = N.fabs(randn(k, d))
else:
@@ -112,18 +113,12 @@
Useful for testing purpose when reproducability is necessary. This does
nothing but checking that the mixture model has valid initial
values."""
- # We have
try:
self.gm.check_state()
except GmParamError, e:
print "Model is not properly initalized, cannot init EM."
- raise "Message was %s" % str(e)
+ raise ValueError("Message was %s" % str(e))
- # TODO:
- # - format of parameters ? For variances, list of variances matrix,
- # keep the current format, have 3d matrices ?
- # - To handle the different modes, we could do something "fancy" such as
- # replacing methods, to avoid checking cases everywhere and unconsistency.
def __init__(self, gm, init = 'kmean'):
"""Initialize a mixture model.
@@ -183,7 +178,8 @@
This is basically the E step of EM for finite mixtures."""
# compute the gaussian pdf
- tgd = densities.multiple_gauss_den(data, self.gm.mu, self.gm.va, log = True)
+ tgd = densities.multiple_gauss_den(data, self.gm.mu,
+ self.gm.va, log = True)
# multiply by the weight
tgd += N.log(self.gm.w)
# Normalize to get a (log) pdf
@@ -420,6 +416,33 @@
self.pval = pval
def train(self, data, model, maxiter = 20, thresh = 1e-5):
+ """Train a model using EM.
+
+ Train a model using data, and stops when the likelihood increase
+ between two consecutive iteration fails behind a threshold, or when the
+ number of iterations > niter, whichever comes first
+
+ :Parameters:
+ data : ndarray
+ contains the observed features, one row is one frame, ie one
+ observation of dimension d
+ model : GMM
+ GMM instance.
+ maxiter : int
+ maximum number of iterations
+ thresh : threshold
+ if the slope of the likelihood falls below this value, the
+ algorithm stops.
+
+ :Returns:
+ likelihood : ndarray
+ one value per iteration.
+
+ Note
+ ----
+ The model is trained, and its parameters updated accordingly, eg the
+ results are put in the GMM instance.
+ """
mode = model.gm.mode
# Build regularizer
@@ -476,7 +499,6 @@
number of point).
diagonal variance version"""
- d = va.shape[1]
k = va.shape[0]
for i in range(k):
@@ -490,8 +512,8 @@
k = va.shape[0] / d
for i in range(k):
- va[i*d:i*d+d,:] *= 1. / (1 + np)
- va[i*d:i*d+d,:] += np / (1. + np) * prior
+ va[i*d:i*d+d, :] *= 1. / (1 + np)
+ va[i*d:i*d+d, :] += np / (1. + np) * prior
if __name__ == "__main__":
pass
Modified: trunk/Lib/sandbox/pyem/misc.py
===================================================================
--- trunk/Lib/sandbox/pyem/misc.py 2007-07-02 09:49:12 UTC (rev 3137)
+++ trunk/Lib/sandbox/pyem/misc.py 2007-07-02 10:22:48 UTC (rev 3138)
@@ -1,4 +1,4 @@
-# Last Change: Mon Jul 02 04:00 PM 2007 J
+# Last Change: Mon Jul 02 06:00 PM 2007 J
#========================================================
# Constants used throughout the module (def args, etc...)
@@ -15,7 +15,7 @@
# max deviation allowed when comparing double (this is actually stupid,
# I should actually use a number of decimals)
-_MAX_DBL_DEV = 1e-10
+MAX_DBL_DEV = 1e-10
## # max conditional number allowed
## _MAX_COND = 1e8
More information about the Scipy-svn
mailing list