From 2b066646d9ce61737a8dcb3d90759020fe753330 Mon Sep 17 00:00:00 2001
From: rdevon <devon@cs.unm.edu>
Date: Tue, 28 Oct 2014 11:50:06 -0600
Subject: [PATCH 1/5] Moved DBM to sandbox/dbm_v2 and stripped down. dbm_v2 is
 an attempt to refactor and redo some of the core aspects of DBM. Currently it
 is a stripped down version of models/dbm to support only: RBM Updown
 Inference Gibbs sampling BaseCD VariationalCD

I have included some tests for RBM CD to get things moving.
There are some changes found in other PRs, namely to:
dbm_cost.py: refactored a little bit to make it cleaner and get BaseCD working.
dbm.py: introduced RBM and a method for DBM to intialize its own chains.

and a few other smaller changes.
---
 pylearn2/sandbox/dbm_v2/__init__.py           |  270 ++
 pylearn2/sandbox/dbm_v2/dbm.py                |  822 ++++
 pylearn2/sandbox/dbm_v2/dbm_cost.py           |  660 +++
 .../sandbox/dbm_v2/inference_procedure.py     |  484 ++
 pylearn2/sandbox/dbm_v2/ising.py              | 1864 ++++++++
 pylearn2/sandbox/dbm_v2/layer.py              | 4124 +++++++++++++++++
 pylearn2/sandbox/dbm_v2/sampling_procedure.py |  210 +
 pylearn2/sandbox/dbm_v2/test_dbm.py           | 1214 +++++
 8 files changed, 9648 insertions(+)
 create mode 100644 pylearn2/sandbox/dbm_v2/__init__.py
 create mode 100755 pylearn2/sandbox/dbm_v2/dbm.py
 create mode 100644 pylearn2/sandbox/dbm_v2/dbm_cost.py
 create mode 100644 pylearn2/sandbox/dbm_v2/inference_procedure.py
 create mode 100644 pylearn2/sandbox/dbm_v2/ising.py
 create mode 100644 pylearn2/sandbox/dbm_v2/layer.py
 create mode 100644 pylearn2/sandbox/dbm_v2/sampling_procedure.py
 create mode 100644 pylearn2/sandbox/dbm_v2/test_dbm.py

diff --git a/pylearn2/sandbox/dbm_v2/__init__.py b/pylearn2/sandbox/dbm_v2/__init__.py
new file mode 100644
index 0000000000..aacd0d2589
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/__init__.py
@@ -0,0 +1,270 @@
+"""
+This module contains functionality related to deep Boltzmann machines.
+They are implemented generically in order to make it easy to support
+convolution versions, etc.
+
+This code was moved piece by piece incrementally over time from Ian's
+private research repository, and it is altogether possible that he
+broke something or left out a piece while moving it. If you find any
+problems please don't hesitate to contact pylearn-dev and we will fix
+the problem and add a unit test.
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import logging
+import numpy as np
+import sys
+
+from theano.compat.python2x import OrderedDict
+
+from pylearn2.expr.nnet import inverse_sigmoid_numpy
+from pylearn2.blocks import Block
+from pylearn2.utils import block_gradient
+from pylearn2.utils.rng import make_theano_rng
+
+
+logger = logging.getLogger(__name__)
+
+logger.debug("DBM changing the recursion limit.")
+# We need this to be high enough that the big theano graphs we make
+# when unrolling inference don't cause python to complain.
+# python intentionally declares stack overflow well before the stack
+# segment is actually exceeded. But we can't make this value too big
+# either, or we'll get seg faults when the python interpreter really
+# does go over the stack segment.
+# IG encountered seg faults on eos3 (a machine at LISA labo) when using
+# 50000 so for now it is set to 40000.
+# I think the actual safe recursion limit can't be predicted in advance
+# because you don't know how big of a stack frame each function will
+# make, so there is not really a "correct" way to do this. Really the
+# python interpreter should provide an option to raise the error
+# precisely when you're going to exceed the stack segment.
+sys.setrecursionlimit(40000)
+
+
+def init_sigmoid_bias_from_marginals(dataset, use_y = False):
+    """
+    Returns b such that sigmoid(b) has the same marginals as the
+    data. Assumes dataset contains a design matrix. If use_y is
+    true, sigmoid(b) will have the same marginals as the targets,
+    rather than the features.
+
+    Parameters
+    ----------
+    dataset : WRITEME
+    use_y : WRITEME
+    """
+    if use_y:
+        X = dataset.y
+    else:
+        X = dataset.get_design_matrix()
+    return init_sigmoid_bias_from_array(X)
+
+def init_sigmoid_bias_from_array(arr):
+    """
+    .. todo::
+
+        WRITEME
+    """
+    X = arr
+    if not (X.max() == 1):
+        raise ValueError("Expected design matrix to consist entirely "
+                "of 0s and 1s, but maximum value is "+str(X.max()))
+    if X.min() != 0.:
+        raise ValueError("Expected design matrix to consist entirely of "
+                "0s and 1s, but minimum value is "+str(X.min()))
+    # removed this check so we can initialize the marginals
+    # with a dataset of bernoulli params
+    # assert not np.any( (X > 0.) * (X < 1.) )
+
+    mean = X.mean(axis=0)
+
+    mean = np.clip(mean, 1e-7, 1-1e-7)
+
+    init_bias = inverse_sigmoid_numpy(mean)
+
+    return init_bias
+
+
+class DBMSampler(Block):
+    """
+    A Block used to sample from the last layer of a DBM with one hidden layer.
+
+    Parameters
+    ----------
+    dbm : WRITEME
+    """
+    def __init__(self, dbm):
+        super(DBMSampler, self).__init__()
+        self.theano_rng = make_theano_rng(None, 2012+10+14, which_method="binomial")
+        self.dbm = dbm
+        assert len(self.dbm.hidden_layers) == 1
+
+    def __call__(self, inputs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        space = self.dbm.get_input_space()
+        num_examples = space.batch_size(inputs)
+
+        last_layer = self.dbm.get_all_layers()[-1]
+        layer_to_chains = self.dbm.make_layer_to_symbolic_state(
+            num_examples, self.theano_rng)
+        # The examples are used to initialize the visible layer's chains
+        layer_to_chains[self.dbm.visible_layer] = inputs
+
+        layer_to_clamp = OrderedDict([(self.dbm.visible_layer, True)])
+        layer_to_chains = self.dbm.mcmc_steps(layer_to_chains, self.theano_rng,
+                                              layer_to_clamp=layer_to_clamp,
+                                              num_steps=1)
+
+        rval = layer_to_chains[last_layer]
+        rval = last_layer.upward_state(rval)
+
+        return rval
+
+    def get_input_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.dbm.get_input_space()
+
+    def get_output_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.dbm.get_output_space()
+
+
+def stitch_rbms(batch_size, rbm_list, niter, inference_procedure=None,
+                targets=False):
+    """
+    Returns a DBM initialized with pre-trained RBMs, with weights and biases
+    initialized according to R. Salakhutdinov's policy.
+
+    This method assumes the RBMs were trained normally. It divides the first
+    and last hidden layer's weights by two and initialized a hidden layer's
+    biases as the mean of its biases and the biases of the visible layer of the
+    RBM above it.
+    """
+    assert len(rbm_list) > 1
+
+    # For intermediary hidden layers, there are two set of biases to choose
+    # from: those from the hidden layer of the given RBM, and those from
+    # the visible layer of the RBM above it. As in R. Salakhutdinov's code,
+    # we handle this by computing the mean of those two sets of biases.
+    for this_rbm, above_rbm in zip(rbm_list[:-1], rbm_list[1:]):
+        hidden_layer = this_rbm.hidden_layers[0]
+        visible_layer = above_rbm.visible_layer
+        new_biases = 0.5 * (hidden_layer.get_biases() +
+                            visible_layer.get_biases())
+        hidden_layer.set_biases(new_biases)
+
+    visible_layer = rbm_list[0].visible_layer
+    visible_layer.dbm = None
+
+    hidden_layers = []
+
+    for rbm in rbm_list:
+        # Make sure all DBM have only one hidden layer, except for the last
+        # one, which can have an optional target layer
+        if rbm == rbm_list[-1]:
+            if targets:
+                assert len(rbm.hidden_layers) == 2
+            else:
+                assert len(rbm.hidden_layers) == 1
+        else:
+            assert len(rbm.hidden_layers) == 1
+
+        hidden_layers = hidden_layers + rbm.hidden_layers
+
+    for hidden_layer in hidden_layers:
+        hidden_layer.dbm = None
+
+    # Divide first and last hidden layer's weights by two, as described
+    # in R. Salakhutdinov's paper (equivalent to training with RBMs with
+    # doubled weights)
+    first_hidden_layer = hidden_layers[-1]
+    if targets:
+        last_hidden_layer = hidden_layers[-2]
+    else:
+        last_hidden_layer = hidden_layers[-1]
+    first_hidden_layer.set_weights(0.5 * first_hidden_layer.get_weights())
+    last_hidden_layer.set_weights(0.5 * last_hidden_layer.get_weights())
+
+    return DBM(batch_size, visible_layer, hidden_layers, niter,
+               inference_procedure)
+
+
+def flatten(l):
+    """
+    Turns a nested graph of lists/tuples/other objects
+    into a list of objects.
+
+    Parameters
+    ----------
+    l : WRITEME
+
+    Returns
+    -------
+    WRITEME
+    """
+    if isinstance(l, (list, tuple)):
+        rval = []
+        for elem in l:
+            if isinstance(elem, (list, tuple)):
+                rval.extend(flatten(elem))
+            else:
+                rval.append(elem)
+    else:
+        return [l]
+    return rval
+
+def block(l):
+    """
+    .. todo::
+
+        WRITEME
+    """
+    new = []
+    for elem in l:
+        if isinstance(elem, (list, tuple)):
+            new.append(block(elem))
+        else:
+            new.append(block_gradient(elem))
+    if isinstance(l, tuple):
+        return tuple(new)
+    return new
+
+
+# Make known modules inside this package
+# this needs to come after e.g. flatten(), since DBM depends on flatten()
+from pylearn2.models.dbm.dbm import DBM
+from pylearn2.models.dbm.inference_procedure import BiasInit
+from pylearn2.models.dbm.inference_procedure import InferenceProcedure
+from pylearn2.models.dbm.inference_procedure import MoreConsistent
+from pylearn2.models.dbm.inference_procedure import MoreConsistent2
+from pylearn2.models.dbm.inference_procedure import SuperWeightDoubling
+from pylearn2.models.dbm.inference_procedure import WeightDoubling
+from pylearn2.models.dbm.layer import BinaryVector
+from pylearn2.models.dbm.layer import BinaryVectorMaxPool
+from pylearn2.models.dbm.layer import BVMP_Gaussian
+from pylearn2.models.dbm.layer import CompositeLayer
+from pylearn2.models.dbm.layer import ConvMaxPool
+from pylearn2.models.dbm.layer import ConvC01B_MaxPool
+from pylearn2.models.dbm.layer import GaussianVisLayer
+from pylearn2.models.dbm.layer import HiddenLayer
+from pylearn2.models.dbm.layer import Layer
+from pylearn2.models.dbm.layer import VisibleLayer
+from pylearn2.models.dbm.layer import Softmax
+from pylearn2.models.dbm.sampling_procedure import SamplingProcedure
diff --git a/pylearn2/sandbox/dbm_v2/dbm.py b/pylearn2/sandbox/dbm_v2/dbm.py
new file mode 100755
index 0000000000..07613417fe
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/dbm.py
@@ -0,0 +1,822 @@
+"""
+The main DBM class
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin", "Devon Hjelm"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import functools
+import logging
+import numpy as np
+import warnings
+
+from theano import tensor as T, config
+from theano.compat import OrderedDict
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+from pylearn2.models import Model
+from pylearn2.sandbox.dbm_v2 import flatten
+from pylearn2.sandbox.dbm_v2.inference_procedure import UpDown
+from pylearn2.sandbox.dbm_v2.sampling_procedure import GibbsEvenOdd
+from pylearn2.sandbox.dbm_v2.layer import Softmax
+from pylearn2.utils import safe_zip, safe_izip
+from pylearn2.utils.rng import make_np_rng
+
+
+logger = logging.getLogger(__name__)
+
+
+class DBM(Model):
+    """
+    A deep Boltzmann machine.
+
+    See "Deep Boltzmann Machines" by Ruslan Salakhutdinov and Geoffrey Hinton
+    for details.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size the model should use. Some convolutional
+        LinearTransforms require a compile-time hardcoded batch size,
+        otherwise this would not be part of the model specification.
+    visible_layer : dbm.VisibleLayer
+        The visible layer of the DBM.
+    hidden_layers : list of dbm.HiddenLayer
+        The hidden layers. A list of HiddenLayer objects. The first
+        layer in the list is connected to the visible layer.
+    niter : int
+        Number of mean field iterations for variational inference
+        for the positive phase.
+    sampling_procedure : WRITEME
+    inference_procedure : WRITEME
+    """
+
+    def __init__(self, batch_size, visible_layer, hidden_layers, niter,
+                 sampling_procedure=None, inference_procedure=None):
+        self.__dict__.update(locals())
+        del self.self
+        assert len(hidden_layers) >= 1
+
+        if len(hidden_layers) > 1 and niter <= 1:
+            raise ValueError("with more than one hidden layer, niter needs to "
+                             "be greater than 1; otherwise mean field won't "
+                             "work properly.")
+
+        self.setup_rng()
+        self.layer_names = set()
+        self.visible_layer.set_dbm(self)
+        for layer in hidden_layers:
+            assert layer.get_dbm() is None
+            layer.set_dbm(self)
+            assert layer.layer_name not in self.layer_names
+            self.layer_names.add(layer.layer_name)
+        self._update_layer_input_spaces()
+        self.force_batch_size = batch_size
+        self.freeze_set = set([])
+        if inference_procedure is None:
+            self.setup_inference_procedure()
+        self.inference_procedure.set_dbm(self)
+        if sampling_procedure is None:
+            self.setup_sampling_procedure()
+        self.sampling_procedure.set_dbm(self)
+
+    def get_all_layers(self):
+        """
+        Returns all layers of the DBM in order of visible, hidden.
+        """
+        return [self.visible_layer] + self.hidden_layers
+
+    def energy(self, V, hidden):
+        """
+        Point energy of the DBM.
+        Calculated from the states of each unit.
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch of visible unit observations (must be SAMPLES, not
+            mean field parameters)
+        hidden : list
+            List, one element per hidden layer, of batches of samples (must
+            be SAMPLES, not mean field parameters)
+
+        Returns
+        -------
+        rval : tensor_like
+            Vector containing the energy of each sample
+
+        Notes
+        -----
+        Applying this function to non-sample theano variables is not guaranteed
+        to give you an expected energy in general, so don't use this that way.
+        """
+
+        terms = []
+
+        terms.append(self.visible_layer.expected_energy_term(state=V,
+                     average=False))
+
+        # This condition could be relaxed, but current code assumes it
+        assert len(self.hidden_layers) > 0
+
+        terms.append(self.hidden_layers[0].expected_energy_term(
+            state_below=self.visible_layer.upward_state(V),
+            state=hidden[0], average_below=False, average=False))
+
+        for i in xrange(1, len(self.hidden_layers)):
+            layer = self.hidden_layers[i]
+            samples_below = hidden[i-1]
+            layer_below = self.hidden_layers[i-1]
+            samples_below = layer_below.upward_state(samples_below)
+            samples = hidden[i]
+            terms.append(layer.expected_energy_term(state_below=samples_below,
+                         state=samples, average_below=False, average=False))
+
+        assert len(terms) > 0
+
+        rval = reduce(lambda x, y: x + y, terms)
+
+        assert rval.ndim == 1
+        return rval
+
+    def mf(self, *args, **kwargs):
+        """
+        Mean field inference of model.
+
+        Performs the inference procedure on the model.
+
+        Parameters
+        ----------
+        *args: TODO
+        **kwargs: TODO
+        """
+
+        self.setup_inference_procedure()
+        return self.inference_procedure.mf(*args, **kwargs)
+
+    def expected_energy(self, V, mf_hidden):
+        """
+        Expected energy of the DBM given a visible vector and the MF updates.
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch of visible unit observations (must be SAMPLES, not
+            mean field parameters: the random variables in the expectation
+            are the hiddens only)
+        mf_hidden : list
+            List, one element per hidden layer, of batches of variational
+            parameters (must be VARIATIONAL PARAMETERS, not samples. Layers
+            with analytically determined variance parameters for their mean
+            field parameters will use those to integrate over the variational
+            distribution, so it's not generally the same thing as measuring
+            the energy at a point.)
+
+        Returns
+        -------
+        rval : tensor_like
+            Vector containing the expected energy of each example under the
+            corresponding variational distribution.
+        """
+
+        self.visible_layer.space.validate(V)
+        assert isinstance(mf_hidden, (list, tuple))
+        assert len(mf_hidden) == len(self.hidden_layers)
+
+        terms = []
+
+        terms.append(self.visible_layer.expected_energy_term(state=V,
+                     average=False))
+
+        # This condition could be relaxed, but current code assumes it
+        assert len(self.hidden_layers) > 0
+
+        terms.append(self.hidden_layers[0].expected_energy_term(
+            state_below=self.visible_layer.upward_state(V),
+            average_below=False, state=mf_hidden[0], average=True))
+
+        for i in xrange(1, len(self.hidden_layers)):
+            layer = self.hidden_layers[i]
+            layer_below = self.hidden_layers[i-1]
+            mf_below = mf_hidden[i-1]
+            mf_below = layer_below.upward_state(mf_below)
+            mf = mf_hidden[i]
+            terms.append(layer.expected_energy_term(state_below=mf_below,
+                         state=mf, average_below=True, average=True))
+
+        assert len(terms) > 0
+
+        rval = reduce(lambda x, y: x + y, terms)
+
+        assert rval.ndim == 1
+        return rval
+
+    def setup_rng(self):
+        """
+        Function to set up the random number generator.
+        """
+        self.rng = make_np_rng(None, [2012, 10, 17], which_method="uniform")
+
+    def setup_inference_procedure(self):
+        """
+        Sets up the inference procedure for the DBM.
+        """
+        if not hasattr(self, 'inference_procedure') or \
+                self.inference_procedure is None:
+            if len(self.hidden_layers) == 1:
+                self.inference_procedure = UpDown()
+            else:
+                #self.inference_procedure = WeightDoubling()
+                self.inference_procedure = UpDown()
+            self.inference_procedure.set_dbm(self)
+
+        if len(self.hidden_layers) == 1:
+            try:
+                self.inference_procedure.is_rbm_compatible()
+            except NotImplementedError:
+                warnings.warn("Inference procedure %r may have unexpected"
+                              "behavior when used with one hidden layer (RBM)."
+                              "See models/dbn/inference_procedure.py for"
+                              "details." % type(self.inference_procedure))
+
+    def setup_sampling_procedure(self):
+        """
+        Sets up the sampling procedure.
+        Defaults to GibbsEvenOdd
+        """
+        if not hasattr(self, 'sampling_procedure') or \
+                self.sampling_procedure is None:
+            self.sampling_procedure = GibbsEvenOdd()
+            self.sampling_procedure.set_dbm(self)
+
+    def get_output_space(self):
+        """
+        Returns the output space of the top hidden layer.
+        """
+        return self.hidden_layers[-1].get_output_space()
+
+    def _update_layer_input_spaces(self):
+        """
+        Tells each layer what its input space should be.
+
+        Notes
+        -----
+        This usually resets the layer's parameters!
+        """
+        visible_layer = self.visible_layer
+        hidden_layers = self.hidden_layers
+
+        self.hidden_layers[0].set_input_space(visible_layer.space)
+        for i in xrange(1, len(hidden_layers)):
+            hidden_layers[i].set_input_space(
+                hidden_layers[i-1].get_output_space())
+
+        for layer in self.get_all_layers():
+            layer.finalize_initialization()
+
+    def add_layers(self, layers):
+        """
+        Add new layers on top of the existing hidden layers
+
+        Parameters
+        ----------
+        layers : dbm.HiddenLayer
+            Layer to add to DBM.
+        """
+
+        # Patch old pickle files
+        if not hasattr(self, 'rng'):
+            self.setup_rng()
+
+        hidden_layers = self.hidden_layers
+        assert len(hidden_layers) > 0
+        for layer in layers:
+            assert layer.get_dbm() is None
+            layer.set_dbm(self)
+            layer.set_input_space(hidden_layers[-1].get_output_space())
+            hidden_layers.append(layer)
+            assert layer.layer_name not in self.layer_names
+            self.layer_names.add(layer.layer_name)
+
+    def freeze(self, parameter_set):
+        """
+        Freezes the set of parameters.
+
+        Parameters
+        ----------
+        parameter_set: WRITEME
+        """
+        # patch old pickle files
+        if not hasattr(self, 'freeze_set'):
+            self.freeze_set = set([])
+
+        self.freeze_set = self.freeze_set.union(parameter_set)
+
+    def get_params(self):
+        """
+        Returns the parameters of the DBM.
+        """
+
+        rval = []
+        for param in self.visible_layer.get_params():
+            assert param.name is not None
+        rval = self.visible_layer.get_params()
+        for layer in self.hidden_layers:
+            for param in layer.get_params():
+                if param.name is None:
+                    raise ValueError("All of your parameters should have "
+                                     "names, but one of " + layer.layer_name +
+                                     "'s doesn't")
+            layer_params = layer.get_params()
+            assert not isinstance(layer_params, set)
+            for param in layer_params:
+                if param not in rval:
+                    rval.append(param)
+
+        # Patch pickle files that predate the freeze_set feature
+        if not hasattr(self, 'freeze_set'):
+            self.freeze_set = set([])
+
+        rval = [elem for elem in rval if elem not in self.freeze_set]
+
+        assert all([elem.name is not None for elem in rval])
+
+        return rval
+
+    def set_batch_size(self, batch_size):
+        """
+        Sets the batch size of the DBM.
+
+        Parameters
+        ----------
+        batch_size: int
+            The batch size
+        """
+        self.batch_size = batch_size
+        self.force_batch_size = batch_size
+
+        for layer in self.hidden_layers:
+            layer.set_batch_size(batch_size)
+
+        if not hasattr(self, 'inference_procedure'):
+            self.setup_inference_procedure()
+        self.inference_procedure.set_batch_size(batch_size)
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+        self.visible_layer.modify_updates(updates)
+        for layer in self.hidden_layers:
+            layer.modify_updates(updates)
+
+    def get_input_space(self):
+        """
+        Returns the input space of the visible layer.
+        """
+        return self.visible_layer.space
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        params = self.get_params()
+
+        for layer in self.hidden_layers + [self.visible_layer]:
+            contrib = layer.get_lr_scalers()
+
+            # No two layers can contend to scale a parameter
+            assert not any([key in rval for key in contrib])
+            # Don't try to scale anything that's not a parameter
+            assert all([key in params for key in contrib])
+
+            rval.update(contrib)
+        assert all([isinstance(val, float) for val in rval.values()])
+
+        return rval
+
+    def get_weights(self):
+        """
+        Returns the weights of the bottom hidden layer.
+        """
+
+        return self.hidden_layers[0].get_weights()
+
+    def get_weights_view_shape(self):
+        """
+        Returns shape of weight view.
+        """
+        return self.hidden_layers[0].get_weights_view_shape()
+
+    def get_weights_format(self):
+        """
+        Returns the format of the weights as that of the bottom hidden layer.
+        """
+        return self.hidden_layers[0].get_weights_format()
+
+    def get_weights_topo(self):
+        """
+        Returns the topologically formatted version of the weights.
+        Uses the bottom hidden layer.
+        """
+        return self.hidden_layers[0].get_weights_topo()
+
+    def make_layer_to_state(self, num_examples, rng=None):
+        """
+        Makes and returns a dictionary mapping layers to states.
+
+        By states, we mean here a real assignment, not a mean field
+        state. For example, for a layer containing binary random
+        variables, the state will be a shared variable containing
+        values in {0,1}, not [0,1]. The visible layer will be included.
+
+        Uses a dictionary so it is easy to unambiguously index a layer
+        without needing to remember rules like vis layer = 0, hiddens
+        start at 1, etc.
+
+        Parameters
+        ----------
+        num_examples : int
+            WRITEME
+        rng : WRITEME
+        """
+
+        # Make a list of all layers
+        layers = [self.visible_layer] + self.hidden_layers
+
+        if rng is None:
+            rng = self.rng
+
+        states = [layer.make_state(num_examples, rng) for layer in layers]
+
+        zipped = safe_zip(layers, states)
+
+        def recurse_check(layer, state):
+            if isinstance(state, (list, tuple)):
+                for elem in state:
+                    recurse_check(layer, elem)
+            else:
+                val = state.get_value()
+                m = val.shape[0]
+                if m != num_examples:
+                    raise ValueError(layer.layer_name + " gave state with " +
+                                     str(m) + " examples in some component."
+                                     "We requested " + str(num_examples))
+
+        for layer, state in zipped:
+            recurse_check(layer, state)
+
+        rval = OrderedDict(zipped)
+
+        return rval
+
+    def make_layer_to_symbolic_state(self, num_examples, rng=None):
+        """
+        .. todo::
+
+            Explain the difference with `make_layer_to_state`
+
+        Makes and returns a dictionary mapping layers to states.
+
+        By states, we mean here a real assignment, not a mean field
+        state. For example, for a layer containing binary random
+        variables, the state will be a shared variable containing
+        values in {0,1}, not [0,1]. The visible layer will be included.
+
+        Uses a dictionary so it is easy to unambiguously index a layer
+        without needing to remember rules like vis layer = 0, hiddens
+        start at 1, etc.
+
+        Parameters
+        ----------
+        num_examples : int
+            WRITEME
+        rng : WRITEME
+        """
+
+        # Make a list of all layers
+        layers = [self.visible_layer] + self.hidden_layers
+
+        assert rng is not None
+
+        states = [layer.make_symbolic_state(num_examples, rng)
+                  for layer in layers]
+
+        zipped = safe_zip(layers, states)
+
+        rval = OrderedDict(zipped)
+
+        return rval
+
+    def mcmc_steps(self, layer_to_state, theano_rng, layer_to_clamp=None,
+                   num_steps=1):
+        """
+        Perform Markov chain Monte Carlo.
+
+        Note: this is due to be removed, though it might still be useful.
+
+        Parameters
+        ----------
+        layer_to_state: dictionary of k, v pairs dbm.layer.Layer, tensor-like
+            Dictionary of layers and their corresponding state.
+        theano_rng: WRITEME
+        layer_to_clamp: dictionary of k, v pairs dbm.layer.Layer, bool
+            Dictionary of layers and a boolean indicating clamping.
+        num_steps: int
+            Number of steps in sampling procedure.
+        """
+
+        warnings.warn("DBM.mcmc_steps is deprecated. You should instead " +
+                      "call DBM.sampling_procedure.sample, which defaults " +
+                      "to what DBM.mcmc_steps used to do. This method will " +
+                      "be removed on or after July 31, 2014.")
+        return self.sampling_procedure.sample(layer_to_state, theano_rng,
+                                              layer_to_clamp, num_steps)
+
+    def get_sampling_updates(self, layer_to_state, theano_rng,
+                             layer_to_clamp=None, num_steps=1,
+                             return_layer_to_updated=False):
+        """
+        This method is for getting an updates dictionary for a theano function.
+
+        It thus implies that the samples are represented as shared variables.
+        If you want an expression for a sampling step applied to arbitrary
+        theano variables, use the 'mcmc_steps' method. This is a wrapper around
+        that method.
+
+        Parameters
+        ----------
+        layer_to_state : dict
+            Dictionary mapping the SuperDBM_Layer instances contained in
+            self to shared variables representing batches of samples of them.
+            (you can allocate one by calling self.make_layer_to_state)
+        theano_rng : MRG_RandomStreams
+            WRITEME
+        layer_to_clamp : dict, optional
+            Dictionary mapping layers to bools. If a layer is not in the
+            dictionary, defaults to False. True indicates that this layer
+            should be clamped, so we are sampling from a conditional
+            distribution rather than the joint distribution
+        num_steps : int, optional
+            WRITEME
+        return_layer_to_updated : bool, optional
+            WRITEME
+
+        Returns
+        -------
+        rval : dict
+            Dictionary mapping each shared variable to an expression to
+            update it. Repeatedly applying these updates does MCMC sampling.
+
+        Notes
+        -----
+        The specific sampling schedule used by default is to sample all of the
+        even-idexed layers of model.hidden_layers, then the visible layer and
+        all the odd-indexed layers.
+        """
+
+        updated = self.sampling_procedure.sample(layer_to_state, theano_rng,
+                                                 layer_to_clamp, num_steps)
+
+        rval = OrderedDict()
+
+        def add_updates(old, new):
+            if isinstance(old, (list, tuple)):
+                for old_elem, new_elem in safe_izip(old, new):
+                    add_updates(old_elem, new_elem)
+            else:
+                rval[old] = new
+
+        # Validate layer_to_clamp / make sure layer_to_clamp is a fully
+        # populated dictionary
+        if layer_to_clamp is None:
+            layer_to_clamp = OrderedDict()
+
+        for key in layer_to_clamp:
+            assert key is self.visible_layer or key in self.hidden_layers
+
+        for layer in [self.visible_layer] + self.hidden_layers:
+            if layer not in layer_to_clamp:
+                layer_to_clamp[layer] = False
+
+        # Translate update expressions into theano updates
+        for layer in layer_to_state:
+            old = layer_to_state[layer]
+            new = updated[layer]
+            if layer_to_clamp[layer]:
+                assert new is old
+            else:
+                add_updates(old, new)
+
+        assert isinstance(self.hidden_layers, list)
+
+        if return_layer_to_updated:
+            return rval, updated
+
+        return rval
+
+    def get_monitoring_channels(self, data):
+        """
+        Returns the monitor channels of the DBM.
+
+        This is done through the visible and all of the hidden layers of DBM.
+
+        Parameters
+        ----------
+        data: tensor-like
+            Data from which to evaluate model.
+        """
+        space, source = self.get_monitoring_data_specs()
+        space.validate(data)
+        X = data
+        history = self.mf(X, return_history=True)
+        q = history[-1]
+
+        rval = OrderedDict()
+
+        ch = self.visible_layer.get_monitoring_channels()
+        for key in ch:
+            rval['vis_' + key] = ch[key]
+
+        for state, layer in safe_zip(q, self.hidden_layers):
+            ch = layer.get_monitoring_channels()
+            for key in ch:
+                rval[layer.layer_name + '_' + key] = ch[key]
+            ch = layer.get_monitoring_channels_from_state(state)
+            for key in ch:
+                rval['mf_' + layer.layer_name + '_' + key] = ch[key]
+        if len(history) > 1:
+            prev_q = history[-2]
+
+            flat_q = flatten(q)
+            flat_prev_q = flatten(prev_q)
+
+            mx = None
+            for new, old in safe_zip(flat_q, flat_prev_q):
+                cur_mx = abs(new - old).max()
+                if new is old:
+                    logger.error('{0} is {1}'.format(new, old))
+                    assert False
+                if mx is None:
+                    mx = cur_mx
+                else:
+                    mx = T.maximum(mx, cur_mx)
+
+            rval['max_var_param_diff'] = mx
+
+            for layer, new, old in safe_zip(self.hidden_layers,
+                                            q, prev_q):
+                sum_diff = 0.
+                for sub_new, sub_old in safe_zip(flatten(new), flatten(old)):
+                    sum_diff += abs(sub_new - sub_old).sum()
+                denom = self.batch_size * \
+                    layer.get_total_state_space().get_total_dimension()
+                denom = np.cast[config.floatX](denom)
+                rval['mean_'+layer.layer_name+'_var_param_diff'] = \
+                    sum_diff / denom
+
+        X_hat = self.reconstruct(X)
+        reconstruction_cost = self.visible_layer.recons_cost(X, X_hat)
+        rval['reconstruction_cost'] = reconstruction_cost
+
+        return rval
+
+    def get_monitoring_data_specs(self):
+        """
+        Get the data_specs describing the data for get_monitoring_channel.
+
+        This implementation returns specification corresponding to unlabeled
+        inputs.
+        """
+        return (self.get_input_space(), self.get_input_source())
+
+    def get_test_batch_size(self):
+        """
+        Returns the batch size of the model.
+        """
+        return self.batch_size
+
+    def reconstruct(self, V):
+        """
+        Reconstructs an input using inpainting method.
+
+        Parameters
+        ----------
+        V: tensor-like
+            Input sample.
+
+        Returns
+        -------
+        recons: tensor-like
+            Reconstruction of V.
+        """
+
+        H = self.mf(V)[0]
+
+        downward_state = self.hidden_layers[0].downward_state(H)
+
+        recons = self.visible_layer.inpaint_update(
+            layer_above=self.hidden_layers[0],
+            state_above=downward_state,
+            drop_mask=None, V=None)
+
+        return recons
+
+    def do_inpainting(self, *args, **kwargs):
+        """
+        Perform inpainting on model.
+
+        Inpainting is defined by the inference procedure.
+
+        Parameters
+        ----------
+        *args: WRITEME
+        **kwargs: WRITEME
+        """
+        self.setup_inference_procedure()
+        return self.inference_procedure.do_inpainting(*args, **kwargs)
+
+    def initialize_chains(self, X, Y, theano_rng):
+        """
+        Function to initialize chains for model when performing the neg phase.
+        TODO: implement in cost functions.
+
+        Parameters
+        ----------
+        X: tensor-like
+            The data. If none, then persistent (TODO)
+        Y: tensor-like
+            Labels.
+        theano_rng: WRITEME
+
+        Returns
+        ------
+        layer_to_chains: OrderedDict
+        """
+
+        if X is None:
+            raise NotImplementedError("Persistent chains not implemented yet.")
+
+        # Initializing to data
+        layer_to_clamp = OrderedDict([(self.visible_layer, True)])
+        layer_to_chains = self.make_layer_to_symbolic_state(1, theano_rng)
+
+        # initialized the visible layer to data
+        layer_to_chains[self.visible_layer] = X
+
+        # if supervised, also clamp targets
+        if Y is not None and self.supervised:
+            # note: if the Y layer changes to something without linear energy,
+            # we'll need to make the expected energy clamp Y in the positive
+            # phase
+            target_layer = self.hidden_layers[-1]
+            assert isinstance(target_layer, Softmax)
+            layer_to_clamp[target_layer] = True
+            layer_to_chains[target_layer] = Y
+
+        # Note that we replace layer_to_chains with a dict mapping to the new
+        # state of the chains
+        # We first initialize the chain by clamping the visible layer and the
+        # target layer (if it exists)
+        layer_to_chains = self.sampling_procedure.sample(
+            layer_to_chains,
+            theano_rng,
+            layer_to_clamp=layer_to_clamp,
+            num_steps=1)
+        return layer_to_chains
+
+
+class RBM(DBM):
+    """
+    A restricted Boltzmann machine.
+
+    The special case of a DBM with only one hidden layer designed to keep
+    things simple for researchers interested only in a single layer of
+    latent variables and DBN.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size the model should use. Some convolutional
+        LinearTransforms require a compile-time hardcoded batch size,
+        otherwise this would not be part of the model specification.
+    visible_layer : DBM.VisibleLayer
+        The visible layer of the DBM.
+    hidden_layers : List of DBM.HiddenLayer
+        The hidden layers. A list of HiddenLayer objects. The first
+        layer in the list is connected to the visible layer.
+    niter : int
+        Number of mean field iterations for variational inference
+        for the positive phase.
+    """
+    def __init__(self, batch_size, visible_layer, hidden_layer, niter):
+        self.__dict__.update(locals())
+        del self.self
+        super(RBM, self).__init__(batch_size, visible_layer, [hidden_layer],
+                                  niter,
+                                  inference_procedure=UpDown(),
+                                  sampling_procedure=GibbsEvenOdd())
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/dbm_cost.py b/pylearn2/sandbox/dbm_v2/dbm_cost.py
new file mode 100644
index 0000000000..7925788e3b
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/dbm_cost.py
@@ -0,0 +1,660 @@
+"""
+This module contains cost functions to use with deep Boltzmann machines
+(pylearn2.models.dbm).
+"""
+
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin", "Devon Hjelm"]
+__copyright__ = "Copyright 2012, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import numpy as np
+import logging
+import warnings
+
+from theano.compat.python2x import OrderedDict
+from theano import config
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+RandomStreams = MRG_RandomStreams
+from theano import tensor as T
+
+import pylearn2
+from pylearn2.costs.cost import Cost
+from pylearn2.costs.cost import (
+    FixedVarDescr, DefaultDataSpecsMixin, NullDataSpecsMixin
+)
+
+from pylearn2.sandbox.dbm_v2 import dbm
+from pylearn2.sandbox.dbm_v2.layer import BinaryVectorMaxPool
+from pylearn2.sandbox.dbm_v2 import flatten
+from pylearn2.sandbox.dbm_v2.layer import BinaryVector
+from pylearn2.sandbox.dbm_v2.layer import Softmax
+
+from pylearn2 import utils
+from pylearn2.utils import make_name
+from pylearn2.utils import safe_izip
+from pylearn2.utils import safe_zip
+from pylearn2.utils import sharedX
+from pylearn2.utils.rng import make_theano_rng
+
+
+logger = logging.getLogger(__name__)
+
+
+# Positive phase methods
+
+def positive_phase(model, X, Y, num_gibbs_steps=1, supervised=False,
+                   theano_rng=None, method="VARIATIONAL"):
+    """
+    Wrapper function for positive phase.
+    Method is controled by switch string "method".
+
+    Parameters
+    ----------
+    X: input observables
+    Y: supervised observables
+    num_gibbs_steps: number of gibbs steps for sampling method
+    theano_rng for sampling method
+    method: method for positive phase: VARIATIONAL or SAMPLING.
+    """
+
+    if method == "VARIATIONAL":
+        return variational_positive_phase(model, X, Y,
+                                          supervised=supervised)
+    elif method == "SAMPLING":
+        return sampling_positive_phase(model, X, Y,
+                                       supervised=supervised,
+                                       num_gibbs_steps=num_gibbs_steps,
+                                       theano_rng=theano_rng)
+    else: raise ValueError("Available methods for positive phase are VARIATIONAL and SAMPLING")
+
+def variational_positive_phase(model, X, Y, supervised):
+    """
+    .. todo::
+
+    WRITEME
+    """
+    if supervised:
+        assert Y is not None
+        # note: if the Y layer changes to something without linear energy,
+        # we'll need to make the expected energy clamp Y in the positive
+        # phase
+        assert isinstance(model.hidden_layers[-1], Softmax)
+
+    q = model.mf(X, Y)
+
+    """
+    Use the non-negativity of the KL divergence to construct a lower
+    bound on the log likelihood. We can drop all terms that are
+    constant with respect to the model parameters:
+
+    log P(v) = L(v, q) + KL(q || P(h|v))
+    L(v, q) = log P(v) - KL(q || P(h|v))
+    L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v)
+    L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const
+    L(v, q) = log P(v) + sum_h q(h) log P(h, v)
+    - sum_h q(h) log P(v) + const
+    L(v, q) = sum_h q(h) log P(h, v) + const
+    L(v, q) = sum_h q(h) -E(h, v) - log Z + const
+
+    so the cost we want to minimize is
+    expected_energy + log Z + const
+
+
+    Note: for the RBM, this bound is exact, since the KL divergence
+    goes to 0.
+    """
+
+    variational_params = flatten(q)
+
+    # The gradients of the expected energy under q are easy, we can just
+    # do that in theano
+    expected_energy_q = model.expected_energy(X, q).mean()
+    params = list(model.get_params())
+    gradients = OrderedDict(
+        safe_zip(params, T.grad(expected_energy_q,
+                                params,
+                                consider_constant=variational_params,
+                                disconnected_inputs='ignore')))
+    return gradients
+
+def sampling_positive_phase(model, X, Y, supervised, num_gibbs_steps, theano_rng):
+    """
+    .. todo::
+
+    WRITEME
+    """
+    assert num_gibbs_steps is not None
+    assert theano_rng is not None
+    # If there's only one hidden layer, there's no point in sampling.
+    if len(model.hidden_layers) == 1: num_gibbs_steps = 1
+    layer_to_clamp = OrderedDict([(model.visible_layer, True)])
+    layer_to_pos_samples = OrderedDict([(model.visible_layer, X)])
+    if supervised:
+        # note: if the Y layer changes to something without linear energy,
+        #       we'll need to make the expected energy clamp Y in the
+        #       positive phase
+        assert isinstance(model.hidden_layers[-1], Softmax)
+        layer_to_clamp[model.hidden_layers[-1]] = True
+        layer_to_pos_samples[model.hidden_layers[-1]] = Y
+        hid = model.hidden_layers[:-1]
+    else:
+        assert Y is None
+        hid = model.hidden_layers
+
+    for layer in hid:
+        mf_state = layer.init_mf_state()
+
+        def recurse_zeros(x):
+            if isinstance(x, tuple):
+                return tuple([recurse_zeros(e) for e in x])
+            return x.zeros_like()
+        layer_to_pos_samples[layer] = recurse_zeros(mf_state)
+
+    layer_to_pos_samples = model.sampling_procedure.sample(
+        layer_to_state=layer_to_pos_samples,
+        layer_to_clamp=layer_to_clamp,
+        num_steps=num_gibbs_steps,
+        theano_rng=theano_rng)
+    q = [layer_to_pos_samples[layer] for layer in model.hidden_layers]
+
+    pos_samples = flatten(q)
+
+    # The gradients of the expected energy under q are easy, we can just
+    # do that in theano
+    expected_energy_q = model.energy(X, q).mean()
+    params = list(model.get_params())
+    gradients = OrderedDict(
+        safe_zip(params, T.grad(expected_energy_q, params,
+                                consider_constant=pos_samples,
+                                disconnected_inputs='ignore')))
+    return gradients
+
+# Negative phase methods
+
+def negative_phase(model, layer_to_chains, method="STANDARD"):
+    """
+    Wrapper function for negative phase.
+
+    Parameters
+    ----------
+    model: a dbm model.
+    layer_to_chains: dicitonary of layer chains for sampling.
+    method: standard or toronto
+    """
+
+    if method == "STANDARD":
+        return standard_negative_phase(model, layer_to_chains)
+    elif method == "TORONTO":
+        return toronto_negative_phase(model, layer_to_chains)
+    else: raise ValueError("Available methods for negative phase are STANDARD and TORONTO")
+
+def standard_negative_phase(model, layer_to_chains):
+    """
+    .. todo::
+
+    WRITEME
+
+    TODO:reduce variance of negative phase by
+    integrating out the even-numbered layers. The
+    Rao-Blackwellize method can do this for you when
+    expected gradient = gradient of expectation, but
+    doing this in general is trickier.
+    """
+    params = list(model.get_params())
+
+    # layer_to_chains = model.rao_blackwellize(layer_to_chains)
+    expected_energy_p = model.energy(
+        layer_to_chains[model.visible_layer],
+        [layer_to_chains[layer] for layer in model.hidden_layers]).mean()
+
+    samples = flatten(layer_to_chains.values())
+    for i, sample in enumerate(samples):
+        if sample.name is None:
+            sample.name = 'sample_'+str(i)
+
+    neg_phase_grads = OrderedDict(
+        safe_zip(params, T.grad(-expected_energy_p, params,
+                                 consider_constant=samples,
+                                 disconnected_inputs='ignore')))
+    return neg_phase_grads
+
+def toronto_negative_phase(model, layer_to_chains):
+    """
+    .. todo::
+
+    WRITEME
+    """
+    # Ruslan Salakhutdinov's undocumented negative phase from
+    # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
+    # IG copied it here without fully understanding it, so it
+    # only applies to exactly the same model structure as
+    # in that code.
+
+    assert isinstance(model.visible_layer, BinaryVector)
+    assert isinstance(model.hidden_layers[0], BinaryVectorMaxPool)
+    assert model.hidden_layers[0].pool_size == 1
+    assert isinstance(model.hidden_layers[1], BinaryVectorMaxPool)
+    assert model.hidden_layers[1].pool_size == 1
+    assert isinstance(model.hidden_layers[2], Softmax)
+    assert len(model.hidden_layers) == 3
+
+    params = list(model.get_params())
+
+    V_samples = layer_to_chains[model.visible_layer]
+    H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for
+                                         layer in model.hidden_layers]
+
+    H1_mf = model.hidden_layers[0].mf_update(
+        state_below=model.visible_layer.upward_state(V_samples),
+        state_above=model.hidden_layers[1].downward_state(H2_samples),
+        layer_above=model.hidden_layers[1])
+    Y_mf = model.hidden_layers[2].mf_update(
+        state_below=model.hidden_layers[1].upward_state(H2_samples))
+    H2_mf = model.hidden_layers[1].mf_update(
+        state_below=model.hidden_layers[0].upward_state(H1_mf),
+        state_above=model.hidden_layers[2].downward_state(Y_mf),
+        layer_above=model.hidden_layers[2])
+
+    expected_energy_p = model.energy(
+        V_samples, [H1_mf, H2_mf, Y_samples]).mean()
+
+    constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])
+
+    neg_phase_grads = OrderedDict(
+        safe_zip(params, T.grad(-expected_energy_p, params,
+                                 consider_constant=constants)))
+    return neg_phase_grads
+
+
+class BaseCD(DefaultDataSpecsMixin, Cost):
+    """
+    Parameters
+    ----------
+    num_chains : int
+        The number of negative chains to use with PCD / SML.
+        WRITEME : how is this meant to be used with CD? Do you just need to
+        set it to be equal to the batch size? If so: TODO, get rid of this
+        redundant aspect of the interface.
+    num_gibbs_steps : int
+        The number of Gibbs steps to use in the negative phase. (i.e., if
+        you want to use CD-k or PCD-k, this is "k").
+    supervised : bool
+        If True, requests class labels and models the joint distrbution over
+        features and labels.
+    toronto_neg : bool
+        If True, use a bit of mean field in the negative phase.
+        Ruslan Salakhutdinov's matlab code does this.
+    theano_rng : MRG_RandomStreams, optional
+        If specified, uses this object to generate all random numbers.
+        Otherwise, makes its own random number generator.
+    """
+
+    def __init__(self, num_chains=1, num_gibbs_steps=1, supervised=False,
+                 toronto_neg=False, theano_rng=None,
+                 positive_method = "SAMPLING", negative_method = "STANDARD"):
+        self.__dict__.update(locals())
+        del self.self
+
+        self.theano_rng = make_theano_rng(theano_rng, 2012+10+14, which_method="binomial")
+        assert supervised in [True, False]
+        if toronto_neg:
+            self.negative_method = "TORONTO"
+
+    def expr(self, model, data):
+        """
+        The partition function makes this intractable.
+        """
+        self.get_data_specs(model)[0].validate(data)
+
+        return None
+
+    def _get_positive_phase(self, model, X, Y=None):
+        """
+        Get positive phase.
+        """
+        return positive_phase(model, X, Y, supervised=self.supervised,
+                              method=self.positive_method,
+                              num_gibbs_steps=self.num_gibbs_steps,
+                              theano_rng=self.theano_rng), OrderedDict()
+
+    def _get_negative_phase(self, model, X, Y=None):
+        """
+        .. todo::
+
+            WRITEME
+
+        d/d theta log Z = (d/d theta Z) / Z
+                        = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
+                        = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
+                        = - sum_h sum_v P(v,h)  d/d theta E(v,h)
+        """
+        layer_to_chains = model.initialize_chains(X, Y, self.theano_rng)
+        updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
+                                                              self.theano_rng,
+                                                              num_steps=self.num_gibbs_steps,
+                                                              return_layer_to_updated=True)
+
+        neg_phase_grads = negative_phase(model, layer_to_chains, method=self.negative_method)
+
+        return neg_phase_grads, updates
+
+    def get_gradients(self, model, data, persistent=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        if self.supervised:
+            X, Y = data
+            assert Y is not None
+        else:
+            X = data
+            Y = None
+
+        pos_phase_grads, pos_updates = self._get_positive_phase(model, X, Y)
+        neg_phase_grads, neg_updates = self._get_negative_phase(model, X, Y)
+
+        updates = OrderedDict()
+        if persistent:
+            for key, val in pos_updates.items():
+                updates[key] = val
+            for key, val in neg_updates.items():
+                updates[key] = val
+
+        gradients = OrderedDict()
+        for param in list(pos_phase_grads.keys()):
+            gradients[param] = neg_phase_grads[param] + pos_phase_grads[param]
+        return gradients, updates
+
+    def get_monitoring_channels(self, model, data):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        rval = OrderedDict()
+
+        if self.supervised:
+            X, Y = data
+        else:
+            X = data
+            Y = None
+
+        history = model.mf(X, return_history = True)
+        q = history[-1]
+
+        if self.supervised:
+            assert len(data) == 2
+            Y_hat = q[-1]
+            true = T.argmax(Y, axis=1)
+            pred = T.argmax(Y_hat, axis=1)
+
+            #true = Print('true')(true)
+            #pred = Print('pred')(pred)
+
+            wrong = T.neq(true, pred)
+            err = T.cast(wrong.mean(), X.dtype)
+            rval['misclass'] = err
+
+            if len(model.hidden_layers) > 1:
+                q = model.mf(X, Y=Y)
+                pen = model.hidden_layers[-2].upward_state(q[-2])
+                Y_recons = model.hidden_layers[-1].mf_update(state_below=pen)
+                pred = T.argmax(Y_recons, axis=1)
+                wrong = T.neq(true, pred)
+
+                rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype)
+
+        return rval
+
+
+class VariationalCD(BaseCD):
+    """
+    An intractable cost representing the negative log likelihood of a DBM.
+    The gradient of this bound is computed using a markov chain initialized
+    with the training example.
+
+    Source: Hinton, G. Training Products of Experts by Minimizing
+            Contrastive Divergence
+    """
+
+    def __init__(self, num_gibbs_steps=2, supervised=False,
+                 toronto_neg=False, theano_rng=None):
+        super(VariationalCD, self).__init__(num_gibbs_steps,
+                                            supervised=supervised,
+                                            toronto_neg=toronto_neg,
+                                            positive_method="VARIATIONAL",
+                                            negative_method="STANDARD")
+
+
+
+class MF_L1_ActCost(DefaultDataSpecsMixin, Cost):
+    """
+    L1 activation cost on the mean field parameters.
+
+    Adds a cost of:
+
+    coeff * max( abs(mean_activation - target) - eps, 0)
+
+    averaged over units
+
+    for each layer.
+
+    """
+
+    def __init__(self, targets, coeffs, eps, supervised):
+        """
+        targets: a list, one element per layer, specifying the activation
+                each layer should be encouraged to have
+                    each element may also be a list depending on the
+                    structure of the layer.
+                See each layer's get_l1_act_cost for a specification of
+                    what the state should be.
+        coeffs: a list, one element per layer, specifying the coefficient
+                to put on the L1 activation cost for each layer
+        supervised: If true, runs mean field on both X and Y, penalizing
+                the layers in between only
+        """
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, ** kwargs):
+
+        if self.supervised:
+            X, Y = data
+            H_hat = model.mf(X, Y= Y)
+        else:
+            X = data
+            H_hat = model.mf(X)
+
+        hidden_layers = model.hidden_layers
+        if self.supervised:
+            hidden_layers = hidden_layers[:-1]
+            H_hat = H_hat[:-1]
+
+        layer_costs = []
+        for layer, mf_state, targets, coeffs, eps in \
+            safe_zip(hidden_layers, H_hat, self.targets, self.coeffs,
+                    self.eps):
+            cost = None
+            try:
+                cost = layer.get_l1_act_cost(mf_state, targets, coeffs, eps)
+            except NotImplementedError:
+                assert isinstance(coeffs, float) and coeffs == 0.
+                assert cost is None # if this gets triggered, there might
+                    # have been a bug, where costs from lower layers got
+                    # applied to higher layers that don't implement the cost
+                cost = None
+            if cost is not None:
+                layer_costs.append(cost)
+
+
+        assert T.scalar() != 0. # make sure theano semantics do what I want
+        layer_costs = [cost_ for cost_ in layer_costs if cost_ != 0.]
+
+        if len(layer_costs) == 0:
+            return T.as_tensor_variable(0.)
+        else:
+            total_cost = reduce(lambda x, y: x + y, layer_costs)
+        total_cost.name = 'MF_L1_ActCost'
+
+        assert total_cost.ndim == 0
+
+        return total_cost
+
+class MF_L2_ActCost(DefaultDataSpecsMixin, Cost):
+    """
+    An L2 penalty on the amount that the hidden unit mean field parameters
+    deviate from desired target values.
+
+    TODO: write up parameters list
+    """
+
+    def __init__(self, targets, coeffs, supervised=False):
+        targets = fix(targets)
+        coeffs = fix(coeffs)
+
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, return_locals=False, **kwargs):
+        """
+        .. todo::
+
+            WRITEME
+
+        If returns locals is True, returns (objective, locals())
+        Note that this means adding / removing / changing the value of
+        local variables is an interface change.
+        In particular, TorontoSparsity depends on "terms" and "H_hat"
+        """
+        self.get_data_specs(model)[0].validate(data)
+        if self.supervised:
+            (X, Y) = data
+        else:
+            X = data
+            Y = None
+
+        H_hat = model.mf(X, Y=Y)
+
+        terms = []
+
+        hidden_layers = model.hidden_layers
+        #if self.supervised:
+        #    hidden_layers = hidden_layers[:-1]
+
+        for layer, mf_state, targets, coeffs in \
+                safe_zip(hidden_layers, H_hat, self.targets, self.coeffs):
+            try:
+                cost = layer.get_l2_act_cost(mf_state, targets, coeffs)
+            except NotImplementedError:
+                if isinstance(coeffs, float) and coeffs == 0.:
+                    cost = 0.
+                else:
+                    raise
+            terms.append(cost)
+
+
+        objective = sum(terms)
+
+        if return_locals:
+            return objective, locals()
+        return objective
+
+
+class L2WeightDecay(NullDataSpecsMixin, Cost):
+    """
+    A Cost that applies the following cost function:
+
+    coeff * sum(sqr(weights))
+    for each set of weights.
+
+    Parameters
+    ----------
+    coeffs : list
+        One element per layer, specifying the coefficient
+        to put on the L1 activation cost for each layer.
+        Each element may in turn be a list, ie, for CompositeLayers.
+    """
+
+    def __init__(self, coeffs):
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, ** kwargs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        layer_costs = [ layer.get_weight_decay(coeff)
+            for layer, coeff in safe_izip(model.hidden_layers, self.coeffs) ]
+
+        assert T.scalar() != 0. # make sure theano semantics do what I want
+        layer_costs = [ cost for cost in layer_costs if cost != 0.]
+
+        if len(layer_costs) == 0:
+            rval =  T.as_tensor_variable(0.)
+            rval.name = '0_weight_decay'
+            return rval
+        else:
+            total_cost = reduce(lambda x, y: x + y, layer_costs)
+        total_cost.name = 'DBM_WeightDecay'
+
+        assert total_cost.ndim == 0
+
+        total_cost.name = 'weight_decay'
+
+        return total_cost
+
+
+class L1WeightDecay(NullDataSpecsMixin, Cost):
+    """
+    A Cost that applies the following cost function:
+
+    coeff * sum(abs(weights))
+    for each set of weights.
+
+    Parameters
+    ----------
+    coeffs : list
+        One element per layer, specifying the coefficient
+        to put on the L1 activation cost for each layer.
+        Each element may in turn be a list, ie, for CompositeLayers.
+    """
+
+    def __init__(self, coeffs):
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, ** kwargs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        layer_costs = [ layer.get_l1_weight_decay(coeff)
+            for layer, coeff in safe_izip(model.hidden_layers, self.coeffs) ]
+
+        assert T.scalar() != 0. # make sure theano semantics do what I want
+        layer_costs = [ cost for cost in layer_costs if cost != 0.]
+
+        if len(layer_costs) == 0:
+            rval =  T.as_tensor_variable(0.)
+            rval.name = '0_l1_weight_decay'
+            return rval
+        else:
+            total_cost = reduce(lambda x, y: x + y, layer_costs)
+        total_cost.name = 'DBM_L1WeightDecay'
+
+        assert total_cost.ndim == 0
+
+        total_cost.name = 'l1_weight_decay'
+
+        return total_cost
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/inference_procedure.py b/pylearn2/sandbox/dbm_v2/inference_procedure.py
new file mode 100644
index 0000000000..d809ee12e4
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/inference_procedure.py
@@ -0,0 +1,484 @@
+"""
+Various InferenceProcedures for use with the DBM class.
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin", "Devon Hjelm"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import functools
+import logging
+
+from theano import gof
+import theano.tensor as T
+import theano
+from theano.gof.op import get_debug_values
+
+from pylearn2.models.dbm import block, flatten
+from pylearn2.models.dbm.layer import Softmax
+from pylearn2.utils import safe_izip, block_gradient, safe_zip
+
+
+logger = logging.getLogger(__name__)
+
+
+class InferenceProcedure(object):
+
+    """
+    A class representing a procedure for performing mean field inference in a
+    DBM.
+    Different subclasses can implement different specific procedures, such as
+    updating the layers in different orders, or using different strategies to
+    initialize the mean field expectations.
+    """
+
+    def set_dbm(self, dbm):
+        """
+        Associates the InferenceProcedure with a specific DBM.
+
+        Parameters
+        ----------
+        dbm : pylearn2.models.dbm.DBM instance
+            The model to perform inference in.
+        """
+        self.dbm = dbm
+
+    def mf(self, V, Y=None, return_history=False, niter=None, block_grad=None):
+        """
+        Perform mean field inference. Subclasses must implement.
+
+        Parameters
+        ----------
+        V : Input space batch
+            The values of the input features modeled by the DBM.
+        Y : (Optional) Target space batch
+            The values of the labels modeled by the DBM. Must be omitted
+            if the DBM does not model labels. If the DBM does model
+            labels, they may be included to perform inference over the
+            hidden layers only, or included to perform inference over the
+            labels.
+        return_history : (Optional) bool
+            Default: False
+            If True, returns the full sequence of mean field updates.
+        niter : (Optional) int
+        block_grad : (Optional) int
+            Default: None
+            If not None, blocks the gradient after `block_grad`
+            iterations, so that only the last `niter` - `block_grad`
+            iterations need to be stored when using the backpropagation
+            algorithm.
+
+        Returns
+        -------
+        result : list
+            If not `return_history` (default), a list with one element
+            per inferred layer, containing the full mean field state
+            of that layer.
+            Otherwise, a list of such lists, with the outer list
+            containing one element for each step of inference.
+        """
+        raise NotImplementedError(str(type(self)) + " does not implement mf.")
+
+    def set_batch_size(self, batch_size):
+        """
+        If the inference procedure is dependent on a batch size at all, makes
+        the necessary internal configurations to work with that batch size.
+
+        Parameters
+        ----------
+        batch_size : int
+            The number of examples in the batch
+        """
+        # Default implementation is no-op, because default procedure does
+        # not depend on the batch size.
+
+    def multi_infer(self, V, return_history=False, niter=None,
+                    block_grad=None):
+        """
+        Inference using "the multi-inference trick." See
+        "Multi-prediction deep Boltzmann machines", Goodfellow et al 2013.
+
+        Subclasses may implement this method, however it is not needed for
+        any training algorithm, and only expected to work at evaluation
+        time if the model was trained with multi-prediction training.
+
+        Parameters
+        ----------
+        V : input space batch
+        return_history : bool
+            If True, returns the complete history of the mean field
+            iterations, rather than just the final values
+        niter : int
+            The number of mean field iterations to run
+        block_grad : int
+            If not None, block the gradient after this number of iterations
+
+        Returns
+        -------
+        result : list
+            A list of mean field states, or if return_history is True, a
+            list of such lists with one element per mean field iteration
+        """
+
+        raise NotImplementedError(str(type(self)) + " does not implement"
+                                  " multi_infer.")
+
+    def do_inpainting(self, V, Y=None, drop_mask=None, drop_mask_Y=None,
+                      return_history=False, noise=False, niter=None,
+                      block_grad=None):
+        """
+        Does the inference required for multi-prediction training.
+
+        If you use this method in your research work, please cite:
+
+            Multi-prediction deep Boltzmann machines. Ian J. Goodfellow,
+            Mehdi Mirza, Aaron Courville, and Yoshua Bengio. NIPS 2013.
+
+
+        Gives the mean field expression for units masked out by drop_mask.
+        Uses self.niter mean field updates.
+
+        Comes in two variants, unsupervised and supervised:
+
+        * unsupervised: Y and drop_mask_Y are not passed to the method. The
+          method produces V_hat, an inpainted version of V
+        * supervised: Y and drop_mask_Y are passed to the method. The method
+          produces V_hat and Y_hat
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch in `model.input_space`
+        Y : tensor_like
+            Theano batch in `model.output_space`, i.e. in the output space of
+            the last hidden layer. (It's not really a hidden layer anymore,
+            but oh well. It's convenient to code it this way because the
+            labels are sort of "on top" of everything else.) *** Y is always
+            assumed to be a matrix of one-hot category labels. ***
+        drop_mask : tensor_like
+            Theano batch in `model.input_space`. Should be all binary, with
+            1s indicating that the corresponding element of X should be
+            "dropped", i.e. hidden from the algorithm and filled in as part
+            of the inpainting process
+        drop_mask_Y : tensor_like
+            Theano vector. Since we assume Y is a one-hot matrix, each row is
+            a single categorical variable. `drop_mask_Y` is a binary mask
+            specifying which *rows* to drop.
+        return_history : bool, optional
+            WRITEME
+        noise : bool, optional
+            WRITEME
+        niter : int, optional
+            WRITEME
+        block_grad : WRITEME
+
+        Returns
+        -------
+        WRITEME
+        """
+
+        raise NotImplementedError(str(type(self)) + " does not implement "
+                                  "do_inpainting.")
+
+    def is_rbm_compatible(self):
+        """
+        Checks whether inference procedure is compatible with an RBM.
+
+        A restricted Boltzmann machine (RBM) is a deep Boltzmann machine (DBM)
+        with exactly one hidden layer. Inference of the posterior is exactly
+        equivalent to one mean field update of the hidden units given the data.
+        An rbm compatible inference procedure should:
+        1) calculate the posterior of the hidden units from the data as
+        defined by the joint probability P(v,h) = 1/Z e^E(v,h), where E(.) is
+        the energy over the graph and Z is the marginal.
+        2) not involve cross terms between hidden units.
+        3) not double or replicate weights.
+        4) use exactly one mean field step.
+        """
+
+        raise NotImplementedError(str(type(self)) + " does not implement "
+                                  "is_rbm_compatible.")
+
+
+class UpDown(InferenceProcedure):
+
+    """
+    An InferenceProcedure that initializes the mean field parameters
+    based on the biases in the model, then alternates between updating
+    each of the layers bottom-to-top
+    and updating each of the layers top-to-bottom.
+    """
+
+    @functools.wraps(InferenceProcedure.mf)
+    def mf(self, V, Y=None, return_history=False, niter=None, block_grad=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        dbm = self.dbm
+
+        assert Y not in [True, False, 0, 1]
+        assert return_history in [True, False, 0, 1]
+
+        if Y is not None:
+            dbm.hidden_layers[-1].get_output_space().validate(Y)
+
+        if niter is None:
+            niter = dbm.niter
+
+        H_hat = [None] + [layer.init_mf_state()
+                          for layer in dbm.hidden_layers[1:]]
+
+        # Make corrections for if we're also running inference on Y
+        if Y is not None:
+            # Last layer is clamped to Y
+            H_hat[-1] = Y
+
+        history = [list(H_hat)]
+
+        # we only need recurrent inference if there are multiple layers
+        assert (niter > 1) == (len(dbm.hidden_layers) > 1)
+
+        for i in xrange(niter):
+            # Determine whether to go up or down on this iteration
+            if i % 2 == 0:
+                start = 0
+                stop = len(H_hat)
+                inc = 1
+            else:
+                start = len(H_hat) - 1
+                stop = -1
+                inc = -1
+            # Do the mean field updates
+            for j in xrange(start, stop, inc):
+                if j == 0:
+                    state_below = dbm.visible_layer.upward_state(V)
+                else:
+                    state_below = dbm.hidden_layers[
+                        j - 1].upward_state(H_hat[j - 1])
+                if j == len(H_hat) - 1:
+                    state_above = None
+                    layer_above = None
+                else:
+                    state_above = dbm.hidden_layers[
+                        j + 1].downward_state(H_hat[j + 1])
+                    layer_above = dbm.hidden_layers[j + 1]
+                H_hat[j] = dbm.hidden_layers[j].mf_update(
+                    state_below=state_below,
+                    state_above=state_above,
+                    layer_above=layer_above)
+                if Y is not None:
+                    H_hat[-1] = Y
+
+            if Y is not None:
+                H_hat[-1] = Y
+
+            if block_grad == i + 1:
+                H_hat = block(H_hat)
+
+            history.append(list(H_hat))
+        # end for mf iter
+
+        # Run some checks on the output
+        for layer, state in safe_izip(dbm.hidden_layers, H_hat):
+            upward_state = layer.upward_state(state)
+            layer.get_output_space().validate(upward_state)
+        if Y is not None:
+            assert all([elem[-1] is Y for elem in history])
+            assert H_hat[-1] is Y
+
+        if return_history:
+            return history
+        else:
+            return H_hat
+
+    def do_inpainting(self, V, Y=None, drop_mask=None, drop_mask_Y=None,
+                      return_history=False, noise=False, niter=None,
+                      block_grad=None):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Gives the mean field expression for units masked out by drop_mask.
+        Uses self.niter mean field updates.
+
+        Comes in two variants, unsupervised and supervised:
+
+        * unsupervised: Y and drop_mask_Y are not passed to the method. The
+          method produces V_hat, an inpainted version of V.
+        * supervised: Y and drop_mask_Y are passed to the method. The method
+          produces V_hat and Y_hat.
+
+        If you use this method in your research work, please cite:
+
+            Multi-prediction deep Boltzmann machines. Ian J. Goodfellow,
+            Mehdi Mirza, Aaron Courville, and Yoshua Bengio. NIPS 2013.
+
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch in `model.input_space`
+        Y : tensor_like
+            Theano batch in model.output_space, ie, in the output space of
+            the last hidden layer (it's not really a hidden layer anymore,
+            but oh well. It's convenient to code it this way because the
+            labels are sort of "on top" of everything else). *** Y is always
+            assumed to be a matrix of one-hot category labels. ***
+        drop_mask : tensor_like
+            A theano batch in `model.input_space`. Should be all binary, with
+            1s indicating that the corresponding element of X should be
+            "dropped", ie, hidden from the algorithm and filled in as part of
+            the inpainting process
+        drop_mask_Y : tensor_like
+            Theano vector. Since we assume Y is a one-hot matrix, each row is
+            a single categorical variable. `drop_mask_Y` is a binary mask
+            specifying which *rows* to drop.
+        """
+
+        if Y is not None:
+            assert isinstance(self.hidden_layers[-1], Softmax)
+
+        model = self.dbm
+
+        """TODO: Should add unit test that calling this with a batch of
+                 different inputs should yield the same output for each
+                 if noise is False and drop_mask is all 1s"""
+
+        if niter is None:
+            niter = model.niter
+
+        assert drop_mask is not None
+        assert return_history in [True, False]
+        assert noise in [True, False]
+        if Y is None:
+            if drop_mask_Y is not None:
+                raise ValueError("do_inpainting got drop_mask_Y but not Y.")
+        else:
+            if drop_mask_Y is None:
+                raise ValueError("do_inpainting got Y but not drop_mask_Y.")
+
+        if Y is not None:
+            assert isinstance(model.hidden_layers[-1], Softmax)
+            if drop_mask_Y.ndim != 1:
+                raise ValueError("do_inpainting assumes Y is a matrix of"
+                                 "one-hot labels,"
+                                 "so each example is only one variable. "
+                                 "drop_mask_Y should "
+                                 "therefore be a vector, but we got "
+                                 "something with ndim " +
+                                 str(drop_mask_Y.ndim))
+            drop_mask_Y = drop_mask_Y.dimshuffle(0, 'x')
+
+        orig_V = V
+        orig_drop_mask = drop_mask
+
+        history = []
+
+        V_hat, V_hat_unmasked = model.visible_layer.init_inpainting_state(
+            V, drop_mask, noise, return_unmasked=True)
+        assert V_hat_unmasked.ndim > 1
+
+        H_hat = [None] + [layer.init_mf_state()
+                          for layer in model.hidden_layers[1:]]
+
+        if Y is not None:
+            Y_hat_unmasked = model.hidden_layers[
+                -1].init_inpainting_state(Y, noise)
+            Y_hat = drop_mask_Y * Y_hat_unmasked + (1 - drop_mask_Y) * Y
+            H_hat[-1] = Y_hat
+
+        def update_history():
+            assert V_hat_unmasked.ndim > 1
+            d = {'V_hat':  V_hat, 'H_hat': H_hat,
+                 'V_hat_unmasked': V_hat_unmasked}
+            if Y is not None:
+                d['Y_hat_unmasked'] = Y_hat_unmasked
+                d['Y_hat'] = H_hat[-1]
+            history.append(d)
+
+        update_history()
+
+        for i in xrange(niter):
+
+            if i % 2 == 0:
+                start = 0
+                stop = len(H_hat)
+                inc = 1
+                if i > 0:
+                    # Don't start by updating V_hat on iteration 0 or
+                    # this will throw out the noise
+                    V_hat, V_hat_unmasked = model.visible_layer.inpaint_update(
+                        state_above=model.hidden_layers[0].downward_state(
+                            H_hat[0]),
+                        layer_above=model.hidden_layers[0],
+                        V=V,
+                        drop_mask=drop_mask, return_unmasked=True)
+                    V_hat.name = 'V_hat[%d](V_hat = %s)' % (i, V_hat.name)
+            else:
+                start = len(H_hat) - 1
+                stop = -1
+                inc = -1
+            for j in xrange(start, stop, inc):
+                if j == 0:
+                    state_below = model.visible_layer.upward_state(V_hat)
+                else:
+                    state_below = model.hidden_layers[
+                        j - 1].upward_state(H_hat[j - 1])
+                if j == len(H_hat) - 1:
+                    state_above = None
+                    layer_above = None
+                else:
+                    state_above = model.hidden_layers[
+                        j + 1].downward_state(H_hat[j + 1])
+                    layer_above = model.hidden_layers[j + 1]
+                H_hat[j] = model.hidden_layers[j].mf_update(
+                    state_below=state_below,
+                    state_above=state_above,
+                    layer_above=layer_above)
+                if Y is not None and j == len(model.hidden_layers) - 1:
+                    Y_hat_unmasked = H_hat[j]
+                    H_hat[j] = drop_mask_Y * H_hat[j] + (1 - drop_mask_Y) * Y
+
+            if i % 2 == 1:
+                V_hat, V_hat_unmasked = model.visible_layer.inpaint_update(
+                    state_above=model.hidden_layers[0].downward_state(
+                        H_hat[0]),
+                    layer_above=model.hidden_layers[0],
+                    V=V,
+                    drop_mask=drop_mask, return_unmasked=True)
+                V_hat.name = 'V_hat[%d](V_hat = %s)' % (i, V_hat.name)
+
+            if block_grad == i + 1:
+                V_hat = block_gradient(V_hat)
+                V_hat_unmasked = block_gradient(V_hat_unmasked)
+                H_hat = block(H_hat)
+            update_history()
+        # end for i
+
+        # debugging, make sure V didn't get changed in this function
+        assert V is orig_V
+        assert drop_mask is orig_drop_mask
+
+        Y_hat = H_hat[-1]
+
+        assert V in theano.gof.graph.ancestors([V_hat])
+        if Y is not None:
+            assert V in theano.gof.graph.ancestors([Y_hat])
+
+        if return_history:
+            return history
+        else:
+            if Y is not None:
+                return V_hat, Y_hat
+            return V_hat
+
+    def is_rbm_compatible(self):
+        """
+        Is implemented as UpDown is RBM compatible.
+        """
+        return
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/ising.py b/pylearn2/sandbox/dbm_v2/ising.py
new file mode 100644
index 0000000000..085f5b068a
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/ising.py
@@ -0,0 +1,1864 @@
+"""
+Implementation of a densely connected Ising model in the
+pylearn2.models.dbm framework
+
+Notes
+-----
+If :math:`h` can be -1 or 1, and
+
+.. math::
+
+    p(h) = \exp(T\dot z \dot h),
+
+then the expected value of :math:`h` is given by
+
+.. math::
+
+    \\tanh(T \dot z),
+
+and the probability that :math:`h` is 1 is given by
+
+.. math::
+
+    \sigma(2T \dot z)
+"""
+
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import numpy as np
+
+from theano.compat.python2x import OrderedDict
+
+from theano import function
+from theano.gof.op import get_debug_values
+from theano.compile.sharedvalue import SharedVariable
+import theano.tensor as T
+import warnings
+
+from pylearn2.expr.nnet import sigmoid_numpy
+from pylearn2.linear.matrixmul import MatrixMul
+from pylearn2.models.dbm import init_sigmoid_bias_from_array
+from pylearn2.models.dbm.layer import HiddenLayer, VisibleLayer
+from pylearn2.space import Conv2DSpace
+from pylearn2.space import VectorSpace
+from pylearn2.utils import sharedX
+from pylearn2.utils.rng import make_theano_rng
+
+
+def init_tanh_bias_from_marginals(dataset, use_y=False):
+    """
+    .. todo::
+
+        WRITEME
+    """
+    if use_y:
+        X = dataset.y
+    else:
+        X = dataset.get_design_matrix()
+    if not (X.max() == 1):
+        raise ValueError("Expected design matrix to consist entirely "
+                         "of 0s and 1s, but maximum value is "+str(X.max()))
+    assert X.min() == -1.
+
+    mean = X.mean(axis=0)
+
+    mean = np.clip(mean, 1e-7, 1-1e-7)
+
+    init_bias = np.arctanh(mean)
+
+    return init_bias
+
+
+class IsingVisible(VisibleLayer):
+    """
+    A DBM visible layer consisting of random variables living
+    in a `VectorSpace`, with values in {-1, 1}.
+
+    Implements the energy function term :math:`-\mathbf{b}^T \mathbf{h}`.
+
+    Parameters
+    ----------
+    nvis : int
+        The dimension of the space
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the
+        energy function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered as a
+        learned parameter
+    bias_from_marginals : `pylearn2.datasets.dataset.Dataset`, optional
+        A dataset whose marginals are used to initialize the visible
+        biases
+    """
+
+    def __init__(self, nvis, beta, learn_beta=False, bias_from_marginals=None):
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared variable.")
+        self.__dict__.update(locals())
+        del self.self
+        # Don't serialize the dataset
+        del self.bias_from_marginals
+
+        self.space = VectorSpace(nvis)
+        self.input_space = self.space
+
+        origin = self.space.get_origin()
+
+        if bias_from_marginals is None:
+            init_bias = np.zeros((nvis,))
+        else:
+            init_bias = init_tanh_bias_from_marginals(bias_from_marginals)
+
+        self.bias = sharedX(init_bias, 'visible_bias')
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.bias.get_value()
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.bias.set_value(biases)
+        if recenter:
+            assert self.center
+            self.offset.set_value(sigmoid_numpy(self.bias.get_value()))
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = [self.bias]
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def mf_update(self, state_above, layer_above):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above)
+
+        bias = self.bias
+
+        z = msg + bias
+
+        rval = T.tanh(self.beta * z)
+
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert state_below is None
+
+        msg = layer_above.downward_message(state_above)
+
+        bias = self.bias
+
+        z = msg + bias
+
+        phi = T.nnet.sigmoid(2. * self.beta * z)
+
+        rval = theano_rng.binomial(size=phi.shape, p=phi, dtype=phi.dtype, n=1)
+
+        return rval * 2. - 1.
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.nvis))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.bias.get_value())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.b)
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below=None,
+                             average_below=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert state_below is None
+        assert average_below is None
+        assert average in [True, False]
+        self.space.validate(state)
+
+        # Energy function is linear so it doesn't matter if we're averaging
+        # or not
+        rval = -(self.beta * T.dot(state, self.bias))
+
+        assert rval.ndim == 1
+
+        return rval
+
+
+class IsingHidden(HiddenLayer):
+    """
+    A hidden layer with :math:`\mathbf{h}` being a vector in {-1, 1},
+    implementing the energy function term
+
+    .. math::
+
+        -\mathbf{v}^T \mathbf{W}\mathbf{h} -\mathbf{b}^T \mathbf{h}
+
+    where :math:`\mathbf{W}` and :math:`\mathbf{b}` are parameters of this
+    layer, and :math:`\mathbf{v}` is the upward state of the layer below.
+
+    Parameters
+    ----------
+    dim : WRITEME
+    layer_name : WRITEME
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the energy
+        function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered as a
+        learned parameter
+    irange : WRITEME
+    sparse_init : WRITEME
+    sparse_stdev : WRITEME
+    include_prob : float, optional
+        Probability of including a weight element in the set of weights
+        initialized to U(-irange, irange). If not included it is
+        initialized to 0.
+    init_bias : WRITEME
+    W_lr_scale : WRITEME
+    b_lr_scale : WRITEME
+    max_col_norm : WRITEME
+    """
+
+    def __init__(self,
+                 dim,
+                 layer_name,
+                 beta,
+                 learn_beta=False,
+                 irange=None,
+                 sparse_init=None,
+                 sparse_stdev=1.,
+                 include_prob=1.0,
+                 init_bias=0.,
+                 W_lr_scale=None,
+                 b_lr_scale=None,
+                 max_col_norm=None):
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared variable.")
+        self.__dict__.update(locals())
+        del self.self
+
+        self.b = sharedX(np.zeros((self.dim,)) + init_bias,
+                         name=layer_name + '_b')
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        rval = OrderedDict()
+
+        if self.W_lr_scale is not None:
+            W, = self.transformer.get_params()
+            rval[W] = self.W_lr_scale
+
+        if self.b_lr_scale is not None:
+            rval[self.b] = self.b_lr_scale
+
+        return rval
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Notes
+        -----
+        Note: this resets parameters!
+        """
+
+        self.input_space = space
+
+        if isinstance(space, VectorSpace):
+            self.requires_reformat = False
+            self.input_dim = space.dim
+        else:
+            self.requires_reformat = True
+            self.input_dim = space.get_total_dimension()
+            self.desired_space = VectorSpace(self.input_dim)
+
+        self.output_space = VectorSpace(self.dim)
+
+        rng = self.dbm.rng
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange, self.irange,
+                            (self.input_dim, self.dim)) * \
+                (rng.uniform(0., 1., (self.input_dim, self.dim))
+                    < self.include_prob)
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.dim))
+            W *= self.sparse_stdev
+
+        W = sharedX(W)
+        W.name = self.layer_name + '_W'
+
+        self.transformer = MatrixMul(W)
+
+        W, = self.transformer.get_params()
+        assert W.name is not None
+
+    def _modify_updates(self, updates):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.max_col_norm is not None:
+            W, = self.transformer.get_params()
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return VectorSpace(self.dim)
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W, = self.transformer.get_params()
+        assert W.name is not None
+        rval = self.transformer.get_params()
+        assert not isinstance(rval, set)
+        rval = list(rval)
+        assert self.b not in rval
+        rval.append(self.b)
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        W, = self.transformer.get_params()
+        return coeff * T.sqr(W).sum()
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.requires_reformat:
+            # This is not really an unimplemented case.
+            # We actually don't know how to format the weights
+            # in design space. We got the data in topo space
+            # and we don't have access to the dataset
+            raise NotImplementedError()
+        W, = self.transformer.get_params()
+        return W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W, = self.transformer.get_params()
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset.set_value(sigmoid_numpy(self.b.get_value()))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+
+        W, = self.transformer.get_params()
+
+        W = W.T
+
+        W = W.reshape(
+            (self.detector_layer_dim, self.input_space.shape[0],
+             self.input_space.shape[1], self.input_space.nchannels)
+        )
+
+        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))
+
+        return function([], W)()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        W, = self.transformer.get_params()
+
+        assert W.ndim == 2
+
+        sq_W = T.sqr(W)
+
+        row_norms = T.sqrt(sq_W.sum(axis=1))
+        col_norms = T.sqrt(sq_W.sum(axis=0))
+
+        return OrderedDict([
+            ('row_norms_min', row_norms.min()),
+            ('row_norms_mean', row_norms.mean()),
+            ('row_norms_max', row_norms.max()),
+            ('col_norms_min', col_norms.min()),
+            ('col_norms_mean', col_norms.mean()),
+            ('col_norms_max', col_norms.max()),
+        ])
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P = state
+
+        rval = OrderedDict()
+
+        vars_and_prefixes = [(P, '')]
+
+        for var, prefix in vars_and_prefixes:
+            v_max = var.max(axis=0)
+            v_min = var.min(axis=0)
+            v_mean = var.mean(axis=0)
+            v_range = v_max - v_min
+
+            # max_x.mean_u is "the mean over *u*nits of the max over
+            # e*x*amples". The x and u are included in the name because
+            # otherwise its hard to remember which axis is which when reading
+            # the monitor I use inner.outer rather than outer_of_inner or
+            # something like that because I want mean_x.* to appear next to
+            # each other in the alphabetical list, as these are commonly
+            # plotted together
+            for key, val in [
+                ('max_x.max_u', v_max.max()),
+                ('max_x.mean_u', v_max.mean()),
+                ('max_x.min_u', v_max.min()),
+                ('min_x.max_u', v_min.max()),
+                ('min_x.mean_u', v_min.mean()),
+                ('min_x.min_u', v_min.min()),
+                ('range_x.max_u', v_range.max()),
+                ('range_x.mean_u', v_range.mean()),
+                ('range_x.min_u', v_range.min()),
+                ('mean_x.max_u', v_mean.max()),
+                ('mean_x.mean_u', v_mean.mean()),
+                ('mean_x.min_u', v_mean.min()),
+            ]:
+                rval[prefix+key] = val
+
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to " +
+                             "None so that it may appear after layer_above " +
+                             "/ state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        z = self.transformer.lmul(state_below) + self.b
+
+        if msg is not None:
+            z = z + msg
+
+        on_prob = T.nnet.sigmoid(2. * self.beta * z)
+
+        samples = theano_rng.binomial(p=on_prob, n=1, size=on_prob.shape,
+                                      dtype=on_prob.dtype) * 2. - 1.
+
+        return samples
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = self.transformer.lmul_T(downward_state)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size,
+                    self.dim).astype(self.b.dtype) + \
+            self.b.dimshuffle('x', 0)
+        rval = T.tanh(self.beta * z)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.dim))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.b.get_value())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.b)
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # state = Print('h_state', attrs=['min', 'max'])(state)
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        # Energy function is linear so it doesn't matter if we're averaging or
+        # not. Specifically, our terms are -u^T W d - b^T d where u is the
+        # upward state of layer below and d is the downward state of this layer
+
+        bias_term = T.dot(state, self.b)
+        weights_term = (self.transformer.lmul(state_below) * state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        rval *= self.beta
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Used to implement TorontoSparsity. Unclear exactly what properties of
+        it are important or how to implement it for other layers.
+
+        Properties it must have:
+            output is same kind of data structure (ie, tuple of theano
+            2-tensors) as mf_update
+
+        Properties it probably should have for other layer types:
+            An infinitesimal change in state_below or the parameters should
+            cause the same sign of change in the output of
+            linear_feed_forward_approximation and in mf_update
+
+            Should not have any non-linearities that cause the gradient to
+            shrink
+
+            Should disregard top-down feedback
+        """
+
+        z = self.beta * (self.transformer.lmul(state_below) + self.b)
+
+        if self.pool_size != 1:
+            # Should probably implement sum pooling for the non-pooled version,
+            # but in reality it's not totally clear what the right answer is
+            raise NotImplementedError()
+
+        return z, z
+
+    def mf_update(self, state_below, state_above, layer_above=None,
+                  double_weights=False, iter_name=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + \
+                       self.layer_name + '[' + iter_name + ']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.b
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        if msg is not None:
+            z = z + msg
+        h = T.tanh(self.beta * z)
+
+        return h
+
+
+class BoltzmannIsingVisible(VisibleLayer):
+    """
+    An IsingVisible whose parameters are defined in Boltzmann machine space.
+
+    Notes
+    -----
+    All parameter noise/clipping is handled by BoltzmannIsingHidden.
+
+    .. todo::
+
+        WRITEME properly
+
+    Parameters
+    ----------
+    nvis : int
+        Number of visible units
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the energy
+        function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered
+            as a learned parameter
+    bias_from_marginals : `pylearn2.datasets.dataset.Dataset`, optional
+        A dataset whose marginals are used to initialize the visible
+        biases
+    sampling_b_stdev : WRITEME
+    min_ising_b : WRITEME
+    max_ising_b : WRITEME
+    """
+
+    def __init__(self, nvis, beta, learn_beta=False, bias_from_marginals=None,
+                 sampling_b_stdev=None, min_ising_b=None, max_ising_b=None):
+
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared " +
+                             "variable.")
+        self.__dict__.update(locals())
+        del self.self
+        # Don't serialize the dataset
+        del self.bias_from_marginals
+
+        self.space = VectorSpace(nvis)
+        self.input_space = self.space
+
+        if bias_from_marginals is None:
+            init_bias = np.zeros((nvis,))
+        else:
+            # data is in [-1, 1], but want biases for a sigmoid
+            init_bias = \
+                init_sigmoid_bias_from_array(bias_from_marginals.X / 2. + 0.5)
+            # init_bias =
+        self.boltzmann_bias = sharedX(init_bias, 'visible_bias')
+
+        self.resample_fn = None
+
+    def finalize_initialization(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.sampling_b_stdev is not None:
+            self.noisy_sampling_b = \
+                sharedX(np.zeros((self.layer_above.dbm.batch_size, self.nvis)))
+
+        updates = OrderedDict()
+        updates[self.boltzmann_bias] = self.boltzmann_bias
+        updates[self.layer_above.W] = self.layer_above.W
+        self.enforce_constraints()
+
+    def _modify_updates(self, updates):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        beta = self.beta
+        if beta in updates:
+            updated_beta = updates[beta]
+            updates[beta] = T.clip(updated_beta, 1., 1000.)
+
+        if any(constraint is not None for constraint in [self.min_ising_b,
+                                                         self.max_ising_b]):
+            bmn = self.min_ising_b
+            if bmn is None:
+                bmn = - 1e6
+            bmx = self.max_ising_b
+            if bmx is None:
+                bmx = 1e6
+            wmn_above = self.layer_above.min_ising_W
+            if wmn_above is None:
+                wmn_above = - 1e6
+            wmx_above = self.layer_above.max_ising_W
+            if wmx_above is None:
+                wmx_above = 1e6
+
+            b = updates[self.boltzmann_bias]
+            W_above = updates[self.layer_above.W]
+            ising_b = 0.5 * b + 0.25 * W_above.sum(axis=1)
+            ising_b = T.clip(ising_b, bmn, bmx)
+
+            ising_W_above = 0.25 * W_above
+            ising_W_above = T.clip(ising_W_above, wmn_above, wmx_above)
+            bhn = 2. * (ising_b - ising_W_above.sum(axis=1))
+
+            updates[self.boltzmann_bias] = bhn
+
+        if self.noisy_sampling_b is not None:
+            theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+            b = updates[self.boltzmann_bias]
+            W_above = updates[self.layer_above.W]
+            ising_b = 0.5 * b + 0.25 * W_above.sum(axis=1)
+
+            noisy_sampling_b = \
+                theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                  std=self.sampling_b_stdev,
+                                  size=self.noisy_sampling_b.shape,
+                                  dtype=ising_b.dtype)
+            updates[self.noisy_sampling_b] = noisy_sampling_b
+
+    def resample_bias_noise(self, batch_size_changed=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if batch_size_changed:
+            self.resample_fn = None
+
+        if self.resample_fn is None:
+            updates = OrderedDict()
+
+            if self.sampling_b_stdev is not None:
+                self.noisy_sampling_b = \
+                    sharedX(np.zeros((self.dbm.batch_size, self.nvis)))
+
+            if self.noisy_sampling_b is not None:
+                theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+                b = self.boltzmann_bias
+                W_above = self.layer_above.W
+                ising_b = 0.5 * b + 0.25 * W_above.sum(axis=1)
+
+                noisy_sampling_b = \
+                    theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                      std=self.sampling_b_stdev,
+                                      size=self.noisy_sampling_b.shape,
+                                      dtype=ising_b.dtype)
+                updates[self.noisy_sampling_b] = noisy_sampling_b
+
+            self.resample_fn = function([], updates=updates)
+
+        self.resample_fn()
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingVisible.get_biases returns the " +
+                      "BOLTZMANN biases, is that what we want?")
+        return self.boltzmann_bias.get_value()
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert False  # not really sure what this should do for this layer
+
+    def ising_bias(self, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if for_sampling and self.layer_above.sampling_b_stdev is not None:
+            return self.noisy_sampling_b
+        return \
+            0.5 * self.boltzmann_bias + 0.25 * self.layer_above.W.sum(axis=1)
+
+    def ising_bias_numpy(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return 0.5 * self.boltzmann_bias.get_value() + \
+            0.25 * self.layer_above.W.get_value().sum(axis=1)
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = [self.boltzmann_bias]
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert state_below is None
+
+        msg = layer_above.downward_message(state_above, for_sampling=True)
+
+        bias = self.ising_bias(for_sampling=True)
+
+        z = msg + bias
+
+        phi = T.nnet.sigmoid(2. * self.beta * z)
+
+        rval = theano_rng.binomial(size=phi.shape, p=phi, dtype=phi.dtype, n=1)
+
+        return rval * 2. - 1.
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.nvis))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.ising_bias_numpy())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.ising_bias())
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def mf_update(self, state_above, layer_above):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above, for_sampling=True)
+
+        bias = self.ising_bias(for_sampling=True)
+
+        z = msg + bias
+
+        rval = T.tanh(self.beta * z)
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below=None,
+                             average_below=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # state = Print('v_state', attrs=['min', 'max'])(state)
+
+        assert state_below is None
+        assert average_below is None
+        assert average in [True, False]
+        self.space.validate(state)
+
+        # Energy function is linear so it doesn't matter if we're averaging
+        # or not
+        rval = -(self.beta * T.dot(state, self.ising_bias()))
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        ising_b = self.ising_bias()
+
+        rval['ising_b_min'] = ising_b.min()
+        rval['ising_b_max'] = ising_b.max()
+        rval['beta'] = self.beta
+
+        if hasattr(self, 'noisy_sampling_b'):
+            rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min()
+            rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max()
+
+        return rval
+
+
+class BoltzmannIsingHidden(HiddenLayer):
+    """
+    An IsingHidden whose parameters are defined in Boltzmann machine space.
+
+    .. todo::
+
+        WRITEME properly
+
+    Parameters
+    ----------
+    dim : WRITEME
+    layer_name : WRITEME
+    layer_below : WRITEME
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the energy
+        function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered as a
+        learned parameter
+    irange : WRITEME
+    sparse_init : WRITEME
+    sparse_stdev : WRITEME
+    include_prob : WRITEME
+    init_bias : WRITEME
+    W_lr_scale : WRITEME
+    b_lr_scale : WRITEME
+    beta_lr_scale : WRITEME
+    max_col_norm : WRITEME
+    min_ising_b : WRITEME
+    max_ising_b : WRITEME
+    min_ising_W : WRITEME
+    max_ising_W : WRITEME
+    sampling_W_stdev : WRITEME
+    sampling_b_stdev : WRITEME
+    """
+    def __init__(self,
+                 dim,
+                 layer_name,
+                 layer_below,
+                 beta,
+                 learn_beta=False,
+                 irange=None,
+                 sparse_init=None,
+                 sparse_stdev=1.,
+                 include_prob=1.0,
+                 init_bias=0.,
+                 W_lr_scale=None,
+                 b_lr_scale=None,
+                 beta_lr_scale=None,
+                 max_col_norm=None,
+                 min_ising_b=None,
+                 max_ising_b=None,
+                 min_ising_W=None,
+                 max_ising_W=None,
+                 sampling_W_stdev=None,
+                 sampling_b_stdev=None):
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared variable.")
+        self.__dict__.update(locals())
+        del self.self
+
+        layer_below.layer_above = self
+        self.layer_above = None
+        self.resample_fn = None
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        if not hasattr(self, 'beta_lr_scale'):
+            self.beta_lr_scale = None
+
+        rval = OrderedDict()
+
+        if self.W_lr_scale is not None:
+            W = self.W
+            rval[W] = self.W_lr_scale
+
+        if self.b_lr_scale is not None:
+            rval[self.boltzmann_b] = self.b_lr_scale
+
+        if self.beta_lr_scale is not None:
+            rval[self.beta] = self.beta_lr_scale
+
+        return rval
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Note: this resets parameters!
+        """
+
+        self.input_space = space
+
+        if isinstance(space, VectorSpace):
+            self.requires_reformat = False
+            self.input_dim = space.dim
+        else:
+            self.requires_reformat = True
+            self.input_dim = space.get_total_dimension()
+            self.desired_space = VectorSpace(self.input_dim)
+
+        self.output_space = VectorSpace(self.dim)
+
+        rng = self.dbm.rng
+
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange, self.irange,
+                            (self.input_dim, self.dim)) * \
+                (rng.uniform(0., 1., (self.input_dim, self.dim))
+                    < self.include_prob)
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.dim))
+            W *= self.sparse_stdev
+        W = sharedX(W)
+        W.name = self.layer_name + '_W'
+        self.W = W
+
+        self.boltzmann_b = sharedX(np.zeros((self.dim,)) + self.init_bias,
+                                   name=self.layer_name + '_b')
+
+    def finalize_initialization(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.sampling_b_stdev is not None:
+            self.noisy_sampling_b = \
+                sharedX(np.zeros((self.dbm.batch_size, self.dim)))
+        if self.sampling_W_stdev is not None:
+            self.noisy_sampling_W = \
+                sharedX(np.zeros((self.input_dim, self.dim)),
+                        'noisy_sampling_W')
+
+        updates = OrderedDict()
+        updates[self.boltzmann_b] = self.boltzmann_b
+        updates[self.W] = self.W
+        if self.layer_above is not None:
+            updates[self.layer_above.W] = self.layer_above.W
+        self.enforce_constraints()
+
+    def _modify_updates(self, updates):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        beta = self.beta
+        if beta in updates:
+            updated_beta = updates[beta]
+            updates[beta] = T.clip(updated_beta, 1., 1000.)
+
+        if self.max_col_norm is not None:
+            W = self.W
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+        if any(constraint is not None for constraint in [self.min_ising_b,
+                                                         self.max_ising_b,
+                                                         self.min_ising_W,
+                                                         self.max_ising_W]):
+            bmn = self.min_ising_b
+            if bmn is None:
+                bmn = - 1e6
+            bmx = self.max_ising_b
+            if bmx is None:
+                bmx = 1e6
+            wmn = self.min_ising_W
+            if wmn is None:
+                wmn = - 1e6
+            wmx = self.max_ising_W
+            if wmx is None:
+                wmx = 1e6
+            if self.layer_above is not None:
+                wmn_above = self.layer_above.min_ising_W
+                if wmn_above is None:
+                    wmn_above = - 1e6
+                wmx_above = self.layer_above.max_ising_W
+                if wmx_above is None:
+                    wmx_above = 1e6
+
+            W = updates[self.W]
+            ising_W = 0.25 * W
+            ising_W = T.clip(ising_W, wmn, wmx)
+
+            b = updates[self.boltzmann_b]
+            if self.layer_above is not None:
+                W_above = updates[self.layer_above.W]
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0) \
+                                  + 0.25 * W_above.sum(axis=1)
+            else:
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0)
+            ising_b = T.clip(ising_b, bmn, bmx)
+
+            if self.layer_above is not None:
+                ising_W_above = 0.25 * W_above
+                ising_W_above = T.clip(ising_W_above, wmn_above, wmx_above)
+                bhn = 2. * (ising_b - ising_W.sum(axis=0)
+                                    - ising_W_above.sum(axis=1))
+            else:
+                bhn = 2. * (ising_b - ising_W.sum(axis=0))
+            Wn = 4. * ising_W
+
+            updates[self.W] = Wn
+            updates[self.boltzmann_b] = bhn
+
+        if self.noisy_sampling_W is not None:
+            theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+            W = updates[self.W]
+            ising_W = 0.25 * W
+
+            noisy_sampling_W = \
+                theano_rng.normal(avg=ising_W, std=self.sampling_W_stdev,
+                                  size=ising_W.shape, dtype=ising_W.dtype)
+            updates[self.noisy_sampling_W] = noisy_sampling_W
+
+            b = updates[self.boltzmann_b]
+            if self.layer_above is not None:
+                W_above = updates[self.layer_above.W]
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0) \
+                                  + 0.25 * W_above.sum(axis=1)
+            else:
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0)
+
+            noisy_sampling_b = \
+                theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                  std=self.sampling_b_stdev,
+                                  size=self.noisy_sampling_b.shape,
+                                  dtype=ising_b.dtype)
+            updates[self.noisy_sampling_b] = noisy_sampling_b
+
+    def resample_bias_noise(self, batch_size_changed=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if batch_size_changed:
+            self.resample_fn = None
+
+        if self.resample_fn is None:
+            updates = OrderedDict()
+
+            if self.sampling_b_stdev is not None:
+                self.noisy_sampling_b = \
+                    sharedX(np.zeros((self.dbm.batch_size, self.dim)))
+
+            if self.noisy_sampling_b is not None:
+                theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+                b = self.boltzmann_b
+                if self.layer_above is not None:
+                    W_above = self.layer_above.W
+                    ising_b = 0.5 * b + 0.25 * self.W.sum(axis=0) \
+                                      + 0.25 * W_above.sum(axis=1)
+                else:
+                    ising_b = 0.5 * b + 0.25 * self.W.sum(axis=0)
+
+                noisy_sampling_b = \
+                    theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                      std=self.sampling_b_stdev,
+                                      size=self.noisy_sampling_b.shape,
+                                      dtype=ising_b.dtype)
+                updates[self.noisy_sampling_b] = noisy_sampling_b
+
+            self.resample_fn = function([], updates=updates)
+
+        self.resample_fn()
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return VectorSpace(self.dim)
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.boltzmann_b.name is not None
+        W = self.W
+        assert W.name is not None
+        rval = [W]
+        assert not isinstance(rval, set)
+        rval = list(rval)
+        assert self.boltzmann_b not in rval
+        rval.append(self.boltzmann_b)
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def ising_weights(self, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'sampling_W_stdev'):
+            self.sampling_W_stdev = None
+        if for_sampling and self.sampling_W_stdev is not None:
+            return self.noisy_sampling_W
+        return 0.25 * self.W
+
+    def ising_b(self, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'sampling_b_stdev'):
+            self.sampling_b_stdev = None
+        if for_sampling and self.sampling_b_stdev is not None:
+            return self.noisy_sampling_b
+        else:
+            if self.layer_above is not None:
+                return 0.5 * self.boltzmann_b + \
+                    0.25 * self.W.sum(axis=0) + \
+                    0.25 * self.layer_above.W.sum(axis=1)
+            else:
+                return 0.5 * self.boltzmann_b + 0.25 * self.W.sum(axis=0)
+
+    def ising_b_numpy(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.layer_above is not None:
+            return 0.5 * self.boltzmann_b.get_value() + \
+                0.25 * self.W.get_value().sum(axis=0) + \
+                0.25 * self.layer_above.W.get_value().sum(axis=1)
+        else:
+            return 0.5 * self.boltzmann_b.get_value() + \
+                0.25 * self.W.get_value().sum(axis=0)
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        W = self.W
+        return coeff * T.sqr(W).sum()
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.get_weights returns the " +
+                      "BOLTZMANN weights, is that what we want?")
+        W = self.W
+        return W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.set_weights sets the BOLTZMANN " +
+                      "weights, is that what we want?")
+        W = self.W
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.boltzmann_b.set_value(biases)
+        assert not recenter  # not really sure what this should do if True
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.get_biases returns the " +
+                      "BOLTZMANN biases, is that what we want?")
+        return self.boltzmann_b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.get_weights_topo returns the " +
+                      "BOLTZMANN weights, is that what we want?")
+
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+
+        W = self.W
+
+        W = W.T
+
+        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
+                       self.input_space.shape[1], self.input_space.nchannels))
+
+        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))
+
+        return function([], W)()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        W = self.W
+
+        assert W.ndim == 2
+
+        sq_W = T.sqr(W)
+
+        row_norms = T.sqrt(sq_W.sum(axis=1))
+        col_norms = T.sqrt(sq_W.sum(axis=0))
+
+        rval = OrderedDict([
+            ('boltzmann_row_norms_min', row_norms.min()),
+            ('boltzmann_row_norms_mean', row_norms.mean()),
+            ('boltzmann_row_norms_max', row_norms.max()),
+            ('boltzmann_col_norms_min', col_norms.min()),
+            ('boltzmann_col_norms_mean', col_norms.mean()),
+            ('boltzmann_col_norms_max', col_norms.max()),
+        ])
+
+        ising_W = self.ising_weights()
+
+        rval['ising_W_min'] = ising_W.min()
+        rval['ising_W_max'] = ising_W.max()
+
+        ising_b = self.ising_b()
+
+        rval['ising_b_min'] = ising_b.min()
+        rval['ising_b_max'] = ising_b.max()
+
+        if hasattr(self, 'noisy_sampling_W'):
+            rval['noisy_sampling_W_min'] = self.noisy_sampling_W.min()
+            rval['noisy_sampling_W_max'] = self.noisy_sampling_W.max()
+            rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min()
+            rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max()
+
+        return rval
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P = state
+
+        rval = OrderedDict()
+
+        vars_and_prefixes = [(P, '')]
+
+        for var, prefix in vars_and_prefixes:
+            v_max = var.max(axis=0)
+            v_min = var.min(axis=0)
+            v_mean = var.mean(axis=0)
+            v_range = v_max - v_min
+
+            # max_x.mean_u is "the mean over *u*nits of the max over
+            # e*x*amples". The x and u are included in the name because
+            # otherwise its hard to remember which axis is which when reading
+            # the monitor I use inner.outer rather than outer_of_inner or
+            # something like that because I want mean_x.* to appear next to
+            # each other in the alphabetical list, as these are commonly
+            # plotted together
+            for key, val in [
+                    ('max_x.max_u', v_max.max()),
+                    ('max_x.mean_u', v_max.mean()),
+                    ('max_x.min_u', v_max.min()),
+                    ('min_x.max_u', v_min.max()),
+                    ('min_x.mean_u', v_min.mean()),
+                    ('min_x.min_u', v_min.min()),
+                    ('range_x.max_u', v_range.max()),
+                    ('range_x.mean_u', v_range.mean()),
+                    ('range_x.min_u', v_range.min()),
+                    ('mean_x.max_u', v_mean.max()),
+                    ('mean_x.mean_u', v_mean.mean()),
+                    ('mean_x.min_u', v_mean.min())
+            ]:
+                rval[prefix+key] = val
+
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to " +
+                             "None so that it may appear after layer_above " +
+                             "/ state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above, for_sampling=True)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        z = T.dot(state_below, self.ising_weights(for_sampling=True)) + \
+            self.ising_b(for_sampling=True)
+
+        if msg is not None:
+            z = z + msg
+
+        on_prob = T.nnet.sigmoid(2. * self.beta * z)
+
+        samples = theano_rng.binomial(p=on_prob, n=1, size=on_prob.shape,
+                                      dtype=on_prob.dtype) * 2. - 1.
+
+        return samples
+
+    def downward_message(self, downward_state, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = T.dot(downward_state,
+                     self.ising_weights(for_sampling=for_sampling).T)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size,
+                    self.dim).astype(self.boltzmann_b.dtype) + \
+            self.ising_b().dimshuffle('x', 0)
+        rval = T.tanh(self.beta * z)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.dim))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.ising_b_numpy())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.ising_b())
+        rval = theano_rng.binomial(size=(num_examples, self.dim), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # state = Print('h_state', attrs=['min', 'max'])(state)
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        # Energy function is linear so it doesn't matter if we're averaging or
+        # not. Specifically, our terms are -u^T W d - b^T d where u is the
+        # upward state of layer below and d is the downward state of this layer
+
+        bias_term = T.dot(state, self.ising_b())
+        weights_term = \
+            (T.dot(state_below, self.ising_weights()) * state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        rval *= self.beta
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Used to implement TorontoSparsity. Unclear exactly what properties of
+        it are important or how to implement it for other layers.
+
+        Properties it must have:
+            output is same kind of data structure (ie, tuple of theano
+            2-tensors) as mf_update
+
+        Properties it probably should have for other layer types:
+            An infinitesimal change in state_below or the parameters should
+            cause the same sign of change in the output of
+            linear_feed_forward_approximation and in mf_update
+
+            Should not have any non-linearities that cause the gradient to
+            shrink
+
+            Should disregard top-down feedback
+        """
+
+        z = self.beta * (T.dot(state_below, self.ising_weights()) + self.ising_b())
+
+        return z
+
+    def mf_update(self, state_below, state_above, layer_above=None,
+                  double_weights=False, iter_name=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + \
+                       self.layer_name + '[' + iter_name+']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = T.dot(state_below, self.ising_weights()) + self.ising_b()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        if msg is not None:
+            z = z + msg
+        h = T.tanh(self.beta * z)
+
+        return h
+
+    def get_l2_act_cost(self, state, target, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        avg = state.mean(axis=0)
+        diff = avg - target
+        return coeff * T.sqr(diff).mean()
diff --git a/pylearn2/sandbox/dbm_v2/layer.py b/pylearn2/sandbox/dbm_v2/layer.py
new file mode 100644
index 0000000000..5bc2c50b13
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/layer.py
@@ -0,0 +1,4124 @@
+"""
+Common DBM Layer classes
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import functools
+import logging
+import numpy as np
+import time
+import warnings
+
+from theano import tensor as T, function, config
+import theano
+from theano.compat import OrderedDict
+from theano.gof.op import get_debug_values
+from theano.printing import Print
+
+from pylearn2.expr.nnet import sigmoid_numpy
+from pylearn2.expr.probabilistic_max_pooling import max_pool_channels, max_pool_b01c, max_pool, max_pool_c01b
+from pylearn2.linear.conv2d import make_random_conv2D, make_sparse_random_conv2D
+from pylearn2.linear.conv2d_c01b import setup_detector_layer_c01b
+from pylearn2.linear.matrixmul import MatrixMul
+from pylearn2.models import Model
+from pylearn2.sandbox.dbm_v2 import init_sigmoid_bias_from_marginals
+from pylearn2.space import VectorSpace, CompositeSpace, Conv2DSpace, Space
+from pylearn2.utils import is_block_gradient
+from pylearn2.utils import sharedX, safe_zip, py_integer_types, block_gradient
+from pylearn2.utils.exc import reraise_as
+from pylearn2.utils.rng import make_theano_rng
+from pylearn2.utils import safe_union
+
+
+logger = logging.getLogger(__name__)
+
+
+class Layer(Model):
+    """
+    Abstract class.
+    A layer of a DBM.
+    May only belong to one DBM.
+
+    Each layer has a state ("total state") that can be split into
+    the piece that is visible to the layer above ("upward state")
+    and the piece that is visible to the layer below ("downward state").
+    (Since visible layers don't have a downward state, the downward_state
+    method only appears in the DBM_HiddenLayer subclass)
+
+    For simple layers, all three of these are the same thing.
+    """
+
+    def get_dbm(self):
+        """
+        Returns the DBM that this layer belongs to, or None
+        if it has not been assigned to a DBM yet.
+        """
+
+        if hasattr(self, 'dbm'):
+            return self.dbm
+
+        return None
+
+    def set_dbm(self, dbm):
+        """
+        Assigns this layer to a DBM.
+
+        Parameters
+        ----------
+        dbm : WRITEME
+        """
+        assert self.get_dbm() is None
+        self.dbm = dbm
+
+    def get_total_state_space(self):
+        """
+        Returns the Space that the layer's total state lives in.
+        """
+        raise NotImplementedError(str(type(self))+" does not implement " +\
+                "get_total_state_space()")
+
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return OrderedDict()
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return OrderedDict()
+
+    def upward_state(self, total_state):
+        """
+        Takes total_state and turns it into the state that layer_above should
+        see when computing P( layer_above | this_layer).
+
+        So far this has two uses:
+
+        * If this layer consists of a detector sub-layer h that is pooled
+          into a pooling layer p, then total_state = (p,h) but layer_above
+          should only see p.
+        * If the conditional P( layer_above | this_layer) depends on
+          parameters of this_layer, sometimes you can play games with
+          the state to avoid needing the layers to communicate. So far
+          the only instance of this usage is when the visible layer
+          is N( Wh, beta). This makes the hidden layer be
+          sigmoid( v beta W + b). Rather than having the hidden layer
+          explicitly know about beta, we can just pass v beta as
+          the upward state.
+
+        Parameters
+        ----------
+        total_state : WRITEME
+
+        Notes
+        -----
+        This method should work both for computing sampling updates
+        and for computing mean field updates. So far I haven't encountered
+        a case where it needs to do different things for those two
+        contexts.
+        """
+        return total_state
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        Returns a shared variable containing an actual state (not a mean field
+        state) for this variable.
+
+        Parameters
+        ----------
+        num_examples : WRITEME
+        numpy_rng : WRITEME
+
+        Returns
+        -------
+        WRITEME
+        """
+
+        raise NotImplementedError("%s doesn't implement make_state" %
+                type(self))
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        Returns a theano symbolic variable containing an actual state (not a
+        mean field state) for this variable.
+
+        Parameters
+        ----------
+        num_examples : WRITEME
+        numpy_rng : WRITEME
+
+        Returns
+        -------
+        WRITEME
+        """
+
+        raise NotImplementedError("%s doesn't implement make_symbolic_state" %
+                                  type(self))
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        Returns an expression for samples of this layer's state, conditioned on
+        the layers above and below Should be valid as an update to the shared
+        variable returned by self.make_state
+
+        Parameters
+        ----------
+        state_below : WRITEME
+            Corresponds to layer_below.upward_state(full_state_below),
+            where full_state_below is the same kind of object as you get
+            out of layer_below.make_state
+        state_above : WRITEME
+            Corresponds to layer_above.downward_state(full_state_above)
+
+        theano_rng : WRITEME
+            An MRG_RandomStreams instance
+
+        Returns
+        -------
+        WRITEME
+
+        Notes
+        -----
+        This can return multiple expressions if this layer's total state
+        consists of more than one shared variable.
+        """
+
+        if hasattr(self, 'get_sampling_updates'):
+            raise AssertionError("Looks like "+str(type(self))+" needs to rename get_sampling_updates to sample.")
+
+        raise NotImplementedError("%s doesn't implement sample" %
+                type(self))
+
+    def expected_energy_term(self, state,
+                                   average,
+                                   state_below,
+                                   average_below):
+        """
+        Returns a term of the expected energy of the entire model.
+        This term should correspond to the expected value of terms
+        of the energy function that:
+
+        - involve this layer only
+        - if there is a layer below, include terms that involve both this layer
+          and the layer below
+
+        Do not include terms that involve the layer below only.
+        Do not include any terms that involve the layer above, if it
+        exists, in any way (the interface doesn't let you see the layer
+        above anyway).
+
+        Parameters
+        ----------
+        state_below : WRITEME
+            Upward state of the layer below.
+        state : WRITEME
+            Total state of this layer
+        average_below : bool
+            If True, the layer below is one of the variables to integrate
+            over in the expectation, and state_below gives its variational
+            parameters. If False, that layer is to be held constant and
+            state_below gives a set of assignments to it average: like
+            average_below, but for 'state' rather than 'state_below'
+
+        Returns
+        -------
+        rval : tensor_like
+            A 1D theano tensor giving the expected energy term for each example
+        """
+        raise NotImplementedError(str(type(self))+" does not implement expected_energy_term.")
+
+    def finalize_initialization(self):
+        """
+        Some layers' initialization depends on layer above being initialized,
+        which is why this method is called after `set_input_space` has been
+        called.
+        """
+        pass
+
+
+class VisibleLayer(Layer):
+    """
+    Abstract class.
+    A layer of a DBM that may be used as a visible layer.
+    Currently, all implemented layer classes may be either visible
+    or hidden but not both. It may be worth making classes that can
+    play both roles though. This would allow getting rid of the BinaryVector
+    class.
+    """
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.get_input_space()
+
+
+class HiddenLayer(Layer):
+    """
+    Abstract class.
+    A layer of a DBM that may be used as a hidden layer.
+    """
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_stdev_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_stdev_rewards")
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_range_rewards")
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_l1_act_cost")
+
+    def get_l2_act_cost(self, state, target, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_l2_act_cost")
+
+
+class BinaryVector(VisibleLayer):
+    """
+    A DBM visible layer consisting of binary random variables living
+    in a VectorSpace.
+
+    Parameters
+    ----------
+    nvis : int
+        Dimension of the space
+    bias_from_marginals : pylearn2.datasets.dataset.Dataset
+        Dataset, whose marginals are used to initialize the visible biases
+    center : bool
+        WRITEME
+    copies : int
+        WRITEME
+    """
+    def __init__(self,
+            nvis,
+            bias_from_marginals = None,
+            center = False,
+            copies = 1, learn_init_inpainting_state = False):
+
+        self.__dict__.update(locals())
+        del self.self
+        # Don't serialize the dataset
+        del self.bias_from_marginals
+
+        self.space = VectorSpace(nvis)
+        self.input_space = self.space
+
+        origin = self.space.get_origin()
+
+        if bias_from_marginals is None:
+            init_bias = np.zeros((nvis,))
+        else:
+            init_bias = init_sigmoid_bias_from_marginals(bias_from_marginals)
+
+        self.bias = sharedX(init_bias, 'visible_bias')
+
+        if center:
+            self.offset = sharedX(sigmoid_numpy(init_bias))
+
+    def get_biases(self):
+        """
+        Returns
+        -------
+        biases : ndarray
+            The numpy value of the biases
+        """
+        return self.bias.get_value()
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.bias.set_value(biases)
+        if recenter:
+            assert self.center
+            self.offset.set_value(sigmoid_numpy(self.bias.get_value()))
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            rval = total_state - self.offset
+        else:
+            rval = total_state
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        return rval * self.copies
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return [self.bias]
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+
+        assert state_below is None
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        msg = layer_above.downward_message(state_above)
+
+        bias = self.bias
+
+        z = msg + bias
+
+        phi = T.nnet.sigmoid(z)
+
+        rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype,
+                       n = 1 )
+
+        return rval
+
+    def mf_update(self, state_above, layer_above):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above)
+        mu = self.bias
+
+        z = msg + mu
+
+        rval = T.nnet.sigmoid(z)
+
+        return rval
+
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+        if self.copies != 1:
+            raise NotImplementedError()
+        driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis))
+        mean = sigmoid_numpy(self.bias.get_value())
+        sample = driver < mean
+
+        rval = sharedX(sample, name = 'v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+        if self.copies != 1:
+            raise NotImplementedError()
+        mean = T.nnet.sigmoid(self.bias)
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean,
+                                   dtype=theano.config.floatX)
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below = None, average_below = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.center:
+            state = state - self.offset
+
+        assert state_below is None
+        assert average_below is None
+        assert average in [True, False]
+        self.space.validate(state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        rval = -T.dot(state, self.bias)
+
+        assert rval.ndim == 1
+
+        return rval * self.copies
+
+    def init_inpainting_state(self, V, drop_mask, noise = False, return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert drop_mask is None or drop_mask.ndim > 1
+
+        unmasked = T.nnet.sigmoid(self.bias.dimshuffle('x',0))
+        # this condition is needed later if unmasked is used as V_hat
+        assert unmasked.ndim == 2
+        # this condition is also needed later if unmasked is used as V_hat
+        assert hasattr(unmasked.owner.op, 'scalar_op')
+        if drop_mask is not None:
+            masked_mean = unmasked * drop_mask
+        else:
+            masked_mean = unmasked
+        if not hasattr(self, 'learn_init_inpainting_state'):
+            self.learn_init_inpainting_state = 0
+        if not self.learn_init_inpainting_state:
+            masked_mean = block_gradient(masked_mean)
+        masked_mean.name = 'masked_mean'
+
+        if noise:
+            theano_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(42)
+            # we want a set of random mean field parameters, not binary samples
+            unmasked = T.nnet.sigmoid(theano_rng.normal(avg = 0.,
+                    std = 1., size = masked_mean.shape,
+                    dtype = masked_mean.dtype))
+            masked_mean = unmasked * drop_mask
+            masked_mean.name = 'masked_noise'
+
+        if drop_mask is None:
+            rval = masked_mean
+        else:
+            masked_V  = V  * (1-drop_mask)
+            rval = masked_mean + masked_V
+        rval.name = 'init_inpainting_state'
+
+        if return_unmasked:
+            assert unmasked.ndim > 1
+            return rval, unmasked
+
+        return rval
+
+
+    def inpaint_update(self, state_above, layer_above, drop_mask = None, V = None, return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above)
+        mu = self.bias
+
+        z = msg + mu
+        z.name = 'inpainting_z_[unknown_iter]'
+
+        unmasked = T.nnet.sigmoid(z)
+
+        if drop_mask is not None:
+            rval = drop_mask * unmasked + (1-drop_mask) * V
+        else:
+            rval = unmasked
+
+        rval.name = 'inpainted_V[unknown_iter]'
+
+        if return_unmasked:
+            owner = unmasked.owner
+            assert owner is not None
+            op = owner.op
+            assert hasattr(op, 'scalar_op')
+            assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
+            return rval, unmasked
+
+        return rval
+
+
+    def recons_cost(self, V, V_hat_unmasked, drop_mask = None, use_sum=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if use_sum:
+            raise NotImplementedError()
+
+        V_hat = V_hat_unmasked
+
+        assert hasattr(V_hat, 'owner')
+        owner = V_hat.owner
+        assert owner is not None
+        op = owner.op
+        block_grad = False
+        if is_block_gradient(op):
+            assert isinstance(op.scalar_op, theano.scalar.Identity)
+            block_grad = True
+            real, = owner.inputs
+            owner = real.owner
+            op = owner.op
+
+        if not hasattr(op, 'scalar_op'):
+            raise ValueError("Expected V_hat_unmasked to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op)))
+        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
+        z ,= owner.inputs
+        if block_grad:
+            z = block_gradient(z)
+
+        if V.ndim != V_hat.ndim:
+            raise ValueError("V and V_hat_unmasked should have same ndim, but are %d and %d." % (V.ndim, V_hat.ndim))
+        unmasked_cost = V * T.nnet.softplus(-z) + (1 - V) * T.nnet.softplus(z)
+        assert unmasked_cost.ndim == V_hat.ndim
+
+        if drop_mask is None:
+            masked_cost = unmasked_cost
+        else:
+            masked_cost = drop_mask * unmasked_cost
+
+        return masked_cost.mean()
+
+class BinaryVectorMaxPool(HiddenLayer):
+    """
+    A hidden layer that does max-pooling on binary vectors.
+    It has two sublayers, the detector layer and the pooling
+    layer. The detector layer is its downward state and the pooling
+    layer is its upward state.
+
+    Parameters
+    ----------
+    detector_layer_dim : WRITEME
+    pool_size : WRITEME
+    layer_name : WRITEME
+    irange : WRITEME
+    sparse_init : WRITEME
+    sparse_stdev : WRITEME
+    include_prob : , optional
+        Probability of including a weight element in the set of weights
+        initialized to U(-irange, irange). If not included it is
+        initialized to 0.
+    init_bias : WRITEME
+    W_lr_scale : WRITEME
+    b_lr_scale : WRITEME
+    center : WRITEME
+    mask_weights : WRITEME
+    max_col_norm : WRITEME
+    copies : WRITEME
+    """
+    # TODO: this layer uses (pooled, detector) as its total state,
+    #       which can be confusing when listing all the states in
+    #       the network left to right. Change this and
+    #       pylearn2.expr.probabilistic_max_pooling to use
+    #       (detector, pooled)
+
+    def __init__(self,
+            detector_layer_dim,
+            pool_size,
+            layer_name,
+            irange = None,
+            sparse_init = None,
+            sparse_stdev = 1.,
+            include_prob = 1.0,
+            init_bias = 0.,
+            W_lr_scale = None,
+            b_lr_scale = None,
+            center = False,
+            mask_weights = None,
+            max_col_norm = None,
+            copies = 1):
+        self.__dict__.update(locals())
+        del self.self
+
+        self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b')
+
+        if self.center:
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset = sharedX(sigmoid_numpy(self.b.get_value()))
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        rval = OrderedDict()
+
+        if self.W_lr_scale is not None:
+            W, = self.transformer.get_params()
+            rval[W] = self.W_lr_scale
+
+        if self.b_lr_scale is not None:
+            rval[self.b] = self.b_lr_scale
+
+        return rval
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+
+        Notes
+        -----
+        This resets parameters!
+        """
+
+        self.input_space = space
+
+        if isinstance(space, VectorSpace):
+            self.requires_reformat = False
+            self.input_dim = space.dim
+        else:
+            self.requires_reformat = True
+            self.input_dim = space.get_total_dimension()
+            self.desired_space = VectorSpace(self.input_dim)
+
+
+        if not (self.detector_layer_dim % self.pool_size == 0):
+            raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" %
+                    (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size))
+
+        self.h_space = VectorSpace(self.detector_layer_dim)
+        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
+        self.output_space = VectorSpace(self.pool_layer_dim)
+
+        rng = self.dbm.rng
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange,
+                                 self.irange,
+                                 (self.input_dim, self.detector_layer_dim)) * \
+                    (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
+                     < self.include_prob)
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.detector_layer_dim))
+            def mask_rejects(idx, i):
+                if self.mask_weights is None:
+                    return False
+                return self.mask_weights[idx, i] == 0.
+            for i in xrange(self.detector_layer_dim):
+                assert self.sparse_init <= self.input_dim
+                for j in xrange(self.sparse_init):
+                    idx = rng.randint(0, self.input_dim)
+                    while W[idx, i] != 0 or mask_rejects(idx, i):
+                        idx = rng.randint(0, self.input_dim)
+                    W[idx, i] = rng.randn()
+            W *= self.sparse_stdev
+
+        W = sharedX(W)
+        W.name = self.layer_name + '_W'
+
+        self.transformer = MatrixMul(W)
+
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        if self.mask_weights is not None:
+            expected_shape =  (self.input_dim, self.detector_layer_dim)
+            if expected_shape != self.mask_weights.shape:
+                raise ValueError("Expected mask with shape "+str(expected_shape)+" but got "+str(self.mask_weights.shape))
+            self.mask = sharedX(self.mask_weights)
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+
+        # Patch old pickle files
+        if not hasattr(self, 'mask_weights'):
+            self.mask_weights = None
+        if not hasattr(self, 'max_col_norm'):
+            self.max_col_norm = None
+
+        if self.mask_weights is not None:
+            W ,= self.transformer.get_params()
+            if W in updates:
+                updates[W] = updates[W] * self.mask
+
+        if self.max_col_norm is not None:
+            W, = self.transformer.get_params()
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace((self.output_space, self.h_space))
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+        rval = self.transformer.get_params()
+        assert not isinstance(rval, set)
+        rval = list(rval)
+        assert self.b not in rval
+        rval.append(self.b)
+        return rval
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        W ,= self.transformer.get_params()
+        return coeff * T.sqr(W).sum()
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.requires_reformat:
+            # This is not really an unimplemented case.
+            # We actually don't know how to format the weights
+            # in design space. We got the data in topo space
+            # and we don't have access to the dataset
+            raise NotImplementedError()
+        W ,= self.transformer.get_params()
+        return W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W, = self.transformer.get_params()
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset.set_value(sigmoid_numpy(self.b.get_value()))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def get_weights_view_shape(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        total = self.detector_layer_dim
+        cols = self.pool_size
+        if cols == 1:
+            # Let the PatchViewer decidew how to arrange the units
+            # when they're not pooled
+            raise NotImplementedError()
+        # When they are pooled, make each pooling unit have one row
+        rows = total / cols
+        return rows, cols
+
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+
+        W ,= self.transformer.get_params()
+
+        W = W.T
+
+        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
+            self.input_space.shape[1], self.input_space.num_channels))
+
+        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))
+
+        return function([], W)()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+        self.h_space.validate(h)
+        self.output_space.validate(p)
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            return p - self.offset
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        return p * self.copies
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            return h - self.offset
+
+        return h * self.copies
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        W ,= self.transformer.get_params()
+
+        assert W.ndim == 2
+
+        sq_W = T.sqr(W)
+
+        row_norms = T.sqrt(sq_W.sum(axis=1))
+        col_norms = T.sqrt(sq_W.sum(axis=0))
+
+        return OrderedDict([
+              ('row_norms_min'  , row_norms.min()),
+              ('row_norms_mean' , row_norms.mean()),
+              ('row_norms_max'  , row_norms.max()),
+              ('col_norms_min'  , col_norms.min()),
+              ('col_norms_mean' , col_norms.mean()),
+              ('col_norms_max'  , col_norms.max()),
+            ])
+
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P, H = state
+
+        rval = OrderedDict()
+
+        if self.pool_size == 1:
+            vars_and_prefixes = [ (P,'') ]
+        else:
+            vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ]
+
+        for var, prefix in vars_and_prefixes:
+            v_max = var.max(axis=0)
+            v_min = var.min(axis=0)
+            v_mean = var.mean(axis=0)
+            v_range = v_max - v_min
+
+            # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples"
+            # The x and u are included in the name because otherwise its hard
+            # to remember which axis is which when reading the monitor
+            # I use inner.outer rather than outer_of_inner or something like that
+            # because I want mean_x.* to appear next to each other in the alphabetical
+            # list, as these are commonly plotted together
+            for key, val in [
+                    ('max_x.max_u', v_max.max()),
+                    ('max_x.mean_u', v_max.mean()),
+                    ('max_x.min_u', v_max.min()),
+                    ('min_x.max_u', v_min.max()),
+                    ('min_x.mean_u', v_min.mean()),
+                    ('min_x.min_u', v_min.min()),
+                    ('range_x.max_u', v_range.max()),
+                    ('range_x.mean_u', v_range.mean()),
+                    ('range_x.min_u', v_range.min()),
+                    ('mean_x.max_u', v_mean.max()),
+                    ('mean_x.mean_u', v_mean.mean()),
+                    ('mean_x.min_u', v_mean.min())
+                    ]:
+                rval[prefix+key] = val
+
+        return rval
+
+    def get_stdev_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if isinstance(coeffs, str):
+                coeffs = float(coeffs)
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            assert all([isinstance(elem, float) for elem in [c]])
+            if c == 0.:
+                continue
+            mn = s.mean(axis=0)
+            dev = s - mn
+            stdev = T.sqrt(T.sqr(dev).mean(axis=0))
+            rval += (0.5 - stdev).mean()*c
+
+        return rval
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if isinstance(coeffs, str):
+                coeffs = float(coeffs)
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            assert all([isinstance(elem, float) for elem in [c]])
+            if c == 0.:
+                continue
+            mx = s.max(axis=0)
+            assert hasattr(mx.owner.op, 'grad')
+            assert mx.ndim == 1
+            mn = s.min(axis=0)
+            assert hasattr(mn.owner.op, 'grad')
+            assert mn.ndim == 1
+            r = mx - mn
+            rval += (1 - r).mean()*c
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if not isinstance(target, float):
+                raise TypeError("BinaryVectorMaxPool.get_l1_act_cost expected target of type float " + \
+                        " but an instance named "+self.layer_name + " got target "+str(target) + " of type "+str(type(target)))
+            assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+            if eps is None:
+                eps = [0.]
+            else:
+                eps = [eps]
+        else:
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            if eps is None:
+                eps = [0., 0.]
+            if target[1] > target[0]:
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c, e in safe_zip(state, target, coeff, eps):
+            assert all([isinstance(elem, float) or hasattr(elem, 'dtype') for elem in [t, c, e]])
+            if c == 0.:
+                continue
+            m = s.mean(axis=0)
+            assert m.ndim == 1
+            rval += T.maximum(abs(m-t)-e,0.).mean()*c
+
+        return rval
+
+    def get_l2_act_cost(self, state, target, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if not isinstance(target, float):
+                raise TypeError("BinaryVectorMaxPool.get_l1_act_cost expected target of type float " + \
+                        " but an instance named "+self.layer_name + " got target "+str(target) + " of type "+str(type(target)))
+            assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+        else:
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            if target[1] > target[0]:
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c in safe_zip(state, target, coeff):
+            assert all([isinstance(elem, float) or hasattr(elem, 'dtype') for elem in [t, c]])
+            if c == 0.:
+                continue
+            m = s.mean(axis=0)
+            assert m.ndim == 1
+            rval += T.square(m-t).mean()*c
+
+        return rval
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        z = self.transformer.lmul(state_below) + self.b
+        p, h, p_sample, h_sample = max_pool_channels(z,
+                self.pool_size, msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.h_space.validate(downward_state)
+        rval = self.transformer.lmul_T(downward_state)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval * self.copies
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \
+                self.b.dimshuffle('x', 0)
+        rval = max_pool_channels(z = z,
+                pool_size = self.pool_size)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Returns a shared variable containing an actual state
+           (not a mean field state) for this variable.
+        """
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        if self.copies != 1:
+            raise NotImplementedError()
+
+
+        empty_input = self.h_space.get_origin_batch(num_examples)
+        empty_output = self.output_space.get_origin_batch(num_examples)
+
+        h_state = sharedX(empty_input)
+        p_state = sharedX(empty_output)
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16), which_method="binomial")
+
+        default_z = T.zeros_like(h_state) + self.b
+
+        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
+                z = default_z,
+                pool_size = self.pool_size,
+                theano_rng = theano_rng)
+
+        assert h_sample.dtype == default_z.dtype
+
+        f = function([], updates = [
+            (p_state , p_sample),
+            (h_state , h_sample)
+            ])
+
+        f()
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Returns a theano symbolic variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        default_z = T.alloc(self.b, num_examples, self.detector_layer_dim)
+
+        p_exp, h_exp, p_sample, h_sample = max_pool_channels(z=default_z,
+                                                             pool_size=self.pool_size,
+                                                             theano_rng=theano_rng)
+
+        assert h_sample.dtype == default_z.dtype
+
+        return p_sample, h_sample
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # Don't need to do anything special for centering, upward_state / downward state
+        # make it all just work
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = T.dot(downward_state, self.b)
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval * self.copies
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        Used to implement TorontoSparsity. Unclear exactly what properties of
+        it are important or how to implement it for other layers.
+
+        Properties it must have: output is same kind of data structure (ie,
+        tuple of theano 2-tensors) as mf_update.
+
+        Properties it probably should have for other layer types: an
+        infinitesimal change in state_below or the parameters should cause the
+        same sign of change in the output of linear_feed_forward_approximation
+        and in mf_update
+
+        Should not have any non-linearities that cause the gradient to shrink
+
+        Should disregard top-down feedback
+
+        Parameters
+        ----------
+        state_below : WRITEME
+        """
+
+        z = self.transformer.lmul(state_below) + self.b
+
+        if self.pool_size != 1:
+            # Should probably implement sum pooling for the non-pooled version,
+            # but in reality it's not totally clear what the right answer is
+            raise NotImplementedError()
+
+        return z, z
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.b
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = max_pool_channels(z, self.pool_size, msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+
+class Softmax(HiddenLayer):
+    """
+    A layer representing a single softmax distribution of a
+    set of discrete categories.
+
+    Parameters
+    ----------
+    n_classes : int
+        The number of discrete categories.
+    layer_name : str
+        The name of the layer.
+    irange : float
+        If not None, initialze the weights in U(-irange, irange)
+    sparse_init : int
+        If not None, initialize `sparse_init` weights per column
+        to N(0, sparse_istdev^2)
+    sparse_istdev : float
+        see above
+    W_lr_scale : float
+        Scale the learning rate on the weights by this amount
+    b_lr_scale : float
+        Scale the learning rate on the biases by this amount
+    max_col_norm : float
+        If not None, constrain the columns of the weight matrix
+        to have at most this L2 norm
+    copies : int
+        Make this many copies of the random variables, all sharing
+        the same weights. This allows the undirected model to
+        behave as if it has asymmetric connections.
+    center : bool
+        If True, use Gregoire Montavon's centering trick.
+    learn_init_inpainting_state : bool
+        If True, and using inpainting-based methods (MP-DBM), learn
+        a parameter controlling the initial value of the mean field
+        state for this layer.
+    """
+
+    presynaptic_name = "presynaptic_Y_hat"
+
+    def __init__(self, n_classes, layer_name, irange = None,
+                 sparse_init = None, sparse_istdev = 1., W_lr_scale = None,
+                 b_lr_scale = None,
+                 max_col_norm = None,
+                 copies = 1, center = False,
+                 learn_init_inpainting_state = True):
+        if isinstance(W_lr_scale, str):
+            W_lr_scale = float(W_lr_scale)
+
+        self.__dict__.update(locals())
+        del self.self
+
+        assert isinstance(n_classes, py_integer_types)
+
+        self.output_space = VectorSpace(n_classes)
+        self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b')
+
+        if self.center:
+            b = self.b.get_value()
+            self.offset = sharedX(np.exp(b) / np.exp(b).sum())
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+
+        if not hasattr(self, 'max_col_norm'):
+            self.max_col_norm = None
+
+        if self.max_col_norm is not None:
+            W = self.W
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+    @functools.wraps(Model.get_lr_scalers)
+    def get_lr_scalers(self):
+
+        rval = OrderedDict()
+
+        # Patch old pickle files
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if self.W_lr_scale is not None:
+            assert isinstance(self.W_lr_scale, float)
+            rval[self.W] = self.W_lr_scale
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        if self.b_lr_scale is not None:
+            assert isinstance(self.b_lr_scale, float)
+            rval[self.b] = self.b_lr_scale
+
+        return rval
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.output_space
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        mx = state.max(axis=1)
+
+        return OrderedDict([
+                ('mean_max_class' , mx.mean()),
+                ('max_max_class' , mx.max()),
+                ('min_max_class' , mx.min())
+        ])
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.input_space = space
+
+        if not isinstance(space, Space):
+            raise TypeError("Expected Space, got "+
+                    str(space)+" of type "+str(type(space)))
+
+        self.input_dim = space.get_total_dimension()
+        self.needs_reformat = not isinstance(space, VectorSpace)
+
+        self.desired_space = VectorSpace(self.input_dim)
+
+        if not self.needs_reformat:
+            assert self.desired_space == self.input_space
+
+        rng = self.dbm.rng
+
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.n_classes))
+            for i in xrange(self.n_classes):
+                for j in xrange(self.sparse_init):
+                    idx = rng.randint(0, self.input_dim)
+                    while W[idx, i] != 0.:
+                        idx = rng.randint(0, self.input_dim)
+                    W[idx, i] = rng.randn() * self.sparse_istdev
+
+        self.W = sharedX(W,  'softmax_W' )
+
+        self._params = [ self.b, self.W ]
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+        desired = self.W.get_value().T
+        ipt = self.desired_space.format_as(desired, self.input_space)
+        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
+        return rval
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not isinstance(self.input_space, VectorSpace):
+            raise NotImplementedError()
+
+        return self.W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.W.set_value(weights)
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            self.offset.set_value( (np.exp(biases) / np.exp(biases).sum()).astype(self.offset.dtype))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+
+        if self.copies != 1:
+            raise NotImplementedError("need to draw self.copies samples and average them together.")
+
+        if state_above is not None:
+            # If you implement this case, also add a unit test for it.
+            # Or at least add a warning that it is not tested.
+            raise NotImplementedError()
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")
+
+        self.input_space.validate(state_below)
+
+        # patch old pickle files
+        if not hasattr(self, 'needs_reformat'):
+            self.needs_reformat = self.needs_reshape
+            del self.needs_reshape
+
+        if self.needs_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        self.desired_space.validate(state_below)
+
+
+        z = T.dot(state_below, self.W) + self.b
+        h_exp = T.nnet.softmax(z)
+        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)
+
+        return h_sample
+
+    def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if state_above is not None:
+            raise NotImplementedError()
+
+        if double_weights:
+            raise NotImplementedError()
+
+        self.input_space.validate(state_below)
+
+        # patch old pickle files
+        if not hasattr(self, 'needs_reformat'):
+            self.needs_reformat = self.needs_reshape
+            del self.needs_reshape
+
+        if self.needs_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        for value in get_debug_values(state_below):
+            if value.shape[0] != self.dbm.batch_size:
+                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))
+
+        self.desired_space.validate(state_below)
+
+        assert self.W.ndim == 2
+        assert state_below.ndim == 2
+
+        b = self.b
+
+        Z = T.dot(state_below, self.W) + b
+
+        rval = T.nnet.softmax(Z)
+
+        for value in get_debug_values(rval):
+            assert value.shape[0] == self.dbm.batch_size
+
+        return rval
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        rval =  T.dot(downward_state, self.W.T) * self.copies
+
+        rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval
+
+    def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale):
+        """
+        The cost of reconstructing `Y` as `Y_hat`. Specifically,
+        the negative log probability.
+
+        This cost is for use with multi-prediction training.
+
+        Parameters
+        ----------
+        Y : target space batch
+            The data labels
+        Y_hat_unmasked : target space batch
+            The output of this layer's `mf_update`; the predicted
+            values of `Y`. Even though the model is only predicting
+            the dropped values, we take predictions for all the
+            values here.
+        drop_mask_Y : 1-D theano tensor
+            A batch of 0s/1s, with 1s indicating that variables
+            have been dropped, and should be included in the
+            reconstruction cost. One indicator per example in the
+            batch, since each example in this layer only has one
+            random variable in it.
+        scale : float
+            Multiply the cost by this amount.
+            We need to do this because the visible layer also goes into
+            the cost. We use the mean over units and examples, so that
+            the scale of the cost doesn't change too much with batch
+            size or example size.
+            We need to multiply this cost by scale to make sure that
+            it is put on the same scale as the reconstruction cost
+            for the visible units. ie, scale should be 1/nvis
+        """
+
+
+        Y_hat = Y_hat_unmasked
+        assert hasattr(Y_hat, 'owner')
+        owner = Y_hat.owner
+        assert owner is not None
+        op = owner.op
+        if isinstance(op, Print):
+            assert len(owner.inputs) == 1
+            Y_hat, = owner.inputs
+            owner = Y_hat.owner
+            op = owner.op
+        assert isinstance(op, T.nnet.Softmax)
+        z ,= owner.inputs
+        assert z.ndim == 2
+
+        z = z - z.max(axis=1).dimshuffle(0, 'x')
+        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
+        # we use sum and not mean because this is really one variable per row
+        log_prob_of = (Y * log_prob).sum(axis=1)
+        masked = log_prob_of * drop_mask_Y
+        assert masked.ndim == 1
+
+        rval = masked.mean() * scale * self.copies
+
+        return - rval
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval =  T.nnet.softmax(self.b.dimshuffle('x', 0)) + T.alloc(0., self.dbm.batch_size, self.n_classes).astype(config.floatX)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Returns a shared variable containing an actual state
+           (not a mean field state) for this variable.
+        """
+
+        if self.copies != 1:
+            raise NotImplementedError("need to make self.copies samples and average them together.")
+
+        t1 = time.time()
+
+        empty_input = self.output_space.get_origin_batch(num_examples)
+        h_state = sharedX(empty_input)
+
+        default_z = T.zeros_like(h_state) + self.b
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        h_exp = T.nnet.softmax(default_z)
+
+        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)
+
+        h_state = sharedX( self.output_space.get_origin_batch(
+            num_examples))
+
+
+        t2 = time.time()
+
+        f = function([], updates = [(
+            h_state , h_sample
+            )])
+
+        t3 = time.time()
+
+        f()
+
+        t4 = time.time()
+
+        logger.info('{0}.make_state took {1}'.format(self, t4-t1))
+        logger.info('\tcompose time: {0}'.format(t2-t1))
+        logger.info('\tcompile time: {0}'.format(t3-t2))
+        logger.info('\texecute time: {0}'.format(t4-t3))
+
+        h_state.name = 'softmax_sample_shared'
+
+        return h_state
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Returns a symbolic variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+
+        if self.copies != 1:
+            raise NotImplementedError("need to make self.copies samples and average them together.")
+
+        default_z = T.alloc(self.b, num_examples, self.n_classes)
+
+        h_exp = T.nnet.softmax(default_z)
+
+        h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype)
+
+        return h_sample
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        return coeff * T.sqr(self.W).sum()
+
+    def upward_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.center:
+            return state - self.offset
+        return state
+
+    def downward_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'center'):
+            self.center = False
+        if self.center:
+            """TODO: write a unit test verifying that inference or sampling
+                     below a centered Softmax layer works"""
+            return state - self.offset
+        return state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.center:
+            state = state - self.offset
+
+        self.input_space.validate(state_below)
+        if self.needs_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+        self.desired_space.validate(state_below)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = T.dot(state, self.b)
+        weights_term = (T.dot(state_below, self.W) * state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        rval *= self.copies
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def init_inpainting_state(self, Y, noise):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if noise:
+            theano_rng = make_theano_rng(None, 2012+10+30, which_method="binomial")
+            return T.nnet.softmax(theano_rng.normal(avg=0., size=Y.shape, std=1., dtype='float32'))
+        rval =  T.nnet.softmax(self.b)
+        if not hasattr(self, 'learn_init_inpainting_state'):
+            self.learn_init_inpainting_state = 1
+        if not self.learn_init_inpainting_state:
+            rval = block_gradient(rval)
+        return rval
+
+    def install_presynaptic_outputs(self, outputs_dict, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert self.presynaptic_name not in outputs_dict
+        outputs_dict[self.presynaptic_name] = self.output_space.make_shared_batch(batch_size, self.presynaptic_name)
+
+
+class GaussianVisLayer(VisibleLayer):
+    """
+    Implements a visible layer that is conditionally gaussian with
+    diagonal variance. The layer lives in a Conv2DSpace.
+
+    Parameters
+    ----------
+    rows, cols, channels : WRITEME
+        the shape of the space
+    learn_init_inpainting : bool, optional
+        WRITEME
+    nvis : WRITEME
+    init_beta : WRITEME
+        the initial value of the precision parameter
+    min_beta : WRITEME
+        clip beta so it is at least this big (default 1)
+    init_mu : WRITEME
+        the initial value of the mean parameter
+    tie_beta : WRITEME
+        None or a string specifying how to tie beta 'locations' = tie beta
+        across locations, ie beta should be a vector with one elem per channel
+    tie_mu : WRITEME
+        None or a string specifying how to tie mu 'locations' = tie mu across
+        locations, ie mu should be a vector with one elem per channel
+    bias_from_marginals : WRITEME
+    beta_lr_scale : WRITEME
+    axes : tuple
+        WRITEME
+    """
+    def __init__(self,
+            rows = None,
+            cols = None,
+            learn_init_inpainting_state=True,
+            channels = None,
+            nvis = None,
+            init_beta = 1.,
+            min_beta = 1.,
+            init_mu = None,
+            tie_beta = None,
+            tie_mu = None,
+            bias_from_marginals = None,
+            beta_lr_scale = 'by_sharing',
+            axes = ('b', 0, 1, 'c')):
+
+        warnings.warn("GaussianVisLayer math very faith based, need to finish working through gaussian.lyx")
+
+        self.__dict__.update(locals())
+        del self.self
+
+        if bias_from_marginals is not None:
+            del self.bias_from_marginals
+            if self.nvis is None:
+                raise NotImplementedError()
+            assert init_mu is None
+            init_mu = bias_from_marginals.X.mean(axis=0)
+
+        if init_mu is None:
+            init_mu = 0.
+        if nvis is None:
+            assert rows is not None
+            assert cols is not None
+            assert channels is not None
+            self.space = Conv2DSpace(shape=[rows,cols], num_channels=channels, axes=axes)
+            # To make GaussianVisLayer compatible with any axis ordering
+            self.batch_axis=list(axes).index('b')
+            self.axes_to_sum = range(len(axes))
+            self.axes_to_sum.remove(self.batch_axis)
+        else:
+            assert rows is None
+            assert cols is None
+            assert channels is None
+            self.space = VectorSpace(nvis)
+            self.axes_to_sum = 1
+            self.batch_axis = None
+        self.input_space = self.space
+
+        origin = self.space.get_origin()
+
+        beta_origin = origin.copy()
+        assert tie_beta in [ None, 'locations']
+        if tie_beta == 'locations':
+            assert nvis is None
+            beta_origin = np.zeros((self.space.num_channels,))
+        self.beta = sharedX(beta_origin + init_beta,name = 'beta')
+        assert self.beta.ndim == beta_origin.ndim
+
+        mu_origin = origin.copy()
+        assert tie_mu in [None, 'locations']
+        if tie_mu == 'locations':
+            assert nvis is None
+            mu_origin = np.zeros((self.space.num_channels,))
+        self.mu = sharedX( mu_origin + init_mu, name = 'mu')
+        assert self.mu.ndim == mu_origin.ndim
+
+
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        rval['beta_min'] = self.beta.min()
+        rval['beta_mean'] = self.beta.mean()
+        rval['beta_max'] = self.beta.max()
+
+        return rval
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.mu is None:
+            return [self.beta]
+        return [self.beta, self.mu]
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        if self.nvis is None:
+            rows, cols = self.space.shape
+            num_loc = float(rows * cols)
+
+        assert self.tie_beta in [None, 'locations']
+        if self.beta_lr_scale == 'by_sharing':
+            if self.tie_beta == 'locations':
+                assert self.nvis is None
+                rval[self.beta] = 1. / num_loc
+        elif self.beta_lr_scale == None:
+            pass
+        else:
+            rval[self.beta] = self.beta_lr_scale
+
+        assert self.tie_mu in [None, 'locations']
+        if self.tie_mu == 'locations':
+            warn = True
+            assert self.nvis is None
+            rval[self.mu] = 1./num_loc
+            logger.warning("mu lr_scaler hardcoded to 1/sharing")
+
+        return rval
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+        if self.beta in updates:
+            updated_beta = updates[self.beta]
+            updates[self.beta] = T.clip(updated_beta,
+                    self.min_beta,1e6)
+
+    def set_biases(self, bias):
+        """
+        Set mean parameter
+
+        Parameters
+        ----------
+        bias: WRITEME
+            Vector of size nvis
+        """
+        self.mu = sharedX(bias, name = 'mu')
+
+    def broadcasted_mu(self):
+        """
+        Returns mu, broadcasted to have the same shape as a batch of data
+        """
+
+        if self.tie_mu == 'locations':
+            def f(x):
+                if x == 'c':
+                    return 0
+                return 'x'
+            axes = [f(ax) for ax in self.axes]
+            rval = self.mu.dimshuffle(*axes)
+        else:
+            assert self.tie_mu is None
+            if self.nvis is None:
+                axes = [0, 1, 2]
+                axes.insert(self.axes.index('b'), 'x')
+                rval = self.mu.dimshuffle(*axes)
+            else:
+                rval = self.mu.dimshuffle('x', 0)
+
+        self.input_space.validate(rval)
+
+        return rval
+
+    def broadcasted_beta(self):
+        """
+        Returns beta, broadcasted to have the same shape as a batch of data
+        """
+        return self.broadcast_beta(self.beta)
+
+    def broadcast_beta(self, beta):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Returns beta, broadcasted to have the same shape as a batch of data
+        """
+
+        if self.tie_beta == 'locations':
+            def f(x):
+                if x == 'c':
+                    return 0
+                return 'x'
+            axes = [f(ax) for ax in self.axes]
+            rval = beta.dimshuffle(*axes)
+        else:
+            assert self.tie_beta is None
+            if self.nvis is None:
+                axes = [0, 1, 2]
+                axes.insert(self.axes.index('b'), 'x')
+                rval = beta.dimshuffle(*axes)
+            else:
+                rval = beta.dimshuffle('x', 0)
+
+        self.input_space.validate(rval)
+
+        return rval
+
+    def init_inpainting_state(self, V, drop_mask, noise = False, return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        """for Vv, drop_mask_v in get_debug_values(V, drop_mask):
+            assert Vv.ndim == 4
+            assert drop_mask_v.ndim in [3,4]
+            for i in xrange(drop_mask.ndim):
+                if Vv.shape[i] != drop_mask_v.shape[i]:
+                    print Vv.shape
+                    print drop_mask_v.shape
+                    assert False
+        """
+
+        unmasked = self.broadcasted_mu()
+
+        if drop_mask is None:
+            assert not noise
+            assert not return_unmasked
+            return unmasked
+        masked_mu = unmasked * drop_mask
+        if not hasattr(self, 'learn_init_inpainting_state'):
+            self.learn_init_inpainting_state = True
+        if not self.learn_init_inpainting_state:
+            masked_mu = block_gradient(masked_mu)
+        masked_mu.name = 'masked_mu'
+
+        if noise:
+            theano_rng = make_theano_rng(None, 42, which_method="binomial")
+            unmasked = theano_rng.normal(avg = 0.,
+                    std = 1., size = masked_mu.shape,
+                    dtype = masked_mu.dtype)
+            masked_mu = unmasked * drop_mask
+            masked_mu.name = 'masked_noise'
+
+
+        masked_V  = V  * (1-drop_mask)
+        rval = masked_mu + masked_V
+        rval.name = 'init_inpainting_state'
+
+        if return_unmasked:
+            return rval, unmasked
+        return rval
+
+
+    def expected_energy_term(self, state, average, state_below = None, average_below = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert state_below is None
+        assert average_below is None
+        self.space.validate(state)
+        if average:
+            raise NotImplementedError(str(type(self))+" doesn't support integrating out variational parameters yet.")
+        else:
+            rval =  0.5 * (self.beta * T.sqr(state - self.mu)).sum(axis=self.axes_to_sum)
+        assert rval.ndim == 1
+        return rval
+
+
+    def inpaint_update(self, state_above, layer_above, drop_mask = None, V = None,
+                        return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        msg = layer_above.downward_message(state_above)
+        mu = self.broadcasted_mu()
+
+        z = msg + mu
+        z.name = 'inpainting_z_[unknown_iter]'
+
+        if drop_mask is not None:
+            rval = drop_mask * z + (1-drop_mask) * V
+        else:
+            rval = z
+
+        rval.name = 'inpainted_V[unknown_iter]'
+
+        if return_unmasked:
+            return rval, z
+
+        return rval
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert state_below is None
+        msg = layer_above.downward_message(state_above)
+        mu = self.mu
+
+        z = msg + mu
+        rval = theano_rng.normal(size = z.shape, avg = z, dtype = z.dtype,
+                       std = 1. / T.sqrt(self.beta))
+        return rval
+
+    def recons_cost(self, V, V_hat_unmasked, drop_mask = None, use_sum=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        return self._recons_cost(V=V, V_hat_unmasked=V_hat_unmasked, drop_mask=drop_mask, use_sum=use_sum, beta=self.beta)
+
+
+    def _recons_cost(self, V, V_hat_unmasked, beta, drop_mask=None, use_sum=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        V_hat = V_hat_unmasked
+
+        assert V.ndim == V_hat.ndim
+        beta = self.broadcasted_beta()
+        unmasked_cost = 0.5 * beta * T.sqr(V-V_hat) - 0.5*T.log(beta / (2*np.pi))
+        assert unmasked_cost.ndim == V_hat.ndim
+
+        if drop_mask is None:
+            masked_cost = unmasked_cost
+        else:
+            masked_cost = drop_mask * unmasked_cost
+
+        if use_sum:
+            return masked_cost.mean(axis=0).sum()
+
+        return masked_cost.mean()
+
+        return masked_cost.mean()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.nvis is None and total_state.ndim != 4:
+            raise ValueError("total_state should have 4 dimensions, has "+str(total_state.ndim))
+        assert total_state is not None
+        V = total_state
+        self.input_space.validate(V)
+        upward_state = (V - self.broadcasted_mu()) * self.broadcasted_beta()
+        return upward_state
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        shape = [num_examples]
+
+        if self.nvis is None:
+            rows, cols = self.space.shape
+            channels = self.space.num_channels
+            shape.append(rows)
+            shape.append(cols)
+            shape.append(channels)
+        else:
+            shape.append(self.nvis)
+
+        sample = numpy_rng.randn(*shape)
+
+        sample *= 1./np.sqrt(self.beta.get_value())
+        sample += self.mu.get_value()
+        rval = sharedX(sample, name = 'v_sample_shared')
+
+        return rval
+
+    def install_presynaptic_outputs(self, outputs_dict, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        outputs_dict['output_V_weighted_pred_sum'] = self.space.make_shared_batch(batch_size)
+
+    def ensemble_prediction(self, symbolic, outputs_dict, ensemble):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Output a symbolic expression for V_hat_unmasked based on taking the
+        geometric mean over the ensemble and renormalizing.
+        n - 1 members of the ensemble have modified outputs_dict and the nth
+        gives its prediction in "symbolic". The parameters for the nth one
+        are currently loaded in the model.
+        """
+
+        weighted_pred_sum = outputs_dict['output_V_weighted_pred_sum'] \
+                + self.broadcasted_beta() * symbolic
+
+        beta_sum = sum(ensemble.get_ensemble_variants(self.beta))
+
+        unmasked_V_hat = weighted_pred_sum / self.broadcast_beta(beta_sum)
+
+        return unmasked_V_hat
+
+    def ensemble_recons_cost(self, V, V_hat_unmasked, drop_mask=None,
+            use_sum=False, ensemble=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        beta = sum(ensemble.get_ensemble_variants(self.beta)) / ensemble.num_copies
+
+        return self._recons_cost(V=V, V_hat_unmasked=V_hat_unmasked, beta=beta, drop_mask=drop_mask,
+            use_sum=use_sum)
+
+
+class ConvMaxPool(HiddenLayer):
+    """
+    .. todo::
+
+        WRITEME
+    """
+
+    def __init__(self,
+             output_channels,
+            kernel_rows,
+            kernel_cols,
+            pool_rows,
+            pool_cols,
+            layer_name,
+            center = False,
+            irange = None,
+            sparse_init = None,
+            scale_by_sharing = True,
+            init_bias = 0.,
+            border_mode = 'valid',
+            output_axes = ('b', 'c', 0, 1)):
+        self.__dict__.update(locals())
+        del self.self
+
+        assert (irange is None) != (sparse_init is None)
+
+        self.b = sharedX( np.zeros((output_channels,)) + init_bias, name = layer_name + '_b')
+        assert border_mode in ['full','valid']
+
+    def broadcasted_bias(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert self.b.ndim == 1
+
+        shuffle = [ 'x' ] * 4
+        shuffle[self.output_axes.index('c')] = 0
+
+        return self.b.dimshuffle(*shuffle)
+
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace((self.h_space, self.output_space))
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Note: this resets parameters!"""
+        if not isinstance(space, Conv2DSpace):
+            raise TypeError("ConvMaxPool can only act on a Conv2DSpace, but received " +
+                    str(type(space))+" as input.")
+        self.input_space = space
+        self.input_rows, self.input_cols = space.shape
+        self.input_channels = space.num_channels
+
+        if self.border_mode == 'valid':
+            self.h_rows = self.input_rows - self.kernel_rows + 1
+            self.h_cols = self.input_cols - self.kernel_cols + 1
+        else:
+            assert self.border_mode == 'full'
+            self.h_rows = self.input_rows + self.kernel_rows - 1
+            self.h_cols = self.input_cols + self.kernel_cols - 1
+
+
+        if not( self.h_rows % self.pool_rows == 0):
+            raise ValueError("h_rows = %d, pool_rows = %d. Should be divisible but remainder is %d" %
+                    (self.h_rows, self.pool_rows, self.h_rows % self.pool_rows))
+        assert self.h_cols % self.pool_cols == 0
+
+        self.h_space = Conv2DSpace(shape = (self.h_rows, self.h_cols), num_channels = self.output_channels,
+                axes = self.output_axes)
+        self.output_space = Conv2DSpace(shape = (self.h_rows / self.pool_rows,
+                                                self.h_cols / self.pool_cols),
+                                                num_channels = self.output_channels,
+                axes = self.output_axes)
+
+        logger.info('{0}: detector shape: {1} '
+                    'pool shape: {2}'.format(self.layer_name,
+                                             self.h_space.shape,
+                                             self.output_space.shape))
+
+        if tuple(self.output_axes) == ('b', 0, 1, 'c'):
+            self.max_pool = max_pool_b01c
+        elif tuple(self.output_axes) == ('b', 'c', 0, 1):
+            self.max_pool = max_pool
+        else:
+            raise NotImplementedError()
+
+        if self.irange is not None:
+            self.transformer = make_random_conv2D(self.irange, input_space = space,
+                    output_space = self.h_space, kernel_shape = (self.kernel_rows, self.kernel_cols),
+                    batch_size = self.dbm.batch_size, border_mode = self.border_mode, rng = self.dbm.rng)
+        else:
+            self.transformer = make_sparse_random_conv2D(self.sparse_init, input_space = space,
+                    output_space = self.h_space, kernel_shape = (self.kernel_rows, self.kernel_cols),
+                    batch_size = self.dbm.batch_size, border_mode = self.border_mode, rng = self.dbm.rng)
+        self.transformer._filters.name = self.layer_name + '_W'
+
+
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        if self.center:
+            p_ofs, h_ofs = self.init_mf_state()
+            self.p_offset = sharedX(self.output_space.get_origin(), 'p_offset')
+            self.h_offset = sharedX(self.h_space.get_origin(), 'h_offset')
+            f = function([], updates={self.p_offset: p_ofs[0,:,:,:], self.h_offset: h_ofs[0,:,:,:]})
+            f()
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        return [ W, self.b]
+
+    def state_to_b01c(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if tuple(self.output_axes) == ('b',0,1,'c'):
+            return state
+        return [ Conv2DSpace.convert(elem, self.output_axes, ('b', 0, 1, 'c'))
+                for elem in state ]
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            if c == 0.:
+                continue
+            # Range over everything but the channel index
+            # theano can only take gradient through max if the max is over 1 axis or all axes
+            # so I manually unroll the max for the case I use here
+            assert self.h_space.axes == ('b', 'c', 0, 1)
+            assert self.output_space.axes == ('b', 'c', 0, 1)
+            mx = s.max(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mx.owner.op, 'grad')
+            mn = s.min(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mn.owner.op, 'grad')
+            assert mx.ndim == 1
+            assert mn.ndim == 1
+            r = mx - mn
+            rval += (1. - r).mean() * c
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+
+            target: if pools contain more than one element, should be a list with
+                    two elements. the first element is for the pooling units and
+                    the second for the detector units.
+
+        """
+        rval = 0.
+
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(target, float)
+            assert isinstance(coeff, float)
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+            if eps is None:
+                eps = 0.
+            eps = [eps]
+        else:
+            if eps is None:
+                eps = [0., 0.]
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            p_target, h_target = target
+            if h_target > p_target and (coeff[0] != 0. and coeff[1] != 0.):
+                # note that, within each group, E[p] is the sum of E[h]
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c, e in safe_zip(state, target, coeff, eps):
+            if c == 0.:
+                continue
+            # Average over everything but the channel index
+            m = s.mean(axis= [ ax for ax in range(4) if self.output_axes[ax] != 'c' ])
+            assert m.ndim == 1
+            rval += T.maximum(abs(m-t)-e,0.).mean()*c
+
+        return rval
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.scale_by_sharing:
+            # scale each learning rate by 1 / # times param is reused
+            h_rows, h_cols = self.h_space.shape
+            num_h = float(h_rows * h_cols)
+            return OrderedDict([(self.transformer._filters, 1./num_h),
+                     (self.b, 1. / num_h)])
+        else:
+            return OrderedDict()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return p
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return h
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P, H = state
+
+        if tuple(self.output_axes) == ('b',0,1,'c'):
+            p_max = P.max(axis=(0,1,2))
+            p_min = P.min(axis=(0,1,2))
+            p_mean = P.mean(axis=(0,1,2))
+        else:
+            assert tuple(self.output_axes) == ('b','c',0,1)
+            p_max = P.max(axis=(0,2,3))
+            p_min = P.min(axis=(0,2,3))
+            p_mean = P.mean(axis=(0,2,3))
+        p_range = p_max - p_min
+
+        rval = {
+                'p_max_max' : p_max.max(),
+                'p_max_mean' : p_max.mean(),
+                'p_max_min' : p_max.min(),
+                'p_min_max' : p_min.max(),
+                'p_min_mean' : p_min.mean(),
+                'p_min_max' : p_min.max(),
+                'p_range_max' : p_range.max(),
+                'p_range_mean' : p_range.mean(),
+                'p_range_min' : p_range.min(),
+                'p_mean_max' : p_mean.max(),
+                'p_mean_mean' : p_mean.mean(),
+                'p_mean_min' : p_mean.min()
+                }
+
+        return rval
+
+    def get_weight_decay(self, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W , = self.transformer.get_params()
+        return coeffs * T.sqr(W).sum()
+
+
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if not hasattr(state_below, 'ndim'):
+            raise TypeError("state_below should be a TensorType, got " +
+                    str(state_below) + " of type " + str(type(state_below)))
+        if state_below.ndim != 4:
+            raise ValueError("state_below should have ndim 4, has "+str(state_below.ndim))
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = self.max_pool(z, (self.pool_rows, self.pool_cols), msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+            try:
+                self.output_space.validate(msg)
+            except TypeError, e:
+                reraise_as(TypeError(str(type(layer_above))+".downward_message gave something that was not the right type: "+str(e)))
+        else:
+            msg = None
+
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        p, h, p_sample, h_sample = self.max_pool(z,
+                (self.pool_rows, self.pool_cols), msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.h_space.validate(downward_state)
+        return self.transformer.lmul_T(downward_state)
+
+    def set_batch_size(self, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.transformer.set_batch_size(batch_size)
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        outp, inp, rows, cols = range(4)
+        raw = self.transformer._filters.get_value()
+
+        return np.transpose(raw,(outp,rows,cols,inp))
+
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        default_z = self.broadcasted_bias()
+        shape = {
+                'b': self.dbm.batch_size,
+                0: self.h_space.shape[0],
+                1: self.h_space.shape[1],
+                'c': self.h_space.num_channels
+                }
+        # work around theano bug with broadcasted stuff
+        default_z += T.alloc(*([0.]+[shape[elem] for elem in self.h_space.axes])).astype(default_z.dtype)
+        assert default_z.ndim == 4
+
+        p, h = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols))
+
+        return p, h
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Returns a shared variable containing an actual state
+           (not a mean field state) for this variable.
+        """
+
+        t1 = time.time()
+
+        empty_input = self.h_space.get_origin_batch(self.dbm.batch_size)
+        h_state = sharedX(empty_input)
+
+        default_z = T.zeros_like(h_state) + self.broadcasted_bias()
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        p_exp, h_exp, p_sample, h_sample = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols),
+                theano_rng = theano_rng)
+
+        p_state = sharedX( self.output_space.get_origin_batch(
+            self.dbm.batch_size))
+
+
+        t2 = time.time()
+
+        f = function([], updates = [
+            (p_state, p_sample),
+            (h_state, h_sample)
+            ])
+
+        t3 = time.time()
+
+        f()
+
+        t4 = time.time()
+
+        logger.info('{0}.make_state took'.format(self, t4-t1))
+        logger.info('\tcompose time: {0}'.format(t2-t1))
+        logger.info('\tcompile time: {0}'.format(t3-t2))
+        logger.info('\texecute time: {0}'.format(t4-t3))
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = (downward_state * self.broadcasted_bias()).sum(axis=(1,2,3))
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=(1,2,3))
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval
+
+
+class ConvC01B_MaxPool(HiddenLayer):
+    """
+    .. todo::
+
+        WRITEME
+    """
+
+    def __init__(self,
+             output_channels,
+            kernel_shape,
+            pool_rows,
+            pool_cols,
+            layer_name,
+            center = False,
+            irange = None,
+            sparse_init = None,
+            scale_by_sharing = True,
+            init_bias = 0.,
+            pad = 0,
+            partial_sum = 1):
+        self.__dict__.update(locals())
+        del self.self
+
+        assert (irange is None) != (sparse_init is None)
+        self.output_axes = ('c', 0, 1, 'b')
+        self.detector_channels = output_channels
+        self.tied_b = 1
+
+    def broadcasted_bias(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.b.ndim != 1:
+            raise NotImplementedError()
+
+        shuffle = [ 'x' ] * 4
+        shuffle[self.output_axes.index('c')] = 0
+
+        return self.b.dimshuffle(*shuffle)
+
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace((self.h_space, self.output_space))
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Note: this resets parameters!"""
+
+        setup_detector_layer_c01b(layer=self,
+                input_space=space, rng=self.dbm.rng,
+                irange=self.irange)
+
+        if not tuple(space.axes) == ('c', 0, 1, 'b'):
+            raise AssertionError("You're not using c01b inputs. Ian is enforcing c01b inputs while developing his pipeline to make sure it runs at maximal speed. If you really don't want to use c01b inputs, you can remove this check and things should work. If they don't work it's only because they're not tested.")
+        if self.dummy_channels != 0:
+            raise NotImplementedError(str(type(self))+" does not support adding dummy channels for cuda-convnet compatibility yet, you must implement that feature or use inputs with <=3 channels or a multiple of 4 channels")
+
+        self.input_rows = self.input_space.shape[0]
+        self.input_cols = self.input_space.shape[1]
+        self.h_rows = self.detector_space.shape[0]
+        self.h_cols = self.detector_space.shape[1]
+
+        if not(self.h_rows % self.pool_rows == 0):
+            raise ValueError(self.layer_name + ": h_rows = %d, pool_rows = %d. Should be divisible but remainder is %d" %
+                    (self.h_rows, self.pool_rows, self.h_rows % self.pool_rows))
+        assert self.h_cols % self.pool_cols == 0
+
+        self.h_space = Conv2DSpace(shape = (self.h_rows, self.h_cols), num_channels = self.output_channels,
+                axes = self.output_axes)
+        self.output_space = Conv2DSpace(shape = (self.h_rows / self.pool_rows,
+                                                self.h_cols / self.pool_cols),
+                                                num_channels = self.output_channels,
+                axes = self.output_axes)
+
+        logger.info('{0} : detector shape: {1} '
+                    'pool shape: {2}'.format(self.layer_name,
+                                             self.h_space.shape,
+                                             self.output_space.shape))
+
+        assert tuple(self.output_axes) == ('c', 0, 1, 'b')
+        self.max_pool = max_pool_c01b
+
+        if self.center:
+            p_ofs, h_ofs = self.init_mf_state()
+            self.p_offset = sharedX(self.output_space.get_origin(), 'p_offset')
+            self.h_offset = sharedX(self.h_space.get_origin(), 'h_offset')
+            f = function([], updates={self.p_offset: p_ofs[:,:,:,0], self.h_offset: h_ofs[:,:,:,0]})
+            f()
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        return [ W, self.b]
+
+    def state_to_b01c(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if tuple(self.output_axes) == ('b',0,1,'c'):
+            return state
+        return [ Conv2DSpace.convert(elem, self.output_axes, ('b', 0, 1, 'c'))
+                for elem in state ]
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            if c == 0.:
+                continue
+            # Range over everything but the channel index
+            # theano can only take gradient through max if the max is over 1 axis or all axes
+            # so I manually unroll the max for the case I use here
+            assert self.h_space.axes == ('b', 'c', 0, 1)
+            assert self.output_space.axes == ('b', 'c', 0, 1)
+            mx = s.max(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mx.owner.op, 'grad')
+            mn = s.min(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mn.owner.op, 'grad')
+            assert mx.ndim == 1
+            assert mn.ndim == 1
+            r = mx - mn
+            rval += (1. - r).mean() * c
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Parameters
+        ----------
+        state : WRITEME
+        target : WRITEME
+            if pools contain more than one element, should be a list
+            with two elements. the first element is for the pooling
+            units and the second for the detector units.
+        coeff : WRITEME
+        eps : WRITEME
+        """
+        rval = 0.
+
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(target, float)
+            assert isinstance(coeff, float)
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+            if eps is None:
+                eps = 0.
+            eps = [eps]
+        else:
+            if eps is None:
+                eps = [0., 0.]
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            p_target, h_target = target
+            if h_target > p_target and (coeff[0] != 0. and coeff[1] != 0.):
+                # note that, within each group, E[p] is the sum of E[h]
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c, e in safe_zip(state, target, coeff, eps):
+            if c == 0.:
+                continue
+            # Average over everything but the channel index
+            m = s.mean(axis= [ ax for ax in range(4) if self.output_axes[ax] != 'c' ])
+            assert m.ndim == 1
+            rval += T.maximum(abs(m-t)-e,0.).mean()*c
+
+        return rval
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        rval = OrderedDict()
+
+        if self.scale_by_sharing:
+            # scale each learning rate by 1 / # times param is reused
+            h_rows, h_cols = self.h_space.shape
+            num_h = float(h_rows * h_cols)
+            rval[self.transformer._filters] = 1. /num_h
+            rval[self.b] = 1. / num_h
+
+        return rval
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return p
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return h
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P, H = state
+
+        axes = tuple([i for i, ax in enumerate(self.output_axes) if ax != 'c'])
+        p_max = P.max(axis=(0,1,2))
+        p_min = P.min(axis=(0,1,2))
+        p_mean = P.mean(axis=(0,1,2))
+
+        p_range = p_max - p_min
+
+        rval = {
+                'p_max_max' : p_max.max(),
+                'p_max_mean' : p_max.mean(),
+                'p_max_min' : p_max.min(),
+                'p_min_max' : p_min.max(),
+                'p_min_mean' : p_min.mean(),
+                'p_min_max' : p_min.max(),
+                'p_range_max' : p_range.max(),
+                'p_range_mean' : p_range.mean(),
+                'p_range_min' : p_range.min(),
+                'p_mean_max' : p_mean.max(),
+                'p_mean_mean' : p_mean.mean(),
+                'p_mean_min' : p_mean.min()
+                }
+
+        return rval
+
+    def get_weight_decay(self, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W , = self.transformer.get_params()
+        return coeffs * T.sqr(W).sum()
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if not hasattr(state_below, 'ndim'):
+            raise TypeError("state_below should be a TensorType, got " +
+                    str(state_below) + " of type " + str(type(state_below)))
+        if state_below.ndim != 4:
+            raise ValueError("state_below should have ndim 4, has "+str(state_below.ndim))
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = self.max_pool(z, (self.pool_rows, self.pool_cols), msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("Need to update for C01B")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+            try:
+                self.output_space.validate(msg)
+            except TypeError, e:
+                reraise_as(TypeError(str(type(layer_above))+".downward_message gave something that was not the right type: "+str(e)))
+        else:
+            msg = None
+
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        p, h, p_sample, h_sample = self.max_pool(z,
+                (self.pool_rows, self.pool_cols), msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.h_space.validate(downward_state)
+        return self.transformer.lmul_T(downward_state)
+
+    def set_batch_size(self, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.transformer.set_batch_size(batch_size)
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.transformer.get_weights_topo()
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        default_z = self.broadcasted_bias()
+        shape = {
+                'b': self.dbm.batch_size,
+                0: self.h_space.shape[0],
+                1: self.h_space.shape[1],
+                'c': self.h_space.num_channels
+                }
+        # work around theano bug with broadcasted stuff
+        default_z += T.alloc(*([0.]+[shape[elem] for elem in self.h_space.axes])).astype(default_z.dtype)
+        assert default_z.ndim == 4
+
+        p, h = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols))
+
+        return p, h
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        raise NotImplementedError("Need to update for C01B")
+
+        t1 = time.time()
+
+        empty_input = self.h_space.get_origin_batch(self.dbm.batch_size)
+        h_state = sharedX(empty_input)
+
+        default_z = T.zeros_like(h_state) + self.broadcasted_bias()
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        p_exp, h_exp, p_sample, h_sample = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols),
+                theano_rng = theano_rng)
+
+        p_state = sharedX( self.output_space.get_origin_batch(
+            self.dbm.batch_size))
+
+
+        t2 = time.time()
+
+        f = function([], updates = [
+            (p_state, p_sample),
+            (h_state, h_sample)
+            ])
+
+        t3 = time.time()
+
+        f()
+
+        t4 = time.time()
+
+        logger.info('{0}.make_state took {1}'.format(self, t4-t1))
+        logger.info('\tcompose time: {0}'.format(t2-t1))
+        logger.info('\tcompile time: {0}'.format(t3-t2))
+        logger.info('\texecute time: {0}'.format(t4-t3))
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        raise NotImplementedError("Need to update for C01B")
+        self.input_space.validate(state_below)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = (downward_state * self.broadcasted_bias()).sum(axis=(1,2,3))
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=(1,2,3))
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval
+
+
+class BVMP_Gaussian(BinaryVectorMaxPool):
+    """
+    Like BinaryVectorMaxPool, but must have GaussianVisLayer
+    as its input. Uses its beta to bias the hidden units appropriately.
+    See gaussian.lyx
+
+    beta is *not* considered a parameter of this layer, it's just an
+    external factor influencing how this layer behaves.
+    Gradient can still flow to beta, but it will only be included in
+    the parameters list if some class other than this layer includes it.
+
+    .. todo::
+
+        WRITEME : parameter list
+    """
+
+    def __init__(self,
+            input_layer,
+            detector_layer_dim,
+            pool_size,
+            layer_name,
+            irange = None,
+            sparse_init = None,
+            sparse_stdev = 1.,
+            include_prob = 1.0,
+            init_bias = 0.,
+            W_lr_scale = None,
+            b_lr_scale = None,
+            center = False,
+            mask_weights = None,
+            max_col_norm = None,
+            copies = 1):
+        warnings.warn("BVMP_Gaussian math is very faith-based, need to complete gaussian.lyx")
+
+        args = locals()
+
+        del args['input_layer']
+        del args['self']
+        super(BVMP_Gaussian, self).__init__(**args)
+        self.input_layer = input_layer
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.requires_reformat:
+            # This is not really an unimplemented case.
+            # We actually don't know how to format the weights
+            # in design space. We got the data in topo space
+            # and we don't have access to the dataset
+            raise NotImplementedError()
+        W ,= self.transformer.get_params()
+        W = W.get_value()
+
+        x = raw_input("multiply by beta?")
+        if x == 'y':
+            beta = self.input_layer.beta.get_value()
+            return (W.T * beta).T
+        assert x == 'n'
+        return W
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("beta would make get_weights for visualization not correspond to set_weights")
+        W, = self.transformer.get_params()
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset.set_value(sigmoid_numpy(self.b.get_value()))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value() - self.beta_bias().eval()
+
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("need to account for beta")
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        z = self.transformer.lmul(state_below) + self.b
+        p, h, p_sample, h_sample = max_pool_channels(z,
+                self.pool_size, msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = self.transformer.lmul_T(downward_state)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval * self.copies
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \
+                self.b.dimshuffle('x', 0) + self.beta_bias()
+        rval = max_pool_channels(z = z,
+                pool_size = self.pool_size)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        raise NotImplementedError("need to account for beta")
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        if self.copies != 1:
+            raise NotImplementedError()
+
+
+        empty_input = self.h_space.get_origin_batch(num_examples)
+        empty_output = self.output_space.get_origin_batch(num_examples)
+
+        h_state = sharedX(empty_input)
+        p_state = sharedX(empty_output)
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        default_z = T.zeros_like(h_state) + self.b
+
+        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
+                z = default_z,
+                pool_size = self.pool_size,
+                theano_rng = theano_rng)
+
+        assert h_sample.dtype == default_z.dtype
+
+        f = function([], updates = [
+            (p_state , p_sample),
+            (h_state , h_sample)
+            ])
+
+        f()
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("need to account for beta, and maybe some oether stuff")
+
+        # Don't need to do anything special for centering, upward_state / downward state
+        # make it all just work
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = T.dot(downward_state, self.b)
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval * self.copies
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Used to implement TorontoSparsity. Unclear exactly what properties of it are
+        important or how to implement it for other layers.
+
+        Properties it must have:
+            output is same kind of data structure (ie, tuple of theano 2-tensors)
+            as mf_update
+
+        Properties it probably should have for other layer types:
+            An infinitesimal change in state_below or the parameters should cause the same sign of change
+            in the output of linear_feed_forward_approximation and in mf_update
+
+            Should not have any non-linearities that cause the gradient to shrink
+
+            Should disregard top-down feedback
+        """
+        raise NotImplementedError("need to account for beta")
+
+        z = self.transformer.lmul(state_below) + self.b
+
+        if self.pool_size != 1:
+            # Should probably implement sum pooling for the non-pooled version,
+            # but in reality it's not totally clear what the right answer is
+            raise NotImplementedError()
+
+        return z, z
+
+    def beta_bias(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W, = self.transformer.get_params()
+        beta = self.input_layer.beta
+        assert beta.ndim == 1
+        return - 0.5 * T.dot(beta, T.sqr(W))
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.b + self.beta_bias()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = max_pool_channels(z, self.pool_size, msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+class CompositeLayer(HiddenLayer):
+    """
+        A Layer constructing by aligning several other Layer
+        objects side by side
+
+        Parameters
+        ----------
+        components : WRITEME
+            A list of layers that are combined to form this layer
+        inputs_to_components : None or dict mapping int to list of int
+            Should be None unless the input space is a CompositeSpace
+            If inputs_to_components[i] contains j, it means input i will
+            be given as input to component j.
+            If an input dodes not appear in the dictionary, it will be given
+            to all components.
+
+            This field allows one CompositeLayer to have another as input
+            without forcing each component to connect to all members
+            of the CompositeLayer below. For example, you might want to
+            have both densely connected and convolutional units in all
+            layers, but a convolutional unit is incapable of taking a
+            non-topological input space.
+    """
+
+
+    def __init__(self, layer_name, components, inputs_to_components = None):
+        self.layer_name = layer_name
+
+        self.components = list(components)
+        assert isinstance(components, list)
+        for component in components:
+            assert isinstance(component, HiddenLayer)
+        self.num_components = len(components)
+        self.components = list(components)
+
+        if inputs_to_components is None:
+            self.inputs_to_components = None
+        else:
+            if not isinstance(inputs_to_components, dict):
+                raise TypeError("CompositeLayer expected inputs_to_components to be a dict, got "+str(type(inputs_to_components)))
+            self.inputs_to_components = OrderedDict()
+            for key in inputs_to_components:
+                assert isinstance(key, int)
+                assert key >= 0
+                value = inputs_to_components[key]
+                assert isinstance(value, list)
+                assert all([isinstance(elem, int) for elem in value])
+                assert min(value) >= 0
+                assert max(value) < self.num_components
+                self.inputs_to_components[key] = list(value)
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.input_space = space
+
+        if not isinstance(space, CompositeSpace):
+            assert self.inputs_to_components is None
+            self.routing_needed = False
+        else:
+            if self.inputs_to_components is None:
+                self.routing_needed = False
+            else:
+                self.routing_needed = True
+                assert max(self.inputs_to_components) < space.num_components
+                # Invert the dictionary
+                self.components_to_inputs = OrderedDict()
+                for i in xrange(self.num_components):
+                    inputs = []
+                    for j in xrange(space.num_components):
+                        if i in self.inputs_to_components[j]:
+                            inputs.append(i)
+                    if len(inputs) < space.num_components:
+                        self.components_to_inputs[i] = inputs
+
+        for i, component in enumerate(self.components):
+            if self.routing_needed and i in self.components_to_inputs:
+                cur_space = space.restrict(self.components_to_inputs[i])
+            else:
+                cur_space = space
+
+            component.set_input_space(cur_space)
+
+        self.output_space = CompositeSpace([ component.get_output_space() for component in self.components ])
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple(component.make_state(num_examples, numpy_rng) for
+                component in self.components)
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace([component.get_total_state_space() for component in self.components])
+
+    def set_batch_size(self, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        for component in self.components:
+            component.set_batch_size(batch_size)
+
+    def set_dbm(self, dbm):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        for component in self.components:
+            component.set_dbm(dbm)
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = []
+
+        for i, component in enumerate(self.components):
+            if self.routing_needed and i in self.components_to_inputs:
+                cur_state_below =self.input_space.restrict_batch(state_below, self.components_to_inputs[i])
+            else:
+                cur_state_below = state_below
+
+            class RoutingLayer(object):
+                def __init__(self, idx, layer):
+                    self.__dict__.update(locals())
+                    del self.self
+                    self.layer_name = 'route_'+str(idx)+'_'+layer.layer_name
+
+                def downward_message(self, state):
+                    return self.layer.downward_message(state)[self.idx]
+
+            if layer_above is not None:
+                cur_layer_above = RoutingLayer(i, layer_above)
+            else:
+                cur_layer_above = None
+
+            mf_update = component.mf_update(state_below = cur_state_below,
+                                            state_above = state_above,
+                                            layer_above = cur_layer_above,
+                                            double_weights = double_weights,
+                                            iter_name = iter_name)
+
+            rval.append(mf_update)
+
+        return tuple(rval)
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple([component.init_mf_state() for component in self.components])
+
+
+    def get_weight_decay(self, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return sum([component.get_weight_decay(coeff) for component, coeff
+            in safe_zip(self.components, coeffs)])
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple([component.upward_state(elem)
+            for component, elem in
+            safe_zip(self.components, total_state)])
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple([component.downward_state(elem)
+            for component, elem in
+            safe_zip(self.components, total_state)])
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(self.input_space, CompositeSpace):
+            num_input_components = self.input_space.num_components
+        else:
+            num_input_components = 1
+
+        rval = [ None ] * num_input_components
+
+        def add(x, y):
+            if x is None:
+                return y
+            if y is None:
+                return x
+            return x + y
+
+        for i, packed in enumerate(safe_zip(self.components, downward_state)):
+            component, state = packed
+            if self.routing_needed and i in self.components_to_inputs:
+                input_idx = self.components_to_inputs[i]
+            else:
+                input_idx = range(num_input_components)
+
+            partial_message = component.downward_message(state)
+
+            if len(input_idx) == 1:
+                partial_message = [ partial_message ]
+
+            assert len(input_idx) == len(partial_message)
+
+            for idx, msg in safe_zip(input_idx, partial_message):
+                rval[idx] = add(rval[idx], msg)
+
+        if len(rval) == 1:
+            rval = rval[0]
+        else:
+            rval = tuple(rval)
+
+        self.input_space.validate(rval)
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return sum([ comp.get_l1_act_cost(s, t, c, e) \
+            for comp, s, t, c, e in safe_zip(self.components, state, target, coeff, eps)])
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return sum([comp.get_range_rewards(s, c)
+            for comp, s, c in safe_zip(self.components, state, coeffs)])
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return reduce(lambda x, y: safe_union(x, y),
+                [component.get_params() for component in self.components])
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        logger.info('Get topological weights for which layer?')
+        for i, component in enumerate(self.components):
+            logger.info('{0} {1}'.format(i, component.layer_name))
+        x = raw_input()
+        return self.components[int(x)].get_weights_topo()
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        for layer, s in safe_zip(self.components, state):
+            d = layer.get_monitoring_channels_from_state(s)
+            for key in d:
+                rval[layer.layer_name+'_'+key] = d[key]
+
+        return rval
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = []
+
+        for i, component in enumerate(self.components):
+            if self.routing_needed and i in self.components_to_inputs:
+                cur_state_below =self.input_space.restrict_batch(state_below, self.components_to_inputs[i])
+            else:
+                cur_state_below = state_below
+
+            class RoutingLayer(object):
+                def __init__(self, idx, layer):
+                    self.__dict__.update(locals())
+                    del self.self
+                    self.layer_name = 'route_'+str(idx)+'_'+layer.layer_name
+
+                def downward_message(self, state):
+                    return self.layer.downward_message(state)[self.idx]
+
+            if layer_above is not None:
+                cur_layer_above = RoutingLayer(i, layer_above)
+            else:
+                cur_layer_above = None
+
+            sample = component.sample(state_below = cur_state_below,
+                                            state_above = state_above,
+                                            layer_above = cur_layer_above,
+                                            theano_rng = theano_rng)
+
+            rval.append(sample)
+
+        return tuple(rval)
diff --git a/pylearn2/sandbox/dbm_v2/sampling_procedure.py b/pylearn2/sandbox/dbm_v2/sampling_procedure.py
new file mode 100644
index 0000000000..134f94a37d
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/sampling_procedure.py
@@ -0,0 +1,210 @@
+"""
+.. todo::
+
+    WRITEME
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+from theano.compat import OrderedDict
+from pylearn2.utils import py_integer_types
+
+
+class SamplingProcedure(object):
+    """
+    Procedure for sampling from a DBM.
+    """
+
+    def set_dbm(self, dbm):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.dbm = dbm
+
+    def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
+               num_steps=1):
+        """
+        Samples from self.dbm using `layer_to_state` as starting values.
+
+        Parameters
+        ----------
+        layer_to_state : dict
+            Maps the DBM's Layer instances to theano variables representing
+            batches of samples of them.
+        theano_rng : theano.sandbox.rng_mrg.MRG_RandomStreams
+            WRITEME
+        layer_to_clamp : dict, optional
+            Maps Layers to bools. If a layer is not in the dictionary,
+            defaults to False. True indicates that this layer should be
+            clamped, so we are sampling from a conditional distribution
+            rather than the joint distribution.
+        num_steps : int, optional
+            WRITEME
+
+        Returns
+        -------
+        layer_to_updated_state : dict
+            Maps the DBM's Layer instances to theano variables representing
+            batches of updated samples of them.
+        """
+        raise NotImplementedError(str(type(self))+" does not implement " +
+                                  "sample.")
+
+
+class GibbsEvenOdd(SamplingProcedure):
+    """
+    The specific sampling schedule used to sample all of the even-idexed
+    layers of model.hidden_layers, then the visible layer and all the
+    odd-indexed layers.
+    """
+
+    def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
+               num_steps=1):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # Validate num_steps
+        assert isinstance(num_steps, py_integer_types)
+        assert num_steps > 0
+
+        # Implement the num_steps > 1 case by repeatedly calling the
+        # num_steps == 1 case
+        if num_steps != 1:
+            for i in xrange(num_steps):
+                layer_to_state = self.sample(layer_to_state, theano_rng,
+                                             layer_to_clamp, num_steps=1)
+            return layer_to_state
+
+        # The rest of the function is the num_steps = 1 case
+        # Current code assumes this, though we could certainly relax this
+        # constraint
+        assert len(self.dbm.hidden_layers) > 0
+
+        # Validate layer_to_clamp / make sure layer_to_clamp is a fully
+        # populated dictionary
+        if layer_to_clamp is None:
+            layer_to_clamp = OrderedDict()
+
+        for key in layer_to_clamp:
+            assert (key is self.dbm.visible_layer or
+                    key in self.dbm.hidden_layers)
+
+        for layer in [self.dbm.visible_layer] + self.dbm.hidden_layers:
+            if layer not in layer_to_clamp:
+                layer_to_clamp[layer] = False
+
+        # Assemble the return value
+        layer_to_updated = OrderedDict()
+
+        for i, this_layer in list(enumerate(self.dbm.hidden_layers))[::2]:
+            # Iteration i does the Gibbs step for hidden_layers[i]
+
+            # Get the sampled state of the layer below so we can condition
+            # on it in our Gibbs update
+            if i == 0:
+                layer_below = self.dbm.visible_layer
+            else:
+                layer_below = self.dbm.hidden_layers[i-1]
+            state_below = layer_to_state[layer_below]
+            state_below = layer_below.upward_state(state_below)
+
+            # Get the sampled state of the layer above so we can condition
+            # on it in our Gibbs step
+            if i + 1 < len(self.dbm.hidden_layers):
+                layer_above = self.dbm.hidden_layers[i + 1]
+                state_above = layer_to_state[layer_above]
+                state_above = layer_above.downward_state(state_above)
+            else:
+                state_above = None
+                layer_above = None
+
+            if layer_to_clamp[this_layer]:
+                this_state = layer_to_state[this_layer]
+                this_sample = this_state
+            else:
+                # Compute the Gibbs sampling update
+                # Sample the state of this layer conditioned
+                # on its Markov blanket (the layer above and
+                # layer below)
+                this_sample = this_layer.sample(state_below=state_below,
+                                                state_above=state_above,
+                                                layer_above=layer_above,
+                                                theano_rng=theano_rng)
+
+            layer_to_updated[this_layer] = this_sample
+
+        #Sample the visible layer
+        vis_state = layer_to_state[self.dbm.visible_layer]
+        if layer_to_clamp[self.dbm.visible_layer]:
+            vis_sample = vis_state
+        else:
+            first_hid = self.dbm.hidden_layers[0]
+            state_above = layer_to_updated[first_hid]
+            state_above = first_hid.downward_state(state_above)
+
+            vis_sample = self.dbm.visible_layer.sample(state_above=state_above,
+                                                       layer_above=first_hid,
+                                                       theano_rng=theano_rng)
+        layer_to_updated[self.dbm.visible_layer] = vis_sample
+
+        # Sample the odd-numbered layers
+        for i, this_layer in list(enumerate(self.dbm.hidden_layers))[1::2]:
+
+            # Get the sampled state of the layer below so we can condition
+            # on it in our Gibbs update
+            layer_below = self.dbm.hidden_layers[i-1]
+
+            # We want to sample from each conditional distribution
+            # ***sequentially*** so we must use the updated version
+            # of the state for the layers whose updates we have
+            # calculcated already, in layer_to_updated.
+            # If we used the original value from
+            # layer_to_state
+            # then we would sample from each conditional
+            # ***simultaneously*** which does not implement MCMC
+            # sampling.
+            state_below = layer_to_updated[layer_below]
+
+            state_below = layer_below.upward_state(state_below)
+
+            # Get the sampled state of the layer above so we can condition
+            # on it in our Gibbs step
+            if i + 1 < len(self.dbm.hidden_layers):
+                layer_above = self.dbm.hidden_layers[i + 1]
+                state_above = layer_to_updated[layer_above]
+                state_above = layer_above.downward_state(state_above)
+            else:
+                state_above = None
+                layer_above = None
+
+            if layer_to_clamp[this_layer]:
+                this_state = layer_to_state[this_layer]
+                this_sample = this_state
+            else:
+                # Compute the Gibbs sampling update
+                # Sample the state of this layer conditioned
+                # on its Markov blanket (the layer above and
+                # layer below)
+                this_sample = this_layer.sample(state_below=state_below,
+                                                state_above=state_above,
+                                                layer_above=layer_above,
+                                                theano_rng=theano_rng)
+
+            layer_to_updated[this_layer] = this_sample
+
+        # Check that all layers were updated
+        assert all([layer in layer_to_updated for layer in layer_to_state])
+        # Check that we didn't accidentally treat any other object as a layer
+        assert all([layer in layer_to_state for layer in layer_to_updated])
+        # Check that clamping worked
+        assert all([(layer_to_state[layer] is layer_to_updated[layer]) ==
+                    layer_to_clamp[layer] for layer in layer_to_state])
+
+        return layer_to_updated
diff --git a/pylearn2/sandbox/dbm_v2/test_dbm.py b/pylearn2/sandbox/dbm_v2/test_dbm.py
new file mode 100644
index 0000000000..d5f5abe646
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/test_dbm.py
@@ -0,0 +1,1214 @@
+from pylearn2.sandbox.dbm_v2.dbm import DBM
+from pylearn2.sandbox.dbm_v2.dbm import RBM
+from pylearn2.sandbox.dbm_v2.layer import BinaryVector, BinaryVectorMaxPool, Softmax, GaussianVisLayer
+
+__authors__ = ["Ian Goodfellow", "Devon Hjelm"]
+__copyright__ = "Copyright 2012, Universite de Montreal"
+__credits__ = ["Ian Goodfellow", "Devon Hjelm"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import numpy as np
+import random
+assert hasattr(np, 'exp')
+
+from theano import config
+from theano import function
+from theano import printing
+from theano import tensor as T
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+from pylearn2.expr.basic import is_binary
+from pylearn2.expr.nnet import inverse_sigmoid_numpy
+from pylearn2.sandbox.dbm_v2.dbm_cost import VariationalCD
+from pylearn2.sandbox.dbm_v2.dbm_cost import BaseCD
+import pylearn2.testing.datasets as datasets
+from pylearn2.space import VectorSpace
+from pylearn2.utils import sharedX
+from pylearn2.utils import safe_zip
+from pylearn2.utils.data_specs import DataSpecsMapping
+
+
+class DummyLayer(object):
+    """
+    A layer that we build for the test that just uses a state
+    as its downward message.
+    """
+
+    def downward_state(self, state):
+        return state
+
+    def downward_message(self, state):
+        return state
+
+
+class DummyDBM(object):
+    """
+    A dummy DBM for some of the tests below.
+    """
+    def __init__(self, rng):
+        self.rng = rng
+
+
+class TestBinaryVector:
+    """
+    Testing class for DBM BinaryVector.
+    """
+    def setUp(self):
+        pass
+    @staticmethod
+    def check_samples(value, expected_shape, expected_mean, tol):
+        """
+        Tests that a matrix of binary samples (observations in rows, variables
+        in columns)
+        1) Has the right shape
+        2) Is binary
+        3) Converges to the right mean
+        """
+        assert value.shape == expected_shape
+        assert is_binary(value)
+        mean = value.mean(axis=0)
+        max_error = np.abs(mean-expected_mean).max()
+        print 'Actual mean:'
+        print mean
+        print 'Expected mean:'
+        print expected_mean
+        print 'Maximal error:', max_error
+        if max_error > tol:
+            raise ValueError("Samples don't seem to have the right mean.")
+
+    def test_make_state(self):
+        # Verifies that BinaryVector.make_state creates
+        # a shared variable whose value passes check_samples
+
+        n = 5
+        num_samples = 1000
+        tol = .04
+
+        layer = BinaryVector(nvis = n)
+
+        rng = np.random.RandomState([2012,11,1])
+
+        mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+
+        z = inverse_sigmoid_numpy(mean)
+
+        layer.set_biases(z.astype(config.floatX))
+
+        init_state = layer.make_state(num_examples=num_samples,
+                                      numpy_rng=rng)
+
+        value = init_state.get_value()
+
+        TestBinaryVector.check_samples(value, (num_samples, n), mean, tol)
+
+    def test_sample(self):
+        # Verifies that BinaryVector.sample returns an expression
+        # whose value passes check_samples
+
+        assert hasattr(np, 'exp')
+
+        n = 5
+        num_samples = 1000
+        tol = .04
+
+        vis = BinaryVector(nvis=n)
+        hid = DummyLayer()
+
+        rng = np.random.RandomState([2012,11,1,259])
+
+        mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+
+        ofs = rng.randn(n)
+
+        vis.set_biases(ofs.astype(config.floatX))
+
+        z = inverse_sigmoid_numpy(mean) - ofs
+
+        z_var = sharedX(np.zeros((num_samples, n)) + z)
+
+        theano_rng = MRG_RandomStreams(2012+11+1)
+
+        sample = vis.sample(state_above=z_var, layer_above=hid,
+                            theano_rng=theano_rng)
+
+        sample = sample.eval()
+
+        TestBinaryVector.check_samples(sample, (num_samples, n), mean, tol)
+
+
+class TestGaussianVisLayer:
+
+    def setUp(self):
+        pass
+
+    @staticmethod
+    def check_samples(value, nsamples, nvis, rows, cols, channels, expected_mean, tol):
+        """
+        Tests that a matrix of Gaussian samples (observations in rows, variables
+        in columns)
+        1) Has the right shape
+        2) Is not binary
+        3) Converges to the right mean
+
+        """
+        if nvis:
+            expected_shape = (nsamples, nvis)
+        else:
+            expected_shape = (nsamples,rows,cols,channels)
+        assert value.shape == expected_shape
+        assert not is_binary(value)
+        mean = value.mean(axis=0)
+        max_error = np.abs(mean-expected_mean).max()
+        print 'Actual mean:'
+        print mean
+        print 'Expected mean:'
+        print expected_mean
+        print 'Maximal error:', max_error
+        print 'Tolerable variance:', tol
+        if max_error > tol:
+            raise ValueError("Samples don't seem to have the right mean.")
+        else:
+            print 'Mean is within expected range'
+
+    def test_make_state(self, n=5, rows=None, cols=None, channels=None, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.make_state.
+        Verified that GaussianVisLayer creates a shared variable whose value passes check_samples.
+        In this case the layer lives in a VectorSpace.
+
+        """
+        beta = 1/tol # precision parameter
+        assert (n is None and (rows is not None and cols is not None and channels is not None)) or\
+            (n is not None and (rows == cols == channels == None)),\
+            "n must be None or rows, cols, and channels must be None"
+
+        rng = np.random.RandomState([2012,11,1])
+        if n is not None:
+            layer = GaussianVisLayer(nvis = n, init_beta=beta)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+        else:
+            # axes for batch, rows, cols, channels, can be given in any order
+            axes = ['b', 0, 1, 'c']
+            random.shuffle(axes)
+            axes = tuple(axes)
+            layer = GaussianVisLayer(rows=rows, cols=cols, channels=channels,
+                                     init_beta=beta, axes=axes)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (rows, cols, channels))
+
+        z = mean
+        layer.set_biases(z.astype(config.floatX))
+        init_state = layer.make_state(num_examples=num_samples,
+                                      numpy_rng=rng)
+        value = init_state.get_value()
+        TestGaussianVisLayer.check_samples(value, num_samples, n, rows, cols, channels, mean, tol)
+
+    def test_make_state_conv(self, n=None, rows=3, cols=3, channels=3, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.make_state.
+        Verifies that GaussianVisLayer.make_state creates a shared variable
+        whose value passes check_samples. In this case the layer lives in a Conv2DSpace.
+
+        Parameters:
+        ----------
+        n: detector layer dimension.
+        num_samples: number of samples or observations over each dimension.
+        tol: tolerace in comparisons
+        rows: number of rows in convolutional detector. Must be None if n is not None
+        cols: number of cols in convolutional detector. Must be None if n is not None
+        channels: number of channels in convolutional detector. Must be None if n is not None
+        """
+        self.test_make_state(n, rows, cols, channels, num_samples, tol)
+
+    def test_sample(self, n=5, rows=None, cols=None, channels=None, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.sample returns an expression whose value passes check_samples.
+        In this case the layer lives in a VectorSpace.
+
+        Parameters:
+        -----------
+        n: detector layer dimension.
+        num_samples: number of samples or observations over each dimension.
+        tol: tolerace in comparisons
+        rows: number of rows in convolutional detector.  Must be None if n is not None
+        cols: number of cols in convolutional detector.  Must be None if n is not None
+        channels: number of channels in convolutional detector.  Must be None if n is not None
+        """
+        assert hasattr(np, 'exp')
+
+        beta = 1/tol  # precision parameter
+        assert (n is None and (rows is not None and cols is not None and channels is not None)) or\
+            (n is not None and (rows == cols == channels == None)),\
+            "n must be None or rows, cols, and channels must be None"
+
+        rng = np.random.RandomState([2012,11,1,259])
+        if n is not None:
+            vis = GaussianVisLayer(nvis=n, init_beta=beta)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+            ofs = rng.randn(n)
+        else:
+            # axes for batch, rows, cols, channels, can be given in any order
+            axes = ['b', 0, 1, 'c']
+            random.shuffle(axes)
+            axes = tuple(axes)
+            vis = GaussianVisLayer(nvis=None,rows=rows, cols=cols,
+                                   channels=channels, init_beta=beta, axes=axes)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (rows, cols, channels))
+            ofs = rng.randn(rows,cols,channels)
+
+        hid = DummyLayer()
+        vis.set_biases(ofs.astype(config.floatX))
+        z=mean -ofs # linear activation function
+
+        if n is not None:
+            z_var = sharedX(np.zeros((num_samples, n)) + z)
+        else:
+            z_var = sharedX(np.zeros((num_samples, rows, cols, channels)) + z)
+
+        theano_rng = MRG_RandomStreams(2012+11+1)
+        sample = vis.sample(state_above=z_var, layer_above=hid,
+                            theano_rng=theano_rng)
+        sample = sample.eval()
+        TestGaussianVisLayer.check_samples(sample, num_samples, n, rows, cols, channels, mean, tol)
+
+    def test_sample_conv(self, n=None, rows=3, cols=3, channels=3, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.sample returns an expression whose value passes check_samples.
+        In this case the layer lives in a Conv2DSpace.
+
+        Parameters:
+        -----------
+        n: detector layer dimension.  Set to None for convolutional.
+        num_samples: number of samples or observations over each dimension.
+        tol: tolerace in comparisons
+        rows: number of rows in convolutional detector.  Must be None if n is not None
+        cols: number of cols in convolutional detector.  Must be None if n is not None
+        channels: number of channels in convolutional detector.  Must be None if n is not None
+        """
+        self.test_sample(n, rows, cols, channels, num_samples, tol)
+
+
+def check_bvmp_samples(value, num_samples, n, pool_size, mean, tol):
+    """
+    bvmp=BinaryVectorMaxPool
+    value: a tuple giving (pooled batch, detector batch)   (all made with same params)
+    num_samples: number of samples there should be in the batch
+    n: detector layer dimension
+    pool_size: size of each pool region
+    mean: (expected value of pool unit, expected value of detector units)
+    tol: amount the emprical mean is allowed to deviate from the analytical expectation
+
+    checks that:
+        1) all values are binary
+        2) detector layer units are mutually exclusive
+        3) pooled unit is max of the detector units
+        4) correct number of samples is present
+        5) variables are of the right shapes
+        6) samples converge to the right expected value
+    """
+
+    pv, hv = value
+
+    assert n % pool_size == 0
+    num_pools = n // pool_size
+
+    assert pv.ndim == 2
+    assert pv.shape[0] == num_samples
+    assert pv.shape[1] == num_pools
+
+    assert hv.ndim == 2
+    assert hv.shape[0] == num_samples
+    assert hv.shape[1] == n
+
+    assert is_binary(pv)
+    assert is_binary(hv)
+
+    for i in xrange(num_pools):
+        sub_p = pv[:,i]
+        assert sub_p.shape == (num_samples,)
+        sub_h = hv[:,i*pool_size:(i+1)*pool_size]
+        assert sub_h.shape == (num_samples, pool_size)
+        if not np.all(sub_p == sub_h.max(axis=1)):
+            for j in xrange(num_samples):
+                print sub_p[j], sub_h[j,:]
+                assert sub_p[j] == sub_h[j,:]
+            assert False
+        assert np.max(sub_h.sum(axis=1)) == 1
+
+    p, h = mean
+    assert p.ndim == 1
+    assert h.ndim == 1
+    emp_p = pv.mean(axis=0)
+    emp_h = hv.mean(axis=0)
+
+    max_diff = np.abs(p - emp_p).max()
+    if max_diff > tol:
+        print 'expected value of pooling units: ',p
+        print 'empirical expectation: ',emp_p
+        print 'maximum difference: ',max_diff
+        raise ValueError("Pooling unit samples have an unlikely mean.")
+    max_diff = np.abs(h - emp_h).max()
+    if max_diff > tol:
+        assert False
+
+def test_bvmp_make_state():
+
+    # Verifies that BinaryVector.make_state creates
+    # a shared variable whose value passes check_binary_samples
+
+    num_pools = 3
+    num_samples = 1000
+    tol = .04
+    rng = np.random.RandomState([2012,11,1,9])
+    # pool_size=1 is an important corner case
+    for pool_size in [1, 2, 5]:
+        n = num_pools * pool_size
+
+        layer = BinaryVectorMaxPool(
+                detector_layer_dim=n,
+                layer_name='h',
+                irange=1.,
+                pool_size=pool_size)
+
+        # This is just to placate mf_update below
+        input_space = VectorSpace(1)
+        class DummyDBM(object):
+            def __init__(self):
+                self.rng = rng
+        layer.set_dbm(DummyDBM())
+        layer.set_input_space(input_space)
+
+        layer.set_biases(rng.uniform(-pool_size, 1., (n,)).astype(config.floatX))
+
+        # To find the mean of the samples, we use mean field with an input of 0
+        mean = layer.mf_update(
+                state_below=T.alloc(0., 1, 1),
+                state_above=None,
+                layer_above=None)
+
+        mean = function([], mean)()
+
+        mean = [ mn[0,:] for mn in mean ]
+
+        state = layer.make_state(num_examples=num_samples,
+                numpy_rng=rng)
+
+        value = [elem.get_value() for elem in state]
+
+        check_bvmp_samples(value, num_samples, n, pool_size, mean, tol)
+
+
+def make_random_basic_binary_dbm(
+        rng,
+        pool_size_1,
+        num_vis = None,
+        num_pool_1 = None,
+        num_pool_2 = None,
+        pool_size_2 = None,
+        center = False
+        ):
+    """
+    Makes a DBM with BinaryVector for the visible layer,
+    and two hidden layers of type BinaryVectorMaxPool.
+    The weights and biases are initialized randomly with
+    somewhat large values (i.e., not what you'd want to
+    use for learning)
+
+    rng: A numpy RandomState.
+    pool_size_1: The size of the pools to use in the first
+                 layer.
+    """
+
+    if num_vis is None:
+        num_vis = rng.randint(1,11)
+    if num_pool_1 is None:
+        num_pool_1 = rng.randint(1,11)
+    if num_pool_2 is None:
+        num_pool_2 = rng.randint(1,11)
+    if pool_size_2 is None:
+        pool_size_2 = rng.randint(1,6)
+
+    num_h1 = num_pool_1 * pool_size_1
+    num_h2 = num_pool_2 * pool_size_2
+
+    v = BinaryVector(num_vis, center=center)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX), recenter=center)
+
+    h1 = BinaryVectorMaxPool(
+            detector_layer_dim = num_h1,
+            pool_size = pool_size_1,
+            layer_name = 'h1',
+            center = center,
+            irange = 1.)
+    h1.set_biases(rng.uniform(-1., 1., (num_h1,)).astype(config.floatX), recenter=center)
+
+    h2 = BinaryVectorMaxPool(
+            center = center,
+            detector_layer_dim = num_h2,
+            pool_size = pool_size_2,
+            layer_name = 'h2',
+            irange = 1.)
+    h2.set_biases(rng.uniform(-1., 1., (num_h2,)).astype(config.floatX), recenter=center)
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [h1, h2],
+            batch_size = 1,
+            niter = 50)
+
+    return dbm
+
+
+def test_bvmp_mf_energy_consistent():
+
+    # A test of the BinaryVectorMaxPool class
+    # Verifies that the mean field update is consistent with
+    # the energy function
+
+    # Specifically, in a DBM consisting of (v, h1, h2), the
+    # lack of intra-layer connections means that
+    # P(h1|v, h2) is factorial so mf_update tells us the true
+    # conditional.
+    # We also know P(h1[i] | h1[-i], v)
+    #  = P(h, v) / P(h[-i], v)
+    #  = P(h, v) / sum_h[i] P(h, v)
+    #  = exp(-E(h, v)) / sum_h[i] exp(-E(h, v))
+    # So we can check that computing P(h[i] | v) with both
+    # methods works the same way
+
+    rng = np.random.RandomState([2012,11,1,613])
+
+    def do_test(pool_size_1):
+
+        # Make DBM and read out its pieces
+        dbm = make_random_basic_binary_dbm(
+                rng = rng,
+                pool_size_1 = pool_size_1,
+                )
+
+        v = dbm.visible_layer
+        h1, h2 = dbm.hidden_layers
+
+        num_p = h1.get_output_space().dim
+
+        # Choose which unit we will test
+        p_idx = rng.randint(num_p)
+
+        # Randomly pick a v, h1[-p_idx], and h2 to condition on
+        # (Random numbers are generated via dbm.rng)
+        layer_to_state = dbm.make_layer_to_state(1)
+        v_state = layer_to_state[v]
+        h1_state = layer_to_state[h1]
+        h2_state = layer_to_state[h2]
+
+        # Debugging checks
+        num_h = h1.detector_layer_dim
+        assert num_p * pool_size_1 == num_h
+        pv, hv = h1_state
+        assert pv.get_value().shape == (1, num_p)
+        assert hv.get_value().shape == (1, num_h)
+
+        # Infer P(h1[i] | h2, v) using mean field
+        expected_p, expected_h = h1.mf_update(
+                state_below = v.upward_state(v_state),
+                state_above = h2.downward_state(h2_state),
+                layer_above = h2)
+
+        expected_p = expected_p[0, p_idx]
+        expected_h = expected_h[0, p_idx * pool_size : (p_idx + 1) * pool_size]
+
+        expected_p, expected_h = function([], [expected_p, expected_h])()
+
+        # Infer P(h1[i] | h2, v) using the energy function
+        energy = dbm.energy(V = v_state,
+                hidden = [h1_state, h2_state])
+        unnormalized_prob = T.exp(-energy)
+        assert unnormalized_prob.ndim == 1
+        unnormalized_prob = unnormalized_prob[0]
+        unnormalized_prob = function([], unnormalized_prob)
+
+        p_state, h_state = h1_state
+
+        def compute_unnormalized_prob(which_detector):
+            write_h = np.zeros((pool_size_1,))
+            if which_detector is None:
+                write_p = 0.
+            else:
+                write_p = 1.
+                write_h[which_detector] = 1.
+
+            h_value = h_state.get_value()
+            p_value = p_state.get_value()
+
+            h_value[0, p_idx * pool_size : (p_idx + 1) * pool_size] = write_h
+            p_value[0, p_idx] = write_p
+
+            h_state.set_value(h_value)
+            p_state.set_value(p_value)
+
+            return unnormalized_prob()
+
+        off_prob = compute_unnormalized_prob(None)
+        on_probs = [compute_unnormalized_prob(idx) for idx in xrange(pool_size)]
+        denom = off_prob + sum(on_probs)
+        off_prob /= denom
+        on_probs = [on_prob / denom for on_prob in on_probs]
+        assert np.allclose(1., off_prob + sum(on_probs))
+
+        # np.asarray(on_probs) doesn't make a numpy vector, so I do it manually
+        wtf_numpy = np.zeros((pool_size_1,))
+        for i in xrange(pool_size_1):
+            wtf_numpy[i] = on_probs[i]
+        on_probs = wtf_numpy
+
+        # Check that they match
+        if not np.allclose(expected_p, 1. - off_prob):
+            print 'mean field expectation of p:',expected_p
+            print 'expectation of p based on enumerating energy function values:',1. - off_prob
+            print 'pool_size_1:',pool_size_1
+
+            assert False
+        if not np.allclose(expected_h, on_probs):
+            print 'mean field expectation of h:',expected_h
+            print 'expectation of h based on enumerating energy function values:',on_probs
+            assert False
+
+    # 1 is an important corner case
+    # We must also run with a larger number to test the general case
+    for pool_size in [1, 2, 5]:
+        do_test(pool_size)
+
+
+def test_bvmp_mf_energy_consistent_center():
+    """
+    A test of the BinaryVectorMaxPool class
+    Verifies that the mean field update is consistent with
+    the energy function when using Gregoire Montavon's centering
+    trick.
+
+    Specifically, in a DBM consisting of (v, h1, h2), the
+    lack of intra-layer connections means that
+    P(h1|v, h2) is factorial so mf_update tells us the true
+    conditional.
+    We also know P(h1[i] | h1[-i], v)
+    = P(h, v) / P(h[-i], v)
+    = P(h, v) / sum_h[i] P(h, v)
+    = exp(-E(h, v)) / sum_h[i] exp(-E(h, v))
+    So we can check that computing P(h[i] | v) with both
+    methods works the same way
+
+    :return:
+    """
+    rng = np.random.RandomState([2012,11,1,613])
+
+    def do_test(pool_size_1):
+
+        # Make DBM and read out its pieces
+        dbm = make_random_basic_binary_dbm(
+                rng = rng,
+                pool_size_1 = pool_size_1,
+                pool_size_2 = 1, # centering is only updated for pool size 1
+                center = True
+                )
+
+        v = dbm.visible_layer
+        h1, h2 = dbm.hidden_layers
+
+        num_p = h1.get_output_space().dim
+
+        # Choose which unit we will test
+        p_idx = rng.randint(num_p)
+
+        # Randomly pick a v, h1[-p_idx], and h2 to condition on
+        # (Random numbers are generated via dbm.rng)
+        layer_to_state = dbm.make_layer_to_state(1)
+        v_state = layer_to_state[v]
+        h1_state = layer_to_state[h1]
+        h2_state = layer_to_state[h2]
+
+        # Debugging checks
+        num_h = h1.detector_layer_dim
+        assert num_p * pool_size_1 == num_h
+        pv, hv = h1_state
+        assert pv.get_value().shape == (1, num_p)
+        assert hv.get_value().shape == (1, num_h)
+
+        # Infer P(h1[i] | h2, v) using mean field
+        expected_p, expected_h = h1.mf_update(
+                state_below = v.upward_state(v_state),
+                state_above = h2.downward_state(h2_state),
+                layer_above = h2)
+
+        expected_p = expected_p[0, p_idx]
+        expected_h = expected_h[0, p_idx * pool_size_1 : (p_idx + 1) * pool_size_1]
+
+        expected_p, expected_h = function([], [expected_p, expected_h])()
+
+        # Infer P(h1[i] | h2, v) using the energy function
+        energy = dbm.energy(V = v_state,
+                hidden = [h1_state, h2_state])
+        unnormalized_prob = T.exp(-energy)
+        assert unnormalized_prob.ndim == 1
+        unnormalized_prob = unnormalized_prob[0]
+        unnormalized_prob = function([], unnormalized_prob)
+
+        p_state, h_state = h1_state
+
+        def compute_unnormalized_prob(which_detector):
+            write_h = np.zeros((pool_size_1,))
+            if which_detector is None:
+                write_p = 0.
+            else:
+                write_p = 1.
+                write_h[which_detector] = 1.
+
+            h_value = h_state.get_value()
+            p_value = p_state.get_value()
+
+            h_value[0, p_idx * pool_size_1 : (p_idx + 1) * pool_size_1] = write_h
+            p_value[0, p_idx] = write_p
+
+            h_state.set_value(h_value)
+            p_state.set_value(p_value)
+
+            return unnormalized_prob()
+
+        off_prob = compute_unnormalized_prob(None)
+        on_probs = [compute_unnormalized_prob(idx) for idx in xrange(pool_size_1)]
+        denom = off_prob + sum(on_probs)
+        off_prob /= denom
+        on_probs = [on_prob / denom for on_prob in on_probs]
+        assert np.allclose(1., off_prob + sum(on_probs))
+
+        # np.asarray(on_probs) doesn't make a numpy vector, so I do it manually
+        wtf_numpy = np.zeros((pool_size_1,))
+        for i in xrange(pool_size_1):
+            wtf_numpy[i] = on_probs[i]
+        on_probs = wtf_numpy
+
+        # Check that they match
+        if not np.allclose(expected_p, 1. - off_prob):
+            print 'mean field expectation of p:',expected_p
+            print 'expectation of p based on enumerating energy function values:',1. - off_prob
+            print 'pool_size_1:',pool_size_1
+
+            assert False
+        if not np.allclose(expected_h, on_probs):
+            print 'mean field expectation of h:',expected_h
+            print 'expectation of h based on enumerating energy function values:',on_probs
+            assert False
+
+    # 1 is the only pool size for which centering is implemented
+    do_test(1)
+
+def test_bvmp_mf_sample_consistent():
+
+    # A test of the BinaryVectorMaxPool class
+    # Verifies that the mean field update is consistent with
+    # the sampling function
+
+    # Specifically, in a DBM consisting of (v, h1, h2), the
+    # lack of intra-layer connections means that
+    # P(h1|v, h2) is factorial so mf_update tells us the true
+    # conditional.
+    # We can thus use mf_update to compute the expected value
+    # of a sample of h1 from v and h2, and check that samples
+    # drawn using the layer's sample method convert to that
+    # value.
+
+    rng = np.random.RandomState([2012,11,1,1016])
+    theano_rng = MRG_RandomStreams(2012+11+1+1036)
+    num_samples = 1000
+    tol = .042
+
+    def do_test(pool_size_1):
+
+        # Make DBM and read out its pieces
+        dbm = make_random_basic_binary_dbm(
+                rng = rng,
+                pool_size_1 = pool_size_1,
+                )
+
+        v = dbm.visible_layer
+        h1, h2 = dbm.hidden_layers
+
+        num_p = h1.get_output_space().dim
+
+        # Choose which unit we will test
+        p_idx = rng.randint(num_p)
+
+        # Randomly pick a v, h1[-p_idx], and h2 to condition on
+        # (Random numbers are generated via dbm.rng)
+        layer_to_state = dbm.make_layer_to_state(1)
+        v_state = layer_to_state[v]
+        h1_state = layer_to_state[h1]
+        h2_state = layer_to_state[h2]
+
+        # Debugging checks
+        num_h = h1.detector_layer_dim
+        assert num_p * pool_size_1 == num_h
+        pv, hv = h1_state
+        assert pv.get_value().shape == (1, num_p)
+        assert hv.get_value().shape == (1, num_h)
+
+        # Infer P(h1[i] | h2, v) using mean field
+        expected_p, expected_h = h1.mf_update(
+                state_below = v.upward_state(v_state),
+                state_above = h2.downward_state(h2_state),
+                layer_above = h2)
+
+        expected_p = expected_p[0, :]
+        expected_h = expected_h[0, :]
+
+        expected_p, expected_h = function([], [expected_p, expected_h])()
+
+        # copy all the states out into a batch size of num_samples
+        cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x')
+        v_state = v_state[0,:] + cause_copy
+        p, h = h1_state
+        h1_state = (p[0,:] + cause_copy, h[0,:] + cause_copy)
+        p, h = h2_state
+        h2_state = (p[0,:] + cause_copy, h[0,:] + cause_copy)
+
+        h1_samples = h1.sample(state_below = v.upward_state(v_state),
+                            state_above = h2.downward_state(h2_state),
+                            layer_above = h2, theano_rng = theano_rng)
+
+        h1_samples = function([], h1_samples)()
+
+
+        check_bvmp_samples(h1_samples, num_samples, num_h, pool_size, (expected_p, expected_h), tol)
+
+
+    # 1 is an important corner case
+    # We must also run with a larger number to test the general case
+    for pool_size in [1, 2, 5]:
+        do_test(pool_size)
+
+def check_multinomial_samples(value, expected_shape, expected_mean, tol):
+    """
+    Tests that a matrix of multinomial samples (observations in rows, variables
+        in columns)
+    1) Has the right shape
+    2) Is binary
+    3) Has one 1 per row
+    4) Converges to the right mean
+    """
+    assert value.shape == expected_shape
+    assert is_binary(value)
+    assert np.all(value.sum(axis=1) == 1)
+    mean = value.mean(axis=0)
+    max_error = np.abs(mean-expected_mean).max()
+    if max_error > tol:
+        print 'Actual mean:'
+        print mean
+        print 'Expected mean:'
+        print expected_mean
+        print 'Maximal error:', max_error
+        raise ValueError("Samples don't seem to have the right mean.")
+
+def test_softmax_make_state():
+
+    # Verifies that BinaryVector.make_state creates
+    # a shared variable whose value passes check_multinomial_samples
+
+    n = 5
+    num_samples = 1000
+    tol = .04
+
+    layer = Softmax(n_classes = n, layer_name = 'y')
+
+    rng = np.random.RandomState([2012, 11, 1, 11])
+
+    z = 3 * rng.randn(n)
+
+    mean = np.exp(z)
+    mean /= mean.sum()
+
+    layer.set_biases(z.astype(config.floatX))
+
+    state = layer.make_state(num_examples=num_samples,
+            numpy_rng=rng)
+
+    value = state.get_value()
+
+    check_multinomial_samples(value, (num_samples, n), mean, tol)
+
+def test_softmax_mf_energy_consistent():
+
+    # A test of the Softmax class
+    # Verifies that the mean field update is consistent with
+    # the energy function
+
+    # Since a Softmax layer contains only one random variable
+    # (with n_classes possible values) the mean field assumption
+    # does not impose any restriction so mf_update simply gives
+    # the true expected value of h given v.
+    # We also know P(h |  v)
+    #  = P(h, v) / P( v)
+    #  = P(h, v) / sum_h P(h, v)
+    #  = exp(-E(h, v)) / sum_h exp(-E(h, v))
+    # So we can check that computing P(h | v) with both
+    # methods works the same way
+
+    rng = np.random.RandomState([2012,11,1,1131])
+
+    # Make DBM
+    num_vis = rng.randint(1,11)
+    n_classes = rng.randint(1, 11)
+
+    v = BinaryVector(num_vis)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX))
+
+    y = Softmax(
+            n_classes = n_classes,
+            layer_name = 'y',
+            irange = 1.)
+    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX))
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [y],
+            batch_size = 1,
+            niter = 50)
+
+    # Randomly pick a v to condition on
+    # (Random numbers are generated via dbm.rng)
+    layer_to_state = dbm.make_layer_to_state(1)
+    v_state = layer_to_state[v]
+    y_state = layer_to_state[y]
+
+    # Infer P(y | v) using mean field
+    expected_y = y.mf_update(
+            state_below = v.upward_state(v_state))
+
+    expected_y = expected_y[0, :]
+
+    expected_y = expected_y.eval()
+
+    # Infer P(y | v) using the energy function
+    energy = dbm.energy(V = v_state,
+            hidden = [y_state])
+    unnormalized_prob = T.exp(-energy)
+    assert unnormalized_prob.ndim == 1
+    unnormalized_prob = unnormalized_prob[0]
+    unnormalized_prob = function([], unnormalized_prob)
+
+    def compute_unnormalized_prob(which):
+        write_y = np.zeros((n_classes,))
+        write_y[which] = 1.
+
+        y_value = y_state.get_value()
+
+        y_value[0, :] = write_y
+
+        y_state.set_value(y_value)
+
+        return unnormalized_prob()
+
+    probs = [compute_unnormalized_prob(idx) for idx in xrange(n_classes)]
+    denom = sum(probs)
+    probs = [on_prob / denom for on_prob in probs]
+
+    # np.asarray(probs) doesn't make a numpy vector, so I do it manually
+    wtf_numpy = np.zeros((n_classes,))
+    for i in xrange(n_classes):
+        wtf_numpy[i] = probs[i]
+    probs = wtf_numpy
+
+    if not np.allclose(expected_y, probs):
+        print 'mean field expectation of h:',expected_y
+        print 'expectation of h based on enumerating energy function values:',probs
+        assert False
+
+def test_softmax_mf_energy_consistent_centering():
+
+    # A test of the Softmax class
+    # Verifies that the mean field update is consistent with
+    # the energy function when using the centering trick
+
+    # Since a Softmax layer contains only one random variable
+    # (with n_classes possible values) the mean field assumption
+    # does not impose any restriction so mf_update simply gives
+    # the true expected value of h given v.
+    # We also know P(h |  v)
+    #  = P(h, v) / P( v)
+    #  = P(h, v) / sum_h P(h, v)
+    #  = exp(-E(h, v)) / sum_h exp(-E(h, v))
+    # So we can check that computing P(h | v) with both
+    # methods works the same way
+
+    rng = np.random.RandomState([2012,11,1,1131])
+
+    # Make DBM
+    num_vis = rng.randint(1,11)
+    n_classes = rng.randint(1, 11)
+
+    v = BinaryVector(num_vis, center=True)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX), recenter=True)
+
+    y = Softmax(
+            n_classes = n_classes,
+            layer_name = 'y',
+            irange = 1., center=True)
+    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX), recenter=True)
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [y],
+            batch_size = 1,
+            niter = 50)
+
+    # Randomly pick a v to condition on
+    # (Random numbers are generated via dbm.rng)
+    layer_to_state = dbm.make_layer_to_state(1)
+    v_state = layer_to_state[v]
+    y_state = layer_to_state[y]
+
+    # Infer P(y | v) using mean field
+    expected_y = y.mf_update(
+            state_below = v.upward_state(v_state))
+
+    expected_y = expected_y[0, :]
+
+    expected_y = expected_y.eval()
+
+    # Infer P(y | v) using the energy function
+    energy = dbm.energy(V = v_state,
+            hidden = [y_state])
+    unnormalized_prob = T.exp(-energy)
+    assert unnormalized_prob.ndim == 1
+    unnormalized_prob = unnormalized_prob[0]
+    unnormalized_prob = function([], unnormalized_prob)
+
+    def compute_unnormalized_prob(which):
+        write_y = np.zeros((n_classes,))
+        write_y[which] = 1.
+
+        y_value = y_state.get_value()
+
+        y_value[0, :] = write_y
+
+        y_state.set_value(y_value)
+
+        return unnormalized_prob()
+
+    probs = [compute_unnormalized_prob(idx) for idx in xrange(n_classes)]
+    denom = sum(probs)
+    probs = [on_prob / denom for on_prob in probs]
+
+    # np.asarray(probs) doesn't make a numpy vector, so I do it manually
+    wtf_numpy = np.zeros((n_classes,))
+    for i in xrange(n_classes):
+        wtf_numpy[i] = probs[i]
+    probs = wtf_numpy
+
+    if not np.allclose(expected_y, probs):
+        print 'mean field expectation of h:',expected_y
+        print 'expectation of h based on enumerating energy function values:',probs
+        assert False
+
+def test_softmax_mf_sample_consistent():
+
+    # A test of the Softmax class
+    # Verifies that the mean field update is consistent with
+    # the sampling function
+
+    # Since a Softmax layer contains only one random variable
+    # (with n_classes possible values) the mean field assumption
+    # does not impose any restriction so mf_update simply gives
+    # the true expected value of h given v.
+    # We can thus use mf_update to compute the expected value
+    # of a sample of y conditioned on v, and check that samples
+    # drawn using the layer's sample method convert to that
+    # value.
+
+    rng = np.random.RandomState([2012,11,1,1154])
+    theano_rng = MRG_RandomStreams(2012+11+1+1154)
+    num_samples = 1000
+    tol = .042
+
+    # Make DBM
+    num_vis = rng.randint(1,11)
+    n_classes = rng.randint(1, 11)
+
+    v = BinaryVector(num_vis)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX))
+
+    y = Softmax(
+            n_classes = n_classes,
+            layer_name = 'y',
+            irange = 1.)
+    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX))
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [y],
+            batch_size = 1,
+            niter = 50)
+
+    # Randomly pick a v to condition on
+    # (Random numbers are generated via dbm.rng)
+    layer_to_state = dbm.make_layer_to_state(1)
+    v_state = layer_to_state[v]
+    y_state = layer_to_state[y]
+
+    # Infer P(y | v) using mean field
+    expected_y = y.mf_update(
+            state_below = v.upward_state(v_state))
+
+    expected_y = expected_y[0, :]
+
+    expected_y = expected_y.eval()
+
+    # copy all the states out into a batch size of num_samples
+    cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x')
+    v_state = v_state[0,:] + cause_copy
+    y_state = y_state[0,:] + cause_copy
+
+    y_samples = y.sample(state_below = v.upward_state(v_state), theano_rng=theano_rng)
+
+    y_samples = function([], y_samples)()
+
+    check_multinomial_samples(y_samples, (num_samples, n_classes), expected_y, tol)
+
+
+def test_make_symbolic_state():
+    # Tests whether the returned p_sample and h_sample have the right
+    # dimensions
+    num_examples = 40
+    theano_rng = MRG_RandomStreams(2012+11+1)
+
+    visible_layer = BinaryVector(nvis=100)
+    rval = visible_layer.make_symbolic_state(num_examples=num_examples,
+                                             theano_rng=theano_rng)
+
+    hidden_layer = BinaryVectorMaxPool(detector_layer_dim=500,
+                                       pool_size=1,
+                                       layer_name='h',
+                                       irange=0.05,
+                                       init_bias=-2.0)
+    p_sample, h_sample = hidden_layer.make_symbolic_state(num_examples=num_examples,
+                                                          theano_rng=theano_rng)
+
+    softmax_layer = Softmax(n_classes=10, layer_name='s', irange=0.05)
+    h_sample_s = softmax_layer.make_symbolic_state(num_examples=num_examples,
+                                                   theano_rng=theano_rng)
+
+    required_shapes = [(40, 100), (40, 500), (40, 500), (40, 10)]
+    f = function(inputs=[],
+                 outputs=[rval, p_sample, h_sample, h_sample_s])
+
+    for s, r in zip(f(), required_shapes):
+        assert s.shape == r
+
+
+
+def check_gradients(expected_grad, actual_grad, corr_tol=0.8, mean_tol=0.05):
+    corr = np.corrcoef(expected_grad.flatten(), actual_grad.flatten())[0,1]
+    assert corr >= corr_tol,\
+        ("Correlation did not pass: (%.2f > %.2f)\n" % (corr_tol, corr)) +\
+        ("Expected:\n %r\n" % expected_grad) +\
+        ("Actual:\n %r" % actual_grad)
+    assert abs(np.mean(expected_grad) - np.mean(actual_grad)) < mean_tol,\
+            "Mean did not pass (%.2f expected vs %.2f actual)" %\
+            (np.mean(expected_grad), np.mean(actual_grad))
+
+def make_rbm(num_visible, num_hidden, batch_size, center=False, rng=None):
+    if rng is None:
+        rng = np.random.RandomState([2014,10,7])
+
+    visible_layer = BinaryVector(nvis=num_visible)
+    visible_layer.set_biases(rng.uniform(-1., 1., (num_visible,)).astype(config.floatX))
+    hidden_layer = BinaryVectorMaxPool(detector_layer_dim=num_hidden,
+                                                                pool_size=1,
+                                                                layer_name='h',
+                                                                irange=0.05,
+                                                                init_bias=-2.0,
+                                                                center=center)
+    hidden_layer.set_biases(rng.uniform(-1., 1., (num_hidden,)).astype(config.floatX), recenter=center)
+    model = RBM(visible_layer=visible_layer,
+                           hidden_layer=hidden_layer,
+                           batch_size=batch_size, niter=1)
+
+    return model
+
+class Test_CD(object):
+    """
+    Class to test contrastive divergence.
+    """
+
+    @staticmethod
+    def check_rbm_pos_phase(rbm, cost, X, tol=0.8):
+
+        pos_grads, updates = cost._get_positive_phase(rbm, X)
+
+        visible_layer = rbm.visible_layer
+        hidden_layer = rbm.hidden_layers[0]
+        P_H0_given_X = hidden_layer.mf_update(state_below=visible_layer.upward_state(X),
+                                              state_above=None, layer_above=None)[1]
+
+        dW_pos_exp = -1 * np.dot(X.eval().T, P_H0_given_X.eval()) / rbm.batch_size
+        dW_pos_act = pos_grads[hidden_layer.transformer.get_params()[0]].eval()
+        check_gradients(dW_pos_exp, dW_pos_act, corr_tol=tol)
+
+        dvb_pos_exp = -np.mean(X.eval(), axis=0)
+        dvb_pos_act = pos_grads[visible_layer.bias].eval()
+        check_gradients(dvb_pos_exp, dvb_pos_act, corr_tol=tol)
+
+        dvh_pos_exp = -np.mean(P_H0_given_X.eval(), axis=0)
+        dvh_pos_act = pos_grads[hidden_layer.b].eval()
+        check_gradients(dvh_pos_exp, dvh_pos_act, corr_tol=tol)
+
+        return pos_grads, updates
+
+    @staticmethod
+    def check_rbm_neg_phase(rbm, cost, X, theano_rng=None, tol=0.85):
+
+        assert theano_rng is not None
+
+        neg_grads, updates = cost._get_negative_phase(rbm, X)
+
+        visible_layer = rbm.visible_layer
+        hidden_layer = rbm.hidden_layers[0]
+
+        P_H0_given_X = hidden_layer.mf_update(state_below = visible_layer.upward_state(X),
+                                              state_above=None, layer_above=None)[1]
+        H0 = hidden_layer.sample(state_below=visible_layer.upward_state(X),
+                                 state_above=None, layer_above=None,
+                                 theano_rng=theano_rng)[1]
+        V1 = visible_layer.sample(state_above=H0, layer_above=hidden_layer,
+                                  theano_rng=theano_rng)
+        P_H1_given_V1 = hidden_layer.mf_update(state_below=visible_layer.upward_state(V1),
+                                               state_above=None, layer_above=None)[1]
+        dW_neg_act = neg_grads[hidden_layer.transformer.get_params()[0]].eval()
+        dW_neg_exp = np.dot(V1.eval().T, P_H1_given_V1.eval()) / rbm.batch_size
+        check_gradients(dW_neg_exp, dW_neg_act, corr_tol=tol)
+
+        dvb_neg_exp = np.mean(V1.eval(), axis=0)
+        dvb_neg_act = neg_grads[visible_layer.bias].eval()
+        check_gradients(dvb_neg_exp, dvb_neg_act, corr_tol=tol)
+
+        dvh_neg_exp = np.mean(P_H1_given_V1.eval(), axis=0)
+        dvh_neg_act = neg_grads[hidden_layer.b].eval()
+        check_gradients(dvh_neg_exp, dvh_neg_act, corr_tol=tol)
+
+        return neg_grads, updates
+
+    def test_rbm(self, num_visible=100, num_hidden=50, batch_size=5000, variational=False):
+        rng = np.random.RandomState([2014,10,7])
+        theano_rng = MRG_RandomStreams(2024+30+9)
+
+        # Set up the RBM (One hidden layer DBM)
+        rbm = make_rbm(num_visible, num_hidden, batch_size, rng=rng)
+
+        if variational:
+            cost = VariationalCD(num_gibbs_steps=1)
+        else:
+            cost = BaseCD(num_gibbs_steps=1)
+
+        # Set the data
+        X = sharedX(rng.randn(batch_size, num_visible))
+        # Get the gradients from the cost function
+        grads, updates = cost.get_gradients(rbm, X)
+        Test_CD.check_rbm_pos_phase(rbm, cost, X)
+        Test_CD.check_rbm_neg_phase(rbm, cost, X, theano_rng=theano_rng)
+
+    def test_rbm_varational(self, num_visible=100, num_hidden=50, batch_size=200):
+        self.test_rbm(num_visible, num_hidden, batch_size, variational=True)

From 81f958567272f72b7b4d0ce803a390e216e433bd Mon Sep 17 00:00:00 2001
From: rdevon <devon@cs.unm.edu>
Date: Tue, 28 Oct 2014 11:50:06 -0600
Subject: [PATCH 2/5] Moved DBM to sandbox/dbm_v2 and stripped down. dbm_v2 is
 an attempt to refactor and redo some of the core aspects of DBM. Currently it
 is a stripped down version of models/dbm to support only: RBM Updown
 Inference Gibbs sampling BaseCD VariationalCD

I have included some tests for RBM CD to get things moving.
There are some changes found in other PRs, namely to:
dbm_cost.py: refactored a little bit to make it cleaner and get BaseCD working.
dbm.py: introduced RBM and a method for DBM to intialize its own chains.

and a few other smaller changes.
---
 pylearn2/sandbox/dbm_v2/__init__.py           |  270 ++
 pylearn2/sandbox/dbm_v2/dbm.py                |  822 ++++
 pylearn2/sandbox/dbm_v2/dbm_cost.py           |  660 +++
 .../sandbox/dbm_v2/inference_procedure.py     |  484 ++
 pylearn2/sandbox/dbm_v2/ising.py              | 1864 ++++++++
 pylearn2/sandbox/dbm_v2/layer.py              | 4124 +++++++++++++++++
 pylearn2/sandbox/dbm_v2/sampling_procedure.py |  210 +
 pylearn2/sandbox/dbm_v2/test_dbm.py           | 1214 +++++
 8 files changed, 9648 insertions(+)
 create mode 100644 pylearn2/sandbox/dbm_v2/__init__.py
 create mode 100755 pylearn2/sandbox/dbm_v2/dbm.py
 create mode 100644 pylearn2/sandbox/dbm_v2/dbm_cost.py
 create mode 100644 pylearn2/sandbox/dbm_v2/inference_procedure.py
 create mode 100644 pylearn2/sandbox/dbm_v2/ising.py
 create mode 100644 pylearn2/sandbox/dbm_v2/layer.py
 create mode 100644 pylearn2/sandbox/dbm_v2/sampling_procedure.py
 create mode 100644 pylearn2/sandbox/dbm_v2/test_dbm.py

diff --git a/pylearn2/sandbox/dbm_v2/__init__.py b/pylearn2/sandbox/dbm_v2/__init__.py
new file mode 100644
index 0000000000..aacd0d2589
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/__init__.py
@@ -0,0 +1,270 @@
+"""
+This module contains functionality related to deep Boltzmann machines.
+They are implemented generically in order to make it easy to support
+convolution versions, etc.
+
+This code was moved piece by piece incrementally over time from Ian's
+private research repository, and it is altogether possible that he
+broke something or left out a piece while moving it. If you find any
+problems please don't hesitate to contact pylearn-dev and we will fix
+the problem and add a unit test.
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import logging
+import numpy as np
+import sys
+
+from theano.compat.python2x import OrderedDict
+
+from pylearn2.expr.nnet import inverse_sigmoid_numpy
+from pylearn2.blocks import Block
+from pylearn2.utils import block_gradient
+from pylearn2.utils.rng import make_theano_rng
+
+
+logger = logging.getLogger(__name__)
+
+logger.debug("DBM changing the recursion limit.")
+# We need this to be high enough that the big theano graphs we make
+# when unrolling inference don't cause python to complain.
+# python intentionally declares stack overflow well before the stack
+# segment is actually exceeded. But we can't make this value too big
+# either, or we'll get seg faults when the python interpreter really
+# does go over the stack segment.
+# IG encountered seg faults on eos3 (a machine at LISA labo) when using
+# 50000 so for now it is set to 40000.
+# I think the actual safe recursion limit can't be predicted in advance
+# because you don't know how big of a stack frame each function will
+# make, so there is not really a "correct" way to do this. Really the
+# python interpreter should provide an option to raise the error
+# precisely when you're going to exceed the stack segment.
+sys.setrecursionlimit(40000)
+
+
+def init_sigmoid_bias_from_marginals(dataset, use_y = False):
+    """
+    Returns b such that sigmoid(b) has the same marginals as the
+    data. Assumes dataset contains a design matrix. If use_y is
+    true, sigmoid(b) will have the same marginals as the targets,
+    rather than the features.
+
+    Parameters
+    ----------
+    dataset : WRITEME
+    use_y : WRITEME
+    """
+    if use_y:
+        X = dataset.y
+    else:
+        X = dataset.get_design_matrix()
+    return init_sigmoid_bias_from_array(X)
+
+def init_sigmoid_bias_from_array(arr):
+    """
+    .. todo::
+
+        WRITEME
+    """
+    X = arr
+    if not (X.max() == 1):
+        raise ValueError("Expected design matrix to consist entirely "
+                "of 0s and 1s, but maximum value is "+str(X.max()))
+    if X.min() != 0.:
+        raise ValueError("Expected design matrix to consist entirely of "
+                "0s and 1s, but minimum value is "+str(X.min()))
+    # removed this check so we can initialize the marginals
+    # with a dataset of bernoulli params
+    # assert not np.any( (X > 0.) * (X < 1.) )
+
+    mean = X.mean(axis=0)
+
+    mean = np.clip(mean, 1e-7, 1-1e-7)
+
+    init_bias = inverse_sigmoid_numpy(mean)
+
+    return init_bias
+
+
+class DBMSampler(Block):
+    """
+    A Block used to sample from the last layer of a DBM with one hidden layer.
+
+    Parameters
+    ----------
+    dbm : WRITEME
+    """
+    def __init__(self, dbm):
+        super(DBMSampler, self).__init__()
+        self.theano_rng = make_theano_rng(None, 2012+10+14, which_method="binomial")
+        self.dbm = dbm
+        assert len(self.dbm.hidden_layers) == 1
+
+    def __call__(self, inputs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        space = self.dbm.get_input_space()
+        num_examples = space.batch_size(inputs)
+
+        last_layer = self.dbm.get_all_layers()[-1]
+        layer_to_chains = self.dbm.make_layer_to_symbolic_state(
+            num_examples, self.theano_rng)
+        # The examples are used to initialize the visible layer's chains
+        layer_to_chains[self.dbm.visible_layer] = inputs
+
+        layer_to_clamp = OrderedDict([(self.dbm.visible_layer, True)])
+        layer_to_chains = self.dbm.mcmc_steps(layer_to_chains, self.theano_rng,
+                                              layer_to_clamp=layer_to_clamp,
+                                              num_steps=1)
+
+        rval = layer_to_chains[last_layer]
+        rval = last_layer.upward_state(rval)
+
+        return rval
+
+    def get_input_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.dbm.get_input_space()
+
+    def get_output_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.dbm.get_output_space()
+
+
+def stitch_rbms(batch_size, rbm_list, niter, inference_procedure=None,
+                targets=False):
+    """
+    Returns a DBM initialized with pre-trained RBMs, with weights and biases
+    initialized according to R. Salakhutdinov's policy.
+
+    This method assumes the RBMs were trained normally. It divides the first
+    and last hidden layer's weights by two and initialized a hidden layer's
+    biases as the mean of its biases and the biases of the visible layer of the
+    RBM above it.
+    """
+    assert len(rbm_list) > 1
+
+    # For intermediary hidden layers, there are two set of biases to choose
+    # from: those from the hidden layer of the given RBM, and those from
+    # the visible layer of the RBM above it. As in R. Salakhutdinov's code,
+    # we handle this by computing the mean of those two sets of biases.
+    for this_rbm, above_rbm in zip(rbm_list[:-1], rbm_list[1:]):
+        hidden_layer = this_rbm.hidden_layers[0]
+        visible_layer = above_rbm.visible_layer
+        new_biases = 0.5 * (hidden_layer.get_biases() +
+                            visible_layer.get_biases())
+        hidden_layer.set_biases(new_biases)
+
+    visible_layer = rbm_list[0].visible_layer
+    visible_layer.dbm = None
+
+    hidden_layers = []
+
+    for rbm in rbm_list:
+        # Make sure all DBM have only one hidden layer, except for the last
+        # one, which can have an optional target layer
+        if rbm == rbm_list[-1]:
+            if targets:
+                assert len(rbm.hidden_layers) == 2
+            else:
+                assert len(rbm.hidden_layers) == 1
+        else:
+            assert len(rbm.hidden_layers) == 1
+
+        hidden_layers = hidden_layers + rbm.hidden_layers
+
+    for hidden_layer in hidden_layers:
+        hidden_layer.dbm = None
+
+    # Divide first and last hidden layer's weights by two, as described
+    # in R. Salakhutdinov's paper (equivalent to training with RBMs with
+    # doubled weights)
+    first_hidden_layer = hidden_layers[-1]
+    if targets:
+        last_hidden_layer = hidden_layers[-2]
+    else:
+        last_hidden_layer = hidden_layers[-1]
+    first_hidden_layer.set_weights(0.5 * first_hidden_layer.get_weights())
+    last_hidden_layer.set_weights(0.5 * last_hidden_layer.get_weights())
+
+    return DBM(batch_size, visible_layer, hidden_layers, niter,
+               inference_procedure)
+
+
+def flatten(l):
+    """
+    Turns a nested graph of lists/tuples/other objects
+    into a list of objects.
+
+    Parameters
+    ----------
+    l : WRITEME
+
+    Returns
+    -------
+    WRITEME
+    """
+    if isinstance(l, (list, tuple)):
+        rval = []
+        for elem in l:
+            if isinstance(elem, (list, tuple)):
+                rval.extend(flatten(elem))
+            else:
+                rval.append(elem)
+    else:
+        return [l]
+    return rval
+
+def block(l):
+    """
+    .. todo::
+
+        WRITEME
+    """
+    new = []
+    for elem in l:
+        if isinstance(elem, (list, tuple)):
+            new.append(block(elem))
+        else:
+            new.append(block_gradient(elem))
+    if isinstance(l, tuple):
+        return tuple(new)
+    return new
+
+
+# Make known modules inside this package
+# this needs to come after e.g. flatten(), since DBM depends on flatten()
+from pylearn2.models.dbm.dbm import DBM
+from pylearn2.models.dbm.inference_procedure import BiasInit
+from pylearn2.models.dbm.inference_procedure import InferenceProcedure
+from pylearn2.models.dbm.inference_procedure import MoreConsistent
+from pylearn2.models.dbm.inference_procedure import MoreConsistent2
+from pylearn2.models.dbm.inference_procedure import SuperWeightDoubling
+from pylearn2.models.dbm.inference_procedure import WeightDoubling
+from pylearn2.models.dbm.layer import BinaryVector
+from pylearn2.models.dbm.layer import BinaryVectorMaxPool
+from pylearn2.models.dbm.layer import BVMP_Gaussian
+from pylearn2.models.dbm.layer import CompositeLayer
+from pylearn2.models.dbm.layer import ConvMaxPool
+from pylearn2.models.dbm.layer import ConvC01B_MaxPool
+from pylearn2.models.dbm.layer import GaussianVisLayer
+from pylearn2.models.dbm.layer import HiddenLayer
+from pylearn2.models.dbm.layer import Layer
+from pylearn2.models.dbm.layer import VisibleLayer
+from pylearn2.models.dbm.layer import Softmax
+from pylearn2.models.dbm.sampling_procedure import SamplingProcedure
diff --git a/pylearn2/sandbox/dbm_v2/dbm.py b/pylearn2/sandbox/dbm_v2/dbm.py
new file mode 100755
index 0000000000..07613417fe
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/dbm.py
@@ -0,0 +1,822 @@
+"""
+The main DBM class
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin", "Devon Hjelm"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import functools
+import logging
+import numpy as np
+import warnings
+
+from theano import tensor as T, config
+from theano.compat import OrderedDict
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+from pylearn2.models import Model
+from pylearn2.sandbox.dbm_v2 import flatten
+from pylearn2.sandbox.dbm_v2.inference_procedure import UpDown
+from pylearn2.sandbox.dbm_v2.sampling_procedure import GibbsEvenOdd
+from pylearn2.sandbox.dbm_v2.layer import Softmax
+from pylearn2.utils import safe_zip, safe_izip
+from pylearn2.utils.rng import make_np_rng
+
+
+logger = logging.getLogger(__name__)
+
+
+class DBM(Model):
+    """
+    A deep Boltzmann machine.
+
+    See "Deep Boltzmann Machines" by Ruslan Salakhutdinov and Geoffrey Hinton
+    for details.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size the model should use. Some convolutional
+        LinearTransforms require a compile-time hardcoded batch size,
+        otherwise this would not be part of the model specification.
+    visible_layer : dbm.VisibleLayer
+        The visible layer of the DBM.
+    hidden_layers : list of dbm.HiddenLayer
+        The hidden layers. A list of HiddenLayer objects. The first
+        layer in the list is connected to the visible layer.
+    niter : int
+        Number of mean field iterations for variational inference
+        for the positive phase.
+    sampling_procedure : WRITEME
+    inference_procedure : WRITEME
+    """
+
+    def __init__(self, batch_size, visible_layer, hidden_layers, niter,
+                 sampling_procedure=None, inference_procedure=None):
+        self.__dict__.update(locals())
+        del self.self
+        assert len(hidden_layers) >= 1
+
+        if len(hidden_layers) > 1 and niter <= 1:
+            raise ValueError("with more than one hidden layer, niter needs to "
+                             "be greater than 1; otherwise mean field won't "
+                             "work properly.")
+
+        self.setup_rng()
+        self.layer_names = set()
+        self.visible_layer.set_dbm(self)
+        for layer in hidden_layers:
+            assert layer.get_dbm() is None
+            layer.set_dbm(self)
+            assert layer.layer_name not in self.layer_names
+            self.layer_names.add(layer.layer_name)
+        self._update_layer_input_spaces()
+        self.force_batch_size = batch_size
+        self.freeze_set = set([])
+        if inference_procedure is None:
+            self.setup_inference_procedure()
+        self.inference_procedure.set_dbm(self)
+        if sampling_procedure is None:
+            self.setup_sampling_procedure()
+        self.sampling_procedure.set_dbm(self)
+
+    def get_all_layers(self):
+        """
+        Returns all layers of the DBM in order of visible, hidden.
+        """
+        return [self.visible_layer] + self.hidden_layers
+
+    def energy(self, V, hidden):
+        """
+        Point energy of the DBM.
+        Calculated from the states of each unit.
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch of visible unit observations (must be SAMPLES, not
+            mean field parameters)
+        hidden : list
+            List, one element per hidden layer, of batches of samples (must
+            be SAMPLES, not mean field parameters)
+
+        Returns
+        -------
+        rval : tensor_like
+            Vector containing the energy of each sample
+
+        Notes
+        -----
+        Applying this function to non-sample theano variables is not guaranteed
+        to give you an expected energy in general, so don't use this that way.
+        """
+
+        terms = []
+
+        terms.append(self.visible_layer.expected_energy_term(state=V,
+                     average=False))
+
+        # This condition could be relaxed, but current code assumes it
+        assert len(self.hidden_layers) > 0
+
+        terms.append(self.hidden_layers[0].expected_energy_term(
+            state_below=self.visible_layer.upward_state(V),
+            state=hidden[0], average_below=False, average=False))
+
+        for i in xrange(1, len(self.hidden_layers)):
+            layer = self.hidden_layers[i]
+            samples_below = hidden[i-1]
+            layer_below = self.hidden_layers[i-1]
+            samples_below = layer_below.upward_state(samples_below)
+            samples = hidden[i]
+            terms.append(layer.expected_energy_term(state_below=samples_below,
+                         state=samples, average_below=False, average=False))
+
+        assert len(terms) > 0
+
+        rval = reduce(lambda x, y: x + y, terms)
+
+        assert rval.ndim == 1
+        return rval
+
+    def mf(self, *args, **kwargs):
+        """
+        Mean field inference of model.
+
+        Performs the inference procedure on the model.
+
+        Parameters
+        ----------
+        *args: TODO
+        **kwargs: TODO
+        """
+
+        self.setup_inference_procedure()
+        return self.inference_procedure.mf(*args, **kwargs)
+
+    def expected_energy(self, V, mf_hidden):
+        """
+        Expected energy of the DBM given a visible vector and the MF updates.
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch of visible unit observations (must be SAMPLES, not
+            mean field parameters: the random variables in the expectation
+            are the hiddens only)
+        mf_hidden : list
+            List, one element per hidden layer, of batches of variational
+            parameters (must be VARIATIONAL PARAMETERS, not samples. Layers
+            with analytically determined variance parameters for their mean
+            field parameters will use those to integrate over the variational
+            distribution, so it's not generally the same thing as measuring
+            the energy at a point.)
+
+        Returns
+        -------
+        rval : tensor_like
+            Vector containing the expected energy of each example under the
+            corresponding variational distribution.
+        """
+
+        self.visible_layer.space.validate(V)
+        assert isinstance(mf_hidden, (list, tuple))
+        assert len(mf_hidden) == len(self.hidden_layers)
+
+        terms = []
+
+        terms.append(self.visible_layer.expected_energy_term(state=V,
+                     average=False))
+
+        # This condition could be relaxed, but current code assumes it
+        assert len(self.hidden_layers) > 0
+
+        terms.append(self.hidden_layers[0].expected_energy_term(
+            state_below=self.visible_layer.upward_state(V),
+            average_below=False, state=mf_hidden[0], average=True))
+
+        for i in xrange(1, len(self.hidden_layers)):
+            layer = self.hidden_layers[i]
+            layer_below = self.hidden_layers[i-1]
+            mf_below = mf_hidden[i-1]
+            mf_below = layer_below.upward_state(mf_below)
+            mf = mf_hidden[i]
+            terms.append(layer.expected_energy_term(state_below=mf_below,
+                         state=mf, average_below=True, average=True))
+
+        assert len(terms) > 0
+
+        rval = reduce(lambda x, y: x + y, terms)
+
+        assert rval.ndim == 1
+        return rval
+
+    def setup_rng(self):
+        """
+        Function to set up the random number generator.
+        """
+        self.rng = make_np_rng(None, [2012, 10, 17], which_method="uniform")
+
+    def setup_inference_procedure(self):
+        """
+        Sets up the inference procedure for the DBM.
+        """
+        if not hasattr(self, 'inference_procedure') or \
+                self.inference_procedure is None:
+            if len(self.hidden_layers) == 1:
+                self.inference_procedure = UpDown()
+            else:
+                #self.inference_procedure = WeightDoubling()
+                self.inference_procedure = UpDown()
+            self.inference_procedure.set_dbm(self)
+
+        if len(self.hidden_layers) == 1:
+            try:
+                self.inference_procedure.is_rbm_compatible()
+            except NotImplementedError:
+                warnings.warn("Inference procedure %r may have unexpected"
+                              "behavior when used with one hidden layer (RBM)."
+                              "See models/dbn/inference_procedure.py for"
+                              "details." % type(self.inference_procedure))
+
+    def setup_sampling_procedure(self):
+        """
+        Sets up the sampling procedure.
+        Defaults to GibbsEvenOdd
+        """
+        if not hasattr(self, 'sampling_procedure') or \
+                self.sampling_procedure is None:
+            self.sampling_procedure = GibbsEvenOdd()
+            self.sampling_procedure.set_dbm(self)
+
+    def get_output_space(self):
+        """
+        Returns the output space of the top hidden layer.
+        """
+        return self.hidden_layers[-1].get_output_space()
+
+    def _update_layer_input_spaces(self):
+        """
+        Tells each layer what its input space should be.
+
+        Notes
+        -----
+        This usually resets the layer's parameters!
+        """
+        visible_layer = self.visible_layer
+        hidden_layers = self.hidden_layers
+
+        self.hidden_layers[0].set_input_space(visible_layer.space)
+        for i in xrange(1, len(hidden_layers)):
+            hidden_layers[i].set_input_space(
+                hidden_layers[i-1].get_output_space())
+
+        for layer in self.get_all_layers():
+            layer.finalize_initialization()
+
+    def add_layers(self, layers):
+        """
+        Add new layers on top of the existing hidden layers
+
+        Parameters
+        ----------
+        layers : dbm.HiddenLayer
+            Layer to add to DBM.
+        """
+
+        # Patch old pickle files
+        if not hasattr(self, 'rng'):
+            self.setup_rng()
+
+        hidden_layers = self.hidden_layers
+        assert len(hidden_layers) > 0
+        for layer in layers:
+            assert layer.get_dbm() is None
+            layer.set_dbm(self)
+            layer.set_input_space(hidden_layers[-1].get_output_space())
+            hidden_layers.append(layer)
+            assert layer.layer_name not in self.layer_names
+            self.layer_names.add(layer.layer_name)
+
+    def freeze(self, parameter_set):
+        """
+        Freezes the set of parameters.
+
+        Parameters
+        ----------
+        parameter_set: WRITEME
+        """
+        # patch old pickle files
+        if not hasattr(self, 'freeze_set'):
+            self.freeze_set = set([])
+
+        self.freeze_set = self.freeze_set.union(parameter_set)
+
+    def get_params(self):
+        """
+        Returns the parameters of the DBM.
+        """
+
+        rval = []
+        for param in self.visible_layer.get_params():
+            assert param.name is not None
+        rval = self.visible_layer.get_params()
+        for layer in self.hidden_layers:
+            for param in layer.get_params():
+                if param.name is None:
+                    raise ValueError("All of your parameters should have "
+                                     "names, but one of " + layer.layer_name +
+                                     "'s doesn't")
+            layer_params = layer.get_params()
+            assert not isinstance(layer_params, set)
+            for param in layer_params:
+                if param not in rval:
+                    rval.append(param)
+
+        # Patch pickle files that predate the freeze_set feature
+        if not hasattr(self, 'freeze_set'):
+            self.freeze_set = set([])
+
+        rval = [elem for elem in rval if elem not in self.freeze_set]
+
+        assert all([elem.name is not None for elem in rval])
+
+        return rval
+
+    def set_batch_size(self, batch_size):
+        """
+        Sets the batch size of the DBM.
+
+        Parameters
+        ----------
+        batch_size: int
+            The batch size
+        """
+        self.batch_size = batch_size
+        self.force_batch_size = batch_size
+
+        for layer in self.hidden_layers:
+            layer.set_batch_size(batch_size)
+
+        if not hasattr(self, 'inference_procedure'):
+            self.setup_inference_procedure()
+        self.inference_procedure.set_batch_size(batch_size)
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+        self.visible_layer.modify_updates(updates)
+        for layer in self.hidden_layers:
+            layer.modify_updates(updates)
+
+    def get_input_space(self):
+        """
+        Returns the input space of the visible layer.
+        """
+        return self.visible_layer.space
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        params = self.get_params()
+
+        for layer in self.hidden_layers + [self.visible_layer]:
+            contrib = layer.get_lr_scalers()
+
+            # No two layers can contend to scale a parameter
+            assert not any([key in rval for key in contrib])
+            # Don't try to scale anything that's not a parameter
+            assert all([key in params for key in contrib])
+
+            rval.update(contrib)
+        assert all([isinstance(val, float) for val in rval.values()])
+
+        return rval
+
+    def get_weights(self):
+        """
+        Returns the weights of the bottom hidden layer.
+        """
+
+        return self.hidden_layers[0].get_weights()
+
+    def get_weights_view_shape(self):
+        """
+        Returns shape of weight view.
+        """
+        return self.hidden_layers[0].get_weights_view_shape()
+
+    def get_weights_format(self):
+        """
+        Returns the format of the weights as that of the bottom hidden layer.
+        """
+        return self.hidden_layers[0].get_weights_format()
+
+    def get_weights_topo(self):
+        """
+        Returns the topologically formatted version of the weights.
+        Uses the bottom hidden layer.
+        """
+        return self.hidden_layers[0].get_weights_topo()
+
+    def make_layer_to_state(self, num_examples, rng=None):
+        """
+        Makes and returns a dictionary mapping layers to states.
+
+        By states, we mean here a real assignment, not a mean field
+        state. For example, for a layer containing binary random
+        variables, the state will be a shared variable containing
+        values in {0,1}, not [0,1]. The visible layer will be included.
+
+        Uses a dictionary so it is easy to unambiguously index a layer
+        without needing to remember rules like vis layer = 0, hiddens
+        start at 1, etc.
+
+        Parameters
+        ----------
+        num_examples : int
+            WRITEME
+        rng : WRITEME
+        """
+
+        # Make a list of all layers
+        layers = [self.visible_layer] + self.hidden_layers
+
+        if rng is None:
+            rng = self.rng
+
+        states = [layer.make_state(num_examples, rng) for layer in layers]
+
+        zipped = safe_zip(layers, states)
+
+        def recurse_check(layer, state):
+            if isinstance(state, (list, tuple)):
+                for elem in state:
+                    recurse_check(layer, elem)
+            else:
+                val = state.get_value()
+                m = val.shape[0]
+                if m != num_examples:
+                    raise ValueError(layer.layer_name + " gave state with " +
+                                     str(m) + " examples in some component."
+                                     "We requested " + str(num_examples))
+
+        for layer, state in zipped:
+            recurse_check(layer, state)
+
+        rval = OrderedDict(zipped)
+
+        return rval
+
+    def make_layer_to_symbolic_state(self, num_examples, rng=None):
+        """
+        .. todo::
+
+            Explain the difference with `make_layer_to_state`
+
+        Makes and returns a dictionary mapping layers to states.
+
+        By states, we mean here a real assignment, not a mean field
+        state. For example, for a layer containing binary random
+        variables, the state will be a shared variable containing
+        values in {0,1}, not [0,1]. The visible layer will be included.
+
+        Uses a dictionary so it is easy to unambiguously index a layer
+        without needing to remember rules like vis layer = 0, hiddens
+        start at 1, etc.
+
+        Parameters
+        ----------
+        num_examples : int
+            WRITEME
+        rng : WRITEME
+        """
+
+        # Make a list of all layers
+        layers = [self.visible_layer] + self.hidden_layers
+
+        assert rng is not None
+
+        states = [layer.make_symbolic_state(num_examples, rng)
+                  for layer in layers]
+
+        zipped = safe_zip(layers, states)
+
+        rval = OrderedDict(zipped)
+
+        return rval
+
+    def mcmc_steps(self, layer_to_state, theano_rng, layer_to_clamp=None,
+                   num_steps=1):
+        """
+        Perform Markov chain Monte Carlo.
+
+        Note: this is due to be removed, though it might still be useful.
+
+        Parameters
+        ----------
+        layer_to_state: dictionary of k, v pairs dbm.layer.Layer, tensor-like
+            Dictionary of layers and their corresponding state.
+        theano_rng: WRITEME
+        layer_to_clamp: dictionary of k, v pairs dbm.layer.Layer, bool
+            Dictionary of layers and a boolean indicating clamping.
+        num_steps: int
+            Number of steps in sampling procedure.
+        """
+
+        warnings.warn("DBM.mcmc_steps is deprecated. You should instead " +
+                      "call DBM.sampling_procedure.sample, which defaults " +
+                      "to what DBM.mcmc_steps used to do. This method will " +
+                      "be removed on or after July 31, 2014.")
+        return self.sampling_procedure.sample(layer_to_state, theano_rng,
+                                              layer_to_clamp, num_steps)
+
+    def get_sampling_updates(self, layer_to_state, theano_rng,
+                             layer_to_clamp=None, num_steps=1,
+                             return_layer_to_updated=False):
+        """
+        This method is for getting an updates dictionary for a theano function.
+
+        It thus implies that the samples are represented as shared variables.
+        If you want an expression for a sampling step applied to arbitrary
+        theano variables, use the 'mcmc_steps' method. This is a wrapper around
+        that method.
+
+        Parameters
+        ----------
+        layer_to_state : dict
+            Dictionary mapping the SuperDBM_Layer instances contained in
+            self to shared variables representing batches of samples of them.
+            (you can allocate one by calling self.make_layer_to_state)
+        theano_rng : MRG_RandomStreams
+            WRITEME
+        layer_to_clamp : dict, optional
+            Dictionary mapping layers to bools. If a layer is not in the
+            dictionary, defaults to False. True indicates that this layer
+            should be clamped, so we are sampling from a conditional
+            distribution rather than the joint distribution
+        num_steps : int, optional
+            WRITEME
+        return_layer_to_updated : bool, optional
+            WRITEME
+
+        Returns
+        -------
+        rval : dict
+            Dictionary mapping each shared variable to an expression to
+            update it. Repeatedly applying these updates does MCMC sampling.
+
+        Notes
+        -----
+        The specific sampling schedule used by default is to sample all of the
+        even-idexed layers of model.hidden_layers, then the visible layer and
+        all the odd-indexed layers.
+        """
+
+        updated = self.sampling_procedure.sample(layer_to_state, theano_rng,
+                                                 layer_to_clamp, num_steps)
+
+        rval = OrderedDict()
+
+        def add_updates(old, new):
+            if isinstance(old, (list, tuple)):
+                for old_elem, new_elem in safe_izip(old, new):
+                    add_updates(old_elem, new_elem)
+            else:
+                rval[old] = new
+
+        # Validate layer_to_clamp / make sure layer_to_clamp is a fully
+        # populated dictionary
+        if layer_to_clamp is None:
+            layer_to_clamp = OrderedDict()
+
+        for key in layer_to_clamp:
+            assert key is self.visible_layer or key in self.hidden_layers
+
+        for layer in [self.visible_layer] + self.hidden_layers:
+            if layer not in layer_to_clamp:
+                layer_to_clamp[layer] = False
+
+        # Translate update expressions into theano updates
+        for layer in layer_to_state:
+            old = layer_to_state[layer]
+            new = updated[layer]
+            if layer_to_clamp[layer]:
+                assert new is old
+            else:
+                add_updates(old, new)
+
+        assert isinstance(self.hidden_layers, list)
+
+        if return_layer_to_updated:
+            return rval, updated
+
+        return rval
+
+    def get_monitoring_channels(self, data):
+        """
+        Returns the monitor channels of the DBM.
+
+        This is done through the visible and all of the hidden layers of DBM.
+
+        Parameters
+        ----------
+        data: tensor-like
+            Data from which to evaluate model.
+        """
+        space, source = self.get_monitoring_data_specs()
+        space.validate(data)
+        X = data
+        history = self.mf(X, return_history=True)
+        q = history[-1]
+
+        rval = OrderedDict()
+
+        ch = self.visible_layer.get_monitoring_channels()
+        for key in ch:
+            rval['vis_' + key] = ch[key]
+
+        for state, layer in safe_zip(q, self.hidden_layers):
+            ch = layer.get_monitoring_channels()
+            for key in ch:
+                rval[layer.layer_name + '_' + key] = ch[key]
+            ch = layer.get_monitoring_channels_from_state(state)
+            for key in ch:
+                rval['mf_' + layer.layer_name + '_' + key] = ch[key]
+        if len(history) > 1:
+            prev_q = history[-2]
+
+            flat_q = flatten(q)
+            flat_prev_q = flatten(prev_q)
+
+            mx = None
+            for new, old in safe_zip(flat_q, flat_prev_q):
+                cur_mx = abs(new - old).max()
+                if new is old:
+                    logger.error('{0} is {1}'.format(new, old))
+                    assert False
+                if mx is None:
+                    mx = cur_mx
+                else:
+                    mx = T.maximum(mx, cur_mx)
+
+            rval['max_var_param_diff'] = mx
+
+            for layer, new, old in safe_zip(self.hidden_layers,
+                                            q, prev_q):
+                sum_diff = 0.
+                for sub_new, sub_old in safe_zip(flatten(new), flatten(old)):
+                    sum_diff += abs(sub_new - sub_old).sum()
+                denom = self.batch_size * \
+                    layer.get_total_state_space().get_total_dimension()
+                denom = np.cast[config.floatX](denom)
+                rval['mean_'+layer.layer_name+'_var_param_diff'] = \
+                    sum_diff / denom
+
+        X_hat = self.reconstruct(X)
+        reconstruction_cost = self.visible_layer.recons_cost(X, X_hat)
+        rval['reconstruction_cost'] = reconstruction_cost
+
+        return rval
+
+    def get_monitoring_data_specs(self):
+        """
+        Get the data_specs describing the data for get_monitoring_channel.
+
+        This implementation returns specification corresponding to unlabeled
+        inputs.
+        """
+        return (self.get_input_space(), self.get_input_source())
+
+    def get_test_batch_size(self):
+        """
+        Returns the batch size of the model.
+        """
+        return self.batch_size
+
+    def reconstruct(self, V):
+        """
+        Reconstructs an input using inpainting method.
+
+        Parameters
+        ----------
+        V: tensor-like
+            Input sample.
+
+        Returns
+        -------
+        recons: tensor-like
+            Reconstruction of V.
+        """
+
+        H = self.mf(V)[0]
+
+        downward_state = self.hidden_layers[0].downward_state(H)
+
+        recons = self.visible_layer.inpaint_update(
+            layer_above=self.hidden_layers[0],
+            state_above=downward_state,
+            drop_mask=None, V=None)
+
+        return recons
+
+    def do_inpainting(self, *args, **kwargs):
+        """
+        Perform inpainting on model.
+
+        Inpainting is defined by the inference procedure.
+
+        Parameters
+        ----------
+        *args: WRITEME
+        **kwargs: WRITEME
+        """
+        self.setup_inference_procedure()
+        return self.inference_procedure.do_inpainting(*args, **kwargs)
+
+    def initialize_chains(self, X, Y, theano_rng):
+        """
+        Function to initialize chains for model when performing the neg phase.
+        TODO: implement in cost functions.
+
+        Parameters
+        ----------
+        X: tensor-like
+            The data. If none, then persistent (TODO)
+        Y: tensor-like
+            Labels.
+        theano_rng: WRITEME
+
+        Returns
+        ------
+        layer_to_chains: OrderedDict
+        """
+
+        if X is None:
+            raise NotImplementedError("Persistent chains not implemented yet.")
+
+        # Initializing to data
+        layer_to_clamp = OrderedDict([(self.visible_layer, True)])
+        layer_to_chains = self.make_layer_to_symbolic_state(1, theano_rng)
+
+        # initialized the visible layer to data
+        layer_to_chains[self.visible_layer] = X
+
+        # if supervised, also clamp targets
+        if Y is not None and self.supervised:
+            # note: if the Y layer changes to something without linear energy,
+            # we'll need to make the expected energy clamp Y in the positive
+            # phase
+            target_layer = self.hidden_layers[-1]
+            assert isinstance(target_layer, Softmax)
+            layer_to_clamp[target_layer] = True
+            layer_to_chains[target_layer] = Y
+
+        # Note that we replace layer_to_chains with a dict mapping to the new
+        # state of the chains
+        # We first initialize the chain by clamping the visible layer and the
+        # target layer (if it exists)
+        layer_to_chains = self.sampling_procedure.sample(
+            layer_to_chains,
+            theano_rng,
+            layer_to_clamp=layer_to_clamp,
+            num_steps=1)
+        return layer_to_chains
+
+
+class RBM(DBM):
+    """
+    A restricted Boltzmann machine.
+
+    The special case of a DBM with only one hidden layer designed to keep
+    things simple for researchers interested only in a single layer of
+    latent variables and DBN.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size the model should use. Some convolutional
+        LinearTransforms require a compile-time hardcoded batch size,
+        otherwise this would not be part of the model specification.
+    visible_layer : DBM.VisibleLayer
+        The visible layer of the DBM.
+    hidden_layers : List of DBM.HiddenLayer
+        The hidden layers. A list of HiddenLayer objects. The first
+        layer in the list is connected to the visible layer.
+    niter : int
+        Number of mean field iterations for variational inference
+        for the positive phase.
+    """
+    def __init__(self, batch_size, visible_layer, hidden_layer, niter):
+        self.__dict__.update(locals())
+        del self.self
+        super(RBM, self).__init__(batch_size, visible_layer, [hidden_layer],
+                                  niter,
+                                  inference_procedure=UpDown(),
+                                  sampling_procedure=GibbsEvenOdd())
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/dbm_cost.py b/pylearn2/sandbox/dbm_v2/dbm_cost.py
new file mode 100644
index 0000000000..7925788e3b
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/dbm_cost.py
@@ -0,0 +1,660 @@
+"""
+This module contains cost functions to use with deep Boltzmann machines
+(pylearn2.models.dbm).
+"""
+
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin", "Devon Hjelm"]
+__copyright__ = "Copyright 2012, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import numpy as np
+import logging
+import warnings
+
+from theano.compat.python2x import OrderedDict
+from theano import config
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+RandomStreams = MRG_RandomStreams
+from theano import tensor as T
+
+import pylearn2
+from pylearn2.costs.cost import Cost
+from pylearn2.costs.cost import (
+    FixedVarDescr, DefaultDataSpecsMixin, NullDataSpecsMixin
+)
+
+from pylearn2.sandbox.dbm_v2 import dbm
+from pylearn2.sandbox.dbm_v2.layer import BinaryVectorMaxPool
+from pylearn2.sandbox.dbm_v2 import flatten
+from pylearn2.sandbox.dbm_v2.layer import BinaryVector
+from pylearn2.sandbox.dbm_v2.layer import Softmax
+
+from pylearn2 import utils
+from pylearn2.utils import make_name
+from pylearn2.utils import safe_izip
+from pylearn2.utils import safe_zip
+from pylearn2.utils import sharedX
+from pylearn2.utils.rng import make_theano_rng
+
+
+logger = logging.getLogger(__name__)
+
+
+# Positive phase methods
+
+def positive_phase(model, X, Y, num_gibbs_steps=1, supervised=False,
+                   theano_rng=None, method="VARIATIONAL"):
+    """
+    Wrapper function for positive phase.
+    Method is controled by switch string "method".
+
+    Parameters
+    ----------
+    X: input observables
+    Y: supervised observables
+    num_gibbs_steps: number of gibbs steps for sampling method
+    theano_rng for sampling method
+    method: method for positive phase: VARIATIONAL or SAMPLING.
+    """
+
+    if method == "VARIATIONAL":
+        return variational_positive_phase(model, X, Y,
+                                          supervised=supervised)
+    elif method == "SAMPLING":
+        return sampling_positive_phase(model, X, Y,
+                                       supervised=supervised,
+                                       num_gibbs_steps=num_gibbs_steps,
+                                       theano_rng=theano_rng)
+    else: raise ValueError("Available methods for positive phase are VARIATIONAL and SAMPLING")
+
+def variational_positive_phase(model, X, Y, supervised):
+    """
+    .. todo::
+
+    WRITEME
+    """
+    if supervised:
+        assert Y is not None
+        # note: if the Y layer changes to something without linear energy,
+        # we'll need to make the expected energy clamp Y in the positive
+        # phase
+        assert isinstance(model.hidden_layers[-1], Softmax)
+
+    q = model.mf(X, Y)
+
+    """
+    Use the non-negativity of the KL divergence to construct a lower
+    bound on the log likelihood. We can drop all terms that are
+    constant with respect to the model parameters:
+
+    log P(v) = L(v, q) + KL(q || P(h|v))
+    L(v, q) = log P(v) - KL(q || P(h|v))
+    L(v, q) = log P(v) - sum_h q(h) log q(h) + q(h) log P(h | v)
+    L(v, q) = log P(v) + sum_h q(h) log P(h | v) + const
+    L(v, q) = log P(v) + sum_h q(h) log P(h, v)
+    - sum_h q(h) log P(v) + const
+    L(v, q) = sum_h q(h) log P(h, v) + const
+    L(v, q) = sum_h q(h) -E(h, v) - log Z + const
+
+    so the cost we want to minimize is
+    expected_energy + log Z + const
+
+
+    Note: for the RBM, this bound is exact, since the KL divergence
+    goes to 0.
+    """
+
+    variational_params = flatten(q)
+
+    # The gradients of the expected energy under q are easy, we can just
+    # do that in theano
+    expected_energy_q = model.expected_energy(X, q).mean()
+    params = list(model.get_params())
+    gradients = OrderedDict(
+        safe_zip(params, T.grad(expected_energy_q,
+                                params,
+                                consider_constant=variational_params,
+                                disconnected_inputs='ignore')))
+    return gradients
+
+def sampling_positive_phase(model, X, Y, supervised, num_gibbs_steps, theano_rng):
+    """
+    .. todo::
+
+    WRITEME
+    """
+    assert num_gibbs_steps is not None
+    assert theano_rng is not None
+    # If there's only one hidden layer, there's no point in sampling.
+    if len(model.hidden_layers) == 1: num_gibbs_steps = 1
+    layer_to_clamp = OrderedDict([(model.visible_layer, True)])
+    layer_to_pos_samples = OrderedDict([(model.visible_layer, X)])
+    if supervised:
+        # note: if the Y layer changes to something without linear energy,
+        #       we'll need to make the expected energy clamp Y in the
+        #       positive phase
+        assert isinstance(model.hidden_layers[-1], Softmax)
+        layer_to_clamp[model.hidden_layers[-1]] = True
+        layer_to_pos_samples[model.hidden_layers[-1]] = Y
+        hid = model.hidden_layers[:-1]
+    else:
+        assert Y is None
+        hid = model.hidden_layers
+
+    for layer in hid:
+        mf_state = layer.init_mf_state()
+
+        def recurse_zeros(x):
+            if isinstance(x, tuple):
+                return tuple([recurse_zeros(e) for e in x])
+            return x.zeros_like()
+        layer_to_pos_samples[layer] = recurse_zeros(mf_state)
+
+    layer_to_pos_samples = model.sampling_procedure.sample(
+        layer_to_state=layer_to_pos_samples,
+        layer_to_clamp=layer_to_clamp,
+        num_steps=num_gibbs_steps,
+        theano_rng=theano_rng)
+    q = [layer_to_pos_samples[layer] for layer in model.hidden_layers]
+
+    pos_samples = flatten(q)
+
+    # The gradients of the expected energy under q are easy, we can just
+    # do that in theano
+    expected_energy_q = model.energy(X, q).mean()
+    params = list(model.get_params())
+    gradients = OrderedDict(
+        safe_zip(params, T.grad(expected_energy_q, params,
+                                consider_constant=pos_samples,
+                                disconnected_inputs='ignore')))
+    return gradients
+
+# Negative phase methods
+
+def negative_phase(model, layer_to_chains, method="STANDARD"):
+    """
+    Wrapper function for negative phase.
+
+    Parameters
+    ----------
+    model: a dbm model.
+    layer_to_chains: dicitonary of layer chains for sampling.
+    method: standard or toronto
+    """
+
+    if method == "STANDARD":
+        return standard_negative_phase(model, layer_to_chains)
+    elif method == "TORONTO":
+        return toronto_negative_phase(model, layer_to_chains)
+    else: raise ValueError("Available methods for negative phase are STANDARD and TORONTO")
+
+def standard_negative_phase(model, layer_to_chains):
+    """
+    .. todo::
+
+    WRITEME
+
+    TODO:reduce variance of negative phase by
+    integrating out the even-numbered layers. The
+    Rao-Blackwellize method can do this for you when
+    expected gradient = gradient of expectation, but
+    doing this in general is trickier.
+    """
+    params = list(model.get_params())
+
+    # layer_to_chains = model.rao_blackwellize(layer_to_chains)
+    expected_energy_p = model.energy(
+        layer_to_chains[model.visible_layer],
+        [layer_to_chains[layer] for layer in model.hidden_layers]).mean()
+
+    samples = flatten(layer_to_chains.values())
+    for i, sample in enumerate(samples):
+        if sample.name is None:
+            sample.name = 'sample_'+str(i)
+
+    neg_phase_grads = OrderedDict(
+        safe_zip(params, T.grad(-expected_energy_p, params,
+                                 consider_constant=samples,
+                                 disconnected_inputs='ignore')))
+    return neg_phase_grads
+
+def toronto_negative_phase(model, layer_to_chains):
+    """
+    .. todo::
+
+    WRITEME
+    """
+    # Ruslan Salakhutdinov's undocumented negative phase from
+    # http://www.mit.edu/~rsalakhu/code_DBM/dbm_mf.m
+    # IG copied it here without fully understanding it, so it
+    # only applies to exactly the same model structure as
+    # in that code.
+
+    assert isinstance(model.visible_layer, BinaryVector)
+    assert isinstance(model.hidden_layers[0], BinaryVectorMaxPool)
+    assert model.hidden_layers[0].pool_size == 1
+    assert isinstance(model.hidden_layers[1], BinaryVectorMaxPool)
+    assert model.hidden_layers[1].pool_size == 1
+    assert isinstance(model.hidden_layers[2], Softmax)
+    assert len(model.hidden_layers) == 3
+
+    params = list(model.get_params())
+
+    V_samples = layer_to_chains[model.visible_layer]
+    H1_samples, H2_samples, Y_samples = [layer_to_chains[layer] for
+                                         layer in model.hidden_layers]
+
+    H1_mf = model.hidden_layers[0].mf_update(
+        state_below=model.visible_layer.upward_state(V_samples),
+        state_above=model.hidden_layers[1].downward_state(H2_samples),
+        layer_above=model.hidden_layers[1])
+    Y_mf = model.hidden_layers[2].mf_update(
+        state_below=model.hidden_layers[1].upward_state(H2_samples))
+    H2_mf = model.hidden_layers[1].mf_update(
+        state_below=model.hidden_layers[0].upward_state(H1_mf),
+        state_above=model.hidden_layers[2].downward_state(Y_mf),
+        layer_above=model.hidden_layers[2])
+
+    expected_energy_p = model.energy(
+        V_samples, [H1_mf, H2_mf, Y_samples]).mean()
+
+    constants = flatten([V_samples, H1_mf, H2_mf, Y_samples])
+
+    neg_phase_grads = OrderedDict(
+        safe_zip(params, T.grad(-expected_energy_p, params,
+                                 consider_constant=constants)))
+    return neg_phase_grads
+
+
+class BaseCD(DefaultDataSpecsMixin, Cost):
+    """
+    Parameters
+    ----------
+    num_chains : int
+        The number of negative chains to use with PCD / SML.
+        WRITEME : how is this meant to be used with CD? Do you just need to
+        set it to be equal to the batch size? If so: TODO, get rid of this
+        redundant aspect of the interface.
+    num_gibbs_steps : int
+        The number of Gibbs steps to use in the negative phase. (i.e., if
+        you want to use CD-k or PCD-k, this is "k").
+    supervised : bool
+        If True, requests class labels and models the joint distrbution over
+        features and labels.
+    toronto_neg : bool
+        If True, use a bit of mean field in the negative phase.
+        Ruslan Salakhutdinov's matlab code does this.
+    theano_rng : MRG_RandomStreams, optional
+        If specified, uses this object to generate all random numbers.
+        Otherwise, makes its own random number generator.
+    """
+
+    def __init__(self, num_chains=1, num_gibbs_steps=1, supervised=False,
+                 toronto_neg=False, theano_rng=None,
+                 positive_method = "SAMPLING", negative_method = "STANDARD"):
+        self.__dict__.update(locals())
+        del self.self
+
+        self.theano_rng = make_theano_rng(theano_rng, 2012+10+14, which_method="binomial")
+        assert supervised in [True, False]
+        if toronto_neg:
+            self.negative_method = "TORONTO"
+
+    def expr(self, model, data):
+        """
+        The partition function makes this intractable.
+        """
+        self.get_data_specs(model)[0].validate(data)
+
+        return None
+
+    def _get_positive_phase(self, model, X, Y=None):
+        """
+        Get positive phase.
+        """
+        return positive_phase(model, X, Y, supervised=self.supervised,
+                              method=self.positive_method,
+                              num_gibbs_steps=self.num_gibbs_steps,
+                              theano_rng=self.theano_rng), OrderedDict()
+
+    def _get_negative_phase(self, model, X, Y=None):
+        """
+        .. todo::
+
+            WRITEME
+
+        d/d theta log Z = (d/d theta Z) / Z
+                        = (d/d theta sum_h sum_v exp(-E(v,h)) ) / Z
+                        = (sum_h sum_v - exp(-E(v,h)) d/d theta E(v,h) ) / Z
+                        = - sum_h sum_v P(v,h)  d/d theta E(v,h)
+        """
+        layer_to_chains = model.initialize_chains(X, Y, self.theano_rng)
+        updates, layer_to_chains = model.get_sampling_updates(layer_to_chains,
+                                                              self.theano_rng,
+                                                              num_steps=self.num_gibbs_steps,
+                                                              return_layer_to_updated=True)
+
+        neg_phase_grads = negative_phase(model, layer_to_chains, method=self.negative_method)
+
+        return neg_phase_grads, updates
+
+    def get_gradients(self, model, data, persistent=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        if self.supervised:
+            X, Y = data
+            assert Y is not None
+        else:
+            X = data
+            Y = None
+
+        pos_phase_grads, pos_updates = self._get_positive_phase(model, X, Y)
+        neg_phase_grads, neg_updates = self._get_negative_phase(model, X, Y)
+
+        updates = OrderedDict()
+        if persistent:
+            for key, val in pos_updates.items():
+                updates[key] = val
+            for key, val in neg_updates.items():
+                updates[key] = val
+
+        gradients = OrderedDict()
+        for param in list(pos_phase_grads.keys()):
+            gradients[param] = neg_phase_grads[param] + pos_phase_grads[param]
+        return gradients, updates
+
+    def get_monitoring_channels(self, model, data):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        rval = OrderedDict()
+
+        if self.supervised:
+            X, Y = data
+        else:
+            X = data
+            Y = None
+
+        history = model.mf(X, return_history = True)
+        q = history[-1]
+
+        if self.supervised:
+            assert len(data) == 2
+            Y_hat = q[-1]
+            true = T.argmax(Y, axis=1)
+            pred = T.argmax(Y_hat, axis=1)
+
+            #true = Print('true')(true)
+            #pred = Print('pred')(pred)
+
+            wrong = T.neq(true, pred)
+            err = T.cast(wrong.mean(), X.dtype)
+            rval['misclass'] = err
+
+            if len(model.hidden_layers) > 1:
+                q = model.mf(X, Y=Y)
+                pen = model.hidden_layers[-2].upward_state(q[-2])
+                Y_recons = model.hidden_layers[-1].mf_update(state_below=pen)
+                pred = T.argmax(Y_recons, axis=1)
+                wrong = T.neq(true, pred)
+
+                rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype)
+
+        return rval
+
+
+class VariationalCD(BaseCD):
+    """
+    An intractable cost representing the negative log likelihood of a DBM.
+    The gradient of this bound is computed using a markov chain initialized
+    with the training example.
+
+    Source: Hinton, G. Training Products of Experts by Minimizing
+            Contrastive Divergence
+    """
+
+    def __init__(self, num_gibbs_steps=2, supervised=False,
+                 toronto_neg=False, theano_rng=None):
+        super(VariationalCD, self).__init__(num_gibbs_steps,
+                                            supervised=supervised,
+                                            toronto_neg=toronto_neg,
+                                            positive_method="VARIATIONAL",
+                                            negative_method="STANDARD")
+
+
+
+class MF_L1_ActCost(DefaultDataSpecsMixin, Cost):
+    """
+    L1 activation cost on the mean field parameters.
+
+    Adds a cost of:
+
+    coeff * max( abs(mean_activation - target) - eps, 0)
+
+    averaged over units
+
+    for each layer.
+
+    """
+
+    def __init__(self, targets, coeffs, eps, supervised):
+        """
+        targets: a list, one element per layer, specifying the activation
+                each layer should be encouraged to have
+                    each element may also be a list depending on the
+                    structure of the layer.
+                See each layer's get_l1_act_cost for a specification of
+                    what the state should be.
+        coeffs: a list, one element per layer, specifying the coefficient
+                to put on the L1 activation cost for each layer
+        supervised: If true, runs mean field on both X and Y, penalizing
+                the layers in between only
+        """
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, ** kwargs):
+
+        if self.supervised:
+            X, Y = data
+            H_hat = model.mf(X, Y= Y)
+        else:
+            X = data
+            H_hat = model.mf(X)
+
+        hidden_layers = model.hidden_layers
+        if self.supervised:
+            hidden_layers = hidden_layers[:-1]
+            H_hat = H_hat[:-1]
+
+        layer_costs = []
+        for layer, mf_state, targets, coeffs, eps in \
+            safe_zip(hidden_layers, H_hat, self.targets, self.coeffs,
+                    self.eps):
+            cost = None
+            try:
+                cost = layer.get_l1_act_cost(mf_state, targets, coeffs, eps)
+            except NotImplementedError:
+                assert isinstance(coeffs, float) and coeffs == 0.
+                assert cost is None # if this gets triggered, there might
+                    # have been a bug, where costs from lower layers got
+                    # applied to higher layers that don't implement the cost
+                cost = None
+            if cost is not None:
+                layer_costs.append(cost)
+
+
+        assert T.scalar() != 0. # make sure theano semantics do what I want
+        layer_costs = [cost_ for cost_ in layer_costs if cost_ != 0.]
+
+        if len(layer_costs) == 0:
+            return T.as_tensor_variable(0.)
+        else:
+            total_cost = reduce(lambda x, y: x + y, layer_costs)
+        total_cost.name = 'MF_L1_ActCost'
+
+        assert total_cost.ndim == 0
+
+        return total_cost
+
+class MF_L2_ActCost(DefaultDataSpecsMixin, Cost):
+    """
+    An L2 penalty on the amount that the hidden unit mean field parameters
+    deviate from desired target values.
+
+    TODO: write up parameters list
+    """
+
+    def __init__(self, targets, coeffs, supervised=False):
+        targets = fix(targets)
+        coeffs = fix(coeffs)
+
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, return_locals=False, **kwargs):
+        """
+        .. todo::
+
+            WRITEME
+
+        If returns locals is True, returns (objective, locals())
+        Note that this means adding / removing / changing the value of
+        local variables is an interface change.
+        In particular, TorontoSparsity depends on "terms" and "H_hat"
+        """
+        self.get_data_specs(model)[0].validate(data)
+        if self.supervised:
+            (X, Y) = data
+        else:
+            X = data
+            Y = None
+
+        H_hat = model.mf(X, Y=Y)
+
+        terms = []
+
+        hidden_layers = model.hidden_layers
+        #if self.supervised:
+        #    hidden_layers = hidden_layers[:-1]
+
+        for layer, mf_state, targets, coeffs in \
+                safe_zip(hidden_layers, H_hat, self.targets, self.coeffs):
+            try:
+                cost = layer.get_l2_act_cost(mf_state, targets, coeffs)
+            except NotImplementedError:
+                if isinstance(coeffs, float) and coeffs == 0.:
+                    cost = 0.
+                else:
+                    raise
+            terms.append(cost)
+
+
+        objective = sum(terms)
+
+        if return_locals:
+            return objective, locals()
+        return objective
+
+
+class L2WeightDecay(NullDataSpecsMixin, Cost):
+    """
+    A Cost that applies the following cost function:
+
+    coeff * sum(sqr(weights))
+    for each set of weights.
+
+    Parameters
+    ----------
+    coeffs : list
+        One element per layer, specifying the coefficient
+        to put on the L1 activation cost for each layer.
+        Each element may in turn be a list, ie, for CompositeLayers.
+    """
+
+    def __init__(self, coeffs):
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, ** kwargs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        layer_costs = [ layer.get_weight_decay(coeff)
+            for layer, coeff in safe_izip(model.hidden_layers, self.coeffs) ]
+
+        assert T.scalar() != 0. # make sure theano semantics do what I want
+        layer_costs = [ cost for cost in layer_costs if cost != 0.]
+
+        if len(layer_costs) == 0:
+            rval =  T.as_tensor_variable(0.)
+            rval.name = '0_weight_decay'
+            return rval
+        else:
+            total_cost = reduce(lambda x, y: x + y, layer_costs)
+        total_cost.name = 'DBM_WeightDecay'
+
+        assert total_cost.ndim == 0
+
+        total_cost.name = 'weight_decay'
+
+        return total_cost
+
+
+class L1WeightDecay(NullDataSpecsMixin, Cost):
+    """
+    A Cost that applies the following cost function:
+
+    coeff * sum(abs(weights))
+    for each set of weights.
+
+    Parameters
+    ----------
+    coeffs : list
+        One element per layer, specifying the coefficient
+        to put on the L1 activation cost for each layer.
+        Each element may in turn be a list, ie, for CompositeLayers.
+    """
+
+    def __init__(self, coeffs):
+        self.__dict__.update(locals())
+        del self.self
+
+    def expr(self, model, data, ** kwargs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.get_data_specs(model)[0].validate(data)
+        layer_costs = [ layer.get_l1_weight_decay(coeff)
+            for layer, coeff in safe_izip(model.hidden_layers, self.coeffs) ]
+
+        assert T.scalar() != 0. # make sure theano semantics do what I want
+        layer_costs = [ cost for cost in layer_costs if cost != 0.]
+
+        if len(layer_costs) == 0:
+            rval =  T.as_tensor_variable(0.)
+            rval.name = '0_l1_weight_decay'
+            return rval
+        else:
+            total_cost = reduce(lambda x, y: x + y, layer_costs)
+        total_cost.name = 'DBM_L1WeightDecay'
+
+        assert total_cost.ndim == 0
+
+        total_cost.name = 'l1_weight_decay'
+
+        return total_cost
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/inference_procedure.py b/pylearn2/sandbox/dbm_v2/inference_procedure.py
new file mode 100644
index 0000000000..d809ee12e4
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/inference_procedure.py
@@ -0,0 +1,484 @@
+"""
+Various InferenceProcedures for use with the DBM class.
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin", "Devon Hjelm"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import functools
+import logging
+
+from theano import gof
+import theano.tensor as T
+import theano
+from theano.gof.op import get_debug_values
+
+from pylearn2.models.dbm import block, flatten
+from pylearn2.models.dbm.layer import Softmax
+from pylearn2.utils import safe_izip, block_gradient, safe_zip
+
+
+logger = logging.getLogger(__name__)
+
+
+class InferenceProcedure(object):
+
+    """
+    A class representing a procedure for performing mean field inference in a
+    DBM.
+    Different subclasses can implement different specific procedures, such as
+    updating the layers in different orders, or using different strategies to
+    initialize the mean field expectations.
+    """
+
+    def set_dbm(self, dbm):
+        """
+        Associates the InferenceProcedure with a specific DBM.
+
+        Parameters
+        ----------
+        dbm : pylearn2.models.dbm.DBM instance
+            The model to perform inference in.
+        """
+        self.dbm = dbm
+
+    def mf(self, V, Y=None, return_history=False, niter=None, block_grad=None):
+        """
+        Perform mean field inference. Subclasses must implement.
+
+        Parameters
+        ----------
+        V : Input space batch
+            The values of the input features modeled by the DBM.
+        Y : (Optional) Target space batch
+            The values of the labels modeled by the DBM. Must be omitted
+            if the DBM does not model labels. If the DBM does model
+            labels, they may be included to perform inference over the
+            hidden layers only, or included to perform inference over the
+            labels.
+        return_history : (Optional) bool
+            Default: False
+            If True, returns the full sequence of mean field updates.
+        niter : (Optional) int
+        block_grad : (Optional) int
+            Default: None
+            If not None, blocks the gradient after `block_grad`
+            iterations, so that only the last `niter` - `block_grad`
+            iterations need to be stored when using the backpropagation
+            algorithm.
+
+        Returns
+        -------
+        result : list
+            If not `return_history` (default), a list with one element
+            per inferred layer, containing the full mean field state
+            of that layer.
+            Otherwise, a list of such lists, with the outer list
+            containing one element for each step of inference.
+        """
+        raise NotImplementedError(str(type(self)) + " does not implement mf.")
+
+    def set_batch_size(self, batch_size):
+        """
+        If the inference procedure is dependent on a batch size at all, makes
+        the necessary internal configurations to work with that batch size.
+
+        Parameters
+        ----------
+        batch_size : int
+            The number of examples in the batch
+        """
+        # Default implementation is no-op, because default procedure does
+        # not depend on the batch size.
+
+    def multi_infer(self, V, return_history=False, niter=None,
+                    block_grad=None):
+        """
+        Inference using "the multi-inference trick." See
+        "Multi-prediction deep Boltzmann machines", Goodfellow et al 2013.
+
+        Subclasses may implement this method, however it is not needed for
+        any training algorithm, and only expected to work at evaluation
+        time if the model was trained with multi-prediction training.
+
+        Parameters
+        ----------
+        V : input space batch
+        return_history : bool
+            If True, returns the complete history of the mean field
+            iterations, rather than just the final values
+        niter : int
+            The number of mean field iterations to run
+        block_grad : int
+            If not None, block the gradient after this number of iterations
+
+        Returns
+        -------
+        result : list
+            A list of mean field states, or if return_history is True, a
+            list of such lists with one element per mean field iteration
+        """
+
+        raise NotImplementedError(str(type(self)) + " does not implement"
+                                  " multi_infer.")
+
+    def do_inpainting(self, V, Y=None, drop_mask=None, drop_mask_Y=None,
+                      return_history=False, noise=False, niter=None,
+                      block_grad=None):
+        """
+        Does the inference required for multi-prediction training.
+
+        If you use this method in your research work, please cite:
+
+            Multi-prediction deep Boltzmann machines. Ian J. Goodfellow,
+            Mehdi Mirza, Aaron Courville, and Yoshua Bengio. NIPS 2013.
+
+
+        Gives the mean field expression for units masked out by drop_mask.
+        Uses self.niter mean field updates.
+
+        Comes in two variants, unsupervised and supervised:
+
+        * unsupervised: Y and drop_mask_Y are not passed to the method. The
+          method produces V_hat, an inpainted version of V
+        * supervised: Y and drop_mask_Y are passed to the method. The method
+          produces V_hat and Y_hat
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch in `model.input_space`
+        Y : tensor_like
+            Theano batch in `model.output_space`, i.e. in the output space of
+            the last hidden layer. (It's not really a hidden layer anymore,
+            but oh well. It's convenient to code it this way because the
+            labels are sort of "on top" of everything else.) *** Y is always
+            assumed to be a matrix of one-hot category labels. ***
+        drop_mask : tensor_like
+            Theano batch in `model.input_space`. Should be all binary, with
+            1s indicating that the corresponding element of X should be
+            "dropped", i.e. hidden from the algorithm and filled in as part
+            of the inpainting process
+        drop_mask_Y : tensor_like
+            Theano vector. Since we assume Y is a one-hot matrix, each row is
+            a single categorical variable. `drop_mask_Y` is a binary mask
+            specifying which *rows* to drop.
+        return_history : bool, optional
+            WRITEME
+        noise : bool, optional
+            WRITEME
+        niter : int, optional
+            WRITEME
+        block_grad : WRITEME
+
+        Returns
+        -------
+        WRITEME
+        """
+
+        raise NotImplementedError(str(type(self)) + " does not implement "
+                                  "do_inpainting.")
+
+    def is_rbm_compatible(self):
+        """
+        Checks whether inference procedure is compatible with an RBM.
+
+        A restricted Boltzmann machine (RBM) is a deep Boltzmann machine (DBM)
+        with exactly one hidden layer. Inference of the posterior is exactly
+        equivalent to one mean field update of the hidden units given the data.
+        An rbm compatible inference procedure should:
+        1) calculate the posterior of the hidden units from the data as
+        defined by the joint probability P(v,h) = 1/Z e^E(v,h), where E(.) is
+        the energy over the graph and Z is the marginal.
+        2) not involve cross terms between hidden units.
+        3) not double or replicate weights.
+        4) use exactly one mean field step.
+        """
+
+        raise NotImplementedError(str(type(self)) + " does not implement "
+                                  "is_rbm_compatible.")
+
+
+class UpDown(InferenceProcedure):
+
+    """
+    An InferenceProcedure that initializes the mean field parameters
+    based on the biases in the model, then alternates between updating
+    each of the layers bottom-to-top
+    and updating each of the layers top-to-bottom.
+    """
+
+    @functools.wraps(InferenceProcedure.mf)
+    def mf(self, V, Y=None, return_history=False, niter=None, block_grad=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        dbm = self.dbm
+
+        assert Y not in [True, False, 0, 1]
+        assert return_history in [True, False, 0, 1]
+
+        if Y is not None:
+            dbm.hidden_layers[-1].get_output_space().validate(Y)
+
+        if niter is None:
+            niter = dbm.niter
+
+        H_hat = [None] + [layer.init_mf_state()
+                          for layer in dbm.hidden_layers[1:]]
+
+        # Make corrections for if we're also running inference on Y
+        if Y is not None:
+            # Last layer is clamped to Y
+            H_hat[-1] = Y
+
+        history = [list(H_hat)]
+
+        # we only need recurrent inference if there are multiple layers
+        assert (niter > 1) == (len(dbm.hidden_layers) > 1)
+
+        for i in xrange(niter):
+            # Determine whether to go up or down on this iteration
+            if i % 2 == 0:
+                start = 0
+                stop = len(H_hat)
+                inc = 1
+            else:
+                start = len(H_hat) - 1
+                stop = -1
+                inc = -1
+            # Do the mean field updates
+            for j in xrange(start, stop, inc):
+                if j == 0:
+                    state_below = dbm.visible_layer.upward_state(V)
+                else:
+                    state_below = dbm.hidden_layers[
+                        j - 1].upward_state(H_hat[j - 1])
+                if j == len(H_hat) - 1:
+                    state_above = None
+                    layer_above = None
+                else:
+                    state_above = dbm.hidden_layers[
+                        j + 1].downward_state(H_hat[j + 1])
+                    layer_above = dbm.hidden_layers[j + 1]
+                H_hat[j] = dbm.hidden_layers[j].mf_update(
+                    state_below=state_below,
+                    state_above=state_above,
+                    layer_above=layer_above)
+                if Y is not None:
+                    H_hat[-1] = Y
+
+            if Y is not None:
+                H_hat[-1] = Y
+
+            if block_grad == i + 1:
+                H_hat = block(H_hat)
+
+            history.append(list(H_hat))
+        # end for mf iter
+
+        # Run some checks on the output
+        for layer, state in safe_izip(dbm.hidden_layers, H_hat):
+            upward_state = layer.upward_state(state)
+            layer.get_output_space().validate(upward_state)
+        if Y is not None:
+            assert all([elem[-1] is Y for elem in history])
+            assert H_hat[-1] is Y
+
+        if return_history:
+            return history
+        else:
+            return H_hat
+
+    def do_inpainting(self, V, Y=None, drop_mask=None, drop_mask_Y=None,
+                      return_history=False, noise=False, niter=None,
+                      block_grad=None):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Gives the mean field expression for units masked out by drop_mask.
+        Uses self.niter mean field updates.
+
+        Comes in two variants, unsupervised and supervised:
+
+        * unsupervised: Y and drop_mask_Y are not passed to the method. The
+          method produces V_hat, an inpainted version of V.
+        * supervised: Y and drop_mask_Y are passed to the method. The method
+          produces V_hat and Y_hat.
+
+        If you use this method in your research work, please cite:
+
+            Multi-prediction deep Boltzmann machines. Ian J. Goodfellow,
+            Mehdi Mirza, Aaron Courville, and Yoshua Bengio. NIPS 2013.
+
+
+        Parameters
+        ----------
+        V : tensor_like
+            Theano batch in `model.input_space`
+        Y : tensor_like
+            Theano batch in model.output_space, ie, in the output space of
+            the last hidden layer (it's not really a hidden layer anymore,
+            but oh well. It's convenient to code it this way because the
+            labels are sort of "on top" of everything else). *** Y is always
+            assumed to be a matrix of one-hot category labels. ***
+        drop_mask : tensor_like
+            A theano batch in `model.input_space`. Should be all binary, with
+            1s indicating that the corresponding element of X should be
+            "dropped", ie, hidden from the algorithm and filled in as part of
+            the inpainting process
+        drop_mask_Y : tensor_like
+            Theano vector. Since we assume Y is a one-hot matrix, each row is
+            a single categorical variable. `drop_mask_Y` is a binary mask
+            specifying which *rows* to drop.
+        """
+
+        if Y is not None:
+            assert isinstance(self.hidden_layers[-1], Softmax)
+
+        model = self.dbm
+
+        """TODO: Should add unit test that calling this with a batch of
+                 different inputs should yield the same output for each
+                 if noise is False and drop_mask is all 1s"""
+
+        if niter is None:
+            niter = model.niter
+
+        assert drop_mask is not None
+        assert return_history in [True, False]
+        assert noise in [True, False]
+        if Y is None:
+            if drop_mask_Y is not None:
+                raise ValueError("do_inpainting got drop_mask_Y but not Y.")
+        else:
+            if drop_mask_Y is None:
+                raise ValueError("do_inpainting got Y but not drop_mask_Y.")
+
+        if Y is not None:
+            assert isinstance(model.hidden_layers[-1], Softmax)
+            if drop_mask_Y.ndim != 1:
+                raise ValueError("do_inpainting assumes Y is a matrix of"
+                                 "one-hot labels,"
+                                 "so each example is only one variable. "
+                                 "drop_mask_Y should "
+                                 "therefore be a vector, but we got "
+                                 "something with ndim " +
+                                 str(drop_mask_Y.ndim))
+            drop_mask_Y = drop_mask_Y.dimshuffle(0, 'x')
+
+        orig_V = V
+        orig_drop_mask = drop_mask
+
+        history = []
+
+        V_hat, V_hat_unmasked = model.visible_layer.init_inpainting_state(
+            V, drop_mask, noise, return_unmasked=True)
+        assert V_hat_unmasked.ndim > 1
+
+        H_hat = [None] + [layer.init_mf_state()
+                          for layer in model.hidden_layers[1:]]
+
+        if Y is not None:
+            Y_hat_unmasked = model.hidden_layers[
+                -1].init_inpainting_state(Y, noise)
+            Y_hat = drop_mask_Y * Y_hat_unmasked + (1 - drop_mask_Y) * Y
+            H_hat[-1] = Y_hat
+
+        def update_history():
+            assert V_hat_unmasked.ndim > 1
+            d = {'V_hat':  V_hat, 'H_hat': H_hat,
+                 'V_hat_unmasked': V_hat_unmasked}
+            if Y is not None:
+                d['Y_hat_unmasked'] = Y_hat_unmasked
+                d['Y_hat'] = H_hat[-1]
+            history.append(d)
+
+        update_history()
+
+        for i in xrange(niter):
+
+            if i % 2 == 0:
+                start = 0
+                stop = len(H_hat)
+                inc = 1
+                if i > 0:
+                    # Don't start by updating V_hat on iteration 0 or
+                    # this will throw out the noise
+                    V_hat, V_hat_unmasked = model.visible_layer.inpaint_update(
+                        state_above=model.hidden_layers[0].downward_state(
+                            H_hat[0]),
+                        layer_above=model.hidden_layers[0],
+                        V=V,
+                        drop_mask=drop_mask, return_unmasked=True)
+                    V_hat.name = 'V_hat[%d](V_hat = %s)' % (i, V_hat.name)
+            else:
+                start = len(H_hat) - 1
+                stop = -1
+                inc = -1
+            for j in xrange(start, stop, inc):
+                if j == 0:
+                    state_below = model.visible_layer.upward_state(V_hat)
+                else:
+                    state_below = model.hidden_layers[
+                        j - 1].upward_state(H_hat[j - 1])
+                if j == len(H_hat) - 1:
+                    state_above = None
+                    layer_above = None
+                else:
+                    state_above = model.hidden_layers[
+                        j + 1].downward_state(H_hat[j + 1])
+                    layer_above = model.hidden_layers[j + 1]
+                H_hat[j] = model.hidden_layers[j].mf_update(
+                    state_below=state_below,
+                    state_above=state_above,
+                    layer_above=layer_above)
+                if Y is not None and j == len(model.hidden_layers) - 1:
+                    Y_hat_unmasked = H_hat[j]
+                    H_hat[j] = drop_mask_Y * H_hat[j] + (1 - drop_mask_Y) * Y
+
+            if i % 2 == 1:
+                V_hat, V_hat_unmasked = model.visible_layer.inpaint_update(
+                    state_above=model.hidden_layers[0].downward_state(
+                        H_hat[0]),
+                    layer_above=model.hidden_layers[0],
+                    V=V,
+                    drop_mask=drop_mask, return_unmasked=True)
+                V_hat.name = 'V_hat[%d](V_hat = %s)' % (i, V_hat.name)
+
+            if block_grad == i + 1:
+                V_hat = block_gradient(V_hat)
+                V_hat_unmasked = block_gradient(V_hat_unmasked)
+                H_hat = block(H_hat)
+            update_history()
+        # end for i
+
+        # debugging, make sure V didn't get changed in this function
+        assert V is orig_V
+        assert drop_mask is orig_drop_mask
+
+        Y_hat = H_hat[-1]
+
+        assert V in theano.gof.graph.ancestors([V_hat])
+        if Y is not None:
+            assert V in theano.gof.graph.ancestors([Y_hat])
+
+        if return_history:
+            return history
+        else:
+            if Y is not None:
+                return V_hat, Y_hat
+            return V_hat
+
+    def is_rbm_compatible(self):
+        """
+        Is implemented as UpDown is RBM compatible.
+        """
+        return
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/ising.py b/pylearn2/sandbox/dbm_v2/ising.py
new file mode 100644
index 0000000000..085f5b068a
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/ising.py
@@ -0,0 +1,1864 @@
+"""
+Implementation of a densely connected Ising model in the
+pylearn2.models.dbm framework
+
+Notes
+-----
+If :math:`h` can be -1 or 1, and
+
+.. math::
+
+    p(h) = \exp(T\dot z \dot h),
+
+then the expected value of :math:`h` is given by
+
+.. math::
+
+    \\tanh(T \dot z),
+
+and the probability that :math:`h` is 1 is given by
+
+.. math::
+
+    \sigma(2T \dot z)
+"""
+
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import numpy as np
+
+from theano.compat.python2x import OrderedDict
+
+from theano import function
+from theano.gof.op import get_debug_values
+from theano.compile.sharedvalue import SharedVariable
+import theano.tensor as T
+import warnings
+
+from pylearn2.expr.nnet import sigmoid_numpy
+from pylearn2.linear.matrixmul import MatrixMul
+from pylearn2.models.dbm import init_sigmoid_bias_from_array
+from pylearn2.models.dbm.layer import HiddenLayer, VisibleLayer
+from pylearn2.space import Conv2DSpace
+from pylearn2.space import VectorSpace
+from pylearn2.utils import sharedX
+from pylearn2.utils.rng import make_theano_rng
+
+
+def init_tanh_bias_from_marginals(dataset, use_y=False):
+    """
+    .. todo::
+
+        WRITEME
+    """
+    if use_y:
+        X = dataset.y
+    else:
+        X = dataset.get_design_matrix()
+    if not (X.max() == 1):
+        raise ValueError("Expected design matrix to consist entirely "
+                         "of 0s and 1s, but maximum value is "+str(X.max()))
+    assert X.min() == -1.
+
+    mean = X.mean(axis=0)
+
+    mean = np.clip(mean, 1e-7, 1-1e-7)
+
+    init_bias = np.arctanh(mean)
+
+    return init_bias
+
+
+class IsingVisible(VisibleLayer):
+    """
+    A DBM visible layer consisting of random variables living
+    in a `VectorSpace`, with values in {-1, 1}.
+
+    Implements the energy function term :math:`-\mathbf{b}^T \mathbf{h}`.
+
+    Parameters
+    ----------
+    nvis : int
+        The dimension of the space
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the
+        energy function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered as a
+        learned parameter
+    bias_from_marginals : `pylearn2.datasets.dataset.Dataset`, optional
+        A dataset whose marginals are used to initialize the visible
+        biases
+    """
+
+    def __init__(self, nvis, beta, learn_beta=False, bias_from_marginals=None):
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared variable.")
+        self.__dict__.update(locals())
+        del self.self
+        # Don't serialize the dataset
+        del self.bias_from_marginals
+
+        self.space = VectorSpace(nvis)
+        self.input_space = self.space
+
+        origin = self.space.get_origin()
+
+        if bias_from_marginals is None:
+            init_bias = np.zeros((nvis,))
+        else:
+            init_bias = init_tanh_bias_from_marginals(bias_from_marginals)
+
+        self.bias = sharedX(init_bias, 'visible_bias')
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.bias.get_value()
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.bias.set_value(biases)
+        if recenter:
+            assert self.center
+            self.offset.set_value(sigmoid_numpy(self.bias.get_value()))
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = [self.bias]
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def mf_update(self, state_above, layer_above):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above)
+
+        bias = self.bias
+
+        z = msg + bias
+
+        rval = T.tanh(self.beta * z)
+
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert state_below is None
+
+        msg = layer_above.downward_message(state_above)
+
+        bias = self.bias
+
+        z = msg + bias
+
+        phi = T.nnet.sigmoid(2. * self.beta * z)
+
+        rval = theano_rng.binomial(size=phi.shape, p=phi, dtype=phi.dtype, n=1)
+
+        return rval * 2. - 1.
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.nvis))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.bias.get_value())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.b)
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below=None,
+                             average_below=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert state_below is None
+        assert average_below is None
+        assert average in [True, False]
+        self.space.validate(state)
+
+        # Energy function is linear so it doesn't matter if we're averaging
+        # or not
+        rval = -(self.beta * T.dot(state, self.bias))
+
+        assert rval.ndim == 1
+
+        return rval
+
+
+class IsingHidden(HiddenLayer):
+    """
+    A hidden layer with :math:`\mathbf{h}` being a vector in {-1, 1},
+    implementing the energy function term
+
+    .. math::
+
+        -\mathbf{v}^T \mathbf{W}\mathbf{h} -\mathbf{b}^T \mathbf{h}
+
+    where :math:`\mathbf{W}` and :math:`\mathbf{b}` are parameters of this
+    layer, and :math:`\mathbf{v}` is the upward state of the layer below.
+
+    Parameters
+    ----------
+    dim : WRITEME
+    layer_name : WRITEME
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the energy
+        function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered as a
+        learned parameter
+    irange : WRITEME
+    sparse_init : WRITEME
+    sparse_stdev : WRITEME
+    include_prob : float, optional
+        Probability of including a weight element in the set of weights
+        initialized to U(-irange, irange). If not included it is
+        initialized to 0.
+    init_bias : WRITEME
+    W_lr_scale : WRITEME
+    b_lr_scale : WRITEME
+    max_col_norm : WRITEME
+    """
+
+    def __init__(self,
+                 dim,
+                 layer_name,
+                 beta,
+                 learn_beta=False,
+                 irange=None,
+                 sparse_init=None,
+                 sparse_stdev=1.,
+                 include_prob=1.0,
+                 init_bias=0.,
+                 W_lr_scale=None,
+                 b_lr_scale=None,
+                 max_col_norm=None):
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared variable.")
+        self.__dict__.update(locals())
+        del self.self
+
+        self.b = sharedX(np.zeros((self.dim,)) + init_bias,
+                         name=layer_name + '_b')
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        rval = OrderedDict()
+
+        if self.W_lr_scale is not None:
+            W, = self.transformer.get_params()
+            rval[W] = self.W_lr_scale
+
+        if self.b_lr_scale is not None:
+            rval[self.b] = self.b_lr_scale
+
+        return rval
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Notes
+        -----
+        Note: this resets parameters!
+        """
+
+        self.input_space = space
+
+        if isinstance(space, VectorSpace):
+            self.requires_reformat = False
+            self.input_dim = space.dim
+        else:
+            self.requires_reformat = True
+            self.input_dim = space.get_total_dimension()
+            self.desired_space = VectorSpace(self.input_dim)
+
+        self.output_space = VectorSpace(self.dim)
+
+        rng = self.dbm.rng
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange, self.irange,
+                            (self.input_dim, self.dim)) * \
+                (rng.uniform(0., 1., (self.input_dim, self.dim))
+                    < self.include_prob)
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.dim))
+            W *= self.sparse_stdev
+
+        W = sharedX(W)
+        W.name = self.layer_name + '_W'
+
+        self.transformer = MatrixMul(W)
+
+        W, = self.transformer.get_params()
+        assert W.name is not None
+
+    def _modify_updates(self, updates):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.max_col_norm is not None:
+            W, = self.transformer.get_params()
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return VectorSpace(self.dim)
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W, = self.transformer.get_params()
+        assert W.name is not None
+        rval = self.transformer.get_params()
+        assert not isinstance(rval, set)
+        rval = list(rval)
+        assert self.b not in rval
+        rval.append(self.b)
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        W, = self.transformer.get_params()
+        return coeff * T.sqr(W).sum()
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.requires_reformat:
+            # This is not really an unimplemented case.
+            # We actually don't know how to format the weights
+            # in design space. We got the data in topo space
+            # and we don't have access to the dataset
+            raise NotImplementedError()
+        W, = self.transformer.get_params()
+        return W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W, = self.transformer.get_params()
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset.set_value(sigmoid_numpy(self.b.get_value()))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+
+        W, = self.transformer.get_params()
+
+        W = W.T
+
+        W = W.reshape(
+            (self.detector_layer_dim, self.input_space.shape[0],
+             self.input_space.shape[1], self.input_space.nchannels)
+        )
+
+        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))
+
+        return function([], W)()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        W, = self.transformer.get_params()
+
+        assert W.ndim == 2
+
+        sq_W = T.sqr(W)
+
+        row_norms = T.sqrt(sq_W.sum(axis=1))
+        col_norms = T.sqrt(sq_W.sum(axis=0))
+
+        return OrderedDict([
+            ('row_norms_min', row_norms.min()),
+            ('row_norms_mean', row_norms.mean()),
+            ('row_norms_max', row_norms.max()),
+            ('col_norms_min', col_norms.min()),
+            ('col_norms_mean', col_norms.mean()),
+            ('col_norms_max', col_norms.max()),
+        ])
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P = state
+
+        rval = OrderedDict()
+
+        vars_and_prefixes = [(P, '')]
+
+        for var, prefix in vars_and_prefixes:
+            v_max = var.max(axis=0)
+            v_min = var.min(axis=0)
+            v_mean = var.mean(axis=0)
+            v_range = v_max - v_min
+
+            # max_x.mean_u is "the mean over *u*nits of the max over
+            # e*x*amples". The x and u are included in the name because
+            # otherwise its hard to remember which axis is which when reading
+            # the monitor I use inner.outer rather than outer_of_inner or
+            # something like that because I want mean_x.* to appear next to
+            # each other in the alphabetical list, as these are commonly
+            # plotted together
+            for key, val in [
+                ('max_x.max_u', v_max.max()),
+                ('max_x.mean_u', v_max.mean()),
+                ('max_x.min_u', v_max.min()),
+                ('min_x.max_u', v_min.max()),
+                ('min_x.mean_u', v_min.mean()),
+                ('min_x.min_u', v_min.min()),
+                ('range_x.max_u', v_range.max()),
+                ('range_x.mean_u', v_range.mean()),
+                ('range_x.min_u', v_range.min()),
+                ('mean_x.max_u', v_mean.max()),
+                ('mean_x.mean_u', v_mean.mean()),
+                ('mean_x.min_u', v_mean.min()),
+            ]:
+                rval[prefix+key] = val
+
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to " +
+                             "None so that it may appear after layer_above " +
+                             "/ state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        z = self.transformer.lmul(state_below) + self.b
+
+        if msg is not None:
+            z = z + msg
+
+        on_prob = T.nnet.sigmoid(2. * self.beta * z)
+
+        samples = theano_rng.binomial(p=on_prob, n=1, size=on_prob.shape,
+                                      dtype=on_prob.dtype) * 2. - 1.
+
+        return samples
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = self.transformer.lmul_T(downward_state)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size,
+                    self.dim).astype(self.b.dtype) + \
+            self.b.dimshuffle('x', 0)
+        rval = T.tanh(self.beta * z)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.dim))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.b.get_value())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.b)
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # state = Print('h_state', attrs=['min', 'max'])(state)
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        # Energy function is linear so it doesn't matter if we're averaging or
+        # not. Specifically, our terms are -u^T W d - b^T d where u is the
+        # upward state of layer below and d is the downward state of this layer
+
+        bias_term = T.dot(state, self.b)
+        weights_term = (self.transformer.lmul(state_below) * state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        rval *= self.beta
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Used to implement TorontoSparsity. Unclear exactly what properties of
+        it are important or how to implement it for other layers.
+
+        Properties it must have:
+            output is same kind of data structure (ie, tuple of theano
+            2-tensors) as mf_update
+
+        Properties it probably should have for other layer types:
+            An infinitesimal change in state_below or the parameters should
+            cause the same sign of change in the output of
+            linear_feed_forward_approximation and in mf_update
+
+            Should not have any non-linearities that cause the gradient to
+            shrink
+
+            Should disregard top-down feedback
+        """
+
+        z = self.beta * (self.transformer.lmul(state_below) + self.b)
+
+        if self.pool_size != 1:
+            # Should probably implement sum pooling for the non-pooled version,
+            # but in reality it's not totally clear what the right answer is
+            raise NotImplementedError()
+
+        return z, z
+
+    def mf_update(self, state_below, state_above, layer_above=None,
+                  double_weights=False, iter_name=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + \
+                       self.layer_name + '[' + iter_name + ']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.b
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        if msg is not None:
+            z = z + msg
+        h = T.tanh(self.beta * z)
+
+        return h
+
+
+class BoltzmannIsingVisible(VisibleLayer):
+    """
+    An IsingVisible whose parameters are defined in Boltzmann machine space.
+
+    Notes
+    -----
+    All parameter noise/clipping is handled by BoltzmannIsingHidden.
+
+    .. todo::
+
+        WRITEME properly
+
+    Parameters
+    ----------
+    nvis : int
+        Number of visible units
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the energy
+        function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered
+            as a learned parameter
+    bias_from_marginals : `pylearn2.datasets.dataset.Dataset`, optional
+        A dataset whose marginals are used to initialize the visible
+        biases
+    sampling_b_stdev : WRITEME
+    min_ising_b : WRITEME
+    max_ising_b : WRITEME
+    """
+
+    def __init__(self, nvis, beta, learn_beta=False, bias_from_marginals=None,
+                 sampling_b_stdev=None, min_ising_b=None, max_ising_b=None):
+
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared " +
+                             "variable.")
+        self.__dict__.update(locals())
+        del self.self
+        # Don't serialize the dataset
+        del self.bias_from_marginals
+
+        self.space = VectorSpace(nvis)
+        self.input_space = self.space
+
+        if bias_from_marginals is None:
+            init_bias = np.zeros((nvis,))
+        else:
+            # data is in [-1, 1], but want biases for a sigmoid
+            init_bias = \
+                init_sigmoid_bias_from_array(bias_from_marginals.X / 2. + 0.5)
+            # init_bias =
+        self.boltzmann_bias = sharedX(init_bias, 'visible_bias')
+
+        self.resample_fn = None
+
+    def finalize_initialization(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.sampling_b_stdev is not None:
+            self.noisy_sampling_b = \
+                sharedX(np.zeros((self.layer_above.dbm.batch_size, self.nvis)))
+
+        updates = OrderedDict()
+        updates[self.boltzmann_bias] = self.boltzmann_bias
+        updates[self.layer_above.W] = self.layer_above.W
+        self.enforce_constraints()
+
+    def _modify_updates(self, updates):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        beta = self.beta
+        if beta in updates:
+            updated_beta = updates[beta]
+            updates[beta] = T.clip(updated_beta, 1., 1000.)
+
+        if any(constraint is not None for constraint in [self.min_ising_b,
+                                                         self.max_ising_b]):
+            bmn = self.min_ising_b
+            if bmn is None:
+                bmn = - 1e6
+            bmx = self.max_ising_b
+            if bmx is None:
+                bmx = 1e6
+            wmn_above = self.layer_above.min_ising_W
+            if wmn_above is None:
+                wmn_above = - 1e6
+            wmx_above = self.layer_above.max_ising_W
+            if wmx_above is None:
+                wmx_above = 1e6
+
+            b = updates[self.boltzmann_bias]
+            W_above = updates[self.layer_above.W]
+            ising_b = 0.5 * b + 0.25 * W_above.sum(axis=1)
+            ising_b = T.clip(ising_b, bmn, bmx)
+
+            ising_W_above = 0.25 * W_above
+            ising_W_above = T.clip(ising_W_above, wmn_above, wmx_above)
+            bhn = 2. * (ising_b - ising_W_above.sum(axis=1))
+
+            updates[self.boltzmann_bias] = bhn
+
+        if self.noisy_sampling_b is not None:
+            theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+            b = updates[self.boltzmann_bias]
+            W_above = updates[self.layer_above.W]
+            ising_b = 0.5 * b + 0.25 * W_above.sum(axis=1)
+
+            noisy_sampling_b = \
+                theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                  std=self.sampling_b_stdev,
+                                  size=self.noisy_sampling_b.shape,
+                                  dtype=ising_b.dtype)
+            updates[self.noisy_sampling_b] = noisy_sampling_b
+
+    def resample_bias_noise(self, batch_size_changed=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if batch_size_changed:
+            self.resample_fn = None
+
+        if self.resample_fn is None:
+            updates = OrderedDict()
+
+            if self.sampling_b_stdev is not None:
+                self.noisy_sampling_b = \
+                    sharedX(np.zeros((self.dbm.batch_size, self.nvis)))
+
+            if self.noisy_sampling_b is not None:
+                theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+                b = self.boltzmann_bias
+                W_above = self.layer_above.W
+                ising_b = 0.5 * b + 0.25 * W_above.sum(axis=1)
+
+                noisy_sampling_b = \
+                    theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                      std=self.sampling_b_stdev,
+                                      size=self.noisy_sampling_b.shape,
+                                      dtype=ising_b.dtype)
+                updates[self.noisy_sampling_b] = noisy_sampling_b
+
+            self.resample_fn = function([], updates=updates)
+
+        self.resample_fn()
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingVisible.get_biases returns the " +
+                      "BOLTZMANN biases, is that what we want?")
+        return self.boltzmann_bias.get_value()
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert False  # not really sure what this should do for this layer
+
+    def ising_bias(self, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if for_sampling and self.layer_above.sampling_b_stdev is not None:
+            return self.noisy_sampling_b
+        return \
+            0.5 * self.boltzmann_bias + 0.25 * self.layer_above.W.sum(axis=1)
+
+    def ising_bias_numpy(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return 0.5 * self.boltzmann_bias.get_value() + \
+            0.25 * self.layer_above.W.get_value().sum(axis=1)
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = [self.boltzmann_bias]
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert state_below is None
+
+        msg = layer_above.downward_message(state_above, for_sampling=True)
+
+        bias = self.ising_bias(for_sampling=True)
+
+        z = msg + bias
+
+        phi = T.nnet.sigmoid(2. * self.beta * z)
+
+        rval = theano_rng.binomial(size=phi.shape, p=phi, dtype=phi.dtype, n=1)
+
+        return rval * 2. - 1.
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.nvis))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.ising_bias_numpy())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.ising_bias())
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def mf_update(self, state_above, layer_above):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above, for_sampling=True)
+
+        bias = self.ising_bias(for_sampling=True)
+
+        z = msg + bias
+
+        rval = T.tanh(self.beta * z)
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below=None,
+                             average_below=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # state = Print('v_state', attrs=['min', 'max'])(state)
+
+        assert state_below is None
+        assert average_below is None
+        assert average in [True, False]
+        self.space.validate(state)
+
+        # Energy function is linear so it doesn't matter if we're averaging
+        # or not
+        rval = -(self.beta * T.dot(state, self.ising_bias()))
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        ising_b = self.ising_bias()
+
+        rval['ising_b_min'] = ising_b.min()
+        rval['ising_b_max'] = ising_b.max()
+        rval['beta'] = self.beta
+
+        if hasattr(self, 'noisy_sampling_b'):
+            rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min()
+            rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max()
+
+        return rval
+
+
+class BoltzmannIsingHidden(HiddenLayer):
+    """
+    An IsingHidden whose parameters are defined in Boltzmann machine space.
+
+    .. todo::
+
+        WRITEME properly
+
+    Parameters
+    ----------
+    dim : WRITEME
+    layer_name : WRITEME
+    layer_below : WRITEME
+    beta : theano shared variable
+        Shared variable representing a multiplicative factor of the energy
+        function (the inverse temperature)
+    learn_beta : boolean, optional
+        Whether or not the inverse temperature should be considered as a
+        learned parameter
+    irange : WRITEME
+    sparse_init : WRITEME
+    sparse_stdev : WRITEME
+    include_prob : WRITEME
+    init_bias : WRITEME
+    W_lr_scale : WRITEME
+    b_lr_scale : WRITEME
+    beta_lr_scale : WRITEME
+    max_col_norm : WRITEME
+    min_ising_b : WRITEME
+    max_ising_b : WRITEME
+    min_ising_W : WRITEME
+    max_ising_W : WRITEME
+    sampling_W_stdev : WRITEME
+    sampling_b_stdev : WRITEME
+    """
+    def __init__(self,
+                 dim,
+                 layer_name,
+                 layer_below,
+                 beta,
+                 learn_beta=False,
+                 irange=None,
+                 sparse_init=None,
+                 sparse_stdev=1.,
+                 include_prob=1.0,
+                 init_bias=0.,
+                 W_lr_scale=None,
+                 b_lr_scale=None,
+                 beta_lr_scale=None,
+                 max_col_norm=None,
+                 min_ising_b=None,
+                 max_ising_b=None,
+                 min_ising_W=None,
+                 max_ising_W=None,
+                 sampling_W_stdev=None,
+                 sampling_b_stdev=None):
+        if not isinstance(beta, SharedVariable):
+            raise ValueError("beta needs to be a theano shared variable.")
+        self.__dict__.update(locals())
+        del self.self
+
+        layer_below.layer_above = self
+        self.layer_above = None
+        self.resample_fn = None
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        if not hasattr(self, 'beta_lr_scale'):
+            self.beta_lr_scale = None
+
+        rval = OrderedDict()
+
+        if self.W_lr_scale is not None:
+            W = self.W
+            rval[W] = self.W_lr_scale
+
+        if self.b_lr_scale is not None:
+            rval[self.boltzmann_b] = self.b_lr_scale
+
+        if self.beta_lr_scale is not None:
+            rval[self.beta] = self.beta_lr_scale
+
+        return rval
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Note: this resets parameters!
+        """
+
+        self.input_space = space
+
+        if isinstance(space, VectorSpace):
+            self.requires_reformat = False
+            self.input_dim = space.dim
+        else:
+            self.requires_reformat = True
+            self.input_dim = space.get_total_dimension()
+            self.desired_space = VectorSpace(self.input_dim)
+
+        self.output_space = VectorSpace(self.dim)
+
+        rng = self.dbm.rng
+
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange, self.irange,
+                            (self.input_dim, self.dim)) * \
+                (rng.uniform(0., 1., (self.input_dim, self.dim))
+                    < self.include_prob)
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.dim))
+            W *= self.sparse_stdev
+        W = sharedX(W)
+        W.name = self.layer_name + '_W'
+        self.W = W
+
+        self.boltzmann_b = sharedX(np.zeros((self.dim,)) + self.init_bias,
+                                   name=self.layer_name + '_b')
+
+    def finalize_initialization(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.sampling_b_stdev is not None:
+            self.noisy_sampling_b = \
+                sharedX(np.zeros((self.dbm.batch_size, self.dim)))
+        if self.sampling_W_stdev is not None:
+            self.noisy_sampling_W = \
+                sharedX(np.zeros((self.input_dim, self.dim)),
+                        'noisy_sampling_W')
+
+        updates = OrderedDict()
+        updates[self.boltzmann_b] = self.boltzmann_b
+        updates[self.W] = self.W
+        if self.layer_above is not None:
+            updates[self.layer_above.W] = self.layer_above.W
+        self.enforce_constraints()
+
+    def _modify_updates(self, updates):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        beta = self.beta
+        if beta in updates:
+            updated_beta = updates[beta]
+            updates[beta] = T.clip(updated_beta, 1., 1000.)
+
+        if self.max_col_norm is not None:
+            W = self.W
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+        if any(constraint is not None for constraint in [self.min_ising_b,
+                                                         self.max_ising_b,
+                                                         self.min_ising_W,
+                                                         self.max_ising_W]):
+            bmn = self.min_ising_b
+            if bmn is None:
+                bmn = - 1e6
+            bmx = self.max_ising_b
+            if bmx is None:
+                bmx = 1e6
+            wmn = self.min_ising_W
+            if wmn is None:
+                wmn = - 1e6
+            wmx = self.max_ising_W
+            if wmx is None:
+                wmx = 1e6
+            if self.layer_above is not None:
+                wmn_above = self.layer_above.min_ising_W
+                if wmn_above is None:
+                    wmn_above = - 1e6
+                wmx_above = self.layer_above.max_ising_W
+                if wmx_above is None:
+                    wmx_above = 1e6
+
+            W = updates[self.W]
+            ising_W = 0.25 * W
+            ising_W = T.clip(ising_W, wmn, wmx)
+
+            b = updates[self.boltzmann_b]
+            if self.layer_above is not None:
+                W_above = updates[self.layer_above.W]
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0) \
+                                  + 0.25 * W_above.sum(axis=1)
+            else:
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0)
+            ising_b = T.clip(ising_b, bmn, bmx)
+
+            if self.layer_above is not None:
+                ising_W_above = 0.25 * W_above
+                ising_W_above = T.clip(ising_W_above, wmn_above, wmx_above)
+                bhn = 2. * (ising_b - ising_W.sum(axis=0)
+                                    - ising_W_above.sum(axis=1))
+            else:
+                bhn = 2. * (ising_b - ising_W.sum(axis=0))
+            Wn = 4. * ising_W
+
+            updates[self.W] = Wn
+            updates[self.boltzmann_b] = bhn
+
+        if self.noisy_sampling_W is not None:
+            theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+            W = updates[self.W]
+            ising_W = 0.25 * W
+
+            noisy_sampling_W = \
+                theano_rng.normal(avg=ising_W, std=self.sampling_W_stdev,
+                                  size=ising_W.shape, dtype=ising_W.dtype)
+            updates[self.noisy_sampling_W] = noisy_sampling_W
+
+            b = updates[self.boltzmann_b]
+            if self.layer_above is not None:
+                W_above = updates[self.layer_above.W]
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0) \
+                                  + 0.25 * W_above.sum(axis=1)
+            else:
+                ising_b = 0.5 * b + 0.25 * W.sum(axis=0)
+
+            noisy_sampling_b = \
+                theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                  std=self.sampling_b_stdev,
+                                  size=self.noisy_sampling_b.shape,
+                                  dtype=ising_b.dtype)
+            updates[self.noisy_sampling_b] = noisy_sampling_b
+
+    def resample_bias_noise(self, batch_size_changed=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if batch_size_changed:
+            self.resample_fn = None
+
+        if self.resample_fn is None:
+            updates = OrderedDict()
+
+            if self.sampling_b_stdev is not None:
+                self.noisy_sampling_b = \
+                    sharedX(np.zeros((self.dbm.batch_size, self.dim)))
+
+            if self.noisy_sampling_b is not None:
+                theano_rng = make_theano_rng(None, self.dbm.rng.randint(2**16), which_method="normal")
+
+                b = self.boltzmann_b
+                if self.layer_above is not None:
+                    W_above = self.layer_above.W
+                    ising_b = 0.5 * b + 0.25 * self.W.sum(axis=0) \
+                                      + 0.25 * W_above.sum(axis=1)
+                else:
+                    ising_b = 0.5 * b + 0.25 * self.W.sum(axis=0)
+
+                noisy_sampling_b = \
+                    theano_rng.normal(avg=ising_b.dimshuffle('x', 0),
+                                      std=self.sampling_b_stdev,
+                                      size=self.noisy_sampling_b.shape,
+                                      dtype=ising_b.dtype)
+                updates[self.noisy_sampling_b] = noisy_sampling_b
+
+            self.resample_fn = function([], updates=updates)
+
+        self.resample_fn()
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return VectorSpace(self.dim)
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.boltzmann_b.name is not None
+        W = self.W
+        assert W.name is not None
+        rval = [W]
+        assert not isinstance(rval, set)
+        rval = list(rval)
+        assert self.boltzmann_b not in rval
+        rval.append(self.boltzmann_b)
+        if self.learn_beta:
+            rval.append(self.beta)
+        return rval
+
+    def ising_weights(self, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'sampling_W_stdev'):
+            self.sampling_W_stdev = None
+        if for_sampling and self.sampling_W_stdev is not None:
+            return self.noisy_sampling_W
+        return 0.25 * self.W
+
+    def ising_b(self, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'sampling_b_stdev'):
+            self.sampling_b_stdev = None
+        if for_sampling and self.sampling_b_stdev is not None:
+            return self.noisy_sampling_b
+        else:
+            if self.layer_above is not None:
+                return 0.5 * self.boltzmann_b + \
+                    0.25 * self.W.sum(axis=0) + \
+                    0.25 * self.layer_above.W.sum(axis=1)
+            else:
+                return 0.5 * self.boltzmann_b + 0.25 * self.W.sum(axis=0)
+
+    def ising_b_numpy(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.layer_above is not None:
+            return 0.5 * self.boltzmann_b.get_value() + \
+                0.25 * self.W.get_value().sum(axis=0) + \
+                0.25 * self.layer_above.W.get_value().sum(axis=1)
+        else:
+            return 0.5 * self.boltzmann_b.get_value() + \
+                0.25 * self.W.get_value().sum(axis=0)
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        W = self.W
+        return coeff * T.sqr(W).sum()
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.get_weights returns the " +
+                      "BOLTZMANN weights, is that what we want?")
+        W = self.W
+        return W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.set_weights sets the BOLTZMANN " +
+                      "weights, is that what we want?")
+        W = self.W
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.boltzmann_b.set_value(biases)
+        assert not recenter  # not really sure what this should do if True
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.get_biases returns the " +
+                      "BOLTZMANN biases, is that what we want?")
+        return self.boltzmann_b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        warnings.warn("BoltzmannIsingHidden.get_weights_topo returns the " +
+                      "BOLTZMANN weights, is that what we want?")
+
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+
+        W = self.W
+
+        W = W.T
+
+        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
+                       self.input_space.shape[1], self.input_space.nchannels))
+
+        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))
+
+        return function([], W)()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        W = self.W
+
+        assert W.ndim == 2
+
+        sq_W = T.sqr(W)
+
+        row_norms = T.sqrt(sq_W.sum(axis=1))
+        col_norms = T.sqrt(sq_W.sum(axis=0))
+
+        rval = OrderedDict([
+            ('boltzmann_row_norms_min', row_norms.min()),
+            ('boltzmann_row_norms_mean', row_norms.mean()),
+            ('boltzmann_row_norms_max', row_norms.max()),
+            ('boltzmann_col_norms_min', col_norms.min()),
+            ('boltzmann_col_norms_mean', col_norms.mean()),
+            ('boltzmann_col_norms_max', col_norms.max()),
+        ])
+
+        ising_W = self.ising_weights()
+
+        rval['ising_W_min'] = ising_W.min()
+        rval['ising_W_max'] = ising_W.max()
+
+        ising_b = self.ising_b()
+
+        rval['ising_b_min'] = ising_b.min()
+        rval['ising_b_max'] = ising_b.max()
+
+        if hasattr(self, 'noisy_sampling_W'):
+            rval['noisy_sampling_W_min'] = self.noisy_sampling_W.min()
+            rval['noisy_sampling_W_max'] = self.noisy_sampling_W.max()
+            rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min()
+            rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max()
+
+        return rval
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P = state
+
+        rval = OrderedDict()
+
+        vars_and_prefixes = [(P, '')]
+
+        for var, prefix in vars_and_prefixes:
+            v_max = var.max(axis=0)
+            v_min = var.min(axis=0)
+            v_mean = var.mean(axis=0)
+            v_range = v_max - v_min
+
+            # max_x.mean_u is "the mean over *u*nits of the max over
+            # e*x*amples". The x and u are included in the name because
+            # otherwise its hard to remember which axis is which when reading
+            # the monitor I use inner.outer rather than outer_of_inner or
+            # something like that because I want mean_x.* to appear next to
+            # each other in the alphabetical list, as these are commonly
+            # plotted together
+            for key, val in [
+                    ('max_x.max_u', v_max.max()),
+                    ('max_x.mean_u', v_max.mean()),
+                    ('max_x.min_u', v_max.min()),
+                    ('min_x.max_u', v_min.max()),
+                    ('min_x.mean_u', v_min.mean()),
+                    ('min_x.min_u', v_min.min()),
+                    ('range_x.max_u', v_range.max()),
+                    ('range_x.mean_u', v_range.mean()),
+                    ('range_x.min_u', v_range.min()),
+                    ('mean_x.max_u', v_mean.max()),
+                    ('mean_x.mean_u', v_mean.mean()),
+                    ('mean_x.min_u', v_mean.min())
+            ]:
+                rval[prefix+key] = val
+
+        return rval
+
+    def sample(self, state_below=None, state_above=None, layer_above=None,
+               theano_rng=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to " +
+                             "None so that it may appear after layer_above " +
+                             "/ state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above, for_sampling=True)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        z = T.dot(state_below, self.ising_weights(for_sampling=True)) + \
+            self.ising_b(for_sampling=True)
+
+        if msg is not None:
+            z = z + msg
+
+        on_prob = T.nnet.sigmoid(2. * self.beta * z)
+
+        samples = theano_rng.binomial(p=on_prob, n=1, size=on_prob.shape,
+                                      dtype=on_prob.dtype) * 2. - 1.
+
+        return samples
+
+    def downward_message(self, downward_state, for_sampling=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = T.dot(downward_state,
+                     self.ising_weights(for_sampling=for_sampling).T)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size,
+                    self.dim).astype(self.boltzmann_b.dtype) + \
+            self.ising_b().dimshuffle('x', 0)
+        rval = T.tanh(self.beta * z)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        driver = numpy_rng.uniform(0., 1., (num_examples, self.dim))
+        on_prob = sigmoid_numpy(2. * self.beta.get_value() *
+                                self.ising_b_numpy())
+        sample = 2. * (driver < on_prob) - 1.
+
+        rval = sharedX(sample, name='v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        mean = T.nnet.sigmoid(2. * self.beta * self.ising_b())
+        rval = theano_rng.binomial(size=(num_examples, self.dim), p=mean)
+        rval = 2. * (rval) - 1.
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # state = Print('h_state', attrs=['min', 'max'])(state)
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        # Energy function is linear so it doesn't matter if we're averaging or
+        # not. Specifically, our terms are -u^T W d - b^T d where u is the
+        # upward state of layer below and d is the downward state of this layer
+
+        bias_term = T.dot(state, self.ising_b())
+        weights_term = \
+            (T.dot(state_below, self.ising_weights()) * state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        rval *= self.beta
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Used to implement TorontoSparsity. Unclear exactly what properties of
+        it are important or how to implement it for other layers.
+
+        Properties it must have:
+            output is same kind of data structure (ie, tuple of theano
+            2-tensors) as mf_update
+
+        Properties it probably should have for other layer types:
+            An infinitesimal change in state_below or the parameters should
+            cause the same sign of change in the output of
+            linear_feed_forward_approximation and in mf_update
+
+            Should not have any non-linearities that cause the gradient to
+            shrink
+
+            Should disregard top-down feedback
+        """
+
+        z = self.beta * (T.dot(state_below, self.ising_weights()) + self.ising_b())
+
+        return z
+
+    def mf_update(self, state_below, state_above, layer_above=None,
+                  double_weights=False, iter_name=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got " +
+                                         "shape of %d" % (self.dbm.batch_size,
+                                                          sb.shape[0]))
+                    assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below,
+                                                     self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + \
+                       self.layer_name + '[' + iter_name+']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = T.dot(state_below, self.ising_weights()) + self.ising_b()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        if msg is not None:
+            z = z + msg
+        h = T.tanh(self.beta * z)
+
+        return h
+
+    def get_l2_act_cost(self, state, target, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        avg = state.mean(axis=0)
+        diff = avg - target
+        return coeff * T.sqr(diff).mean()
diff --git a/pylearn2/sandbox/dbm_v2/layer.py b/pylearn2/sandbox/dbm_v2/layer.py
new file mode 100644
index 0000000000..5bc2c50b13
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/layer.py
@@ -0,0 +1,4124 @@
+"""
+Common DBM Layer classes
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import functools
+import logging
+import numpy as np
+import time
+import warnings
+
+from theano import tensor as T, function, config
+import theano
+from theano.compat import OrderedDict
+from theano.gof.op import get_debug_values
+from theano.printing import Print
+
+from pylearn2.expr.nnet import sigmoid_numpy
+from pylearn2.expr.probabilistic_max_pooling import max_pool_channels, max_pool_b01c, max_pool, max_pool_c01b
+from pylearn2.linear.conv2d import make_random_conv2D, make_sparse_random_conv2D
+from pylearn2.linear.conv2d_c01b import setup_detector_layer_c01b
+from pylearn2.linear.matrixmul import MatrixMul
+from pylearn2.models import Model
+from pylearn2.sandbox.dbm_v2 import init_sigmoid_bias_from_marginals
+from pylearn2.space import VectorSpace, CompositeSpace, Conv2DSpace, Space
+from pylearn2.utils import is_block_gradient
+from pylearn2.utils import sharedX, safe_zip, py_integer_types, block_gradient
+from pylearn2.utils.exc import reraise_as
+from pylearn2.utils.rng import make_theano_rng
+from pylearn2.utils import safe_union
+
+
+logger = logging.getLogger(__name__)
+
+
+class Layer(Model):
+    """
+    Abstract class.
+    A layer of a DBM.
+    May only belong to one DBM.
+
+    Each layer has a state ("total state") that can be split into
+    the piece that is visible to the layer above ("upward state")
+    and the piece that is visible to the layer below ("downward state").
+    (Since visible layers don't have a downward state, the downward_state
+    method only appears in the DBM_HiddenLayer subclass)
+
+    For simple layers, all three of these are the same thing.
+    """
+
+    def get_dbm(self):
+        """
+        Returns the DBM that this layer belongs to, or None
+        if it has not been assigned to a DBM yet.
+        """
+
+        if hasattr(self, 'dbm'):
+            return self.dbm
+
+        return None
+
+    def set_dbm(self, dbm):
+        """
+        Assigns this layer to a DBM.
+
+        Parameters
+        ----------
+        dbm : WRITEME
+        """
+        assert self.get_dbm() is None
+        self.dbm = dbm
+
+    def get_total_state_space(self):
+        """
+        Returns the Space that the layer's total state lives in.
+        """
+        raise NotImplementedError(str(type(self))+" does not implement " +\
+                "get_total_state_space()")
+
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return OrderedDict()
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return OrderedDict()
+
+    def upward_state(self, total_state):
+        """
+        Takes total_state and turns it into the state that layer_above should
+        see when computing P( layer_above | this_layer).
+
+        So far this has two uses:
+
+        * If this layer consists of a detector sub-layer h that is pooled
+          into a pooling layer p, then total_state = (p,h) but layer_above
+          should only see p.
+        * If the conditional P( layer_above | this_layer) depends on
+          parameters of this_layer, sometimes you can play games with
+          the state to avoid needing the layers to communicate. So far
+          the only instance of this usage is when the visible layer
+          is N( Wh, beta). This makes the hidden layer be
+          sigmoid( v beta W + b). Rather than having the hidden layer
+          explicitly know about beta, we can just pass v beta as
+          the upward state.
+
+        Parameters
+        ----------
+        total_state : WRITEME
+
+        Notes
+        -----
+        This method should work both for computing sampling updates
+        and for computing mean field updates. So far I haven't encountered
+        a case where it needs to do different things for those two
+        contexts.
+        """
+        return total_state
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        Returns a shared variable containing an actual state (not a mean field
+        state) for this variable.
+
+        Parameters
+        ----------
+        num_examples : WRITEME
+        numpy_rng : WRITEME
+
+        Returns
+        -------
+        WRITEME
+        """
+
+        raise NotImplementedError("%s doesn't implement make_state" %
+                type(self))
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        Returns a theano symbolic variable containing an actual state (not a
+        mean field state) for this variable.
+
+        Parameters
+        ----------
+        num_examples : WRITEME
+        numpy_rng : WRITEME
+
+        Returns
+        -------
+        WRITEME
+        """
+
+        raise NotImplementedError("%s doesn't implement make_symbolic_state" %
+                                  type(self))
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        Returns an expression for samples of this layer's state, conditioned on
+        the layers above and below Should be valid as an update to the shared
+        variable returned by self.make_state
+
+        Parameters
+        ----------
+        state_below : WRITEME
+            Corresponds to layer_below.upward_state(full_state_below),
+            where full_state_below is the same kind of object as you get
+            out of layer_below.make_state
+        state_above : WRITEME
+            Corresponds to layer_above.downward_state(full_state_above)
+
+        theano_rng : WRITEME
+            An MRG_RandomStreams instance
+
+        Returns
+        -------
+        WRITEME
+
+        Notes
+        -----
+        This can return multiple expressions if this layer's total state
+        consists of more than one shared variable.
+        """
+
+        if hasattr(self, 'get_sampling_updates'):
+            raise AssertionError("Looks like "+str(type(self))+" needs to rename get_sampling_updates to sample.")
+
+        raise NotImplementedError("%s doesn't implement sample" %
+                type(self))
+
+    def expected_energy_term(self, state,
+                                   average,
+                                   state_below,
+                                   average_below):
+        """
+        Returns a term of the expected energy of the entire model.
+        This term should correspond to the expected value of terms
+        of the energy function that:
+
+        - involve this layer only
+        - if there is a layer below, include terms that involve both this layer
+          and the layer below
+
+        Do not include terms that involve the layer below only.
+        Do not include any terms that involve the layer above, if it
+        exists, in any way (the interface doesn't let you see the layer
+        above anyway).
+
+        Parameters
+        ----------
+        state_below : WRITEME
+            Upward state of the layer below.
+        state : WRITEME
+            Total state of this layer
+        average_below : bool
+            If True, the layer below is one of the variables to integrate
+            over in the expectation, and state_below gives its variational
+            parameters. If False, that layer is to be held constant and
+            state_below gives a set of assignments to it average: like
+            average_below, but for 'state' rather than 'state_below'
+
+        Returns
+        -------
+        rval : tensor_like
+            A 1D theano tensor giving the expected energy term for each example
+        """
+        raise NotImplementedError(str(type(self))+" does not implement expected_energy_term.")
+
+    def finalize_initialization(self):
+        """
+        Some layers' initialization depends on layer above being initialized,
+        which is why this method is called after `set_input_space` has been
+        called.
+        """
+        pass
+
+
+class VisibleLayer(Layer):
+    """
+    Abstract class.
+    A layer of a DBM that may be used as a visible layer.
+    Currently, all implemented layer classes may be either visible
+    or hidden but not both. It may be worth making classes that can
+    play both roles though. This would allow getting rid of the BinaryVector
+    class.
+    """
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.get_input_space()
+
+
+class HiddenLayer(Layer):
+    """
+    Abstract class.
+    A layer of a DBM that may be used as a hidden layer.
+    """
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return total_state
+
+    def get_stdev_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_stdev_rewards")
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_range_rewards")
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_l1_act_cost")
+
+    def get_l2_act_cost(self, state, target, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError(str(type(self))+" does not implement get_l2_act_cost")
+
+
+class BinaryVector(VisibleLayer):
+    """
+    A DBM visible layer consisting of binary random variables living
+    in a VectorSpace.
+
+    Parameters
+    ----------
+    nvis : int
+        Dimension of the space
+    bias_from_marginals : pylearn2.datasets.dataset.Dataset
+        Dataset, whose marginals are used to initialize the visible biases
+    center : bool
+        WRITEME
+    copies : int
+        WRITEME
+    """
+    def __init__(self,
+            nvis,
+            bias_from_marginals = None,
+            center = False,
+            copies = 1, learn_init_inpainting_state = False):
+
+        self.__dict__.update(locals())
+        del self.self
+        # Don't serialize the dataset
+        del self.bias_from_marginals
+
+        self.space = VectorSpace(nvis)
+        self.input_space = self.space
+
+        origin = self.space.get_origin()
+
+        if bias_from_marginals is None:
+            init_bias = np.zeros((nvis,))
+        else:
+            init_bias = init_sigmoid_bias_from_marginals(bias_from_marginals)
+
+        self.bias = sharedX(init_bias, 'visible_bias')
+
+        if center:
+            self.offset = sharedX(sigmoid_numpy(init_bias))
+
+    def get_biases(self):
+        """
+        Returns
+        -------
+        biases : ndarray
+            The numpy value of the biases
+        """
+        return self.bias.get_value()
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.bias.set_value(biases)
+        if recenter:
+            assert self.center
+            self.offset.set_value(sigmoid_numpy(self.bias.get_value()))
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            rval = total_state - self.offset
+        else:
+            rval = total_state
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        return rval * self.copies
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return [self.bias]
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+
+        assert state_below is None
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        msg = layer_above.downward_message(state_above)
+
+        bias = self.bias
+
+        z = msg + bias
+
+        phi = T.nnet.sigmoid(z)
+
+        rval = theano_rng.binomial(size = phi.shape, p = phi, dtype = phi.dtype,
+                       n = 1 )
+
+        return rval
+
+    def mf_update(self, state_above, layer_above):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above)
+        mu = self.bias
+
+        z = msg + mu
+
+        rval = T.nnet.sigmoid(z)
+
+        return rval
+
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+        if self.copies != 1:
+            raise NotImplementedError()
+        driver = numpy_rng.uniform(0.,1., (num_examples, self.nvis))
+        mean = sigmoid_numpy(self.bias.get_value())
+        sample = driver < mean
+
+        rval = sharedX(sample, name = 'v_sample_shared')
+
+        return rval
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+        if self.copies != 1:
+            raise NotImplementedError()
+        mean = T.nnet.sigmoid(self.bias)
+        rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean,
+                                   dtype=theano.config.floatX)
+
+        return rval
+
+    def expected_energy_term(self, state, average, state_below = None, average_below = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.center:
+            state = state - self.offset
+
+        assert state_below is None
+        assert average_below is None
+        assert average in [True, False]
+        self.space.validate(state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        rval = -T.dot(state, self.bias)
+
+        assert rval.ndim == 1
+
+        return rval * self.copies
+
+    def init_inpainting_state(self, V, drop_mask, noise = False, return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert drop_mask is None or drop_mask.ndim > 1
+
+        unmasked = T.nnet.sigmoid(self.bias.dimshuffle('x',0))
+        # this condition is needed later if unmasked is used as V_hat
+        assert unmasked.ndim == 2
+        # this condition is also needed later if unmasked is used as V_hat
+        assert hasattr(unmasked.owner.op, 'scalar_op')
+        if drop_mask is not None:
+            masked_mean = unmasked * drop_mask
+        else:
+            masked_mean = unmasked
+        if not hasattr(self, 'learn_init_inpainting_state'):
+            self.learn_init_inpainting_state = 0
+        if not self.learn_init_inpainting_state:
+            masked_mean = block_gradient(masked_mean)
+        masked_mean.name = 'masked_mean'
+
+        if noise:
+            theano_rng = theano.sandbox.rng_mrg.MRG_RandomStreams(42)
+            # we want a set of random mean field parameters, not binary samples
+            unmasked = T.nnet.sigmoid(theano_rng.normal(avg = 0.,
+                    std = 1., size = masked_mean.shape,
+                    dtype = masked_mean.dtype))
+            masked_mean = unmasked * drop_mask
+            masked_mean.name = 'masked_noise'
+
+        if drop_mask is None:
+            rval = masked_mean
+        else:
+            masked_V  = V  * (1-drop_mask)
+            rval = masked_mean + masked_V
+        rval.name = 'init_inpainting_state'
+
+        if return_unmasked:
+            assert unmasked.ndim > 1
+            return rval, unmasked
+
+        return rval
+
+
+    def inpaint_update(self, state_above, layer_above, drop_mask = None, V = None, return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        msg = layer_above.downward_message(state_above)
+        mu = self.bias
+
+        z = msg + mu
+        z.name = 'inpainting_z_[unknown_iter]'
+
+        unmasked = T.nnet.sigmoid(z)
+
+        if drop_mask is not None:
+            rval = drop_mask * unmasked + (1-drop_mask) * V
+        else:
+            rval = unmasked
+
+        rval.name = 'inpainted_V[unknown_iter]'
+
+        if return_unmasked:
+            owner = unmasked.owner
+            assert owner is not None
+            op = owner.op
+            assert hasattr(op, 'scalar_op')
+            assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
+            return rval, unmasked
+
+        return rval
+
+
+    def recons_cost(self, V, V_hat_unmasked, drop_mask = None, use_sum=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if use_sum:
+            raise NotImplementedError()
+
+        V_hat = V_hat_unmasked
+
+        assert hasattr(V_hat, 'owner')
+        owner = V_hat.owner
+        assert owner is not None
+        op = owner.op
+        block_grad = False
+        if is_block_gradient(op):
+            assert isinstance(op.scalar_op, theano.scalar.Identity)
+            block_grad = True
+            real, = owner.inputs
+            owner = real.owner
+            op = owner.op
+
+        if not hasattr(op, 'scalar_op'):
+            raise ValueError("Expected V_hat_unmasked to be generated by an Elemwise op, got "+str(op)+" of type "+str(type(op)))
+        assert isinstance(op.scalar_op, T.nnet.sigm.ScalarSigmoid)
+        z ,= owner.inputs
+        if block_grad:
+            z = block_gradient(z)
+
+        if V.ndim != V_hat.ndim:
+            raise ValueError("V and V_hat_unmasked should have same ndim, but are %d and %d." % (V.ndim, V_hat.ndim))
+        unmasked_cost = V * T.nnet.softplus(-z) + (1 - V) * T.nnet.softplus(z)
+        assert unmasked_cost.ndim == V_hat.ndim
+
+        if drop_mask is None:
+            masked_cost = unmasked_cost
+        else:
+            masked_cost = drop_mask * unmasked_cost
+
+        return masked_cost.mean()
+
+class BinaryVectorMaxPool(HiddenLayer):
+    """
+    A hidden layer that does max-pooling on binary vectors.
+    It has two sublayers, the detector layer and the pooling
+    layer. The detector layer is its downward state and the pooling
+    layer is its upward state.
+
+    Parameters
+    ----------
+    detector_layer_dim : WRITEME
+    pool_size : WRITEME
+    layer_name : WRITEME
+    irange : WRITEME
+    sparse_init : WRITEME
+    sparse_stdev : WRITEME
+    include_prob : , optional
+        Probability of including a weight element in the set of weights
+        initialized to U(-irange, irange). If not included it is
+        initialized to 0.
+    init_bias : WRITEME
+    W_lr_scale : WRITEME
+    b_lr_scale : WRITEME
+    center : WRITEME
+    mask_weights : WRITEME
+    max_col_norm : WRITEME
+    copies : WRITEME
+    """
+    # TODO: this layer uses (pooled, detector) as its total state,
+    #       which can be confusing when listing all the states in
+    #       the network left to right. Change this and
+    #       pylearn2.expr.probabilistic_max_pooling to use
+    #       (detector, pooled)
+
+    def __init__(self,
+            detector_layer_dim,
+            pool_size,
+            layer_name,
+            irange = None,
+            sparse_init = None,
+            sparse_stdev = 1.,
+            include_prob = 1.0,
+            init_bias = 0.,
+            W_lr_scale = None,
+            b_lr_scale = None,
+            center = False,
+            mask_weights = None,
+            max_col_norm = None,
+            copies = 1):
+        self.__dict__.update(locals())
+        del self.self
+
+        self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b')
+
+        if self.center:
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset = sharedX(sigmoid_numpy(self.b.get_value()))
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        rval = OrderedDict()
+
+        if self.W_lr_scale is not None:
+            W, = self.transformer.get_params()
+            rval[W] = self.W_lr_scale
+
+        if self.b_lr_scale is not None:
+            rval[self.b] = self.b_lr_scale
+
+        return rval
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+
+        Notes
+        -----
+        This resets parameters!
+        """
+
+        self.input_space = space
+
+        if isinstance(space, VectorSpace):
+            self.requires_reformat = False
+            self.input_dim = space.dim
+        else:
+            self.requires_reformat = True
+            self.input_dim = space.get_total_dimension()
+            self.desired_space = VectorSpace(self.input_dim)
+
+
+        if not (self.detector_layer_dim % self.pool_size == 0):
+            raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" %
+                    (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size))
+
+        self.h_space = VectorSpace(self.detector_layer_dim)
+        self.pool_layer_dim = self.detector_layer_dim / self.pool_size
+        self.output_space = VectorSpace(self.pool_layer_dim)
+
+        rng = self.dbm.rng
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange,
+                                 self.irange,
+                                 (self.input_dim, self.detector_layer_dim)) * \
+                    (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim))
+                     < self.include_prob)
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.detector_layer_dim))
+            def mask_rejects(idx, i):
+                if self.mask_weights is None:
+                    return False
+                return self.mask_weights[idx, i] == 0.
+            for i in xrange(self.detector_layer_dim):
+                assert self.sparse_init <= self.input_dim
+                for j in xrange(self.sparse_init):
+                    idx = rng.randint(0, self.input_dim)
+                    while W[idx, i] != 0 or mask_rejects(idx, i):
+                        idx = rng.randint(0, self.input_dim)
+                    W[idx, i] = rng.randn()
+            W *= self.sparse_stdev
+
+        W = sharedX(W)
+        W.name = self.layer_name + '_W'
+
+        self.transformer = MatrixMul(W)
+
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        if self.mask_weights is not None:
+            expected_shape =  (self.input_dim, self.detector_layer_dim)
+            if expected_shape != self.mask_weights.shape:
+                raise ValueError("Expected mask with shape "+str(expected_shape)+" but got "+str(self.mask_weights.shape))
+            self.mask = sharedX(self.mask_weights)
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+
+        # Patch old pickle files
+        if not hasattr(self, 'mask_weights'):
+            self.mask_weights = None
+        if not hasattr(self, 'max_col_norm'):
+            self.max_col_norm = None
+
+        if self.mask_weights is not None:
+            W ,= self.transformer.get_params()
+            if W in updates:
+                updates[W] = updates[W] * self.mask
+
+        if self.max_col_norm is not None:
+            W, = self.transformer.get_params()
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace((self.output_space, self.h_space))
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+        rval = self.transformer.get_params()
+        assert not isinstance(rval, set)
+        rval = list(rval)
+        assert self.b not in rval
+        rval.append(self.b)
+        return rval
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        W ,= self.transformer.get_params()
+        return coeff * T.sqr(W).sum()
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.requires_reformat:
+            # This is not really an unimplemented case.
+            # We actually don't know how to format the weights
+            # in design space. We got the data in topo space
+            # and we don't have access to the dataset
+            raise NotImplementedError()
+        W ,= self.transformer.get_params()
+        return W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W, = self.transformer.get_params()
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset.set_value(sigmoid_numpy(self.b.get_value()))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def get_weights_view_shape(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        total = self.detector_layer_dim
+        cols = self.pool_size
+        if cols == 1:
+            # Let the PatchViewer decidew how to arrange the units
+            # when they're not pooled
+            raise NotImplementedError()
+        # When they are pooled, make each pooling unit have one row
+        rows = total / cols
+        return rows, cols
+
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+
+        W ,= self.transformer.get_params()
+
+        W = W.T
+
+        W = W.reshape((self.detector_layer_dim, self.input_space.shape[0],
+            self.input_space.shape[1], self.input_space.num_channels))
+
+        W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c'))
+
+        return function([], W)()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+        self.h_space.validate(h)
+        self.output_space.validate(p)
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            return p - self.offset
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        return p * self.copies
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            return h - self.offset
+
+        return h * self.copies
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        W ,= self.transformer.get_params()
+
+        assert W.ndim == 2
+
+        sq_W = T.sqr(W)
+
+        row_norms = T.sqrt(sq_W.sum(axis=1))
+        col_norms = T.sqrt(sq_W.sum(axis=0))
+
+        return OrderedDict([
+              ('row_norms_min'  , row_norms.min()),
+              ('row_norms_mean' , row_norms.mean()),
+              ('row_norms_max'  , row_norms.max()),
+              ('col_norms_min'  , col_norms.min()),
+              ('col_norms_mean' , col_norms.mean()),
+              ('col_norms_max'  , col_norms.max()),
+            ])
+
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P, H = state
+
+        rval = OrderedDict()
+
+        if self.pool_size == 1:
+            vars_and_prefixes = [ (P,'') ]
+        else:
+            vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ]
+
+        for var, prefix in vars_and_prefixes:
+            v_max = var.max(axis=0)
+            v_min = var.min(axis=0)
+            v_mean = var.mean(axis=0)
+            v_range = v_max - v_min
+
+            # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples"
+            # The x and u are included in the name because otherwise its hard
+            # to remember which axis is which when reading the monitor
+            # I use inner.outer rather than outer_of_inner or something like that
+            # because I want mean_x.* to appear next to each other in the alphabetical
+            # list, as these are commonly plotted together
+            for key, val in [
+                    ('max_x.max_u', v_max.max()),
+                    ('max_x.mean_u', v_max.mean()),
+                    ('max_x.min_u', v_max.min()),
+                    ('min_x.max_u', v_min.max()),
+                    ('min_x.mean_u', v_min.mean()),
+                    ('min_x.min_u', v_min.min()),
+                    ('range_x.max_u', v_range.max()),
+                    ('range_x.mean_u', v_range.mean()),
+                    ('range_x.min_u', v_range.min()),
+                    ('mean_x.max_u', v_mean.max()),
+                    ('mean_x.mean_u', v_mean.mean()),
+                    ('mean_x.min_u', v_mean.min())
+                    ]:
+                rval[prefix+key] = val
+
+        return rval
+
+    def get_stdev_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if isinstance(coeffs, str):
+                coeffs = float(coeffs)
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            assert all([isinstance(elem, float) for elem in [c]])
+            if c == 0.:
+                continue
+            mn = s.mean(axis=0)
+            dev = s - mn
+            stdev = T.sqrt(T.sqr(dev).mean(axis=0))
+            rval += (0.5 - stdev).mean()*c
+
+        return rval
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if isinstance(coeffs, str):
+                coeffs = float(coeffs)
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            assert all([isinstance(elem, float) for elem in [c]])
+            if c == 0.:
+                continue
+            mx = s.max(axis=0)
+            assert hasattr(mx.owner.op, 'grad')
+            assert mx.ndim == 1
+            mn = s.min(axis=0)
+            assert hasattr(mn.owner.op, 'grad')
+            assert mn.ndim == 1
+            r = mx - mn
+            rval += (1 - r).mean()*c
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if not isinstance(target, float):
+                raise TypeError("BinaryVectorMaxPool.get_l1_act_cost expected target of type float " + \
+                        " but an instance named "+self.layer_name + " got target "+str(target) + " of type "+str(type(target)))
+            assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+            if eps is None:
+                eps = [0.]
+            else:
+                eps = [eps]
+        else:
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            if eps is None:
+                eps = [0., 0.]
+            if target[1] > target[0]:
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c, e in safe_zip(state, target, coeff, eps):
+            assert all([isinstance(elem, float) or hasattr(elem, 'dtype') for elem in [t, c, e]])
+            if c == 0.:
+                continue
+            m = s.mean(axis=0)
+            assert m.ndim == 1
+            rval += T.maximum(abs(m-t)-e,0.).mean()*c
+
+        return rval
+
+    def get_l2_act_cost(self, state, target, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        P, H = state
+        self.output_space.validate(P)
+        self.h_space.validate(H)
+
+
+        if self.pool_size == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            if not isinstance(target, float):
+                raise TypeError("BinaryVectorMaxPool.get_l1_act_cost expected target of type float " + \
+                        " but an instance named "+self.layer_name + " got target "+str(target) + " of type "+str(type(target)))
+            assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+        else:
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            if target[1] > target[0]:
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c in safe_zip(state, target, coeff):
+            assert all([isinstance(elem, float) or hasattr(elem, 'dtype') for elem in [t, c]])
+            if c == 0.:
+                continue
+            m = s.mean(axis=0)
+            assert m.ndim == 1
+            rval += T.square(m-t).mean()*c
+
+        return rval
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        z = self.transformer.lmul(state_below) + self.b
+        p, h, p_sample, h_sample = max_pool_channels(z,
+                self.pool_size, msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.h_space.validate(downward_state)
+        rval = self.transformer.lmul_T(downward_state)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval * self.copies
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \
+                self.b.dimshuffle('x', 0)
+        rval = max_pool_channels(z = z,
+                pool_size = self.pool_size)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Returns a shared variable containing an actual state
+           (not a mean field state) for this variable.
+        """
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        if self.copies != 1:
+            raise NotImplementedError()
+
+
+        empty_input = self.h_space.get_origin_batch(num_examples)
+        empty_output = self.output_space.get_origin_batch(num_examples)
+
+        h_state = sharedX(empty_input)
+        p_state = sharedX(empty_output)
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16), which_method="binomial")
+
+        default_z = T.zeros_like(h_state) + self.b
+
+        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
+                z = default_z,
+                pool_size = self.pool_size,
+                theano_rng = theano_rng)
+
+        assert h_sample.dtype == default_z.dtype
+
+        f = function([], updates = [
+            (p_state , p_sample),
+            (h_state , h_sample)
+            ])
+
+        f()
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Returns a theano symbolic variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        default_z = T.alloc(self.b, num_examples, self.detector_layer_dim)
+
+        p_exp, h_exp, p_sample, h_sample = max_pool_channels(z=default_z,
+                                                             pool_size=self.pool_size,
+                                                             theano_rng=theano_rng)
+
+        assert h_sample.dtype == default_z.dtype
+
+        return p_sample, h_sample
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        # Don't need to do anything special for centering, upward_state / downward state
+        # make it all just work
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = T.dot(downward_state, self.b)
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval * self.copies
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        Used to implement TorontoSparsity. Unclear exactly what properties of
+        it are important or how to implement it for other layers.
+
+        Properties it must have: output is same kind of data structure (ie,
+        tuple of theano 2-tensors) as mf_update.
+
+        Properties it probably should have for other layer types: an
+        infinitesimal change in state_below or the parameters should cause the
+        same sign of change in the output of linear_feed_forward_approximation
+        and in mf_update
+
+        Should not have any non-linearities that cause the gradient to shrink
+
+        Should disregard top-down feedback
+
+        Parameters
+        ----------
+        state_below : WRITEME
+        """
+
+        z = self.transformer.lmul(state_below) + self.b
+
+        if self.pool_size != 1:
+            # Should probably implement sum pooling for the non-pooled version,
+            # but in reality it's not totally clear what the right answer is
+            raise NotImplementedError()
+
+        return z, z
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.b
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = max_pool_channels(z, self.pool_size, msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+
+class Softmax(HiddenLayer):
+    """
+    A layer representing a single softmax distribution of a
+    set of discrete categories.
+
+    Parameters
+    ----------
+    n_classes : int
+        The number of discrete categories.
+    layer_name : str
+        The name of the layer.
+    irange : float
+        If not None, initialze the weights in U(-irange, irange)
+    sparse_init : int
+        If not None, initialize `sparse_init` weights per column
+        to N(0, sparse_istdev^2)
+    sparse_istdev : float
+        see above
+    W_lr_scale : float
+        Scale the learning rate on the weights by this amount
+    b_lr_scale : float
+        Scale the learning rate on the biases by this amount
+    max_col_norm : float
+        If not None, constrain the columns of the weight matrix
+        to have at most this L2 norm
+    copies : int
+        Make this many copies of the random variables, all sharing
+        the same weights. This allows the undirected model to
+        behave as if it has asymmetric connections.
+    center : bool
+        If True, use Gregoire Montavon's centering trick.
+    learn_init_inpainting_state : bool
+        If True, and using inpainting-based methods (MP-DBM), learn
+        a parameter controlling the initial value of the mean field
+        state for this layer.
+    """
+
+    presynaptic_name = "presynaptic_Y_hat"
+
+    def __init__(self, n_classes, layer_name, irange = None,
+                 sparse_init = None, sparse_istdev = 1., W_lr_scale = None,
+                 b_lr_scale = None,
+                 max_col_norm = None,
+                 copies = 1, center = False,
+                 learn_init_inpainting_state = True):
+        if isinstance(W_lr_scale, str):
+            W_lr_scale = float(W_lr_scale)
+
+        self.__dict__.update(locals())
+        del self.self
+
+        assert isinstance(n_classes, py_integer_types)
+
+        self.output_space = VectorSpace(n_classes)
+        self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b')
+
+        if self.center:
+            b = self.b.get_value()
+            self.offset = sharedX(np.exp(b) / np.exp(b).sum())
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+
+        if not hasattr(self, 'max_col_norm'):
+            self.max_col_norm = None
+
+        if self.max_col_norm is not None:
+            W = self.W
+            if W in updates:
+                updated_W = updates[W]
+                col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0))
+                desired_norms = T.clip(col_norms, 0, self.max_col_norm)
+                updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
+
+    @functools.wraps(Model.get_lr_scalers)
+    def get_lr_scalers(self):
+
+        rval = OrderedDict()
+
+        # Patch old pickle files
+        if not hasattr(self, 'W_lr_scale'):
+            self.W_lr_scale = None
+
+        if self.W_lr_scale is not None:
+            assert isinstance(self.W_lr_scale, float)
+            rval[self.W] = self.W_lr_scale
+
+        if not hasattr(self, 'b_lr_scale'):
+            self.b_lr_scale = None
+
+        if self.b_lr_scale is not None:
+            assert isinstance(self.b_lr_scale, float)
+            rval[self.b] = self.b_lr_scale
+
+        return rval
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.output_space
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        mx = state.max(axis=1)
+
+        return OrderedDict([
+                ('mean_max_class' , mx.mean()),
+                ('max_max_class' , mx.max()),
+                ('min_max_class' , mx.min())
+        ])
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.input_space = space
+
+        if not isinstance(space, Space):
+            raise TypeError("Expected Space, got "+
+                    str(space)+" of type "+str(type(space)))
+
+        self.input_dim = space.get_total_dimension()
+        self.needs_reformat = not isinstance(space, VectorSpace)
+
+        self.desired_space = VectorSpace(self.input_dim)
+
+        if not self.needs_reformat:
+            assert self.desired_space == self.input_space
+
+        rng = self.dbm.rng
+
+        if self.irange is not None:
+            assert self.sparse_init is None
+            W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes))
+        else:
+            assert self.sparse_init is not None
+            W = np.zeros((self.input_dim, self.n_classes))
+            for i in xrange(self.n_classes):
+                for j in xrange(self.sparse_init):
+                    idx = rng.randint(0, self.input_dim)
+                    while W[idx, i] != 0.:
+                        idx = rng.randint(0, self.input_dim)
+                    W[idx, i] = rng.randn() * self.sparse_istdev
+
+        self.W = sharedX(W,  'softmax_W' )
+
+        self._params = [ self.b, self.W ]
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not isinstance(self.input_space, Conv2DSpace):
+            raise NotImplementedError()
+        desired = self.W.get_value().T
+        ipt = self.desired_space.format_as(desired, self.input_space)
+        rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c'))
+        return rval
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not isinstance(self.input_space, VectorSpace):
+            raise NotImplementedError()
+
+        return self.W.get_value()
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.W.set_value(weights)
+
+    def set_biases(self, biases, recenter=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            self.offset.set_value( (np.exp(biases) / np.exp(biases).sum()).astype(self.offset.dtype))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value()
+
+    def get_weights_format(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return ('v', 'h')
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+
+        if self.copies != 1:
+            raise NotImplementedError("need to draw self.copies samples and average them together.")
+
+        if state_above is not None:
+            # If you implement this case, also add a unit test for it.
+            # Or at least add a warning that it is not tested.
+            raise NotImplementedError()
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")
+
+        self.input_space.validate(state_below)
+
+        # patch old pickle files
+        if not hasattr(self, 'needs_reformat'):
+            self.needs_reformat = self.needs_reshape
+            del self.needs_reshape
+
+        if self.needs_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        self.desired_space.validate(state_below)
+
+
+        z = T.dot(state_below, self.W) + self.b
+        h_exp = T.nnet.softmax(z)
+        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)
+
+        return h_sample
+
+    def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if state_above is not None:
+            raise NotImplementedError()
+
+        if double_weights:
+            raise NotImplementedError()
+
+        self.input_space.validate(state_below)
+
+        # patch old pickle files
+        if not hasattr(self, 'needs_reformat'):
+            self.needs_reformat = self.needs_reshape
+            del self.needs_reshape
+
+        if self.needs_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        for value in get_debug_values(state_below):
+            if value.shape[0] != self.dbm.batch_size:
+                raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0]))
+
+        self.desired_space.validate(state_below)
+
+        assert self.W.ndim == 2
+        assert state_below.ndim == 2
+
+        b = self.b
+
+        Z = T.dot(state_below, self.W) + b
+
+        rval = T.nnet.softmax(Z)
+
+        for value in get_debug_values(rval):
+            assert value.shape[0] == self.dbm.batch_size
+
+        return rval
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        rval =  T.dot(downward_state, self.W.T) * self.copies
+
+        rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval
+
+    def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale):
+        """
+        The cost of reconstructing `Y` as `Y_hat`. Specifically,
+        the negative log probability.
+
+        This cost is for use with multi-prediction training.
+
+        Parameters
+        ----------
+        Y : target space batch
+            The data labels
+        Y_hat_unmasked : target space batch
+            The output of this layer's `mf_update`; the predicted
+            values of `Y`. Even though the model is only predicting
+            the dropped values, we take predictions for all the
+            values here.
+        drop_mask_Y : 1-D theano tensor
+            A batch of 0s/1s, with 1s indicating that variables
+            have been dropped, and should be included in the
+            reconstruction cost. One indicator per example in the
+            batch, since each example in this layer only has one
+            random variable in it.
+        scale : float
+            Multiply the cost by this amount.
+            We need to do this because the visible layer also goes into
+            the cost. We use the mean over units and examples, so that
+            the scale of the cost doesn't change too much with batch
+            size or example size.
+            We need to multiply this cost by scale to make sure that
+            it is put on the same scale as the reconstruction cost
+            for the visible units. ie, scale should be 1/nvis
+        """
+
+
+        Y_hat = Y_hat_unmasked
+        assert hasattr(Y_hat, 'owner')
+        owner = Y_hat.owner
+        assert owner is not None
+        op = owner.op
+        if isinstance(op, Print):
+            assert len(owner.inputs) == 1
+            Y_hat, = owner.inputs
+            owner = Y_hat.owner
+            op = owner.op
+        assert isinstance(op, T.nnet.Softmax)
+        z ,= owner.inputs
+        assert z.ndim == 2
+
+        z = z - z.max(axis=1).dimshuffle(0, 'x')
+        log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x'))
+        # we use sum and not mean because this is really one variable per row
+        log_prob_of = (Y * log_prob).sum(axis=1)
+        masked = log_prob_of * drop_mask_Y
+        assert masked.ndim == 1
+
+        rval = masked.mean() * scale * self.copies
+
+        return - rval
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval =  T.nnet.softmax(self.b.dimshuffle('x', 0)) + T.alloc(0., self.dbm.batch_size, self.n_classes).astype(config.floatX)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Returns a shared variable containing an actual state
+           (not a mean field state) for this variable.
+        """
+
+        if self.copies != 1:
+            raise NotImplementedError("need to make self.copies samples and average them together.")
+
+        t1 = time.time()
+
+        empty_input = self.output_space.get_origin_batch(num_examples)
+        h_state = sharedX(empty_input)
+
+        default_z = T.zeros_like(h_state) + self.b
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        h_exp = T.nnet.softmax(default_z)
+
+        h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype)
+
+        h_state = sharedX( self.output_space.get_origin_batch(
+            num_examples))
+
+
+        t2 = time.time()
+
+        f = function([], updates = [(
+            h_state , h_sample
+            )])
+
+        t3 = time.time()
+
+        f()
+
+        t4 = time.time()
+
+        logger.info('{0}.make_state took {1}'.format(self, t4-t1))
+        logger.info('\tcompose time: {0}'.format(t2-t1))
+        logger.info('\tcompile time: {0}'.format(t3-t2))
+        logger.info('\texecute time: {0}'.format(t4-t3))
+
+        h_state.name = 'softmax_sample_shared'
+
+        return h_state
+
+    def make_symbolic_state(self, num_examples, theano_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Returns a symbolic variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+
+        if self.copies != 1:
+            raise NotImplementedError("need to make self.copies samples and average them together.")
+
+        default_z = T.alloc(self.b, num_examples, self.n_classes)
+
+        h_exp = T.nnet.softmax(default_z)
+
+        h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype)
+
+        return h_sample
+
+    def get_weight_decay(self, coeff):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(coeff, str):
+            coeff = float(coeff)
+        assert isinstance(coeff, float) or hasattr(coeff, 'dtype')
+        return coeff * T.sqr(self.W).sum()
+
+    def upward_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.center:
+            return state - self.offset
+        return state
+
+    def downward_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if not hasattr(self, 'center'):
+            self.center = False
+        if self.center:
+            """TODO: write a unit test verifying that inference or sampling
+                     below a centered Softmax layer works"""
+            return state - self.offset
+        return state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.center:
+            state = state - self.offset
+
+        self.input_space.validate(state_below)
+        if self.needs_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+        self.desired_space.validate(state_below)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = T.dot(state, self.b)
+        weights_term = (T.dot(state_below, self.W) * state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        rval *= self.copies
+
+        assert rval.ndim == 1
+
+        return rval
+
+    def init_inpainting_state(self, Y, noise):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if noise:
+            theano_rng = make_theano_rng(None, 2012+10+30, which_method="binomial")
+            return T.nnet.softmax(theano_rng.normal(avg=0., size=Y.shape, std=1., dtype='float32'))
+        rval =  T.nnet.softmax(self.b)
+        if not hasattr(self, 'learn_init_inpainting_state'):
+            self.learn_init_inpainting_state = 1
+        if not self.learn_init_inpainting_state:
+            rval = block_gradient(rval)
+        return rval
+
+    def install_presynaptic_outputs(self, outputs_dict, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert self.presynaptic_name not in outputs_dict
+        outputs_dict[self.presynaptic_name] = self.output_space.make_shared_batch(batch_size, self.presynaptic_name)
+
+
+class GaussianVisLayer(VisibleLayer):
+    """
+    Implements a visible layer that is conditionally gaussian with
+    diagonal variance. The layer lives in a Conv2DSpace.
+
+    Parameters
+    ----------
+    rows, cols, channels : WRITEME
+        the shape of the space
+    learn_init_inpainting : bool, optional
+        WRITEME
+    nvis : WRITEME
+    init_beta : WRITEME
+        the initial value of the precision parameter
+    min_beta : WRITEME
+        clip beta so it is at least this big (default 1)
+    init_mu : WRITEME
+        the initial value of the mean parameter
+    tie_beta : WRITEME
+        None or a string specifying how to tie beta 'locations' = tie beta
+        across locations, ie beta should be a vector with one elem per channel
+    tie_mu : WRITEME
+        None or a string specifying how to tie mu 'locations' = tie mu across
+        locations, ie mu should be a vector with one elem per channel
+    bias_from_marginals : WRITEME
+    beta_lr_scale : WRITEME
+    axes : tuple
+        WRITEME
+    """
+    def __init__(self,
+            rows = None,
+            cols = None,
+            learn_init_inpainting_state=True,
+            channels = None,
+            nvis = None,
+            init_beta = 1.,
+            min_beta = 1.,
+            init_mu = None,
+            tie_beta = None,
+            tie_mu = None,
+            bias_from_marginals = None,
+            beta_lr_scale = 'by_sharing',
+            axes = ('b', 0, 1, 'c')):
+
+        warnings.warn("GaussianVisLayer math very faith based, need to finish working through gaussian.lyx")
+
+        self.__dict__.update(locals())
+        del self.self
+
+        if bias_from_marginals is not None:
+            del self.bias_from_marginals
+            if self.nvis is None:
+                raise NotImplementedError()
+            assert init_mu is None
+            init_mu = bias_from_marginals.X.mean(axis=0)
+
+        if init_mu is None:
+            init_mu = 0.
+        if nvis is None:
+            assert rows is not None
+            assert cols is not None
+            assert channels is not None
+            self.space = Conv2DSpace(shape=[rows,cols], num_channels=channels, axes=axes)
+            # To make GaussianVisLayer compatible with any axis ordering
+            self.batch_axis=list(axes).index('b')
+            self.axes_to_sum = range(len(axes))
+            self.axes_to_sum.remove(self.batch_axis)
+        else:
+            assert rows is None
+            assert cols is None
+            assert channels is None
+            self.space = VectorSpace(nvis)
+            self.axes_to_sum = 1
+            self.batch_axis = None
+        self.input_space = self.space
+
+        origin = self.space.get_origin()
+
+        beta_origin = origin.copy()
+        assert tie_beta in [ None, 'locations']
+        if tie_beta == 'locations':
+            assert nvis is None
+            beta_origin = np.zeros((self.space.num_channels,))
+        self.beta = sharedX(beta_origin + init_beta,name = 'beta')
+        assert self.beta.ndim == beta_origin.ndim
+
+        mu_origin = origin.copy()
+        assert tie_mu in [None, 'locations']
+        if tie_mu == 'locations':
+            assert nvis is None
+            mu_origin = np.zeros((self.space.num_channels,))
+        self.mu = sharedX( mu_origin + init_mu, name = 'mu')
+        assert self.mu.ndim == mu_origin.ndim
+
+
+
+    def get_monitoring_channels(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        rval['beta_min'] = self.beta.min()
+        rval['beta_mean'] = self.beta.mean()
+        rval['beta_max'] = self.beta.max()
+
+        return rval
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.mu is None:
+            return [self.beta]
+        return [self.beta, self.mu]
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        if self.nvis is None:
+            rows, cols = self.space.shape
+            num_loc = float(rows * cols)
+
+        assert self.tie_beta in [None, 'locations']
+        if self.beta_lr_scale == 'by_sharing':
+            if self.tie_beta == 'locations':
+                assert self.nvis is None
+                rval[self.beta] = 1. / num_loc
+        elif self.beta_lr_scale == None:
+            pass
+        else:
+            rval[self.beta] = self.beta_lr_scale
+
+        assert self.tie_mu in [None, 'locations']
+        if self.tie_mu == 'locations':
+            warn = True
+            assert self.nvis is None
+            rval[self.mu] = 1./num_loc
+            logger.warning("mu lr_scaler hardcoded to 1/sharing")
+
+        return rval
+
+    @functools.wraps(Model._modify_updates)
+    def _modify_updates(self, updates):
+        if self.beta in updates:
+            updated_beta = updates[self.beta]
+            updates[self.beta] = T.clip(updated_beta,
+                    self.min_beta,1e6)
+
+    def set_biases(self, bias):
+        """
+        Set mean parameter
+
+        Parameters
+        ----------
+        bias: WRITEME
+            Vector of size nvis
+        """
+        self.mu = sharedX(bias, name = 'mu')
+
+    def broadcasted_mu(self):
+        """
+        Returns mu, broadcasted to have the same shape as a batch of data
+        """
+
+        if self.tie_mu == 'locations':
+            def f(x):
+                if x == 'c':
+                    return 0
+                return 'x'
+            axes = [f(ax) for ax in self.axes]
+            rval = self.mu.dimshuffle(*axes)
+        else:
+            assert self.tie_mu is None
+            if self.nvis is None:
+                axes = [0, 1, 2]
+                axes.insert(self.axes.index('b'), 'x')
+                rval = self.mu.dimshuffle(*axes)
+            else:
+                rval = self.mu.dimshuffle('x', 0)
+
+        self.input_space.validate(rval)
+
+        return rval
+
+    def broadcasted_beta(self):
+        """
+        Returns beta, broadcasted to have the same shape as a batch of data
+        """
+        return self.broadcast_beta(self.beta)
+
+    def broadcast_beta(self, beta):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Returns beta, broadcasted to have the same shape as a batch of data
+        """
+
+        if self.tie_beta == 'locations':
+            def f(x):
+                if x == 'c':
+                    return 0
+                return 'x'
+            axes = [f(ax) for ax in self.axes]
+            rval = beta.dimshuffle(*axes)
+        else:
+            assert self.tie_beta is None
+            if self.nvis is None:
+                axes = [0, 1, 2]
+                axes.insert(self.axes.index('b'), 'x')
+                rval = beta.dimshuffle(*axes)
+            else:
+                rval = beta.dimshuffle('x', 0)
+
+        self.input_space.validate(rval)
+
+        return rval
+
+    def init_inpainting_state(self, V, drop_mask, noise = False, return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        """for Vv, drop_mask_v in get_debug_values(V, drop_mask):
+            assert Vv.ndim == 4
+            assert drop_mask_v.ndim in [3,4]
+            for i in xrange(drop_mask.ndim):
+                if Vv.shape[i] != drop_mask_v.shape[i]:
+                    print Vv.shape
+                    print drop_mask_v.shape
+                    assert False
+        """
+
+        unmasked = self.broadcasted_mu()
+
+        if drop_mask is None:
+            assert not noise
+            assert not return_unmasked
+            return unmasked
+        masked_mu = unmasked * drop_mask
+        if not hasattr(self, 'learn_init_inpainting_state'):
+            self.learn_init_inpainting_state = True
+        if not self.learn_init_inpainting_state:
+            masked_mu = block_gradient(masked_mu)
+        masked_mu.name = 'masked_mu'
+
+        if noise:
+            theano_rng = make_theano_rng(None, 42, which_method="binomial")
+            unmasked = theano_rng.normal(avg = 0.,
+                    std = 1., size = masked_mu.shape,
+                    dtype = masked_mu.dtype)
+            masked_mu = unmasked * drop_mask
+            masked_mu.name = 'masked_noise'
+
+
+        masked_V  = V  * (1-drop_mask)
+        rval = masked_mu + masked_V
+        rval.name = 'init_inpainting_state'
+
+        if return_unmasked:
+            return rval, unmasked
+        return rval
+
+
+    def expected_energy_term(self, state, average, state_below = None, average_below = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert state_below is None
+        assert average_below is None
+        self.space.validate(state)
+        if average:
+            raise NotImplementedError(str(type(self))+" doesn't support integrating out variational parameters yet.")
+        else:
+            rval =  0.5 * (self.beta * T.sqr(state - self.mu)).sum(axis=self.axes_to_sum)
+        assert rval.ndim == 1
+        return rval
+
+
+    def inpaint_update(self, state_above, layer_above, drop_mask = None, V = None,
+                        return_unmasked = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        msg = layer_above.downward_message(state_above)
+        mu = self.broadcasted_mu()
+
+        z = msg + mu
+        z.name = 'inpainting_z_[unknown_iter]'
+
+        if drop_mask is not None:
+            rval = drop_mask * z + (1-drop_mask) * V
+        else:
+            rval = z
+
+        rval.name = 'inpainted_V[unknown_iter]'
+
+        if return_unmasked:
+            return rval, z
+
+        return rval
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert state_below is None
+        msg = layer_above.downward_message(state_above)
+        mu = self.mu
+
+        z = msg + mu
+        rval = theano_rng.normal(size = z.shape, avg = z, dtype = z.dtype,
+                       std = 1. / T.sqrt(self.beta))
+        return rval
+
+    def recons_cost(self, V, V_hat_unmasked, drop_mask = None, use_sum=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        return self._recons_cost(V=V, V_hat_unmasked=V_hat_unmasked, drop_mask=drop_mask, use_sum=use_sum, beta=self.beta)
+
+
+    def _recons_cost(self, V, V_hat_unmasked, beta, drop_mask=None, use_sum=False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        V_hat = V_hat_unmasked
+
+        assert V.ndim == V_hat.ndim
+        beta = self.broadcasted_beta()
+        unmasked_cost = 0.5 * beta * T.sqr(V-V_hat) - 0.5*T.log(beta / (2*np.pi))
+        assert unmasked_cost.ndim == V_hat.ndim
+
+        if drop_mask is None:
+            masked_cost = unmasked_cost
+        else:
+            masked_cost = drop_mask * unmasked_cost
+
+        if use_sum:
+            return masked_cost.mean(axis=0).sum()
+
+        return masked_cost.mean()
+
+        return masked_cost.mean()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.nvis is None and total_state.ndim != 4:
+            raise ValueError("total_state should have 4 dimensions, has "+str(total_state.ndim))
+        assert total_state is not None
+        V = total_state
+        self.input_space.validate(V)
+        upward_state = (V - self.broadcasted_mu()) * self.broadcasted_beta()
+        return upward_state
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        shape = [num_examples]
+
+        if self.nvis is None:
+            rows, cols = self.space.shape
+            channels = self.space.num_channels
+            shape.append(rows)
+            shape.append(cols)
+            shape.append(channels)
+        else:
+            shape.append(self.nvis)
+
+        sample = numpy_rng.randn(*shape)
+
+        sample *= 1./np.sqrt(self.beta.get_value())
+        sample += self.mu.get_value()
+        rval = sharedX(sample, name = 'v_sample_shared')
+
+        return rval
+
+    def install_presynaptic_outputs(self, outputs_dict, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        outputs_dict['output_V_weighted_pred_sum'] = self.space.make_shared_batch(batch_size)
+
+    def ensemble_prediction(self, symbolic, outputs_dict, ensemble):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+        Output a symbolic expression for V_hat_unmasked based on taking the
+        geometric mean over the ensemble and renormalizing.
+        n - 1 members of the ensemble have modified outputs_dict and the nth
+        gives its prediction in "symbolic". The parameters for the nth one
+        are currently loaded in the model.
+        """
+
+        weighted_pred_sum = outputs_dict['output_V_weighted_pred_sum'] \
+                + self.broadcasted_beta() * symbolic
+
+        beta_sum = sum(ensemble.get_ensemble_variants(self.beta))
+
+        unmasked_V_hat = weighted_pred_sum / self.broadcast_beta(beta_sum)
+
+        return unmasked_V_hat
+
+    def ensemble_recons_cost(self, V, V_hat_unmasked, drop_mask=None,
+            use_sum=False, ensemble=None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        beta = sum(ensemble.get_ensemble_variants(self.beta)) / ensemble.num_copies
+
+        return self._recons_cost(V=V, V_hat_unmasked=V_hat_unmasked, beta=beta, drop_mask=drop_mask,
+            use_sum=use_sum)
+
+
+class ConvMaxPool(HiddenLayer):
+    """
+    .. todo::
+
+        WRITEME
+    """
+
+    def __init__(self,
+             output_channels,
+            kernel_rows,
+            kernel_cols,
+            pool_rows,
+            pool_cols,
+            layer_name,
+            center = False,
+            irange = None,
+            sparse_init = None,
+            scale_by_sharing = True,
+            init_bias = 0.,
+            border_mode = 'valid',
+            output_axes = ('b', 'c', 0, 1)):
+        self.__dict__.update(locals())
+        del self.self
+
+        assert (irange is None) != (sparse_init is None)
+
+        self.b = sharedX( np.zeros((output_channels,)) + init_bias, name = layer_name + '_b')
+        assert border_mode in ['full','valid']
+
+    def broadcasted_bias(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        assert self.b.ndim == 1
+
+        shuffle = [ 'x' ] * 4
+        shuffle[self.output_axes.index('c')] = 0
+
+        return self.b.dimshuffle(*shuffle)
+
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace((self.h_space, self.output_space))
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Note: this resets parameters!"""
+        if not isinstance(space, Conv2DSpace):
+            raise TypeError("ConvMaxPool can only act on a Conv2DSpace, but received " +
+                    str(type(space))+" as input.")
+        self.input_space = space
+        self.input_rows, self.input_cols = space.shape
+        self.input_channels = space.num_channels
+
+        if self.border_mode == 'valid':
+            self.h_rows = self.input_rows - self.kernel_rows + 1
+            self.h_cols = self.input_cols - self.kernel_cols + 1
+        else:
+            assert self.border_mode == 'full'
+            self.h_rows = self.input_rows + self.kernel_rows - 1
+            self.h_cols = self.input_cols + self.kernel_cols - 1
+
+
+        if not( self.h_rows % self.pool_rows == 0):
+            raise ValueError("h_rows = %d, pool_rows = %d. Should be divisible but remainder is %d" %
+                    (self.h_rows, self.pool_rows, self.h_rows % self.pool_rows))
+        assert self.h_cols % self.pool_cols == 0
+
+        self.h_space = Conv2DSpace(shape = (self.h_rows, self.h_cols), num_channels = self.output_channels,
+                axes = self.output_axes)
+        self.output_space = Conv2DSpace(shape = (self.h_rows / self.pool_rows,
+                                                self.h_cols / self.pool_cols),
+                                                num_channels = self.output_channels,
+                axes = self.output_axes)
+
+        logger.info('{0}: detector shape: {1} '
+                    'pool shape: {2}'.format(self.layer_name,
+                                             self.h_space.shape,
+                                             self.output_space.shape))
+
+        if tuple(self.output_axes) == ('b', 0, 1, 'c'):
+            self.max_pool = max_pool_b01c
+        elif tuple(self.output_axes) == ('b', 'c', 0, 1):
+            self.max_pool = max_pool
+        else:
+            raise NotImplementedError()
+
+        if self.irange is not None:
+            self.transformer = make_random_conv2D(self.irange, input_space = space,
+                    output_space = self.h_space, kernel_shape = (self.kernel_rows, self.kernel_cols),
+                    batch_size = self.dbm.batch_size, border_mode = self.border_mode, rng = self.dbm.rng)
+        else:
+            self.transformer = make_sparse_random_conv2D(self.sparse_init, input_space = space,
+                    output_space = self.h_space, kernel_shape = (self.kernel_rows, self.kernel_cols),
+                    batch_size = self.dbm.batch_size, border_mode = self.border_mode, rng = self.dbm.rng)
+        self.transformer._filters.name = self.layer_name + '_W'
+
+
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        if self.center:
+            p_ofs, h_ofs = self.init_mf_state()
+            self.p_offset = sharedX(self.output_space.get_origin(), 'p_offset')
+            self.h_offset = sharedX(self.h_space.get_origin(), 'h_offset')
+            f = function([], updates={self.p_offset: p_ofs[0,:,:,:], self.h_offset: h_ofs[0,:,:,:]})
+            f()
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        return [ W, self.b]
+
+    def state_to_b01c(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if tuple(self.output_axes) == ('b',0,1,'c'):
+            return state
+        return [ Conv2DSpace.convert(elem, self.output_axes, ('b', 0, 1, 'c'))
+                for elem in state ]
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            if c == 0.:
+                continue
+            # Range over everything but the channel index
+            # theano can only take gradient through max if the max is over 1 axis or all axes
+            # so I manually unroll the max for the case I use here
+            assert self.h_space.axes == ('b', 'c', 0, 1)
+            assert self.output_space.axes == ('b', 'c', 0, 1)
+            mx = s.max(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mx.owner.op, 'grad')
+            mn = s.min(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mn.owner.op, 'grad')
+            assert mx.ndim == 1
+            assert mn.ndim == 1
+            r = mx - mn
+            rval += (1. - r).mean() * c
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """
+
+            target: if pools contain more than one element, should be a list with
+                    two elements. the first element is for the pooling units and
+                    the second for the detector units.
+
+        """
+        rval = 0.
+
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(target, float)
+            assert isinstance(coeff, float)
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+            if eps is None:
+                eps = 0.
+            eps = [eps]
+        else:
+            if eps is None:
+                eps = [0., 0.]
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            p_target, h_target = target
+            if h_target > p_target and (coeff[0] != 0. and coeff[1] != 0.):
+                # note that, within each group, E[p] is the sum of E[h]
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c, e in safe_zip(state, target, coeff, eps):
+            if c == 0.:
+                continue
+            # Average over everything but the channel index
+            m = s.mean(axis= [ ax for ax in range(4) if self.output_axes[ax] != 'c' ])
+            assert m.ndim == 1
+            rval += T.maximum(abs(m-t)-e,0.).mean()*c
+
+        return rval
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.scale_by_sharing:
+            # scale each learning rate by 1 / # times param is reused
+            h_rows, h_cols = self.h_space.shape
+            num_h = float(h_rows * h_cols)
+            return OrderedDict([(self.transformer._filters, 1./num_h),
+                     (self.b, 1. / num_h)])
+        else:
+            return OrderedDict()
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return p
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return h
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P, H = state
+
+        if tuple(self.output_axes) == ('b',0,1,'c'):
+            p_max = P.max(axis=(0,1,2))
+            p_min = P.min(axis=(0,1,2))
+            p_mean = P.mean(axis=(0,1,2))
+        else:
+            assert tuple(self.output_axes) == ('b','c',0,1)
+            p_max = P.max(axis=(0,2,3))
+            p_min = P.min(axis=(0,2,3))
+            p_mean = P.mean(axis=(0,2,3))
+        p_range = p_max - p_min
+
+        rval = {
+                'p_max_max' : p_max.max(),
+                'p_max_mean' : p_max.mean(),
+                'p_max_min' : p_max.min(),
+                'p_min_max' : p_min.max(),
+                'p_min_mean' : p_min.mean(),
+                'p_min_max' : p_min.max(),
+                'p_range_max' : p_range.max(),
+                'p_range_mean' : p_range.mean(),
+                'p_range_min' : p_range.min(),
+                'p_mean_max' : p_mean.max(),
+                'p_mean_mean' : p_mean.mean(),
+                'p_mean_min' : p_mean.min()
+                }
+
+        return rval
+
+    def get_weight_decay(self, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W , = self.transformer.get_params()
+        return coeffs * T.sqr(W).sum()
+
+
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if not hasattr(state_below, 'ndim'):
+            raise TypeError("state_below should be a TensorType, got " +
+                    str(state_below) + " of type " + str(type(state_below)))
+        if state_below.ndim != 4:
+            raise ValueError("state_below should have ndim 4, has "+str(state_below.ndim))
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = self.max_pool(z, (self.pool_rows, self.pool_cols), msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+            try:
+                self.output_space.validate(msg)
+            except TypeError, e:
+                reraise_as(TypeError(str(type(layer_above))+".downward_message gave something that was not the right type: "+str(e)))
+        else:
+            msg = None
+
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        p, h, p_sample, h_sample = self.max_pool(z,
+                (self.pool_rows, self.pool_cols), msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.h_space.validate(downward_state)
+        return self.transformer.lmul_T(downward_state)
+
+    def set_batch_size(self, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.transformer.set_batch_size(batch_size)
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        outp, inp, rows, cols = range(4)
+        raw = self.transformer._filters.get_value()
+
+        return np.transpose(raw,(outp,rows,cols,inp))
+
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        default_z = self.broadcasted_bias()
+        shape = {
+                'b': self.dbm.batch_size,
+                0: self.h_space.shape[0],
+                1: self.h_space.shape[1],
+                'c': self.h_space.num_channels
+                }
+        # work around theano bug with broadcasted stuff
+        default_z += T.alloc(*([0.]+[shape[elem] for elem in self.h_space.axes])).astype(default_z.dtype)
+        assert default_z.ndim == 4
+
+        p, h = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols))
+
+        return p, h
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Returns a shared variable containing an actual state
+           (not a mean field state) for this variable.
+        """
+
+        t1 = time.time()
+
+        empty_input = self.h_space.get_origin_batch(self.dbm.batch_size)
+        h_state = sharedX(empty_input)
+
+        default_z = T.zeros_like(h_state) + self.broadcasted_bias()
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        p_exp, h_exp, p_sample, h_sample = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols),
+                theano_rng = theano_rng)
+
+        p_state = sharedX( self.output_space.get_origin_batch(
+            self.dbm.batch_size))
+
+
+        t2 = time.time()
+
+        f = function([], updates = [
+            (p_state, p_sample),
+            (h_state, h_sample)
+            ])
+
+        t3 = time.time()
+
+        f()
+
+        t4 = time.time()
+
+        logger.info('{0}.make_state took'.format(self, t4-t1))
+        logger.info('\tcompose time: {0}'.format(t2-t1))
+        logger.info('\tcompile time: {0}'.format(t3-t2))
+        logger.info('\texecute time: {0}'.format(t4-t3))
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = (downward_state * self.broadcasted_bias()).sum(axis=(1,2,3))
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=(1,2,3))
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval
+
+
+class ConvC01B_MaxPool(HiddenLayer):
+    """
+    .. todo::
+
+        WRITEME
+    """
+
+    def __init__(self,
+             output_channels,
+            kernel_shape,
+            pool_rows,
+            pool_cols,
+            layer_name,
+            center = False,
+            irange = None,
+            sparse_init = None,
+            scale_by_sharing = True,
+            init_bias = 0.,
+            pad = 0,
+            partial_sum = 1):
+        self.__dict__.update(locals())
+        del self.self
+
+        assert (irange is None) != (sparse_init is None)
+        self.output_axes = ('c', 0, 1, 'b')
+        self.detector_channels = output_channels
+        self.tied_b = 1
+
+    def broadcasted_bias(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if self.b.ndim != 1:
+            raise NotImplementedError()
+
+        shuffle = [ 'x' ] * 4
+        shuffle[self.output_axes.index('c')] = 0
+
+        return self.b.dimshuffle(*shuffle)
+
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace((self.h_space, self.output_space))
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        """ Note: this resets parameters!"""
+
+        setup_detector_layer_c01b(layer=self,
+                input_space=space, rng=self.dbm.rng,
+                irange=self.irange)
+
+        if not tuple(space.axes) == ('c', 0, 1, 'b'):
+            raise AssertionError("You're not using c01b inputs. Ian is enforcing c01b inputs while developing his pipeline to make sure it runs at maximal speed. If you really don't want to use c01b inputs, you can remove this check and things should work. If they don't work it's only because they're not tested.")
+        if self.dummy_channels != 0:
+            raise NotImplementedError(str(type(self))+" does not support adding dummy channels for cuda-convnet compatibility yet, you must implement that feature or use inputs with <=3 channels or a multiple of 4 channels")
+
+        self.input_rows = self.input_space.shape[0]
+        self.input_cols = self.input_space.shape[1]
+        self.h_rows = self.detector_space.shape[0]
+        self.h_cols = self.detector_space.shape[1]
+
+        if not(self.h_rows % self.pool_rows == 0):
+            raise ValueError(self.layer_name + ": h_rows = %d, pool_rows = %d. Should be divisible but remainder is %d" %
+                    (self.h_rows, self.pool_rows, self.h_rows % self.pool_rows))
+        assert self.h_cols % self.pool_cols == 0
+
+        self.h_space = Conv2DSpace(shape = (self.h_rows, self.h_cols), num_channels = self.output_channels,
+                axes = self.output_axes)
+        self.output_space = Conv2DSpace(shape = (self.h_rows / self.pool_rows,
+                                                self.h_cols / self.pool_cols),
+                                                num_channels = self.output_channels,
+                axes = self.output_axes)
+
+        logger.info('{0} : detector shape: {1} '
+                    'pool shape: {2}'.format(self.layer_name,
+                                             self.h_space.shape,
+                                             self.output_space.shape))
+
+        assert tuple(self.output_axes) == ('c', 0, 1, 'b')
+        self.max_pool = max_pool_c01b
+
+        if self.center:
+            p_ofs, h_ofs = self.init_mf_state()
+            self.p_offset = sharedX(self.output_space.get_origin(), 'p_offset')
+            self.h_offset = sharedX(self.h_space.get_origin(), 'h_offset')
+            f = function([], updates={self.p_offset: p_ofs[:,:,:,0], self.h_offset: h_ofs[:,:,:,0]})
+            f()
+
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        assert self.b.name is not None
+        W ,= self.transformer.get_params()
+        assert W.name is not None
+
+        return [ W, self.b]
+
+    def state_to_b01c(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        if tuple(self.output_axes) == ('b',0,1,'c'):
+            return state
+        return [ Conv2DSpace.convert(elem, self.output_axes, ('b', 0, 1, 'c'))
+                for elem in state ]
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = 0.
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(coeffs, float)
+            _, state = state
+            state = [state]
+            coeffs = [coeffs]
+        else:
+            assert all([len(elem) == 2 for elem in [state, coeffs]])
+
+        for s, c in safe_zip(state, coeffs):
+            if c == 0.:
+                continue
+            # Range over everything but the channel index
+            # theano can only take gradient through max if the max is over 1 axis or all axes
+            # so I manually unroll the max for the case I use here
+            assert self.h_space.axes == ('b', 'c', 0, 1)
+            assert self.output_space.axes == ('b', 'c', 0, 1)
+            mx = s.max(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mx.owner.op, 'grad')
+            mn = s.min(axis=3).max(axis=2).max(axis=0)
+            assert hasattr(mn.owner.op, 'grad')
+            assert mx.ndim == 1
+            assert mn.ndim == 1
+            r = mx - mn
+            rval += (1. - r).mean() * c
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Parameters
+        ----------
+        state : WRITEME
+        target : WRITEME
+            if pools contain more than one element, should be a list
+            with two elements. the first element is for the pooling
+            units and the second for the detector units.
+        coeff : WRITEME
+        eps : WRITEME
+        """
+        rval = 0.
+
+
+        if self.pool_rows == 1 and self.pool_cols == 1:
+            # If the pool size is 1 then pools = detectors
+            # and we should not penalize pools and detectors separately
+            assert len(state) == 2
+            assert isinstance(target, float)
+            assert isinstance(coeff, float)
+            _, state = state
+            state = [state]
+            target = [target]
+            coeff = [coeff]
+            if eps is None:
+                eps = 0.
+            eps = [eps]
+        else:
+            if eps is None:
+                eps = [0., 0.]
+            assert all([len(elem) == 2 for elem in [state, target, coeff]])
+            p_target, h_target = target
+            if h_target > p_target and (coeff[0] != 0. and coeff[1] != 0.):
+                # note that, within each group, E[p] is the sum of E[h]
+                warnings.warn("Do you really want to regularize the detector units to be more active than the pooling units?")
+
+        for s, t, c, e in safe_zip(state, target, coeff, eps):
+            if c == 0.:
+                continue
+            # Average over everything but the channel index
+            m = s.mean(axis= [ ax for ax in range(4) if self.output_axes[ax] != 'c' ])
+            assert m.ndim == 1
+            rval += T.maximum(abs(m-t)-e,0.).mean()*c
+
+        return rval
+
+    def get_lr_scalers(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        rval = OrderedDict()
+
+        if self.scale_by_sharing:
+            # scale each learning rate by 1 / # times param is reused
+            h_rows, h_cols = self.h_space.shape
+            num_h = float(h_rows * h_cols)
+            rval[self.transformer._filters] = 1. /num_h
+            rval[self.b] = 1. / num_h
+
+        return rval
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return p
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        p,h = total_state
+
+        if not hasattr(self, 'center'):
+            self.center = False
+
+        if self.center:
+            p -= self.p_offset
+            h -= self.h_offset
+
+        return h
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        P, H = state
+
+        axes = tuple([i for i, ax in enumerate(self.output_axes) if ax != 'c'])
+        p_max = P.max(axis=(0,1,2))
+        p_min = P.min(axis=(0,1,2))
+        p_mean = P.mean(axis=(0,1,2))
+
+        p_range = p_max - p_min
+
+        rval = {
+                'p_max_max' : p_max.max(),
+                'p_max_mean' : p_max.mean(),
+                'p_max_min' : p_max.min(),
+                'p_min_max' : p_min.max(),
+                'p_min_mean' : p_min.mean(),
+                'p_min_max' : p_min.max(),
+                'p_range_max' : p_range.max(),
+                'p_range_mean' : p_range.mean(),
+                'p_range_min' : p_range.min(),
+                'p_mean_max' : p_mean.max(),
+                'p_mean_mean' : p_mean.mean(),
+                'p_mean_min' : p_mean.min()
+                }
+
+        return rval
+
+    def get_weight_decay(self, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W , = self.transformer.get_params()
+        return coeffs * T.sqr(W).sum()
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if not hasattr(state_below, 'ndim'):
+            raise TypeError("state_below should be a TensorType, got " +
+                    str(state_below) + " of type " + str(type(state_below)))
+        if state_below.ndim != 4:
+            raise ValueError("state_below should have ndim 4, has "+str(state_below.ndim))
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = self.max_pool(z, (self.pool_rows, self.pool_cols), msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("Need to update for C01B")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+            try:
+                self.output_space.validate(msg)
+            except TypeError, e:
+                reraise_as(TypeError(str(type(layer_above))+".downward_message gave something that was not the right type: "+str(e)))
+        else:
+            msg = None
+
+        z = self.transformer.lmul(state_below) + self.broadcasted_bias()
+        p, h, p_sample, h_sample = self.max_pool(z,
+                (self.pool_rows, self.pool_cols), msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.h_space.validate(downward_state)
+        return self.transformer.lmul_T(downward_state)
+
+    def set_batch_size(self, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.transformer.set_batch_size(batch_size)
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.transformer.get_weights_topo()
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        default_z = self.broadcasted_bias()
+        shape = {
+                'b': self.dbm.batch_size,
+                0: self.h_space.shape[0],
+                1: self.h_space.shape[1],
+                'c': self.h_space.num_channels
+                }
+        # work around theano bug with broadcasted stuff
+        default_z += T.alloc(*([0.]+[shape[elem] for elem in self.h_space.axes])).astype(default_z.dtype)
+        assert default_z.ndim == 4
+
+        p, h = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols))
+
+        return p, h
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        raise NotImplementedError("Need to update for C01B")
+
+        t1 = time.time()
+
+        empty_input = self.h_space.get_origin_batch(self.dbm.batch_size)
+        h_state = sharedX(empty_input)
+
+        default_z = T.zeros_like(h_state) + self.broadcasted_bias()
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        p_exp, h_exp, p_sample, h_sample = self.max_pool(
+                z = default_z,
+                pool_shape = (self.pool_rows, self.pool_cols),
+                theano_rng = theano_rng)
+
+        p_state = sharedX( self.output_space.get_origin_batch(
+            self.dbm.batch_size))
+
+
+        t2 = time.time()
+
+        f = function([], updates = [
+            (p_state, p_sample),
+            (h_state, h_sample)
+            ])
+
+        t3 = time.time()
+
+        f()
+
+        t4 = time.time()
+
+        logger.info('{0}.make_state took {1}'.format(self, t4-t1))
+        logger.info('\tcompose time: {0}'.format(t2-t1))
+        logger.info('\tcompile time: {0}'.format(t3-t2))
+        logger.info('\texecute time: {0}'.format(t4-t3))
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        raise NotImplementedError("Need to update for C01B")
+        self.input_space.validate(state_below)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = (downward_state * self.broadcasted_bias()).sum(axis=(1,2,3))
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=(1,2,3))
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval
+
+
+class BVMP_Gaussian(BinaryVectorMaxPool):
+    """
+    Like BinaryVectorMaxPool, but must have GaussianVisLayer
+    as its input. Uses its beta to bias the hidden units appropriately.
+    See gaussian.lyx
+
+    beta is *not* considered a parameter of this layer, it's just an
+    external factor influencing how this layer behaves.
+    Gradient can still flow to beta, but it will only be included in
+    the parameters list if some class other than this layer includes it.
+
+    .. todo::
+
+        WRITEME : parameter list
+    """
+
+    def __init__(self,
+            input_layer,
+            detector_layer_dim,
+            pool_size,
+            layer_name,
+            irange = None,
+            sparse_init = None,
+            sparse_stdev = 1.,
+            include_prob = 1.0,
+            init_bias = 0.,
+            W_lr_scale = None,
+            b_lr_scale = None,
+            center = False,
+            mask_weights = None,
+            max_col_norm = None,
+            copies = 1):
+        warnings.warn("BVMP_Gaussian math is very faith-based, need to complete gaussian.lyx")
+
+        args = locals()
+
+        del args['input_layer']
+        del args['self']
+        super(BVMP_Gaussian, self).__init__(**args)
+        self.input_layer = input_layer
+
+    def get_weights(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if self.requires_reformat:
+            # This is not really an unimplemented case.
+            # We actually don't know how to format the weights
+            # in design space. We got the data in topo space
+            # and we don't have access to the dataset
+            raise NotImplementedError()
+        W ,= self.transformer.get_params()
+        W = W.get_value()
+
+        x = raw_input("multiply by beta?")
+        if x == 'y':
+            beta = self.input_layer.beta.get_value()
+            return (W.T * beta).T
+        assert x == 'n'
+        return W
+
+    def set_weights(self, weights):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("beta would make get_weights for visualization not correspond to set_weights")
+        W, = self.transformer.get_params()
+        W.set_value(weights)
+
+    def set_biases(self, biases, recenter = False):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.b.set_value(biases)
+        if recenter:
+            assert self.center
+            if self.pool_size != 1:
+                raise NotImplementedError()
+            self.offset.set_value(sigmoid_numpy(self.b.get_value()))
+
+    def get_biases(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return self.b.get_value() - self.beta_bias().eval()
+
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("need to account for beta")
+        if self.copies != 1:
+            raise NotImplementedError()
+
+        if theano_rng is None:
+            raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.")
+
+        if state_above is not None:
+            msg = layer_above.downward_message(state_above)
+        else:
+            msg = None
+
+        if self.requires_reformat:
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        z = self.transformer.lmul(state_below) + self.b
+        p, h, p_sample, h_sample = max_pool_channels(z,
+                self.pool_size, msg, theano_rng)
+
+        return p_sample, h_sample
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = self.transformer.lmul_T(downward_state)
+
+        if self.requires_reformat:
+            rval = self.desired_space.format_as(rval, self.input_space)
+
+        return rval * self.copies
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # work around theano bug with broadcasted vectors
+        z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \
+                self.b.dimshuffle('x', 0) + self.beta_bias()
+        rval = max_pool_channels(z = z,
+                pool_size = self.pool_size)
+        return rval
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Returns a shared variable containing an actual state
+        (not a mean field state) for this variable.
+        """
+        raise NotImplementedError("need to account for beta")
+
+        if not hasattr(self, 'copies'):
+            self.copies = 1
+
+        if self.copies != 1:
+            raise NotImplementedError()
+
+
+        empty_input = self.h_space.get_origin_batch(num_examples)
+        empty_output = self.output_space.get_origin_batch(num_examples)
+
+        h_state = sharedX(empty_input)
+        p_state = sharedX(empty_output)
+
+        theano_rng = make_theano_rng(None, numpy_rng.randint(2 ** 16),
+                                     which_method="binomial")
+
+        default_z = T.zeros_like(h_state) + self.b
+
+        p_exp, h_exp, p_sample, h_sample = max_pool_channels(
+                z = default_z,
+                pool_size = self.pool_size,
+                theano_rng = theano_rng)
+
+        assert h_sample.dtype == default_z.dtype
+
+        f = function([], updates = [
+            (p_state , p_sample),
+            (h_state , h_sample)
+            ])
+
+        f()
+
+        p_state.name = 'p_sample_shared'
+        h_state.name = 'h_sample_shared'
+
+        return p_state, h_state
+
+    def expected_energy_term(self, state, average, state_below, average_below):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        raise NotImplementedError("need to account for beta, and maybe some oether stuff")
+
+        # Don't need to do anything special for centering, upward_state / downward state
+        # make it all just work
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        downward_state = self.downward_state(state)
+        self.h_space.validate(downward_state)
+
+        # Energy function is linear so it doesn't matter if we're averaging or not
+        # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below
+        # and d is the downward state of this layer
+
+        bias_term = T.dot(downward_state, self.b)
+        weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1)
+
+        rval = -bias_term - weights_term
+
+        assert rval.ndim == 1
+
+        return rval * self.copies
+
+    def linear_feed_forward_approximation(self, state_below):
+        """
+        .. todo::
+
+            WRITEME properly
+
+        Used to implement TorontoSparsity. Unclear exactly what properties of it are
+        important or how to implement it for other layers.
+
+        Properties it must have:
+            output is same kind of data structure (ie, tuple of theano 2-tensors)
+            as mf_update
+
+        Properties it probably should have for other layer types:
+            An infinitesimal change in state_below or the parameters should cause the same sign of change
+            in the output of linear_feed_forward_approximation and in mf_update
+
+            Should not have any non-linearities that cause the gradient to shrink
+
+            Should disregard top-down feedback
+        """
+        raise NotImplementedError("need to account for beta")
+
+        z = self.transformer.lmul(state_below) + self.b
+
+        if self.pool_size != 1:
+            # Should probably implement sum pooling for the non-pooled version,
+            # but in reality it's not totally clear what the right answer is
+            raise NotImplementedError()
+
+        return z, z
+
+    def beta_bias(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        W, = self.transformer.get_params()
+        beta = self.input_layer.beta
+        assert beta.ndim == 1
+        return - 0.5 * T.dot(beta, T.sqr(W))
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+
+        self.input_space.validate(state_below)
+
+        if self.requires_reformat:
+            if not isinstance(state_below, tuple):
+                for sb in get_debug_values(state_below):
+                    if sb.shape[0] != self.dbm.batch_size:
+                        raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0]))
+                    assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim
+
+            state_below = self.input_space.format_as(state_below, self.desired_space)
+
+        if iter_name is None:
+            iter_name = 'anon'
+
+        if state_above is not None:
+            assert layer_above is not None
+            msg = layer_above.downward_message(state_above)
+            msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']'
+        else:
+            msg = None
+
+        if double_weights:
+            state_below = 2. * state_below
+            state_below.name = self.layer_name + '_'+iter_name + '_2state'
+        z = self.transformer.lmul(state_below) + self.b + self.beta_bias()
+        if self.layer_name is not None and iter_name is not None:
+            z.name = self.layer_name + '_' + iter_name + '_z'
+        p,h = max_pool_channels(z, self.pool_size, msg)
+
+        p.name = self.layer_name + '_p_' + iter_name
+        h.name = self.layer_name + '_h_' + iter_name
+
+        return p, h
+
+class CompositeLayer(HiddenLayer):
+    """
+        A Layer constructing by aligning several other Layer
+        objects side by side
+
+        Parameters
+        ----------
+        components : WRITEME
+            A list of layers that are combined to form this layer
+        inputs_to_components : None or dict mapping int to list of int
+            Should be None unless the input space is a CompositeSpace
+            If inputs_to_components[i] contains j, it means input i will
+            be given as input to component j.
+            If an input dodes not appear in the dictionary, it will be given
+            to all components.
+
+            This field allows one CompositeLayer to have another as input
+            without forcing each component to connect to all members
+            of the CompositeLayer below. For example, you might want to
+            have both densely connected and convolutional units in all
+            layers, but a convolutional unit is incapable of taking a
+            non-topological input space.
+    """
+
+
+    def __init__(self, layer_name, components, inputs_to_components = None):
+        self.layer_name = layer_name
+
+        self.components = list(components)
+        assert isinstance(components, list)
+        for component in components:
+            assert isinstance(component, HiddenLayer)
+        self.num_components = len(components)
+        self.components = list(components)
+
+        if inputs_to_components is None:
+            self.inputs_to_components = None
+        else:
+            if not isinstance(inputs_to_components, dict):
+                raise TypeError("CompositeLayer expected inputs_to_components to be a dict, got "+str(type(inputs_to_components)))
+            self.inputs_to_components = OrderedDict()
+            for key in inputs_to_components:
+                assert isinstance(key, int)
+                assert key >= 0
+                value = inputs_to_components[key]
+                assert isinstance(value, list)
+                assert all([isinstance(elem, int) for elem in value])
+                assert min(value) >= 0
+                assert max(value) < self.num_components
+                self.inputs_to_components[key] = list(value)
+
+    def set_input_space(self, space):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.input_space = space
+
+        if not isinstance(space, CompositeSpace):
+            assert self.inputs_to_components is None
+            self.routing_needed = False
+        else:
+            if self.inputs_to_components is None:
+                self.routing_needed = False
+            else:
+                self.routing_needed = True
+                assert max(self.inputs_to_components) < space.num_components
+                # Invert the dictionary
+                self.components_to_inputs = OrderedDict()
+                for i in xrange(self.num_components):
+                    inputs = []
+                    for j in xrange(space.num_components):
+                        if i in self.inputs_to_components[j]:
+                            inputs.append(i)
+                    if len(inputs) < space.num_components:
+                        self.components_to_inputs[i] = inputs
+
+        for i, component in enumerate(self.components):
+            if self.routing_needed and i in self.components_to_inputs:
+                cur_space = space.restrict(self.components_to_inputs[i])
+            else:
+                cur_space = space
+
+            component.set_input_space(cur_space)
+
+        self.output_space = CompositeSpace([ component.get_output_space() for component in self.components ])
+
+    def make_state(self, num_examples, numpy_rng):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple(component.make_state(num_examples, numpy_rng) for
+                component in self.components)
+
+    def get_total_state_space(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return CompositeSpace([component.get_total_state_space() for component in self.components])
+
+    def set_batch_size(self, batch_size):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        for component in self.components:
+            component.set_batch_size(batch_size)
+
+    def set_dbm(self, dbm):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        for component in self.components:
+            component.set_dbm(dbm)
+
+    def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = []
+
+        for i, component in enumerate(self.components):
+            if self.routing_needed and i in self.components_to_inputs:
+                cur_state_below =self.input_space.restrict_batch(state_below, self.components_to_inputs[i])
+            else:
+                cur_state_below = state_below
+
+            class RoutingLayer(object):
+                def __init__(self, idx, layer):
+                    self.__dict__.update(locals())
+                    del self.self
+                    self.layer_name = 'route_'+str(idx)+'_'+layer.layer_name
+
+                def downward_message(self, state):
+                    return self.layer.downward_message(state)[self.idx]
+
+            if layer_above is not None:
+                cur_layer_above = RoutingLayer(i, layer_above)
+            else:
+                cur_layer_above = None
+
+            mf_update = component.mf_update(state_below = cur_state_below,
+                                            state_above = state_above,
+                                            layer_above = cur_layer_above,
+                                            double_weights = double_weights,
+                                            iter_name = iter_name)
+
+            rval.append(mf_update)
+
+        return tuple(rval)
+
+    def init_mf_state(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple([component.init_mf_state() for component in self.components])
+
+
+    def get_weight_decay(self, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return sum([component.get_weight_decay(coeff) for component, coeff
+            in safe_zip(self.components, coeffs)])
+
+    def upward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple([component.upward_state(elem)
+            for component, elem in
+            safe_zip(self.components, total_state)])
+
+    def downward_state(self, total_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return tuple([component.downward_state(elem)
+            for component, elem in
+            safe_zip(self.components, total_state)])
+
+    def downward_message(self, downward_state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        if isinstance(self.input_space, CompositeSpace):
+            num_input_components = self.input_space.num_components
+        else:
+            num_input_components = 1
+
+        rval = [ None ] * num_input_components
+
+        def add(x, y):
+            if x is None:
+                return y
+            if y is None:
+                return x
+            return x + y
+
+        for i, packed in enumerate(safe_zip(self.components, downward_state)):
+            component, state = packed
+            if self.routing_needed and i in self.components_to_inputs:
+                input_idx = self.components_to_inputs[i]
+            else:
+                input_idx = range(num_input_components)
+
+            partial_message = component.downward_message(state)
+
+            if len(input_idx) == 1:
+                partial_message = [ partial_message ]
+
+            assert len(input_idx) == len(partial_message)
+
+            for idx, msg in safe_zip(input_idx, partial_message):
+                rval[idx] = add(rval[idx], msg)
+
+        if len(rval) == 1:
+            rval = rval[0]
+        else:
+            rval = tuple(rval)
+
+        self.input_space.validate(rval)
+
+        return rval
+
+    def get_l1_act_cost(self, state, target, coeff, eps):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return sum([ comp.get_l1_act_cost(s, t, c, e) \
+            for comp, s, t, c, e in safe_zip(self.components, state, target, coeff, eps)])
+
+    def get_range_rewards(self, state, coeffs):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return sum([comp.get_range_rewards(s, c)
+            for comp, s, c in safe_zip(self.components, state, coeffs)])
+
+    def get_params(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        return reduce(lambda x, y: safe_union(x, y),
+                [component.get_params() for component in self.components])
+
+    def get_weights_topo(self):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        logger.info('Get topological weights for which layer?')
+        for i, component in enumerate(self.components):
+            logger.info('{0} {1}'.format(i, component.layer_name))
+        x = raw_input()
+        return self.components[int(x)].get_weights_topo()
+
+    def get_monitoring_channels_from_state(self, state):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = OrderedDict()
+
+        for layer, s in safe_zip(self.components, state):
+            d = layer.get_monitoring_channels_from_state(s)
+            for key in d:
+                rval[layer.layer_name+'_'+key] = d[key]
+
+        return rval
+
+    def sample(self, state_below = None, state_above = None,
+            layer_above = None,
+            theano_rng = None):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        rval = []
+
+        for i, component in enumerate(self.components):
+            if self.routing_needed and i in self.components_to_inputs:
+                cur_state_below =self.input_space.restrict_batch(state_below, self.components_to_inputs[i])
+            else:
+                cur_state_below = state_below
+
+            class RoutingLayer(object):
+                def __init__(self, idx, layer):
+                    self.__dict__.update(locals())
+                    del self.self
+                    self.layer_name = 'route_'+str(idx)+'_'+layer.layer_name
+
+                def downward_message(self, state):
+                    return self.layer.downward_message(state)[self.idx]
+
+            if layer_above is not None:
+                cur_layer_above = RoutingLayer(i, layer_above)
+            else:
+                cur_layer_above = None
+
+            sample = component.sample(state_below = cur_state_below,
+                                            state_above = state_above,
+                                            layer_above = cur_layer_above,
+                                            theano_rng = theano_rng)
+
+            rval.append(sample)
+
+        return tuple(rval)
diff --git a/pylearn2/sandbox/dbm_v2/sampling_procedure.py b/pylearn2/sandbox/dbm_v2/sampling_procedure.py
new file mode 100644
index 0000000000..134f94a37d
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/sampling_procedure.py
@@ -0,0 +1,210 @@
+"""
+.. todo::
+
+    WRITEME
+"""
+__authors__ = ["Ian Goodfellow", "Vincent Dumoulin"]
+__copyright__ = "Copyright 2012-2013, Universite de Montreal"
+__credits__ = ["Ian Goodfellow"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+from theano.compat import OrderedDict
+from pylearn2.utils import py_integer_types
+
+
+class SamplingProcedure(object):
+    """
+    Procedure for sampling from a DBM.
+    """
+
+    def set_dbm(self, dbm):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        self.dbm = dbm
+
+    def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
+               num_steps=1):
+        """
+        Samples from self.dbm using `layer_to_state` as starting values.
+
+        Parameters
+        ----------
+        layer_to_state : dict
+            Maps the DBM's Layer instances to theano variables representing
+            batches of samples of them.
+        theano_rng : theano.sandbox.rng_mrg.MRG_RandomStreams
+            WRITEME
+        layer_to_clamp : dict, optional
+            Maps Layers to bools. If a layer is not in the dictionary,
+            defaults to False. True indicates that this layer should be
+            clamped, so we are sampling from a conditional distribution
+            rather than the joint distribution.
+        num_steps : int, optional
+            WRITEME
+
+        Returns
+        -------
+        layer_to_updated_state : dict
+            Maps the DBM's Layer instances to theano variables representing
+            batches of updated samples of them.
+        """
+        raise NotImplementedError(str(type(self))+" does not implement " +
+                                  "sample.")
+
+
+class GibbsEvenOdd(SamplingProcedure):
+    """
+    The specific sampling schedule used to sample all of the even-idexed
+    layers of model.hidden_layers, then the visible layer and all the
+    odd-indexed layers.
+    """
+
+    def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
+               num_steps=1):
+        """
+        .. todo::
+
+            WRITEME
+        """
+        # Validate num_steps
+        assert isinstance(num_steps, py_integer_types)
+        assert num_steps > 0
+
+        # Implement the num_steps > 1 case by repeatedly calling the
+        # num_steps == 1 case
+        if num_steps != 1:
+            for i in xrange(num_steps):
+                layer_to_state = self.sample(layer_to_state, theano_rng,
+                                             layer_to_clamp, num_steps=1)
+            return layer_to_state
+
+        # The rest of the function is the num_steps = 1 case
+        # Current code assumes this, though we could certainly relax this
+        # constraint
+        assert len(self.dbm.hidden_layers) > 0
+
+        # Validate layer_to_clamp / make sure layer_to_clamp is a fully
+        # populated dictionary
+        if layer_to_clamp is None:
+            layer_to_clamp = OrderedDict()
+
+        for key in layer_to_clamp:
+            assert (key is self.dbm.visible_layer or
+                    key in self.dbm.hidden_layers)
+
+        for layer in [self.dbm.visible_layer] + self.dbm.hidden_layers:
+            if layer not in layer_to_clamp:
+                layer_to_clamp[layer] = False
+
+        # Assemble the return value
+        layer_to_updated = OrderedDict()
+
+        for i, this_layer in list(enumerate(self.dbm.hidden_layers))[::2]:
+            # Iteration i does the Gibbs step for hidden_layers[i]
+
+            # Get the sampled state of the layer below so we can condition
+            # on it in our Gibbs update
+            if i == 0:
+                layer_below = self.dbm.visible_layer
+            else:
+                layer_below = self.dbm.hidden_layers[i-1]
+            state_below = layer_to_state[layer_below]
+            state_below = layer_below.upward_state(state_below)
+
+            # Get the sampled state of the layer above so we can condition
+            # on it in our Gibbs step
+            if i + 1 < len(self.dbm.hidden_layers):
+                layer_above = self.dbm.hidden_layers[i + 1]
+                state_above = layer_to_state[layer_above]
+                state_above = layer_above.downward_state(state_above)
+            else:
+                state_above = None
+                layer_above = None
+
+            if layer_to_clamp[this_layer]:
+                this_state = layer_to_state[this_layer]
+                this_sample = this_state
+            else:
+                # Compute the Gibbs sampling update
+                # Sample the state of this layer conditioned
+                # on its Markov blanket (the layer above and
+                # layer below)
+                this_sample = this_layer.sample(state_below=state_below,
+                                                state_above=state_above,
+                                                layer_above=layer_above,
+                                                theano_rng=theano_rng)
+
+            layer_to_updated[this_layer] = this_sample
+
+        #Sample the visible layer
+        vis_state = layer_to_state[self.dbm.visible_layer]
+        if layer_to_clamp[self.dbm.visible_layer]:
+            vis_sample = vis_state
+        else:
+            first_hid = self.dbm.hidden_layers[0]
+            state_above = layer_to_updated[first_hid]
+            state_above = first_hid.downward_state(state_above)
+
+            vis_sample = self.dbm.visible_layer.sample(state_above=state_above,
+                                                       layer_above=first_hid,
+                                                       theano_rng=theano_rng)
+        layer_to_updated[self.dbm.visible_layer] = vis_sample
+
+        # Sample the odd-numbered layers
+        for i, this_layer in list(enumerate(self.dbm.hidden_layers))[1::2]:
+
+            # Get the sampled state of the layer below so we can condition
+            # on it in our Gibbs update
+            layer_below = self.dbm.hidden_layers[i-1]
+
+            # We want to sample from each conditional distribution
+            # ***sequentially*** so we must use the updated version
+            # of the state for the layers whose updates we have
+            # calculcated already, in layer_to_updated.
+            # If we used the original value from
+            # layer_to_state
+            # then we would sample from each conditional
+            # ***simultaneously*** which does not implement MCMC
+            # sampling.
+            state_below = layer_to_updated[layer_below]
+
+            state_below = layer_below.upward_state(state_below)
+
+            # Get the sampled state of the layer above so we can condition
+            # on it in our Gibbs step
+            if i + 1 < len(self.dbm.hidden_layers):
+                layer_above = self.dbm.hidden_layers[i + 1]
+                state_above = layer_to_updated[layer_above]
+                state_above = layer_above.downward_state(state_above)
+            else:
+                state_above = None
+                layer_above = None
+
+            if layer_to_clamp[this_layer]:
+                this_state = layer_to_state[this_layer]
+                this_sample = this_state
+            else:
+                # Compute the Gibbs sampling update
+                # Sample the state of this layer conditioned
+                # on its Markov blanket (the layer above and
+                # layer below)
+                this_sample = this_layer.sample(state_below=state_below,
+                                                state_above=state_above,
+                                                layer_above=layer_above,
+                                                theano_rng=theano_rng)
+
+            layer_to_updated[this_layer] = this_sample
+
+        # Check that all layers were updated
+        assert all([layer in layer_to_updated for layer in layer_to_state])
+        # Check that we didn't accidentally treat any other object as a layer
+        assert all([layer in layer_to_state for layer in layer_to_updated])
+        # Check that clamping worked
+        assert all([(layer_to_state[layer] is layer_to_updated[layer]) ==
+                    layer_to_clamp[layer] for layer in layer_to_state])
+
+        return layer_to_updated
diff --git a/pylearn2/sandbox/dbm_v2/test_dbm.py b/pylearn2/sandbox/dbm_v2/test_dbm.py
new file mode 100644
index 0000000000..d5f5abe646
--- /dev/null
+++ b/pylearn2/sandbox/dbm_v2/test_dbm.py
@@ -0,0 +1,1214 @@
+from pylearn2.sandbox.dbm_v2.dbm import DBM
+from pylearn2.sandbox.dbm_v2.dbm import RBM
+from pylearn2.sandbox.dbm_v2.layer import BinaryVector, BinaryVectorMaxPool, Softmax, GaussianVisLayer
+
+__authors__ = ["Ian Goodfellow", "Devon Hjelm"]
+__copyright__ = "Copyright 2012, Universite de Montreal"
+__credits__ = ["Ian Goodfellow", "Devon Hjelm"]
+__license__ = "3-clause BSD"
+__maintainer__ = "LISA Lab"
+
+import numpy as np
+import random
+assert hasattr(np, 'exp')
+
+from theano import config
+from theano import function
+from theano import printing
+from theano import tensor as T
+from theano.sandbox.rng_mrg import MRG_RandomStreams
+
+from pylearn2.expr.basic import is_binary
+from pylearn2.expr.nnet import inverse_sigmoid_numpy
+from pylearn2.sandbox.dbm_v2.dbm_cost import VariationalCD
+from pylearn2.sandbox.dbm_v2.dbm_cost import BaseCD
+import pylearn2.testing.datasets as datasets
+from pylearn2.space import VectorSpace
+from pylearn2.utils import sharedX
+from pylearn2.utils import safe_zip
+from pylearn2.utils.data_specs import DataSpecsMapping
+
+
+class DummyLayer(object):
+    """
+    A layer that we build for the test that just uses a state
+    as its downward message.
+    """
+
+    def downward_state(self, state):
+        return state
+
+    def downward_message(self, state):
+        return state
+
+
+class DummyDBM(object):
+    """
+    A dummy DBM for some of the tests below.
+    """
+    def __init__(self, rng):
+        self.rng = rng
+
+
+class TestBinaryVector:
+    """
+    Testing class for DBM BinaryVector.
+    """
+    def setUp(self):
+        pass
+    @staticmethod
+    def check_samples(value, expected_shape, expected_mean, tol):
+        """
+        Tests that a matrix of binary samples (observations in rows, variables
+        in columns)
+        1) Has the right shape
+        2) Is binary
+        3) Converges to the right mean
+        """
+        assert value.shape == expected_shape
+        assert is_binary(value)
+        mean = value.mean(axis=0)
+        max_error = np.abs(mean-expected_mean).max()
+        print 'Actual mean:'
+        print mean
+        print 'Expected mean:'
+        print expected_mean
+        print 'Maximal error:', max_error
+        if max_error > tol:
+            raise ValueError("Samples don't seem to have the right mean.")
+
+    def test_make_state(self):
+        # Verifies that BinaryVector.make_state creates
+        # a shared variable whose value passes check_samples
+
+        n = 5
+        num_samples = 1000
+        tol = .04
+
+        layer = BinaryVector(nvis = n)
+
+        rng = np.random.RandomState([2012,11,1])
+
+        mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+
+        z = inverse_sigmoid_numpy(mean)
+
+        layer.set_biases(z.astype(config.floatX))
+
+        init_state = layer.make_state(num_examples=num_samples,
+                                      numpy_rng=rng)
+
+        value = init_state.get_value()
+
+        TestBinaryVector.check_samples(value, (num_samples, n), mean, tol)
+
+    def test_sample(self):
+        # Verifies that BinaryVector.sample returns an expression
+        # whose value passes check_samples
+
+        assert hasattr(np, 'exp')
+
+        n = 5
+        num_samples = 1000
+        tol = .04
+
+        vis = BinaryVector(nvis=n)
+        hid = DummyLayer()
+
+        rng = np.random.RandomState([2012,11,1,259])
+
+        mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+
+        ofs = rng.randn(n)
+
+        vis.set_biases(ofs.astype(config.floatX))
+
+        z = inverse_sigmoid_numpy(mean) - ofs
+
+        z_var = sharedX(np.zeros((num_samples, n)) + z)
+
+        theano_rng = MRG_RandomStreams(2012+11+1)
+
+        sample = vis.sample(state_above=z_var, layer_above=hid,
+                            theano_rng=theano_rng)
+
+        sample = sample.eval()
+
+        TestBinaryVector.check_samples(sample, (num_samples, n), mean, tol)
+
+
+class TestGaussianVisLayer:
+
+    def setUp(self):
+        pass
+
+    @staticmethod
+    def check_samples(value, nsamples, nvis, rows, cols, channels, expected_mean, tol):
+        """
+        Tests that a matrix of Gaussian samples (observations in rows, variables
+        in columns)
+        1) Has the right shape
+        2) Is not binary
+        3) Converges to the right mean
+
+        """
+        if nvis:
+            expected_shape = (nsamples, nvis)
+        else:
+            expected_shape = (nsamples,rows,cols,channels)
+        assert value.shape == expected_shape
+        assert not is_binary(value)
+        mean = value.mean(axis=0)
+        max_error = np.abs(mean-expected_mean).max()
+        print 'Actual mean:'
+        print mean
+        print 'Expected mean:'
+        print expected_mean
+        print 'Maximal error:', max_error
+        print 'Tolerable variance:', tol
+        if max_error > tol:
+            raise ValueError("Samples don't seem to have the right mean.")
+        else:
+            print 'Mean is within expected range'
+
+    def test_make_state(self, n=5, rows=None, cols=None, channels=None, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.make_state.
+        Verified that GaussianVisLayer creates a shared variable whose value passes check_samples.
+        In this case the layer lives in a VectorSpace.
+
+        """
+        beta = 1/tol # precision parameter
+        assert (n is None and (rows is not None and cols is not None and channels is not None)) or\
+            (n is not None and (rows == cols == channels == None)),\
+            "n must be None or rows, cols, and channels must be None"
+
+        rng = np.random.RandomState([2012,11,1])
+        if n is not None:
+            layer = GaussianVisLayer(nvis = n, init_beta=beta)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+        else:
+            # axes for batch, rows, cols, channels, can be given in any order
+            axes = ['b', 0, 1, 'c']
+            random.shuffle(axes)
+            axes = tuple(axes)
+            layer = GaussianVisLayer(rows=rows, cols=cols, channels=channels,
+                                     init_beta=beta, axes=axes)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (rows, cols, channels))
+
+        z = mean
+        layer.set_biases(z.astype(config.floatX))
+        init_state = layer.make_state(num_examples=num_samples,
+                                      numpy_rng=rng)
+        value = init_state.get_value()
+        TestGaussianVisLayer.check_samples(value, num_samples, n, rows, cols, channels, mean, tol)
+
+    def test_make_state_conv(self, n=None, rows=3, cols=3, channels=3, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.make_state.
+        Verifies that GaussianVisLayer.make_state creates a shared variable
+        whose value passes check_samples. In this case the layer lives in a Conv2DSpace.
+
+        Parameters:
+        ----------
+        n: detector layer dimension.
+        num_samples: number of samples or observations over each dimension.
+        tol: tolerace in comparisons
+        rows: number of rows in convolutional detector. Must be None if n is not None
+        cols: number of cols in convolutional detector. Must be None if n is not None
+        channels: number of channels in convolutional detector. Must be None if n is not None
+        """
+        self.test_make_state(n, rows, cols, channels, num_samples, tol)
+
+    def test_sample(self, n=5, rows=None, cols=None, channels=None, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.sample returns an expression whose value passes check_samples.
+        In this case the layer lives in a VectorSpace.
+
+        Parameters:
+        -----------
+        n: detector layer dimension.
+        num_samples: number of samples or observations over each dimension.
+        tol: tolerace in comparisons
+        rows: number of rows in convolutional detector.  Must be None if n is not None
+        cols: number of cols in convolutional detector.  Must be None if n is not None
+        channels: number of channels in convolutional detector.  Must be None if n is not None
+        """
+        assert hasattr(np, 'exp')
+
+        beta = 1/tol  # precision parameter
+        assert (n is None and (rows is not None and cols is not None and channels is not None)) or\
+            (n is not None and (rows == cols == channels == None)),\
+            "n must be None or rows, cols, and channels must be None"
+
+        rng = np.random.RandomState([2012,11,1,259])
+        if n is not None:
+            vis = GaussianVisLayer(nvis=n, init_beta=beta)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (n,))
+            ofs = rng.randn(n)
+        else:
+            # axes for batch, rows, cols, channels, can be given in any order
+            axes = ['b', 0, 1, 'c']
+            random.shuffle(axes)
+            axes = tuple(axes)
+            vis = GaussianVisLayer(nvis=None,rows=rows, cols=cols,
+                                   channels=channels, init_beta=beta, axes=axes)
+            mean = rng.uniform(1e-6, 1. - 1e-6, (rows, cols, channels))
+            ofs = rng.randn(rows,cols,channels)
+
+        hid = DummyLayer()
+        vis.set_biases(ofs.astype(config.floatX))
+        z=mean -ofs # linear activation function
+
+        if n is not None:
+            z_var = sharedX(np.zeros((num_samples, n)) + z)
+        else:
+            z_var = sharedX(np.zeros((num_samples, rows, cols, channels)) + z)
+
+        theano_rng = MRG_RandomStreams(2012+11+1)
+        sample = vis.sample(state_above=z_var, layer_above=hid,
+                            theano_rng=theano_rng)
+        sample = sample.eval()
+        TestGaussianVisLayer.check_samples(sample, num_samples, n, rows, cols, channels, mean, tol)
+
+    def test_sample_conv(self, n=None, rows=3, cols=3, channels=3, num_samples=1000, tol=0.042):
+        """
+        Verifies that GaussianVisLayer.sample returns an expression whose value passes check_samples.
+        In this case the layer lives in a Conv2DSpace.
+
+        Parameters:
+        -----------
+        n: detector layer dimension.  Set to None for convolutional.
+        num_samples: number of samples or observations over each dimension.
+        tol: tolerace in comparisons
+        rows: number of rows in convolutional detector.  Must be None if n is not None
+        cols: number of cols in convolutional detector.  Must be None if n is not None
+        channels: number of channels in convolutional detector.  Must be None if n is not None
+        """
+        self.test_sample(n, rows, cols, channels, num_samples, tol)
+
+
+def check_bvmp_samples(value, num_samples, n, pool_size, mean, tol):
+    """
+    bvmp=BinaryVectorMaxPool
+    value: a tuple giving (pooled batch, detector batch)   (all made with same params)
+    num_samples: number of samples there should be in the batch
+    n: detector layer dimension
+    pool_size: size of each pool region
+    mean: (expected value of pool unit, expected value of detector units)
+    tol: amount the emprical mean is allowed to deviate from the analytical expectation
+
+    checks that:
+        1) all values are binary
+        2) detector layer units are mutually exclusive
+        3) pooled unit is max of the detector units
+        4) correct number of samples is present
+        5) variables are of the right shapes
+        6) samples converge to the right expected value
+    """
+
+    pv, hv = value
+
+    assert n % pool_size == 0
+    num_pools = n // pool_size
+
+    assert pv.ndim == 2
+    assert pv.shape[0] == num_samples
+    assert pv.shape[1] == num_pools
+
+    assert hv.ndim == 2
+    assert hv.shape[0] == num_samples
+    assert hv.shape[1] == n
+
+    assert is_binary(pv)
+    assert is_binary(hv)
+
+    for i in xrange(num_pools):
+        sub_p = pv[:,i]
+        assert sub_p.shape == (num_samples,)
+        sub_h = hv[:,i*pool_size:(i+1)*pool_size]
+        assert sub_h.shape == (num_samples, pool_size)
+        if not np.all(sub_p == sub_h.max(axis=1)):
+            for j in xrange(num_samples):
+                print sub_p[j], sub_h[j,:]
+                assert sub_p[j] == sub_h[j,:]
+            assert False
+        assert np.max(sub_h.sum(axis=1)) == 1
+
+    p, h = mean
+    assert p.ndim == 1
+    assert h.ndim == 1
+    emp_p = pv.mean(axis=0)
+    emp_h = hv.mean(axis=0)
+
+    max_diff = np.abs(p - emp_p).max()
+    if max_diff > tol:
+        print 'expected value of pooling units: ',p
+        print 'empirical expectation: ',emp_p
+        print 'maximum difference: ',max_diff
+        raise ValueError("Pooling unit samples have an unlikely mean.")
+    max_diff = np.abs(h - emp_h).max()
+    if max_diff > tol:
+        assert False
+
+def test_bvmp_make_state():
+
+    # Verifies that BinaryVector.make_state creates
+    # a shared variable whose value passes check_binary_samples
+
+    num_pools = 3
+    num_samples = 1000
+    tol = .04
+    rng = np.random.RandomState([2012,11,1,9])
+    # pool_size=1 is an important corner case
+    for pool_size in [1, 2, 5]:
+        n = num_pools * pool_size
+
+        layer = BinaryVectorMaxPool(
+                detector_layer_dim=n,
+                layer_name='h',
+                irange=1.,
+                pool_size=pool_size)
+
+        # This is just to placate mf_update below
+        input_space = VectorSpace(1)
+        class DummyDBM(object):
+            def __init__(self):
+                self.rng = rng
+        layer.set_dbm(DummyDBM())
+        layer.set_input_space(input_space)
+
+        layer.set_biases(rng.uniform(-pool_size, 1., (n,)).astype(config.floatX))
+
+        # To find the mean of the samples, we use mean field with an input of 0
+        mean = layer.mf_update(
+                state_below=T.alloc(0., 1, 1),
+                state_above=None,
+                layer_above=None)
+
+        mean = function([], mean)()
+
+        mean = [ mn[0,:] for mn in mean ]
+
+        state = layer.make_state(num_examples=num_samples,
+                numpy_rng=rng)
+
+        value = [elem.get_value() for elem in state]
+
+        check_bvmp_samples(value, num_samples, n, pool_size, mean, tol)
+
+
+def make_random_basic_binary_dbm(
+        rng,
+        pool_size_1,
+        num_vis = None,
+        num_pool_1 = None,
+        num_pool_2 = None,
+        pool_size_2 = None,
+        center = False
+        ):
+    """
+    Makes a DBM with BinaryVector for the visible layer,
+    and two hidden layers of type BinaryVectorMaxPool.
+    The weights and biases are initialized randomly with
+    somewhat large values (i.e., not what you'd want to
+    use for learning)
+
+    rng: A numpy RandomState.
+    pool_size_1: The size of the pools to use in the first
+                 layer.
+    """
+
+    if num_vis is None:
+        num_vis = rng.randint(1,11)
+    if num_pool_1 is None:
+        num_pool_1 = rng.randint(1,11)
+    if num_pool_2 is None:
+        num_pool_2 = rng.randint(1,11)
+    if pool_size_2 is None:
+        pool_size_2 = rng.randint(1,6)
+
+    num_h1 = num_pool_1 * pool_size_1
+    num_h2 = num_pool_2 * pool_size_2
+
+    v = BinaryVector(num_vis, center=center)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX), recenter=center)
+
+    h1 = BinaryVectorMaxPool(
+            detector_layer_dim = num_h1,
+            pool_size = pool_size_1,
+            layer_name = 'h1',
+            center = center,
+            irange = 1.)
+    h1.set_biases(rng.uniform(-1., 1., (num_h1,)).astype(config.floatX), recenter=center)
+
+    h2 = BinaryVectorMaxPool(
+            center = center,
+            detector_layer_dim = num_h2,
+            pool_size = pool_size_2,
+            layer_name = 'h2',
+            irange = 1.)
+    h2.set_biases(rng.uniform(-1., 1., (num_h2,)).astype(config.floatX), recenter=center)
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [h1, h2],
+            batch_size = 1,
+            niter = 50)
+
+    return dbm
+
+
+def test_bvmp_mf_energy_consistent():
+
+    # A test of the BinaryVectorMaxPool class
+    # Verifies that the mean field update is consistent with
+    # the energy function
+
+    # Specifically, in a DBM consisting of (v, h1, h2), the
+    # lack of intra-layer connections means that
+    # P(h1|v, h2) is factorial so mf_update tells us the true
+    # conditional.
+    # We also know P(h1[i] | h1[-i], v)
+    #  = P(h, v) / P(h[-i], v)
+    #  = P(h, v) / sum_h[i] P(h, v)
+    #  = exp(-E(h, v)) / sum_h[i] exp(-E(h, v))
+    # So we can check that computing P(h[i] | v) with both
+    # methods works the same way
+
+    rng = np.random.RandomState([2012,11,1,613])
+
+    def do_test(pool_size_1):
+
+        # Make DBM and read out its pieces
+        dbm = make_random_basic_binary_dbm(
+                rng = rng,
+                pool_size_1 = pool_size_1,
+                )
+
+        v = dbm.visible_layer
+        h1, h2 = dbm.hidden_layers
+
+        num_p = h1.get_output_space().dim
+
+        # Choose which unit we will test
+        p_idx = rng.randint(num_p)
+
+        # Randomly pick a v, h1[-p_idx], and h2 to condition on
+        # (Random numbers are generated via dbm.rng)
+        layer_to_state = dbm.make_layer_to_state(1)
+        v_state = layer_to_state[v]
+        h1_state = layer_to_state[h1]
+        h2_state = layer_to_state[h2]
+
+        # Debugging checks
+        num_h = h1.detector_layer_dim
+        assert num_p * pool_size_1 == num_h
+        pv, hv = h1_state
+        assert pv.get_value().shape == (1, num_p)
+        assert hv.get_value().shape == (1, num_h)
+
+        # Infer P(h1[i] | h2, v) using mean field
+        expected_p, expected_h = h1.mf_update(
+                state_below = v.upward_state(v_state),
+                state_above = h2.downward_state(h2_state),
+                layer_above = h2)
+
+        expected_p = expected_p[0, p_idx]
+        expected_h = expected_h[0, p_idx * pool_size : (p_idx + 1) * pool_size]
+
+        expected_p, expected_h = function([], [expected_p, expected_h])()
+
+        # Infer P(h1[i] | h2, v) using the energy function
+        energy = dbm.energy(V = v_state,
+                hidden = [h1_state, h2_state])
+        unnormalized_prob = T.exp(-energy)
+        assert unnormalized_prob.ndim == 1
+        unnormalized_prob = unnormalized_prob[0]
+        unnormalized_prob = function([], unnormalized_prob)
+
+        p_state, h_state = h1_state
+
+        def compute_unnormalized_prob(which_detector):
+            write_h = np.zeros((pool_size_1,))
+            if which_detector is None:
+                write_p = 0.
+            else:
+                write_p = 1.
+                write_h[which_detector] = 1.
+
+            h_value = h_state.get_value()
+            p_value = p_state.get_value()
+
+            h_value[0, p_idx * pool_size : (p_idx + 1) * pool_size] = write_h
+            p_value[0, p_idx] = write_p
+
+            h_state.set_value(h_value)
+            p_state.set_value(p_value)
+
+            return unnormalized_prob()
+
+        off_prob = compute_unnormalized_prob(None)
+        on_probs = [compute_unnormalized_prob(idx) for idx in xrange(pool_size)]
+        denom = off_prob + sum(on_probs)
+        off_prob /= denom
+        on_probs = [on_prob / denom for on_prob in on_probs]
+        assert np.allclose(1., off_prob + sum(on_probs))
+
+        # np.asarray(on_probs) doesn't make a numpy vector, so I do it manually
+        wtf_numpy = np.zeros((pool_size_1,))
+        for i in xrange(pool_size_1):
+            wtf_numpy[i] = on_probs[i]
+        on_probs = wtf_numpy
+
+        # Check that they match
+        if not np.allclose(expected_p, 1. - off_prob):
+            print 'mean field expectation of p:',expected_p
+            print 'expectation of p based on enumerating energy function values:',1. - off_prob
+            print 'pool_size_1:',pool_size_1
+
+            assert False
+        if not np.allclose(expected_h, on_probs):
+            print 'mean field expectation of h:',expected_h
+            print 'expectation of h based on enumerating energy function values:',on_probs
+            assert False
+
+    # 1 is an important corner case
+    # We must also run with a larger number to test the general case
+    for pool_size in [1, 2, 5]:
+        do_test(pool_size)
+
+
+def test_bvmp_mf_energy_consistent_center():
+    """
+    A test of the BinaryVectorMaxPool class
+    Verifies that the mean field update is consistent with
+    the energy function when using Gregoire Montavon's centering
+    trick.
+
+    Specifically, in a DBM consisting of (v, h1, h2), the
+    lack of intra-layer connections means that
+    P(h1|v, h2) is factorial so mf_update tells us the true
+    conditional.
+    We also know P(h1[i] | h1[-i], v)
+    = P(h, v) / P(h[-i], v)
+    = P(h, v) / sum_h[i] P(h, v)
+    = exp(-E(h, v)) / sum_h[i] exp(-E(h, v))
+    So we can check that computing P(h[i] | v) with both
+    methods works the same way
+
+    :return:
+    """
+    rng = np.random.RandomState([2012,11,1,613])
+
+    def do_test(pool_size_1):
+
+        # Make DBM and read out its pieces
+        dbm = make_random_basic_binary_dbm(
+                rng = rng,
+                pool_size_1 = pool_size_1,
+                pool_size_2 = 1, # centering is only updated for pool size 1
+                center = True
+                )
+
+        v = dbm.visible_layer
+        h1, h2 = dbm.hidden_layers
+
+        num_p = h1.get_output_space().dim
+
+        # Choose which unit we will test
+        p_idx = rng.randint(num_p)
+
+        # Randomly pick a v, h1[-p_idx], and h2 to condition on
+        # (Random numbers are generated via dbm.rng)
+        layer_to_state = dbm.make_layer_to_state(1)
+        v_state = layer_to_state[v]
+        h1_state = layer_to_state[h1]
+        h2_state = layer_to_state[h2]
+
+        # Debugging checks
+        num_h = h1.detector_layer_dim
+        assert num_p * pool_size_1 == num_h
+        pv, hv = h1_state
+        assert pv.get_value().shape == (1, num_p)
+        assert hv.get_value().shape == (1, num_h)
+
+        # Infer P(h1[i] | h2, v) using mean field
+        expected_p, expected_h = h1.mf_update(
+                state_below = v.upward_state(v_state),
+                state_above = h2.downward_state(h2_state),
+                layer_above = h2)
+
+        expected_p = expected_p[0, p_idx]
+        expected_h = expected_h[0, p_idx * pool_size_1 : (p_idx + 1) * pool_size_1]
+
+        expected_p, expected_h = function([], [expected_p, expected_h])()
+
+        # Infer P(h1[i] | h2, v) using the energy function
+        energy = dbm.energy(V = v_state,
+                hidden = [h1_state, h2_state])
+        unnormalized_prob = T.exp(-energy)
+        assert unnormalized_prob.ndim == 1
+        unnormalized_prob = unnormalized_prob[0]
+        unnormalized_prob = function([], unnormalized_prob)
+
+        p_state, h_state = h1_state
+
+        def compute_unnormalized_prob(which_detector):
+            write_h = np.zeros((pool_size_1,))
+            if which_detector is None:
+                write_p = 0.
+            else:
+                write_p = 1.
+                write_h[which_detector] = 1.
+
+            h_value = h_state.get_value()
+            p_value = p_state.get_value()
+
+            h_value[0, p_idx * pool_size_1 : (p_idx + 1) * pool_size_1] = write_h
+            p_value[0, p_idx] = write_p
+
+            h_state.set_value(h_value)
+            p_state.set_value(p_value)
+
+            return unnormalized_prob()
+
+        off_prob = compute_unnormalized_prob(None)
+        on_probs = [compute_unnormalized_prob(idx) for idx in xrange(pool_size_1)]
+        denom = off_prob + sum(on_probs)
+        off_prob /= denom
+        on_probs = [on_prob / denom for on_prob in on_probs]
+        assert np.allclose(1., off_prob + sum(on_probs))
+
+        # np.asarray(on_probs) doesn't make a numpy vector, so I do it manually
+        wtf_numpy = np.zeros((pool_size_1,))
+        for i in xrange(pool_size_1):
+            wtf_numpy[i] = on_probs[i]
+        on_probs = wtf_numpy
+
+        # Check that they match
+        if not np.allclose(expected_p, 1. - off_prob):
+            print 'mean field expectation of p:',expected_p
+            print 'expectation of p based on enumerating energy function values:',1. - off_prob
+            print 'pool_size_1:',pool_size_1
+
+            assert False
+        if not np.allclose(expected_h, on_probs):
+            print 'mean field expectation of h:',expected_h
+            print 'expectation of h based on enumerating energy function values:',on_probs
+            assert False
+
+    # 1 is the only pool size for which centering is implemented
+    do_test(1)
+
+def test_bvmp_mf_sample_consistent():
+
+    # A test of the BinaryVectorMaxPool class
+    # Verifies that the mean field update is consistent with
+    # the sampling function
+
+    # Specifically, in a DBM consisting of (v, h1, h2), the
+    # lack of intra-layer connections means that
+    # P(h1|v, h2) is factorial so mf_update tells us the true
+    # conditional.
+    # We can thus use mf_update to compute the expected value
+    # of a sample of h1 from v and h2, and check that samples
+    # drawn using the layer's sample method convert to that
+    # value.
+
+    rng = np.random.RandomState([2012,11,1,1016])
+    theano_rng = MRG_RandomStreams(2012+11+1+1036)
+    num_samples = 1000
+    tol = .042
+
+    def do_test(pool_size_1):
+
+        # Make DBM and read out its pieces
+        dbm = make_random_basic_binary_dbm(
+                rng = rng,
+                pool_size_1 = pool_size_1,
+                )
+
+        v = dbm.visible_layer
+        h1, h2 = dbm.hidden_layers
+
+        num_p = h1.get_output_space().dim
+
+        # Choose which unit we will test
+        p_idx = rng.randint(num_p)
+
+        # Randomly pick a v, h1[-p_idx], and h2 to condition on
+        # (Random numbers are generated via dbm.rng)
+        layer_to_state = dbm.make_layer_to_state(1)
+        v_state = layer_to_state[v]
+        h1_state = layer_to_state[h1]
+        h2_state = layer_to_state[h2]
+
+        # Debugging checks
+        num_h = h1.detector_layer_dim
+        assert num_p * pool_size_1 == num_h
+        pv, hv = h1_state
+        assert pv.get_value().shape == (1, num_p)
+        assert hv.get_value().shape == (1, num_h)
+
+        # Infer P(h1[i] | h2, v) using mean field
+        expected_p, expected_h = h1.mf_update(
+                state_below = v.upward_state(v_state),
+                state_above = h2.downward_state(h2_state),
+                layer_above = h2)
+
+        expected_p = expected_p[0, :]
+        expected_h = expected_h[0, :]
+
+        expected_p, expected_h = function([], [expected_p, expected_h])()
+
+        # copy all the states out into a batch size of num_samples
+        cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x')
+        v_state = v_state[0,:] + cause_copy
+        p, h = h1_state
+        h1_state = (p[0,:] + cause_copy, h[0,:] + cause_copy)
+        p, h = h2_state
+        h2_state = (p[0,:] + cause_copy, h[0,:] + cause_copy)
+
+        h1_samples = h1.sample(state_below = v.upward_state(v_state),
+                            state_above = h2.downward_state(h2_state),
+                            layer_above = h2, theano_rng = theano_rng)
+
+        h1_samples = function([], h1_samples)()
+
+
+        check_bvmp_samples(h1_samples, num_samples, num_h, pool_size, (expected_p, expected_h), tol)
+
+
+    # 1 is an important corner case
+    # We must also run with a larger number to test the general case
+    for pool_size in [1, 2, 5]:
+        do_test(pool_size)
+
+def check_multinomial_samples(value, expected_shape, expected_mean, tol):
+    """
+    Tests that a matrix of multinomial samples (observations in rows, variables
+        in columns)
+    1) Has the right shape
+    2) Is binary
+    3) Has one 1 per row
+    4) Converges to the right mean
+    """
+    assert value.shape == expected_shape
+    assert is_binary(value)
+    assert np.all(value.sum(axis=1) == 1)
+    mean = value.mean(axis=0)
+    max_error = np.abs(mean-expected_mean).max()
+    if max_error > tol:
+        print 'Actual mean:'
+        print mean
+        print 'Expected mean:'
+        print expected_mean
+        print 'Maximal error:', max_error
+        raise ValueError("Samples don't seem to have the right mean.")
+
+def test_softmax_make_state():
+
+    # Verifies that BinaryVector.make_state creates
+    # a shared variable whose value passes check_multinomial_samples
+
+    n = 5
+    num_samples = 1000
+    tol = .04
+
+    layer = Softmax(n_classes = n, layer_name = 'y')
+
+    rng = np.random.RandomState([2012, 11, 1, 11])
+
+    z = 3 * rng.randn(n)
+
+    mean = np.exp(z)
+    mean /= mean.sum()
+
+    layer.set_biases(z.astype(config.floatX))
+
+    state = layer.make_state(num_examples=num_samples,
+            numpy_rng=rng)
+
+    value = state.get_value()
+
+    check_multinomial_samples(value, (num_samples, n), mean, tol)
+
+def test_softmax_mf_energy_consistent():
+
+    # A test of the Softmax class
+    # Verifies that the mean field update is consistent with
+    # the energy function
+
+    # Since a Softmax layer contains only one random variable
+    # (with n_classes possible values) the mean field assumption
+    # does not impose any restriction so mf_update simply gives
+    # the true expected value of h given v.
+    # We also know P(h |  v)
+    #  = P(h, v) / P( v)
+    #  = P(h, v) / sum_h P(h, v)
+    #  = exp(-E(h, v)) / sum_h exp(-E(h, v))
+    # So we can check that computing P(h | v) with both
+    # methods works the same way
+
+    rng = np.random.RandomState([2012,11,1,1131])
+
+    # Make DBM
+    num_vis = rng.randint(1,11)
+    n_classes = rng.randint(1, 11)
+
+    v = BinaryVector(num_vis)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX))
+
+    y = Softmax(
+            n_classes = n_classes,
+            layer_name = 'y',
+            irange = 1.)
+    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX))
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [y],
+            batch_size = 1,
+            niter = 50)
+
+    # Randomly pick a v to condition on
+    # (Random numbers are generated via dbm.rng)
+    layer_to_state = dbm.make_layer_to_state(1)
+    v_state = layer_to_state[v]
+    y_state = layer_to_state[y]
+
+    # Infer P(y | v) using mean field
+    expected_y = y.mf_update(
+            state_below = v.upward_state(v_state))
+
+    expected_y = expected_y[0, :]
+
+    expected_y = expected_y.eval()
+
+    # Infer P(y | v) using the energy function
+    energy = dbm.energy(V = v_state,
+            hidden = [y_state])
+    unnormalized_prob = T.exp(-energy)
+    assert unnormalized_prob.ndim == 1
+    unnormalized_prob = unnormalized_prob[0]
+    unnormalized_prob = function([], unnormalized_prob)
+
+    def compute_unnormalized_prob(which):
+        write_y = np.zeros((n_classes,))
+        write_y[which] = 1.
+
+        y_value = y_state.get_value()
+
+        y_value[0, :] = write_y
+
+        y_state.set_value(y_value)
+
+        return unnormalized_prob()
+
+    probs = [compute_unnormalized_prob(idx) for idx in xrange(n_classes)]
+    denom = sum(probs)
+    probs = [on_prob / denom for on_prob in probs]
+
+    # np.asarray(probs) doesn't make a numpy vector, so I do it manually
+    wtf_numpy = np.zeros((n_classes,))
+    for i in xrange(n_classes):
+        wtf_numpy[i] = probs[i]
+    probs = wtf_numpy
+
+    if not np.allclose(expected_y, probs):
+        print 'mean field expectation of h:',expected_y
+        print 'expectation of h based on enumerating energy function values:',probs
+        assert False
+
+def test_softmax_mf_energy_consistent_centering():
+
+    # A test of the Softmax class
+    # Verifies that the mean field update is consistent with
+    # the energy function when using the centering trick
+
+    # Since a Softmax layer contains only one random variable
+    # (with n_classes possible values) the mean field assumption
+    # does not impose any restriction so mf_update simply gives
+    # the true expected value of h given v.
+    # We also know P(h |  v)
+    #  = P(h, v) / P( v)
+    #  = P(h, v) / sum_h P(h, v)
+    #  = exp(-E(h, v)) / sum_h exp(-E(h, v))
+    # So we can check that computing P(h | v) with both
+    # methods works the same way
+
+    rng = np.random.RandomState([2012,11,1,1131])
+
+    # Make DBM
+    num_vis = rng.randint(1,11)
+    n_classes = rng.randint(1, 11)
+
+    v = BinaryVector(num_vis, center=True)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX), recenter=True)
+
+    y = Softmax(
+            n_classes = n_classes,
+            layer_name = 'y',
+            irange = 1., center=True)
+    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX), recenter=True)
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [y],
+            batch_size = 1,
+            niter = 50)
+
+    # Randomly pick a v to condition on
+    # (Random numbers are generated via dbm.rng)
+    layer_to_state = dbm.make_layer_to_state(1)
+    v_state = layer_to_state[v]
+    y_state = layer_to_state[y]
+
+    # Infer P(y | v) using mean field
+    expected_y = y.mf_update(
+            state_below = v.upward_state(v_state))
+
+    expected_y = expected_y[0, :]
+
+    expected_y = expected_y.eval()
+
+    # Infer P(y | v) using the energy function
+    energy = dbm.energy(V = v_state,
+            hidden = [y_state])
+    unnormalized_prob = T.exp(-energy)
+    assert unnormalized_prob.ndim == 1
+    unnormalized_prob = unnormalized_prob[0]
+    unnormalized_prob = function([], unnormalized_prob)
+
+    def compute_unnormalized_prob(which):
+        write_y = np.zeros((n_classes,))
+        write_y[which] = 1.
+
+        y_value = y_state.get_value()
+
+        y_value[0, :] = write_y
+
+        y_state.set_value(y_value)
+
+        return unnormalized_prob()
+
+    probs = [compute_unnormalized_prob(idx) for idx in xrange(n_classes)]
+    denom = sum(probs)
+    probs = [on_prob / denom for on_prob in probs]
+
+    # np.asarray(probs) doesn't make a numpy vector, so I do it manually
+    wtf_numpy = np.zeros((n_classes,))
+    for i in xrange(n_classes):
+        wtf_numpy[i] = probs[i]
+    probs = wtf_numpy
+
+    if not np.allclose(expected_y, probs):
+        print 'mean field expectation of h:',expected_y
+        print 'expectation of h based on enumerating energy function values:',probs
+        assert False
+
+def test_softmax_mf_sample_consistent():
+
+    # A test of the Softmax class
+    # Verifies that the mean field update is consistent with
+    # the sampling function
+
+    # Since a Softmax layer contains only one random variable
+    # (with n_classes possible values) the mean field assumption
+    # does not impose any restriction so mf_update simply gives
+    # the true expected value of h given v.
+    # We can thus use mf_update to compute the expected value
+    # of a sample of y conditioned on v, and check that samples
+    # drawn using the layer's sample method convert to that
+    # value.
+
+    rng = np.random.RandomState([2012,11,1,1154])
+    theano_rng = MRG_RandomStreams(2012+11+1+1154)
+    num_samples = 1000
+    tol = .042
+
+    # Make DBM
+    num_vis = rng.randint(1,11)
+    n_classes = rng.randint(1, 11)
+
+    v = BinaryVector(num_vis)
+    v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX))
+
+    y = Softmax(
+            n_classes = n_classes,
+            layer_name = 'y',
+            irange = 1.)
+    y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX))
+
+    dbm = DBM(visible_layer = v,
+            hidden_layers = [y],
+            batch_size = 1,
+            niter = 50)
+
+    # Randomly pick a v to condition on
+    # (Random numbers are generated via dbm.rng)
+    layer_to_state = dbm.make_layer_to_state(1)
+    v_state = layer_to_state[v]
+    y_state = layer_to_state[y]
+
+    # Infer P(y | v) using mean field
+    expected_y = y.mf_update(
+            state_below = v.upward_state(v_state))
+
+    expected_y = expected_y[0, :]
+
+    expected_y = expected_y.eval()
+
+    # copy all the states out into a batch size of num_samples
+    cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x')
+    v_state = v_state[0,:] + cause_copy
+    y_state = y_state[0,:] + cause_copy
+
+    y_samples = y.sample(state_below = v.upward_state(v_state), theano_rng=theano_rng)
+
+    y_samples = function([], y_samples)()
+
+    check_multinomial_samples(y_samples, (num_samples, n_classes), expected_y, tol)
+
+
+def test_make_symbolic_state():
+    # Tests whether the returned p_sample and h_sample have the right
+    # dimensions
+    num_examples = 40
+    theano_rng = MRG_RandomStreams(2012+11+1)
+
+    visible_layer = BinaryVector(nvis=100)
+    rval = visible_layer.make_symbolic_state(num_examples=num_examples,
+                                             theano_rng=theano_rng)
+
+    hidden_layer = BinaryVectorMaxPool(detector_layer_dim=500,
+                                       pool_size=1,
+                                       layer_name='h',
+                                       irange=0.05,
+                                       init_bias=-2.0)
+    p_sample, h_sample = hidden_layer.make_symbolic_state(num_examples=num_examples,
+                                                          theano_rng=theano_rng)
+
+    softmax_layer = Softmax(n_classes=10, layer_name='s', irange=0.05)
+    h_sample_s = softmax_layer.make_symbolic_state(num_examples=num_examples,
+                                                   theano_rng=theano_rng)
+
+    required_shapes = [(40, 100), (40, 500), (40, 500), (40, 10)]
+    f = function(inputs=[],
+                 outputs=[rval, p_sample, h_sample, h_sample_s])
+
+    for s, r in zip(f(), required_shapes):
+        assert s.shape == r
+
+
+
+def check_gradients(expected_grad, actual_grad, corr_tol=0.8, mean_tol=0.05):
+    corr = np.corrcoef(expected_grad.flatten(), actual_grad.flatten())[0,1]
+    assert corr >= corr_tol,\
+        ("Correlation did not pass: (%.2f > %.2f)\n" % (corr_tol, corr)) +\
+        ("Expected:\n %r\n" % expected_grad) +\
+        ("Actual:\n %r" % actual_grad)
+    assert abs(np.mean(expected_grad) - np.mean(actual_grad)) < mean_tol,\
+            "Mean did not pass (%.2f expected vs %.2f actual)" %\
+            (np.mean(expected_grad), np.mean(actual_grad))
+
+def make_rbm(num_visible, num_hidden, batch_size, center=False, rng=None):
+    if rng is None:
+        rng = np.random.RandomState([2014,10,7])
+
+    visible_layer = BinaryVector(nvis=num_visible)
+    visible_layer.set_biases(rng.uniform(-1., 1., (num_visible,)).astype(config.floatX))
+    hidden_layer = BinaryVectorMaxPool(detector_layer_dim=num_hidden,
+                                                                pool_size=1,
+                                                                layer_name='h',
+                                                                irange=0.05,
+                                                                init_bias=-2.0,
+                                                                center=center)
+    hidden_layer.set_biases(rng.uniform(-1., 1., (num_hidden,)).astype(config.floatX), recenter=center)
+    model = RBM(visible_layer=visible_layer,
+                           hidden_layer=hidden_layer,
+                           batch_size=batch_size, niter=1)
+
+    return model
+
+class Test_CD(object):
+    """
+    Class to test contrastive divergence.
+    """
+
+    @staticmethod
+    def check_rbm_pos_phase(rbm, cost, X, tol=0.8):
+
+        pos_grads, updates = cost._get_positive_phase(rbm, X)
+
+        visible_layer = rbm.visible_layer
+        hidden_layer = rbm.hidden_layers[0]
+        P_H0_given_X = hidden_layer.mf_update(state_below=visible_layer.upward_state(X),
+                                              state_above=None, layer_above=None)[1]
+
+        dW_pos_exp = -1 * np.dot(X.eval().T, P_H0_given_X.eval()) / rbm.batch_size
+        dW_pos_act = pos_grads[hidden_layer.transformer.get_params()[0]].eval()
+        check_gradients(dW_pos_exp, dW_pos_act, corr_tol=tol)
+
+        dvb_pos_exp = -np.mean(X.eval(), axis=0)
+        dvb_pos_act = pos_grads[visible_layer.bias].eval()
+        check_gradients(dvb_pos_exp, dvb_pos_act, corr_tol=tol)
+
+        dvh_pos_exp = -np.mean(P_H0_given_X.eval(), axis=0)
+        dvh_pos_act = pos_grads[hidden_layer.b].eval()
+        check_gradients(dvh_pos_exp, dvh_pos_act, corr_tol=tol)
+
+        return pos_grads, updates
+
+    @staticmethod
+    def check_rbm_neg_phase(rbm, cost, X, theano_rng=None, tol=0.85):
+
+        assert theano_rng is not None
+
+        neg_grads, updates = cost._get_negative_phase(rbm, X)
+
+        visible_layer = rbm.visible_layer
+        hidden_layer = rbm.hidden_layers[0]
+
+        P_H0_given_X = hidden_layer.mf_update(state_below = visible_layer.upward_state(X),
+                                              state_above=None, layer_above=None)[1]
+        H0 = hidden_layer.sample(state_below=visible_layer.upward_state(X),
+                                 state_above=None, layer_above=None,
+                                 theano_rng=theano_rng)[1]
+        V1 = visible_layer.sample(state_above=H0, layer_above=hidden_layer,
+                                  theano_rng=theano_rng)
+        P_H1_given_V1 = hidden_layer.mf_update(state_below=visible_layer.upward_state(V1),
+                                               state_above=None, layer_above=None)[1]
+        dW_neg_act = neg_grads[hidden_layer.transformer.get_params()[0]].eval()
+        dW_neg_exp = np.dot(V1.eval().T, P_H1_given_V1.eval()) / rbm.batch_size
+        check_gradients(dW_neg_exp, dW_neg_act, corr_tol=tol)
+
+        dvb_neg_exp = np.mean(V1.eval(), axis=0)
+        dvb_neg_act = neg_grads[visible_layer.bias].eval()
+        check_gradients(dvb_neg_exp, dvb_neg_act, corr_tol=tol)
+
+        dvh_neg_exp = np.mean(P_H1_given_V1.eval(), axis=0)
+        dvh_neg_act = neg_grads[hidden_layer.b].eval()
+        check_gradients(dvh_neg_exp, dvh_neg_act, corr_tol=tol)
+
+        return neg_grads, updates
+
+    def test_rbm(self, num_visible=100, num_hidden=50, batch_size=5000, variational=False):
+        rng = np.random.RandomState([2014,10,7])
+        theano_rng = MRG_RandomStreams(2024+30+9)
+
+        # Set up the RBM (One hidden layer DBM)
+        rbm = make_rbm(num_visible, num_hidden, batch_size, rng=rng)
+
+        if variational:
+            cost = VariationalCD(num_gibbs_steps=1)
+        else:
+            cost = BaseCD(num_gibbs_steps=1)
+
+        # Set the data
+        X = sharedX(rng.randn(batch_size, num_visible))
+        # Get the gradients from the cost function
+        grads, updates = cost.get_gradients(rbm, X)
+        Test_CD.check_rbm_pos_phase(rbm, cost, X)
+        Test_CD.check_rbm_neg_phase(rbm, cost, X, theano_rng=theano_rng)
+
+    def test_rbm_varational(self, num_visible=100, num_hidden=50, batch_size=200):
+        self.test_rbm(num_visible, num_hidden, batch_size, variational=True)

From d3e0293dfa7cc9ca49071099fda903aae881ded8 Mon Sep 17 00:00:00 2001
From: rdevon <devon@cs.unm.edu>
Date: Tue, 28 Oct 2014 18:51:57 -0600
Subject: [PATCH 3/5] Updated whitelist. Travis hopefully passes now.

---
 pylearn2/devtools/tests/test_format.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pylearn2/devtools/tests/test_format.py b/pylearn2/devtools/tests/test_format.py
index f10e931379..ed5f337118 100644
--- a/pylearn2/devtools/tests/test_format.py
+++ b/pylearn2/devtools/tests/test_format.py
@@ -78,6 +78,14 @@
     "sandbox/lisa_rl/bandit/classifier_bandit.py",
     "sandbox/lisa_rl/bandit/classifier_agent.py",
     "sandbox/lisa_rl/bandit/plot_reward.py",
+    "sandbox/dbm_v2/layer.py",
+    "sandbox/dbm_v2/dbm.py",
+    "sandbox/dbm_v2/dbm_cost.py",
+    "sandbox/dbm_v2/__init__.py",
+    "sandbox/dbm_v2/test_dbm.py",
+    "sandbox/dbm_v2/ising.py",
+    "sandbox/dbm_v2/sampling_procedure.py",
+    "sandbox/dbm_v2/inference_procedure.py",
     "config/old_config.py",
     "utils/utlc.py",
     "utils/tests/test_serial.py",

From aafda2e8588355e7667b913d77a1957c04607fb6 Mon Sep 17 00:00:00 2001
From: rdevon <devon@cs.unm.edu>
Date: Tue, 28 Oct 2014 18:51:57 -0600
Subject: [PATCH 4/5] Updated whitelist. Travis hopefully passes now.

---
 pylearn2/devtools/tests/test_format.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/pylearn2/devtools/tests/test_format.py b/pylearn2/devtools/tests/test_format.py
index f10e931379..a7de68b56c 100644
--- a/pylearn2/devtools/tests/test_format.py
+++ b/pylearn2/devtools/tests/test_format.py
@@ -78,6 +78,14 @@
     "sandbox/lisa_rl/bandit/classifier_bandit.py",
     "sandbox/lisa_rl/bandit/classifier_agent.py",
     "sandbox/lisa_rl/bandit/plot_reward.py",
+    "sandbox/dbm_v2/layer.py",
+    "sandbox/dbm_v2/dbm.py",
+    "sandbox/dbm_v2/dbm_cost.py",
+    "sandbox/dbm_v2/__init__.py",
+    "sandbox/dbm_v2/test_dbm.py",
+    "sandbox/dbm_v2/ising.py",
+    "sandbox/dbm_v2/sampling_procedure.py",
+    "sandbox/dbm_v2/inference_procedure.py",
     "config/old_config.py",
     "utils/utlc.py",
     "utils/tests/test_serial.py",
@@ -400,6 +408,14 @@
     'sandbox/lisa_rl/bandit/classifier_agent.py',
     'sandbox/lisa_rl/bandit/gaussian_bandit.py',
     'sandbox/lisa_rl/__init__.py',
+    "sandbox/dbm_v2/layer.py",
+    "sandbox/dbm_v2/dbm.py",
+    "sandbox/dbm_v2/dbm_cost.py",
+    "sandbox/dbm_v2/__init__.py",
+    "sandbox/dbm_v2/test_dbm.py",
+    "sandbox/dbm_v2/ising.py",
+    "sandbox/dbm_v2/sampling_procedure.py",
+    "sandbox/dbm_v2/inference_procedure.py",
     'config/old_config.py',
     'config/tests/test_yaml_parse.py',
     'config/yaml_parse.py',

From 7df1aa6e591d2bf25b8f71ba175d8f12ac76f9eb Mon Sep 17 00:00:00 2001
From: rdevon <devon@cs.unm.edu>
Date: Sat, 1 Nov 2014 15:09:10 -0600
Subject: [PATCH 5/5] Modified sampling procedure for CD in RBM and modified
 tests to not use just random weights and input. For now I switched to a
 sampling pricedure that's more true to CD-1 for RBM. Modified tests to use
 data that is closer to weights than just random.

---
 pylearn2/sandbox/dbm_v2/sampling_procedure.py | 178 +++++++-----------
 pylearn2/sandbox/dbm_v2/test_dbm.py           |  11 +-
 2 files changed, 76 insertions(+), 113 deletions(-)

diff --git a/pylearn2/sandbox/dbm_v2/sampling_procedure.py b/pylearn2/sandbox/dbm_v2/sampling_procedure.py
index 134f94a37d..a439633936 100644
--- a/pylearn2/sandbox/dbm_v2/sampling_procedure.py
+++ b/pylearn2/sandbox/dbm_v2/sampling_procedure.py
@@ -58,146 +58,106 @@ def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
 
 class GibbsEvenOdd(SamplingProcedure):
     """
+    Even-odd Gibbs sampling.
     The specific sampling schedule used to sample all of the even-idexed
     layers of model.hidden_layers, then the visible layer and all the
     odd-indexed layers.
     """
 
-    def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
-               num_steps=1):
+    def sample(self, layer_to_state, theano_rng, layer_to_clamp=None, num_steps=1):
         """
-        .. todo::
+        Samples from self.dbm using `layer_to_state` as starting values.
 
+        Parameters
+        ----------
+        layer_to_state : dict
+            Maps the DBM's Layer instances to theano variables representing
+            batches of samples of them.
+        theano_rng : theano.sandbox.rng_mrg.MRG_RandomStreams
             WRITEME
+        layer_to_clamp : dict, optional
+            Maps Layers to bools. If a layer is not in the dictionary,
+            defaults to False. True indicates that this layer should be
+            clamped, so we are sampling from a conditional distribution
+            rather than the joint distribution.
+        num_steps : int, optional
+            Steps to sample the odd units. Evens are always done num_steps+1
+            TODO: this is a hack to make CD work for RBM for now.
+
+        Returns
+        -------
+        layer_to_updated_state : dict
+            Maps the DBM's Layer instances to theano variables representing
+            batches of updated samples of them.
         """
-        # Validate num_steps
-        assert isinstance(num_steps, py_integer_types)
-        assert num_steps > 0
-
-        # Implement the num_steps > 1 case by repeatedly calling the
-        # num_steps == 1 case
-        if num_steps != 1:
-            for i in xrange(num_steps):
-                layer_to_state = self.sample(layer_to_state, theano_rng,
-                                             layer_to_clamp, num_steps=1)
-            return layer_to_state
-
-        # The rest of the function is the num_steps = 1 case
-        # Current code assumes this, though we could certainly relax this
-        # constraint
-        assert len(self.dbm.hidden_layers) > 0
-
-        # Validate layer_to_clamp / make sure layer_to_clamp is a fully
-        # populated dictionary
+
+        assert isinstance(num_steps, py_integer_types) and num_steps >= 0
+
         if layer_to_clamp is None:
             layer_to_clamp = OrderedDict()
 
         for key in layer_to_clamp:
-            assert (key is self.dbm.visible_layer or
-                    key in self.dbm.hidden_layers)
+            assert key in self.dbm.hidden_layers + [self.dbm.visible_layer]
 
-        for layer in [self.dbm.visible_layer] + self.dbm.hidden_layers:
+        # Set layer to clamps.
+        for layer in self.dbm.hidden_layers + [self.dbm.visible_layer]:
             if layer not in layer_to_clamp:
                 layer_to_clamp[layer] = False
 
-        # Assemble the return value
+        # Set all updates to the initial states.
+        # For now, assert that the layer_to_state is full, but we will change
+        # this later. Right now the cost function initializes everything.
+        for layer in [self.dbm.visible_layer] + self.dbm.hidden_layers:
+            assert layer in layer_to_state
         layer_to_updated = OrderedDict()
+        for layer in [self.dbm.visible_layer] + self.dbm.hidden_layers:
+            layer_to_updated[layer] = layer_to_state[layer]
 
-        for i, this_layer in list(enumerate(self.dbm.hidden_layers))[::2]:
-            # Iteration i does the Gibbs step for hidden_layers[i]
+        def update(i, this_layer):
+            if layer_to_clamp[this_layer]:
+                return
 
-            # Get the sampled state of the layer below so we can condition
-            # on it in our Gibbs update
-            if i == 0:
-                layer_below = self.dbm.visible_layer
+            # States and layers below
+            if i == -1: # visible layer, will change.
+                layer_below = None
+                state_below = None
             else:
-                layer_below = self.dbm.hidden_layers[i-1]
-            state_below = layer_to_state[layer_below]
-            state_below = layer_below.upward_state(state_below)
-
-            # Get the sampled state of the layer above so we can condition
-            # on it in our Gibbs step
+                if i == 0:
+                    layer_below = self.dbm.visible_layer
+                elif i > 0:
+                    layer_below = self.dbm.hidden_layers[i-1]
+                state_below = layer_to_updated[layer_below]
+                state_below = layer_below.upward_state(state_below)
+
+            # States and layers above
             if i + 1 < len(self.dbm.hidden_layers):
                 layer_above = self.dbm.hidden_layers[i + 1]
-                state_above = layer_to_state[layer_above]
+                state_above = layer_to_updated[layer_above]
                 state_above = layer_above.downward_state(state_above)
             else:
-                state_above = None
                 layer_above = None
+                state_above = None
 
-            if layer_to_clamp[this_layer]:
-                this_state = layer_to_state[this_layer]
-                this_sample = this_state
-            else:
-                # Compute the Gibbs sampling update
-                # Sample the state of this layer conditioned
-                # on its Markov blanket (the layer above and
-                # layer below)
-                this_sample = this_layer.sample(state_below=state_below,
-                                                state_above=state_above,
-                                                layer_above=layer_above,
-                                                theano_rng=theano_rng)
+            this_sample = this_layer.sample(state_below=state_below,
+                                            state_above=state_above,
+                                            layer_above=layer_above,
+                                            theano_rng=theano_rng)
 
             layer_to_updated[this_layer] = this_sample
 
-        #Sample the visible layer
-        vis_state = layer_to_state[self.dbm.visible_layer]
-        if layer_to_clamp[self.dbm.visible_layer]:
-            vis_sample = vis_state
-        else:
-            first_hid = self.dbm.hidden_layers[0]
-            state_above = layer_to_updated[first_hid]
-            state_above = first_hid.downward_state(state_above)
-
-            vis_sample = self.dbm.visible_layer.sample(state_above=state_above,
-                                                       layer_above=first_hid,
-                                                       theano_rng=theano_rng)
-        layer_to_updated[self.dbm.visible_layer] = vis_sample
-
-        # Sample the odd-numbered layers
-        for i, this_layer in list(enumerate(self.dbm.hidden_layers))[1::2]:
-
-            # Get the sampled state of the layer below so we can condition
-            # on it in our Gibbs update
-            layer_below = self.dbm.hidden_layers[i-1]
-
-            # We want to sample from each conditional distribution
-            # ***sequentially*** so we must use the updated version
-            # of the state for the layers whose updates we have
-            # calculcated already, in layer_to_updated.
-            # If we used the original value from
-            # layer_to_state
-            # then we would sample from each conditional
-            # ***simultaneously*** which does not implement MCMC
-            # sampling.
-            state_below = layer_to_updated[layer_below]
-
-            state_below = layer_below.upward_state(state_below)
-
-            # Get the sampled state of the layer above so we can condition
-            # on it in our Gibbs step
-            if i + 1 < len(self.dbm.hidden_layers):
-                layer_above = self.dbm.hidden_layers[i + 1]
-                state_above = layer_to_updated[layer_above]
-                state_above = layer_above.downward_state(state_above)
-            else:
-                state_above = None
-                layer_above = None
+        evens = list(enumerate(self.dbm.hidden_layers))[::2]
+        # Odds are the visible layer plus the odd hidden layers
+        odds = [(-1, self.dbm.visible_layer)] + list(enumerate(self.dbm.hidden_layers))[1::2]
 
-            if layer_to_clamp[this_layer]:
-                this_state = layer_to_state[this_layer]
-                this_sample = this_state
-            else:
-                # Compute the Gibbs sampling update
-                # Sample the state of this layer conditioned
-                # on its Markov blanket (the layer above and
-                # layer below)
-                this_sample = this_layer.sample(state_below=state_below,
-                                                state_above=state_above,
-                                                layer_above=layer_above,
-                                                theano_rng=theano_rng)
+        update_count = OrderedDict((layer, 0) for l in [self.dbm.visible_layer] + self.dbm.hidden_layers)
 
-            layer_to_updated[this_layer] = this_sample
+        for i, this_layer in evens:
+            update(i, this_layer)
+        for s in xrange(num_steps):
+            for i, this_layer in odds:
+                update(i, this_layer)
+            for i, this_layer in evens:
+                update(i, this_layer)
 
         # Check that all layers were updated
         assert all([layer in layer_to_updated for layer in layer_to_state])
@@ -207,4 +167,4 @@ def sample(self, layer_to_state, theano_rng, layer_to_clamp=None,
         assert all([(layer_to_state[layer] is layer_to_updated[layer]) ==
                     layer_to_clamp[layer] for layer in layer_to_state])
 
-        return layer_to_updated
+        return layer_to_updated
\ No newline at end of file
diff --git a/pylearn2/sandbox/dbm_v2/test_dbm.py b/pylearn2/sandbox/dbm_v2/test_dbm.py
index d5f5abe646..6c98a9d287 100644
--- a/pylearn2/sandbox/dbm_v2/test_dbm.py
+++ b/pylearn2/sandbox/dbm_v2/test_dbm.py
@@ -1135,7 +1135,7 @@ class Test_CD(object):
     """
 
     @staticmethod
-    def check_rbm_pos_phase(rbm, cost, X, tol=0.8):
+    def check_rbm_pos_phase(rbm, cost, X, tol=0.90):
 
         pos_grads, updates = cost._get_positive_phase(rbm, X)
 
@@ -1159,7 +1159,7 @@ def check_rbm_pos_phase(rbm, cost, X, tol=0.8):
         return pos_grads, updates
 
     @staticmethod
-    def check_rbm_neg_phase(rbm, cost, X, theano_rng=None, tol=0.85):
+    def check_rbm_neg_phase(rbm, cost, X, theano_rng=None, tol=0.90):
 
         assert theano_rng is not None
 
@@ -1203,8 +1203,11 @@ def test_rbm(self, num_visible=100, num_hidden=50, batch_size=5000, variational=
         else:
             cost = BaseCD(num_gibbs_steps=1)
 
-        # Set the data
-        X = sharedX(rng.randn(batch_size, num_visible))
+        # Set the data to a noisy version of the weights.
+        assert batch_size % num_hidden == 0, "Need to replicate weights evenly across batches for now."
+        W, _ = rbm.hidden_layers[0].get_params()
+        X = sharedX(np.tile(W.T.eval(), (batch_size // num_hidden, 1)) +\
+                    theano_rng.normal(std=np.std(W.T.eval()) / 10., size=(batch_size, num_visible)).eval())
         # Get the gradients from the cost function
         grads, updates = cost.get_gradients(rbm, X)
         Test_CD.check_rbm_pos_phase(rbm, cost, X)