blocks icon indicating copy to clipboard operation
blocks copied to clipboard

Bug in nested recurrent model

Open Beronx86 opened this issue 8 years ago • 2 comments

Multiple top_bricks are found while building the nested recurrent model. But there is only one in fact. So errors occurs. Besides, the theano.function of the nested recurrent method works well. I think the error occurs because the outermost scan instance does not find all the variables when building the computation graph. The bug may be reproduced with the following code.

import numpy
from theano import tensor, function
from blocks.bricks import Initializable, Linear
from blocks.bricks.recurrent import BaseRecurrent, GatedRecurrent, recurrent
from blocks.bricks.parallel import Fork
from blocks.initialization import IsotropicGaussian, Constant
from blocks.utils import dict_union
from blocks.model import Model
# from recurrent import recurrent


class InnerRecurrent(BaseRecurrent, Initializable):
    def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs):
        self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru')

        self.inner_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=inner_input_dim, name='inner_input_fork')
        self.outer_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=outer_input_dim, name='inner_outer_fork')

        super(InnerRecurrent, self).__init__(**kwargs)

        self.children = [
            self.inner_gru, self.inner_input_fork, self.outer_input_fork]

    def _push_allocation_config(self):
        self.inner_input_fork.output_dims = self.inner_gru.get_dims(
            self.inner_input_fork.output_names)
        self.outer_input_fork.output_dims = self.inner_gru.get_dims(
            self.outer_input_fork.output_names)

    @recurrent(sequences=['inner_inputs'], states=['states'],
               contexts=['outer_inputs'], outputs=['states'])
    def apply(self, inner_inputs, states, outer_inputs):
        forked_inputs = self.inner_input_fork.apply(inner_inputs, as_dict=True)
        forked_states = self.outer_input_fork.apply(outer_inputs, as_dict=True)

        gru_inputs = {key: forked_inputs[key] + forked_states[key]
                      for key in forked_inputs.keys()}

        new_states = self.inner_gru.apply(
            iterate=False,
            **dict_union(gru_inputs, {'states': states}))
        return new_states  # mean according to the time axis

    def get_dim(self, name):
        if name == 'states':
            return self.inner_gru.get_dim(name)
        else:
            return AttributeError


class OuterLinear(BaseRecurrent, Initializable):
    def __init__(self, inner_recurrent, inner_dim, **kwargs):
        self.inner_recurrent = inner_recurrent
        self.linear_map = Linear(input_dim=inner_dim, output_dim=1)

        super(OuterLinear, self).__init__(**kwargs)

        self.children = [self.inner_recurrent, self.linear_map]

    @recurrent(sequences=['outer_inputs'], states=[],
               contexts=['inner_inputs'], outputs=['weighted_averages'])
    def apply(self, outer_inputs, inner_inputs):
        inner_states = self.inner_recurrent.apply(
            inner_inputs=inner_inputs, outer_inputs=outer_inputs)
        linear_outs = self.linear_map.apply(inner_states)
        return linear_outs.mean(axis=0)


def test_nested_recurrent():
    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.)}

    inner_input_dim = 11
    outer_input_dim = 17
    inner_dim = 5

    batch_size = 3
    inner_steps = 19
    outer_steps = 7

    inner_recurrent = InnerRecurrent(inner_input_dim, outer_input_dim,
                                     inner_dim)
    nested_recurrent = OuterLinear(inner_recurrent, inner_dim, **inits)
    nested_recurrent.push_allocation_config()
    nested_recurrent.initialize()

    inner_inputs = tensor.tensor3()
    outer_inputs = tensor.tensor3()

    nested_outs = nested_recurrent.apply(
        outer_inputs=outer_inputs, inner_inputs=inner_inputs)

    func = function(inputs=[inner_inputs, outer_inputs], outputs=nested_outs,
                    allow_input_downcast=True)

    inner_input_val = numpy.random.uniform(
        size=(inner_steps, batch_size, inner_input_dim))
    outer_input_val = numpy.random.uniform(
        size=(outer_steps, batch_size, outer_input_dim))

    # works well till this line
    outputs_val = func(inner_input_val, outer_input_val)

    # bug occurs
    outs_mean = nested_outs.mean()
    model = Model(outs_mean)


if __name__ == '__main__':
    test_nested_recurrent()

Beronx86 avatar Jun 13 '16 12:06 Beronx86

Right, there are issues with nested scan.

Generally, it's not the best idea to use nested scan in theano unless you really need it, try to unroll the inner computation. Or try to explicitly use scan function instead of @recurrent, I've seen some people made it working.

dmitriy-serdyuk avatar Jun 13 '16 21:06 dmitriy-serdyuk

I thin I fixed the bug. https://github.com/mila-udem/blocks/pull/1113 Because in graph/init.py :ComputationGraph.scan_variables supposed that there is no nested recurrent classes.

Beronx86 avatar Jun 14 '16 04:06 Beronx86