bigbird
bigbird copied to clipboard
Pegasus variables mapping
I have my own pretrained Pegasus model, now I want to finetune using BigBird, so this is my mapping function,
import re
import collections
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
"""Compute the union of the current variables and checkpoint variables."""
assignment_map = {}
initialized_variable_names = {}
name_to_variable = collections.OrderedDict()
for var in tvars:
name = var.name
m = re.match('^(.*):\\d+$', name)
if m is not None:
name = m.group(1)
name_to_variable[name] = var
init_vars = tf.train.list_variables(init_checkpoint)
assignment_map = collections.OrderedDict()
for x in init_vars:
(name, var) = (x[0], x[1])
l = 'pegasus/' + name
l = l.replace('embeddings/weights', 'embeddings/word_embeddings')
l = l.replace('self/output', 'output')
l = l.replace('ffn/dense_1', 'output/dense')
l = l.replace('ffn', 'intermediate')
l = l.replace('memory_attention/output', 'attention/encdec_output')
l = l.replace('memory_attention', 'attention/encdec')
if l not in name_to_variable:
continue
assignment_map[name] = name_to_variable[l]
initialized_variable_names[l + ':0'] = 1
return (assignment_map, initialized_variable_names)
output,
OrderedDict([('decoder/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_0/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_0/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/attention/self/key/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/attention/self/query/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/attention/self/value/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_0/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_0/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/ffn/dense/bias',
<tf.Variable 'pegasus/decoder/layer_0/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('decoder/layer_0/ffn/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_0/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('decoder/layer_0/ffn/dense_1/bias',
<tf.Variable 'pegasus/decoder/layer_0/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/ffn/dense_1/kernel',
<tf.Variable 'pegasus/decoder/layer_0/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('decoder/layer_0/memory_attention/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_0/attention/encdec/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/memory_attention/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_0/attention/encdec/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_0/memory_attention/key/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/encdec/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/memory_attention/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/encdec_output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/memory_attention/query/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/encdec/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_0/memory_attention/value/kernel',
<tf.Variable 'pegasus/decoder/layer_0/attention/encdec/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_1/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_1/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/attention/self/key/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/attention/self/query/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/attention/self/value/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_1/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_1/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/ffn/dense/bias',
<tf.Variable 'pegasus/decoder/layer_1/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('decoder/layer_1/ffn/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_1/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('decoder/layer_1/ffn/dense_1/bias',
<tf.Variable 'pegasus/decoder/layer_1/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/ffn/dense_1/kernel',
<tf.Variable 'pegasus/decoder/layer_1/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('decoder/layer_1/memory_attention/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_1/attention/encdec/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/memory_attention/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_1/attention/encdec/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_1/memory_attention/key/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/encdec/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/memory_attention/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/encdec_output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/memory_attention/query/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/encdec/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_1/memory_attention/value/kernel',
<tf.Variable 'pegasus/decoder/layer_1/attention/encdec/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_2/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_2/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/attention/self/key/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/attention/self/query/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/attention/self/value/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_2/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_2/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/ffn/dense/bias',
<tf.Variable 'pegasus/decoder/layer_2/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('decoder/layer_2/ffn/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_2/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('decoder/layer_2/ffn/dense_1/bias',
<tf.Variable 'pegasus/decoder/layer_2/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/ffn/dense_1/kernel',
<tf.Variable 'pegasus/decoder/layer_2/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('decoder/layer_2/memory_attention/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_2/attention/encdec/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/memory_attention/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_2/attention/encdec/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_2/memory_attention/key/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/encdec/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/memory_attention/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/encdec_output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/memory_attention/query/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/encdec/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_2/memory_attention/value/kernel',
<tf.Variable 'pegasus/decoder/layer_2/attention/encdec/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_3/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_3/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/attention/self/key/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/attention/self/query/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/attention/self/value/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_3/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_3/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/ffn/dense/bias',
<tf.Variable 'pegasus/decoder/layer_3/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('decoder/layer_3/ffn/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_3/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('decoder/layer_3/ffn/dense_1/bias',
<tf.Variable 'pegasus/decoder/layer_3/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/ffn/dense_1/kernel',
<tf.Variable 'pegasus/decoder/layer_3/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('decoder/layer_3/memory_attention/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_3/attention/encdec/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/memory_attention/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_3/attention/encdec/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_3/memory_attention/key/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/encdec/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/memory_attention/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/encdec_output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/memory_attention/query/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/encdec/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_3/memory_attention/value/kernel',
<tf.Variable 'pegasus/decoder/layer_3/attention/encdec/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_4/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_4/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/attention/self/key/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/attention/self/query/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/attention/self/value/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_4/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_4/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/ffn/dense/bias',
<tf.Variable 'pegasus/decoder/layer_4/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('decoder/layer_4/ffn/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_4/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('decoder/layer_4/ffn/dense_1/bias',
<tf.Variable 'pegasus/decoder/layer_4/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/ffn/dense_1/kernel',
<tf.Variable 'pegasus/decoder/layer_4/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('decoder/layer_4/memory_attention/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_4/attention/encdec/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/memory_attention/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_4/attention/encdec/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_4/memory_attention/key/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/encdec/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/memory_attention/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/encdec_output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/memory_attention/query/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/encdec/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_4/memory_attention/value/kernel',
<tf.Variable 'pegasus/decoder/layer_4/attention/encdec/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_5/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_5/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/attention/self/key/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/attention/self/query/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/attention/self/value/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_5/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_5/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/ffn/dense/bias',
<tf.Variable 'pegasus/decoder/layer_5/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('decoder/layer_5/ffn/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_5/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('decoder/layer_5/ffn/dense_1/bias',
<tf.Variable 'pegasus/decoder/layer_5/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/ffn/dense_1/kernel',
<tf.Variable 'pegasus/decoder/layer_5/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('decoder/layer_5/memory_attention/LayerNorm/beta',
<tf.Variable 'pegasus/decoder/layer_5/attention/encdec/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/memory_attention/LayerNorm/gamma',
<tf.Variable 'pegasus/decoder/layer_5/attention/encdec/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('decoder/layer_5/memory_attention/key/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/encdec/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/memory_attention/output/dense/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/encdec_output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/memory_attention/query/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/encdec/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('decoder/layer_5/memory_attention/value/kernel',
<tf.Variable 'pegasus/decoder/layer_5/attention/encdec/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('embeddings/weights',
<tf.Variable 'pegasus/embeddings/word_embeddings:0' shape=(32128, 512) dtype=float32_ref>),
('encoder/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_0/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_0/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_0/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_0/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_0/attention/self/key/kernel',
<tf.Variable 'pegasus/encoder/layer_0/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_0/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_0/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_0/attention/self/query/kernel',
<tf.Variable 'pegasus/encoder/layer_0/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_0/attention/self/value/kernel',
<tf.Variable 'pegasus/encoder/layer_0/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_0/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_0/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_0/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_0/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_0/ffn/dense/bias',
<tf.Variable 'pegasus/encoder/layer_0/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('encoder/layer_0/ffn/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_0/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('encoder/layer_0/ffn/dense_1/bias',
<tf.Variable 'pegasus/encoder/layer_0/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_0/ffn/dense_1/kernel',
<tf.Variable 'pegasus/encoder/layer_0/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('encoder/layer_1/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_1/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_1/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_1/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_1/attention/self/key/kernel',
<tf.Variable 'pegasus/encoder/layer_1/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_1/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_1/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_1/attention/self/query/kernel',
<tf.Variable 'pegasus/encoder/layer_1/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_1/attention/self/value/kernel',
<tf.Variable 'pegasus/encoder/layer_1/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_1/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_1/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_1/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_1/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_1/ffn/dense/bias',
<tf.Variable 'pegasus/encoder/layer_1/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('encoder/layer_1/ffn/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_1/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('encoder/layer_1/ffn/dense_1/bias',
<tf.Variable 'pegasus/encoder/layer_1/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_1/ffn/dense_1/kernel',
<tf.Variable 'pegasus/encoder/layer_1/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('encoder/layer_2/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_2/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_2/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_2/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_2/attention/self/key/kernel',
<tf.Variable 'pegasus/encoder/layer_2/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_2/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_2/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_2/attention/self/query/kernel',
<tf.Variable 'pegasus/encoder/layer_2/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_2/attention/self/value/kernel',
<tf.Variable 'pegasus/encoder/layer_2/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_2/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_2/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_2/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_2/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_2/ffn/dense/bias',
<tf.Variable 'pegasus/encoder/layer_2/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('encoder/layer_2/ffn/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_2/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('encoder/layer_2/ffn/dense_1/bias',
<tf.Variable 'pegasus/encoder/layer_2/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_2/ffn/dense_1/kernel',
<tf.Variable 'pegasus/encoder/layer_2/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('encoder/layer_3/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_3/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_3/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_3/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_3/attention/self/key/kernel',
<tf.Variable 'pegasus/encoder/layer_3/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_3/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_3/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_3/attention/self/query/kernel',
<tf.Variable 'pegasus/encoder/layer_3/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_3/attention/self/value/kernel',
<tf.Variable 'pegasus/encoder/layer_3/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_3/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_3/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_3/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_3/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_3/ffn/dense/bias',
<tf.Variable 'pegasus/encoder/layer_3/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('encoder/layer_3/ffn/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_3/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('encoder/layer_3/ffn/dense_1/bias',
<tf.Variable 'pegasus/encoder/layer_3/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_3/ffn/dense_1/kernel',
<tf.Variable 'pegasus/encoder/layer_3/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('encoder/layer_4/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_4/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_4/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_4/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_4/attention/self/key/kernel',
<tf.Variable 'pegasus/encoder/layer_4/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_4/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_4/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_4/attention/self/query/kernel',
<tf.Variable 'pegasus/encoder/layer_4/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_4/attention/self/value/kernel',
<tf.Variable 'pegasus/encoder/layer_4/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_4/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_4/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_4/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_4/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_4/ffn/dense/bias',
<tf.Variable 'pegasus/encoder/layer_4/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('encoder/layer_4/ffn/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_4/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('encoder/layer_4/ffn/dense_1/bias',
<tf.Variable 'pegasus/encoder/layer_4/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_4/ffn/dense_1/kernel',
<tf.Variable 'pegasus/encoder/layer_4/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>),
('encoder/layer_5/attention/self/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_5/attention/self/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_5/attention/self/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_5/attention/self/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_5/attention/self/key/kernel',
<tf.Variable 'pegasus/encoder/layer_5/attention/self/key/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_5/attention/self/output/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_5/attention/output/dense/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_5/attention/self/query/kernel',
<tf.Variable 'pegasus/encoder/layer_5/attention/self/query/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_5/attention/self/value/kernel',
<tf.Variable 'pegasus/encoder/layer_5/attention/self/value/kernel:0' shape=(512, 512) dtype=float32_ref>),
('encoder/layer_5/ffn/LayerNorm/beta',
<tf.Variable 'pegasus/encoder/layer_5/intermediate/LayerNorm/beta:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_5/ffn/LayerNorm/gamma',
<tf.Variable 'pegasus/encoder/layer_5/intermediate/LayerNorm/gamma:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_5/ffn/dense/bias',
<tf.Variable 'pegasus/encoder/layer_5/intermediate/dense/bias:0' shape=(3072,) dtype=float32_ref>),
('encoder/layer_5/ffn/dense/kernel',
<tf.Variable 'pegasus/encoder/layer_5/intermediate/dense/kernel:0' shape=(512, 3072) dtype=float32_ref>),
('encoder/layer_5/ffn/dense_1/bias',
<tf.Variable 'pegasus/encoder/layer_5/output/dense/bias:0' shape=(512,) dtype=float32_ref>),
('encoder/layer_5/ffn/dense_1/kernel',
<tf.Variable 'pegasus/encoder/layer_5/output/dense/kernel:0' shape=(3072, 512) dtype=float32_ref>)])
My pegasus config, Copy pasted from https://github.com/google-research/bigbird/blob/master/bigbird/summarization/pegasus_large.sh
bert_config = {
# transformer basic configs
'attention_probs_dropout_prob': 0.1,
'hidden_act': 'relu',
'hidden_dropout_prob': 0.1,
'hidden_size': 512,
'initializer_range': 0.02,
'intermediate_size': 3072,
'max_position_embeddings': 4096,
'max_encoder_length': 2048,
'max_decoder_length': 512,
'num_attention_heads': 8,
'num_hidden_layers': 6,
'type_vocab_size': 2,
'scope': 'pegasus',
'use_bias': False,
'rescale_embedding': True,
'vocab_model_file': None,
# sparse mask configs
'attention_type': 'block_sparse',
'norm_type': 'prenorm',
'block_size': 64,
'num_rand_blocks': 3,
'vocab_size': 32128,
'beam_size': 1,
'alpha': 0.0,
'couple_encoder_decoder': False,
'num_warmup_steps': 10000,
'learning_rate': 0.1,
'label_smoothing': 0.1,
'optimizer': 'Adafactor',
'use_tpu': True,
}
Not sure this is the correct one, finetuning is really slow, so any guide about variable mapping is really helpful.
@manzilz