diff --git a/tests/test_TFNetworkLayer.py b/tests/test_TFNetworkLayer.py index 29b14ed189..e6f1972829 100644 --- a/tests/test_TFNetworkLayer.py +++ b/tests/test_TFNetworkLayer.py @@ -3278,6 +3278,194 @@ def test_name_scope_share_params(): assert_equal(set(network.get_trainable_params()), {l1.params["W"], l1.params["b"]}) +def test_reuse_params_map_custom_transitive_dependency(): + # target_embed_raw shares from base:source_embed_raw + # output_prob shares from target_embed_raw (via custom) + config = Config() + n_in, n_out = 3, 3 + net_dict = {'dec_01_att_key': {'axis': 'F', 'class': 'split_dims', 'dims': (8, 64), 'from': ['dec_01_att_key0']}, + 'dec_01_att_key0': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'from': ['encoder'], + 'n_out': 512, + 'with_bias': False}, + 'dec_01_att_value': {'axis': 'F', 'class': 'split_dims', 'dims': (8, 64), 'from': ['dec_01_att_value0']}, + 'dec_01_att_value0': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'from': ['encoder'], + 'n_out': 512, + 'with_bias': False}, + 'decision': {'class': 'decide', 'from': ['output'], 'loss': 'edit_distance', 'loss_opts': {}, 'target': 'classes'}, + 'enc_01': {'class': 'copy', 'from': ['enc_01_ff_out']}, + 'enc_01_ff_conv1': {'activation': 'relu', + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'from': ['enc_01_ff_laynorm'], + 'n_out': 2048, + 'with_bias': True}, + 'enc_01_ff_conv2': {'activation': None, + 'class': 'linear', + 'dropout': 0.3, + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'from': ['enc_01_ff_conv1'], + 'n_out': 512, + 'with_bias': True}, + 'enc_01_ff_drop': {'class': 'dropout', 'dropout': 0.3, 'from': ['enc_01_ff_conv2']}, + 'enc_01_ff_laynorm': {'class': 'layer_norm', 'from': ['enc_01_self_att_out']}, + 'enc_01_ff_out': {'class': 'combine', 'from': ['enc_01_self_att_out', 'enc_01_ff_drop'], 'kind': 'add', + 'n_out': 512}, + 'enc_01_self_att_att': {'attention_dropout': 0.3, + 'attention_left_only': False, + 'class': 'self_attention', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'from': ['enc_01_self_att_laynorm'], + 'n_out': 512, + 'num_heads': 8, + 'total_key_dim': 512}, + 'enc_01_self_att_drop': {'class': 'dropout', 'dropout': 0.3, 'from': ['enc_01_self_att_lin']}, + 'enc_01_self_att_laynorm': {'class': 'layer_norm', 'from': ['source_embed']}, + 'enc_01_self_att_lin': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'from': ['enc_01_self_att_att'], + 'n_out': 512, + 'with_bias': False}, + 'enc_01_self_att_out': {'class': 'combine', 'from': ['source_embed', 'enc_01_self_att_drop'], 'kind': 'add', + 'n_out': 512}, + 'encoder': {'class': 'layer_norm', 'from': ['enc_01']}, + 'output': {'class': 'rec', + 'from': [], + 'max_seq_len': "max_len_from('base:encoder') * 3", + 'target': 'classes', + 'unit': {'dec_01': {'class': 'copy', 'from': ['dec_01_ff_out']}, + 'dec_01_att0': {'base': 'base:dec_01_att_value', 'class': 'generic_attention', + 'weights': 'dec_01_att_weights_drop'}, + 'dec_01_att_att': {'axes': ['dim:8', 'dim:64'], 'class': 'merge_dims', 'from': ['dec_01_att0']}, + 'dec_01_att_drop': {'class': 'dropout', 'dropout': 0.3, 'from': ['dec_01_att_lin']}, + 'dec_01_att_energy': {'class': 'dot', + 'from': ['base:dec_01_att_key', 'dec_01_att_query'], + 'red1': 'F', 'red2': 'F', 'var1': 'T', 'var2': 'T?'}, + 'dec_01_att_laynorm': {'class': 'layer_norm', 'from': ['dec_01_self_att_out']}, + 'dec_01_att_lin': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', " + 'scale=1.0)', + 'from': ['dec_01_att_att'], + 'n_out': 512, + 'with_bias': False}, + 'dec_01_att_out': {'class': 'combine', 'from': ['dec_01_self_att_out', 'dec_01_att_drop'], 'kind': 'add', + 'n_out': 512}, + 'dec_01_att_query': {'axis': 'F', 'class': 'split_dims', 'dims': (8, 64), 'from': ['dec_01_att_query0']}, + 'dec_01_att_query0': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', " + 'scale=1.0)', + 'from': ['dec_01_att_laynorm'], + 'n_out': 512, + 'with_bias': False}, + 'dec_01_att_weights': {'axis': 'stag:extern_data:data', + 'class': 'softmax_over_spatial', + 'energy_factor': 0.125, + 'from': ['dec_01_att_energy']}, + 'dec_01_att_weights_drop': {'class': 'dropout', + 'dropout': 0.3, + 'dropout_noise_shape': {'*': None}, + 'from': ['dec_01_att_weights']}, + 'dec_01_ff_conv1': {'activation': 'relu', + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', " + 'scale=1.0)', + 'from': ['dec_01_ff_laynorm'], + 'n_out': 2048, + 'with_bias': True}, + 'dec_01_ff_conv2': {'activation': None, + 'class': 'linear', + 'dropout': 0.3, + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', " + 'scale=1.0)', + 'from': ['dec_01_ff_conv1'], + 'n_out': 512, + 'with_bias': True}, + 'dec_01_ff_drop': {'class': 'dropout', 'dropout': 0.3, 'from': ['dec_01_ff_conv2']}, + 'dec_01_ff_laynorm': {'class': 'layer_norm', 'from': ['dec_01_att_out']}, + 'dec_01_ff_out': {'class': 'combine', 'from': ['dec_01_att_out', 'dec_01_ff_drop'], 'kind': 'add', + 'n_out': 512}, + 'dec_01_self_att_att': {'attention_dropout': 0.3, + 'attention_left_only': True, + 'class': 'self_attention', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', " + "distribution='uniform', scale=1.0)", + 'from': ['dec_01_self_att_laynorm'], + 'n_out': 512, + 'num_heads': 8, + 'total_key_dim': 512}, + 'dec_01_self_att_drop': {'class': 'dropout', 'dropout': 0.3, 'from': ['dec_01_self_att_lin']}, + 'dec_01_self_att_laynorm': {'class': 'layer_norm', 'from': ['target_embed']}, + 'dec_01_self_att_lin': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', " + "distribution='uniform', scale=1.0)", + 'from': ['dec_01_self_att_att'], + 'n_out': 512, + 'with_bias': False}, + 'dec_01_self_att_out': {'class': 'combine', + 'from': ['target_embed', 'dec_01_self_att_drop'], + 'kind': 'add', + 'n_out': 512}, + 'decoder': {'class': 'layer_norm', 'from': ['dec_01']}, + 'end': {'class': 'compare', 'from': ['output'], 'value': 0}, + 'output': {'beam_size': 12, 'class': 'choice', 'from': ['output_prob'], 'initial_output': 0, + 'target': 'classes'}, + 'output_prob': {'class': 'softmax', + 'dropout': 0.0, + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', " + 'scale=1.0)', + 'from': ['decoder'], + 'loss': 'ce', + 'loss_opts': {'label_smoothing': 0.2, 'use_normalized_loss': True}, + 'reuse_params': { + 'map': {'W': {'custom': (lambda reuse_layer, **kwargs: tf.transpose(reuse_layer.params["W"])), + 'reuse_layer': 'target_embed_raw'}, + 'b': None}}, + 'target': 'classes', + 'with_bias': True}, + 'target_embed': {'class': 'dropout', 'dropout': 0.0, 'from': ['target_embed_with_pos']}, + 'target_embed_raw': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', " + 'scale=1.0)', + 'from': ['prev:output'], + 'n_out': 512, + 'reuse_params': {'map': {'W': {'reuse_layer': 'base:source_embed_raw'}, 'b': None}}, + 'with_bias': False}, + 'target_embed_weighted': {'class': 'eval', 'eval': 'source(0) * 22.627417', 'from': ['target_embed_raw']}, + 'target_embed_with_pos': {'add_to_input': True, 'class': 'positional_encoding', + 'from': ['target_embed_weighted']}}}, + 'source_embed': {'class': 'dropout', 'dropout': 0.0, 'from': ['source_embed_with_pos']}, + 'source_embed_raw': {'activation': None, + 'class': 'linear', + 'forward_weights_init': "variance_scaling_initializer(mode='fan_in', distribution='uniform', scale=1.0)", + 'n_out': 512, + 'with_bias': False, 'from': 'data:data'}, + 'source_embed_weighted': {'class': 'eval', 'eval': 'source(0) * 22.627417', 'from': ['source_embed_raw']}, + 'source_embed_with_pos': {'add_to_input': True, 'class': 'positional_encoding', 'from': ['source_embed_weighted']}} + config.update({ + "num_outputs": n_out, + "num_inputs": n_in, + "network": net_dict}) + with make_scope() as session: + print("Construct for training") + from returnn.tf.layers.rec import RecLayer, _SubnetworkRecCell + train_net = TFNetwork(config=config, train_flag=True) + train_net.construct_from_dict(config.typed_dict["network"]) + with make_scope() as session: + print("Construct for search") + search_net = TFNetwork(config=config, train_flag=False, eval_flag=True, search_flag=True) + search_net.construct_from_dict(config.typed_dict["network"]) + + def test_SliceLayer_output_placeholder(): with make_scope() as session: net = TFNetwork(extern_data=ExternData())