oneflow
oneflow copied to clipboard
Reduce print op graph cost
Libai T5
# T5-large model config
model.cfg.num_attention_heads = 12
model.cfg.hidden_size = 384
model.cfg.hidden_layers = 6
生成 op graph 用于 debug
(MODULE:model.t5_model.encoder.layers.4.post_attention_layernorm:LayerNorm((384,), eps=1e-05, elementwise_affine=True)): (
(INPUT:_model.t5_model.encoder.layers.4.post_attention_layernorm_input.0.0_2:tensor(..., placement=oneflow.placement(type="cuda", ranks=[0, 1]),
sbp=(oneflow.sbp.split(dim=0),), is_lazy='True', size=(32, 512, 384),
dtype=oneflow.float32, grad_fn=<add_n_backward>))
(PARAMETER:model.t5_model.encoder.layers.4.post_attention_layernorm.weight:tensor(..., placement=oneflow.placement(type="cuda", ranks=[0, 1]),
sbp=(oneflow.sbp.broadcast,), size=(384,), dtype=oneflow.float32,
grad_fn=<accumulate_grad>)): ()
(PARAMETER:model.t5_model.encoder.layers.4.post_attention_layernorm.bias:tensor(..., placement=oneflow.placement(type="cuda", ranks=[0, 1]),
sbp=(oneflow.sbp.broadcast,), size=(384,), dtype=oneflow.float32,
grad_fn=<accumulate_grad>)): ()
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.weight() -> (out:sbp=(B), size=(384), dtype=(oneflow.float32)), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.bias() -> (out:sbp=(B), size=(384), dtype=(oneflow.float32)), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154(model.t5_model.encoder.layers.4.self_attention-fused_bias_add_mask_scale-152/out_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.post_attention_layernorm.weight-out-cast_f2h/out_0:(sbp=(B), size=(384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.post_attention_layernorm.bias-out-cast_f2h/out_0:(sbp=(B), size=(384), dtype=(oneflow.float16))) -> (model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/y_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/mean_0:(sbp=(S(0)), size=(32, 512), dtype=(oneflow.float32)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/inv_variance_0:(sbp=(S(0)), size=(32, 512), dtype=(oneflow.float32))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_param_grad-1074(model.t5_model.encoder.layers.4.mlp.dense_h_to_4h-broadcast_matmul-1070/out_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.self_attention-fused_bias_add_mask_scale-152/out_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/mean_0:(sbp=(S(0)), size=(32, 512), dtype=(oneflow.float32)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/inv_variance_0:(sbp=(S(0)), size=(32, 512), dtype=(oneflow.float32))) -> (model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_param_grad-1074/gamma_diff_0:(sbp=(P), size=(384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_param_grad-1074/beta_diff_0:(sbp=(P), size=(384), dtype=(oneflow.float16))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_grad-1075(model.t5_model.encoder.layers.4.mlp.dense_h_to_4h-broadcast_matmul-1070/out_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.self_attention-fused_bias_add_mask_scale-152/out_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/mean_0:(sbp=(S(0)), size=(32, 512), dtype=(oneflow.float32)), model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm-154/inv_variance_0:(sbp=(S(0)), size=(32, 512), dtype=(oneflow.float32)), model.t5_model.encoder.layers.4.post_attention_layernorm.weight-out-cast_f2h/out_0:(sbp=(B), size=(384), dtype=(oneflow.float16))) -> (model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_grad-1075/dx_0:(sbp=(S(0)), size=(32, 512, 384), dtype=(oneflow.float16))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.weight-out-cast_f2h(model.t5_model.encoder.layers.4.post_attention_layernorm.weight/out:(sbp=(B), size=(384), dtype=(oneflow.float32))) -> (model.t5_model.encoder.layers.4.post_attention_layernorm.weight-out-cast_f2h/out_0:(sbp=(B), size=(384), dtype=(oneflow.float16))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.bias-out-cast_f2h(model.t5_model.encoder.layers.4.post_attention_layernorm.bias/out:(sbp=(B), size=(384), dtype=(oneflow.float32))) -> (model.t5_model.encoder.layers.4.post_attention_layernorm.bias-out-cast_f2h/out_0:(sbp=(B), size=(384), dtype=(oneflow.float16))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: System-ModelDiffScale-ScalarMul-637(model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_param_grad-1074/gamma_diff_0:(sbp=(B), size=(384), dtype=(oneflow.float16)), System-Boxing-Identity-891/out:(sbp=(B), size=(1), dtype=(oneflow.float32))) -> (System-ModelDiffScale-ScalarMul-637/y_0:(sbp=(B), size=(384), dtype=(oneflow.float32))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: System-ModelDiffScale-ScalarMul-662(model.t5_model.encoder.layers.4.post_attention_layernorm-layer_norm_param_grad-1074/beta_diff_0:(sbp=(B), size=(384), dtype=(oneflow.float16)), System-Boxing-Identity-891/out:(sbp=(B), size=(1), dtype=(oneflow.float32))) -> (System-ModelDiffScale-ScalarMul-662/y_0:(sbp=(B), size=(384), dtype=(oneflow.float32))), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.weight-m() -> (out:sbp=(B), size=(384), dtype=(oneflow.float32)), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.weight-v() -> (out:sbp=(B), size=(384), dtype=(oneflow.float32)), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.weight_optimizer(model.t5_model.encoder.layers.4.post_attention_layernorm.weight/out:(sbp=(B), size=(384), dtype=(oneflow.float32)), System-ModelDiffScale-ScalarMul-637/y_0:(sbp=(B), size=(384), dtype=(oneflow.float32)), System-Boxing-Identity-889/out:(sbp=(B), size=(1), dtype=(oneflow.float32)), System-ClipGradient-GlobalNorm-Clamp-755/y_0:(sbp=(B), size=(), dtype=(oneflow.float32)), System-Train-DynamicLossScale-GraphBase_0-CountNotFinite/out_0:(sbp=(B), size=(1), dtype=(oneflow.int64)), model.t5_model.encoder.layers.4.post_attention_layernorm.weight-m/out:(sbp=(B), size=(384), dtype=(oneflow.float32)), model.t5_model.encoder.layers.4.post_attention_layernorm.weight-v/out:(sbp=(B), size=(384), dtype=(oneflow.float32)), System-Boxing-Identity-890/out:(sbp=(B), size=(1), dtype=(oneflow.float32)), System-Boxing-Identity-887/out:(sbp=(B), size=(1), dtype=(oneflow.float32))) -> (), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.bias-m() -> (out:sbp=(B), size=(384), dtype=(oneflow.float32)), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.bias-v() -> (out:sbp=(B), size=(384), dtype=(oneflow.float32)), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OPERATOR: model.t5_model.encoder.layers.4.post_attention_layernorm.bias_optimizer(model.t5_model.encoder.layers.4.post_attention_layernorm.bias/out:(sbp=(B), size=(384), dtype=(oneflow.float32)), System-ModelDiffScale-ScalarMul-662/y_0:(sbp=(B), size=(384), dtype=(oneflow.float32)), System-Boxing-Identity-889/out:(sbp=(B), size=(1), dtype=(oneflow.float32)), System-ClipGradient-GlobalNorm-Clamp-755/y_0:(sbp=(B), size=(), dtype=(oneflow.float32)), System-Train-DynamicLossScale-GraphBase_0-CountNotFinite/out_0:(sbp=(B), size=(1), dtype=(oneflow.int64)), model.t5_model.encoder.layers.4.post_attention_layernorm.bias-m/out:(sbp=(B), size=(384), dtype=(oneflow.float32)), model.t5_model.encoder.layers.4.post_attention_layernorm.bias-v/out:(sbp=(B), size=(384), dtype=(oneflow.float32)), System-Boxing-Identity-890/out:(sbp=(B), size=(1), dtype=(oneflow.float32)), System-Boxing-Identity-887/out:(sbp=(B), size=(1), dtype=(oneflow.float32))) -> (), placement=(oneflow.placement(type="cuda", ranks=[0, 1])))
(OUTPUT:_model.t5_model.encoder.layers.4.post_attention_layernorm_output.0.0_2:tensor(..., placement=oneflow.placement(type="cuda", ranks=[0, 1]),
sbp=(oneflow.sbp.split(dim=0),), is_lazy='True', size=(32, 512, 384),
dtype=oneflow.float32, grad_fn=<layer_norm_backward>))
)
对其生成 op graph 的时间开销
- Master 开销 144.8s
- 本分支开销 1.2s
Code got formatted by CI. Please request CI again if you still want to have this PR merged. If the PR is from a forked repo, please download the patch files from the GitHub Actions web page and apply them locally.
Code got formatted by CI. Please request CI again if you still want to have this PR merged. If the PR is from a forked repo, please download the patch files from the GitHub Actions web page and apply them locally.
Code got formatted by CI. Please request CI again if you still want to have this PR merged. If the PR is from a forked repo, please download the patch files from the GitHub Actions web page and apply them locally.
Speed stats:
GPU Name: GeForce GTX 1080
❌ OneFlow resnet50 time: 140.8ms (= 14077.3ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 163.8ms (= 16383.5ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.16 (= 163.8ms / 140.8ms)
OneFlow resnet50 time: 85.8ms (= 8576.1ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 102.3ms (= 10225.4ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.19 (= 102.3ms / 85.8ms)
OneFlow resnet50 time: 58.3ms (= 11652.5ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 78.6ms (= 15722.0ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.35 (= 78.6ms / 58.3ms)
OneFlow resnet50 time: 45.2ms (= 9030.7ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 79.1ms (= 15819.7ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.75 (= 79.1ms / 45.2ms)
OneFlow resnet50 time: 39.9ms (= 7985.0ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 76.2ms (= 15247.1ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.91 (= 76.2ms / 39.9ms)
View latest API docs preview at: https://staging.oneflow.info/docs/Oneflow-Inc/oneflow/pr/9320/
Speed stats:
GPU Name: GeForce GTX 1080
❌ OneFlow resnet50 time: 141.2ms (= 14121.6ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 163.9ms (= 16393.4ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.16 (= 163.9ms / 141.2ms)
OneFlow resnet50 time: 85.4ms (= 8542.6ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 102.9ms (= 10287.0ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.20 (= 102.9ms / 85.4ms)
OneFlow resnet50 time: 57.8ms (= 11556.6ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 82.2ms (= 16435.7ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.42 (= 82.2ms / 57.8ms)
OneFlow resnet50 time: 45.9ms (= 9181.3ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 70.3ms (= 14064.8ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.53 (= 70.3ms / 45.9ms)
OneFlow resnet50 time: 40.6ms (= 8121.4ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 77.4ms (= 15475.7ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.91 (= 77.4ms / 40.6ms)
CI failed when running job: cuda-misc. PR label automerge has been removed
Speed stats:
GPU Name: GeForce GTX 1080
❌ OneFlow resnet50 time: 141.2ms (= 14122.4ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 170.9ms (= 17092.9ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.21 (= 170.9ms / 141.2ms)
OneFlow resnet50 time: 86.6ms (= 8660.6ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 107.2ms (= 10717.5ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.24 (= 107.2ms / 86.6ms)
OneFlow resnet50 time: 57.8ms (= 11554.5ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 77.9ms (= 15585.0ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.35 (= 77.9ms / 57.8ms)
OneFlow resnet50 time: 44.3ms (= 8865.5ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 69.5ms (= 13896.8ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.57 (= 69.5ms / 44.3ms)
OneFlow resnet50 time: 39.9ms (= 7972.6ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 67.1ms (= 13425.9ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.68 (= 67.1ms / 39.9ms)
View latest API docs preview at: https://staging.oneflow.info/docs/Oneflow-Inc/oneflow/pr/9320/
Speed stats:
Speed stats:
GPU Name: GeForce GTX 1080
❌ OneFlow resnet50 time: 140.5ms (= 14045.9ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 162.9ms (= 16288.3ms / 100, input_shape=[16, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.16 (= 162.9ms / 140.5ms)
OneFlow resnet50 time: 85.7ms (= 8572.7ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 102.3ms (= 10227.3ms / 100, input_shape=[8, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.19 (= 102.3ms / 85.7ms)
OneFlow resnet50 time: 58.3ms (= 11651.2ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 85.0ms (= 16993.9ms / 200, input_shape=[4, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.46 (= 85.0ms / 58.3ms)
OneFlow resnet50 time: 45.5ms (= 9105.6ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 76.0ms (= 15201.3ms / 200, input_shape=[2, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.67 (= 76.0ms / 45.5ms)
OneFlow resnet50 time: 42.9ms (= 8570.5ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
PyTorch resnet50 time: 67.7ms (= 13530.6ms / 200, input_shape=[1, 3, 224, 224], ddp, world size=2)
✔️ Relative speed: 1.58 (= 67.7ms / 42.9ms)
View latest API docs preview at: https://staging.oneflow.info/docs/Oneflow-Inc/oneflow/pr/9320/