add optimzer case (DeepLink-org#314)

* optimzer case (sgd, adadelta, rmsprop, adam adamw)
DeepLink-org · Sep 15, 2023 · 8b85a47 · 8b85a47
1 parent 31b62f6
commit 8b85a47
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 59 deletions.
diff --git a/diopi_test/python/conformance/diopi_configs.py b/diopi_test/python/conformance/diopi_configs.py
@@ -4514,23 +4514,25 @@
         atol_half=1e-4,
         rtol_half=1e-3,
         para=dict(
-            nesterov=[False, True],
-            lr=[0.1, 0.1],
-            momentum=[0.01, 0.01],
-            weight_decay=[0, 0.1],
-            dampening=[0.1, 0],
+            lr=[0.05, 0.001, 0.1, 0.1, 0, 3, 0.2, 0.07],
+            momentum=[0.5, 0, 0.01, 0.01, 1, 0.5, 2, 1.2],
+            weight_decay=[0, 0.5, 0, 0.1, 3, 2.3, 4.0, 5],
+            dampening=[0, -0.5, 0.1, 0, 2, 3.0, 0, 6.5],
+            nesterov=[True, False, False, True, False, False, True, False],
         ),
         tensor_para=dict(
             dtype=[Dtype.float32, Dtype.float16, Dtype.float64],
             args=[
                 {
                     "ins": ['param', 'param_grad'],
-                    "shape": [(2, 3, 16), (4, 32, 7, 7)],
+                    "shape": [(), (16, 8), (2, 3, 16), (4, 32, 7, 7), (4, 16, 3, 8, 2),
+                              (0,), (3, 0), (4, 0, 9)],
                     "gen_fn": Genfunc.randn,
                 },
                 {
                     "ins": ['buf'],
-                    "shape": [(2, 3, 16), (4, 32, 7, 7)],
+                    "shape": [(), (16, 8), (2, 3, 16), (4, 32, 7, 7), (4, 16, 3, 8, 2),
+                              (0,), (3, 0), (4, 0, 9)],
                     "gen_fn": Genfunc.rand,
                 },
             ]
@@ -4739,39 +4741,50 @@
         ),
     ),
 
-    # 'adam': dict(
-    #     name=['adam', 'adamw'],
-    #     interface=["CustomizedTest"],
-    #     atol=1e-4,
-    #     rtol=1e-3,
-    #     atol_half=1e-4,
-    #     rtol_half=1e-3,
-
-    #     para=dict(
-    #         lr=[0.1, 0.1],
-    #         beta1=[0.9, 0.8],
-    #         beta2=[0.99, 0.88],
-    #         eps=[1e-08, 1e-09],
-    #         step=[1, 4],
-    #         weight_decay=[0, 0.1],
-    #         amsgrad=[False, True],
-    #     ),
-    #     tensor_para=dict(
-    #         dtype=[Dtype.float16, Dtype.float32, Dtype.float64],
-    #         args=[
-    #             {
-    #                 "ins": ['param', 'param_grad'],
-    #                 "shape": [(2, 3, 16), (4, 32, 7, 7)],
-    #                 "gen_fn": Genfunc.rand,
-    #             },
-    #             {
-    #                 "ins": ['exp_avg', 'exp_avg_sq', 'max_exp_avg_sq'],
-    #                 "shape": [(2, 3, 16), (4, 32, 7, 7)],
-    #                 "gen_fn": Genfunc.zeros,
-    #             },
-    #         ]
-    #     ),
-    # ),
+    # FXIME adamw、adam输出精度不一致
+    'adam': dict(
+        name=['adam', 'adamw'],
+        interface=["CustomizedTest"],
+        atol=1e-4,
+        rtol=1e-3,
+        atol_half=1e-4,
+        rtol_half=1e-3,
+        para=dict(
+            # lr=[0, -0.2, 2, 0.001, 0.1, 3.2, -2, 0],
+            # beta1=[0, -1, 0.004, 0.9, 0.8, -2, 4.3, 0],
+            # beta2=[0.3, 0, -2, 0.99, 0.88, 1, -4, 0],
+            # eps=[-1e-02, 0, 1e-2, 1e-08, 1e-09, 0, 2, 1e-4],
+            # step=[3, 2, 0, 1, 4, 2, 4, 5],
+            # weight_decay=[-0.2, 0, 2, 0, 0.1, 2.5, 0, -3],
+            # amsgrad=[False, True, True, False, True, False, True, True],
+            lr=[0, 3.2, -2, 0],
+            beta1=[0, -2, 4.3, 0],
+            beta2=[0.3, 1, -4, 0],
+            eps=[-1e-02, 0, 2, 1e-4],
+            step=[3, 2, 4, 5],
+            weight_decay=[-0.2, 2.5, 0, -3],
+            amsgrad=[False, False, True, True],
+        ),
+        tensor_para=dict(
+            dtype=[Dtype.float16, Dtype.float32, Dtype.float64],
+            args=[
+                {
+                    "ins": ['param', 'param_grad'],
+                    # "shape": [(), (16,), (16, 8), (2, 3, 16), (4, 32, 7, 7),
+                    #           (0,), (4, 0), (12, 0, 9)],
+                    "shape": [(), (0,), (4, 0), (12, 0, 9)],
+                    "gen_fn": Genfunc.randn,
+                },
+                {
+                    "ins": ['exp_avg', 'exp_avg_sq', 'max_exp_avg_sq'],
+                    # "shape": [(), (16,), (16, 8), (2, 3, 16), (4, 32, 7, 7),
+                    #           (0,), (4, 0), (12, 0, 9)],
+                    "shape": [(), (0,), (4, 0), (12, 0, 9)],
+                    "gen_fn": Genfunc.randn,
+                },
+            ]
+        ),
+    ),
 
     # FIXME conv_transpose2d特定参数组合，反向传播失败
     'conv_transpose2d': dict(
@@ -5043,23 +5056,25 @@
         atol_half=1e-4,
         rtol_half=1e-3,
         para=dict(
-            lr=[0.1, 0.1],
-            rho=[0.9, 0.88],
-            eps=[1e-6, 1e-6],
-            weight_decay=[0, 0.1],
+            lr=[1.0, 0, -0.5, 0.1, 0.1, 2.3, -2, 0],
+            rho=[-1, 1.2, 0, 0.9, 0.88, -3, 0.5, 0],
+            eps=[1e-2, 0, -1e-4, 1e-6, 1e-6, 0, 1e-4, -1e-6],
+            weight_decay=[1.2, 0.5, -1.3, 0, 0.1, 0.5, 0, -1.2],
         ),
         tensor_para=dict(
             dtype=[Dtype.float32, Dtype.float16, Dtype.float64],
             args=[
                 {
                     "ins": ['param', 'param_grad'],
-                    "shape": [(2, 3, 16), (4, 32, 7, 7)],
-                    "gen_fn": Genfunc.rand,
+                    "shape": [(), (16,), (16, 8), (2, 3, 16), (4, 32, 7, 7),
+                              (0,), (4, 0), (12, 0, 9)],
+                    "gen_fn": Genfunc.randn,
                 },
                 {
                     "ins": ['square_avg', 'acc_delta'],
-                    "shape": [(2, 3, 16), (4, 32, 7, 7)],
-                    "gen_fn": Genfunc.zeros,
+                    "shape": [(), (16,), (16, 8), (2, 3, 16), (4, 32, 7, 7),
+                              (0,), (4, 0), (12, 0, 9)],
+                    "gen_fn": Genfunc.randn,
                 },
             ]
         ),
@@ -5073,25 +5088,27 @@
         atol=1e-5,
         rtol=1e-3,
         para=dict(
-            lr=[0.1, 0.01],
-            alpha=[0.9, 0.99],
-            eps=[1e-6, 1e-8],
-            weight_decay=[0, 0.1],
-            momentum=[0, 0.1],
-            centered=[False, True],
+            lr=[0, 1.2, -0.05, 0.1, 0.01, 0, 2, 2.3],
+            alpha=[-0.3, 0, 1.2, 0.9, 0.99, 3, 0, 0.4],
+            eps=[1e-2, 0, -1e-4, 1e-6, 1e-8, 0, 1e-4, -1e-6],
+            weight_decay=[1.2, 0.5, -1.3, 0, 0.1, 0.5, 0, -1.2],
+            momentum=[-2, 0.3, 1, 0, 0.1, 0.05, -3, 0],
+            centered=[True, False, True, False, True, True, False, True],
         ),
         tensor_para=dict(
             dtype=[Dtype.float32, Dtype.float16, Dtype.float64],
             args=[
                 {
                     "ins": ['param', 'param_grad'],
-                    "shape": [(2, 3, 16), (4, 32, 7, 7)],
+                    "shape": [(), (16,), (16, 8), (2, 3, 16), (4, 32, 7, 7),
+                              (0,), (4, 0), (12, 0, 9)],
                     "gen_fn": Genfunc.randn,
                 },
                 {
                     "ins": ['square_avg', 'grad_avg', 'momentum_buffer'],
-                    "shape": [(2, 3, 16), (4, 32, 7, 7)],
-                    "gen_fn": Genfunc.zeros,
+                    "shape": [(), (16,), (16, 8), (2, 3, 16), (4, 32, 7, 7),
+                              (0,), (4, 0), (12, 0, 9)],
+                    "gen_fn": Genfunc.randn,
                 },
             ]
         ),

diff --git a/diopi_test/python/conformance/gen_data.py b/diopi_test/python/conformance/gen_data.py
@@ -442,7 +442,7 @@ def adamw(param, param_grad, exp_avg, exp_avg_sq, max_exp_avg_sq, lr, beta1, bet
         exp_avgs = [exp_avg]
         exp_avg_sqs = [exp_avg_sq]
         max_exp_avg_sqs = [max_exp_avg_sq]
-        state_steps = [step]
+        state_steps = [torch.tensor(float(step))]
 
         torch.optim._functional.adamw(params_with_grad,
                                       grads,
@@ -455,7 +455,8 @@ def adamw(param, param_grad, exp_avg, exp_avg_sq, max_exp_avg_sq, lr, beta1, bet
                                       beta2=beta2,
                                       lr=lr,
                                       weight_decay=weight_decay,
-                                      eps=eps)
+                                      eps=eps,
+                                      maximize=False)
         return param, param_grad, exp_avg, exp_avg_sq, max_exp_avg_sq
 
     def adadelta(param, param_grad, square_avg, acc_delta, lr, rho, eps, weight_decay):

diff --git a/impl/camb/device_configs.py b/impl/camb/device_configs.py
@@ -999,6 +999,8 @@
             args=[
                 {
                     "ins": ['param', 'param_grad'],
+                    # FIXME Run diopi_functions.adam failed, because of inputs: param_grad changed
+                    "shape": [Skip(())],
                     "dtype": [Skip(Dtype.float16)],
                 },
             ]
@@ -1045,6 +1047,15 @@
         name=["adadelta"],
         atol_half=1e-3,
         rtol_half=1e-3,
+        tensor_para=dict(
+            args=[
+                {
+                    # can't get correct result
+                    "ins": ['param', 'param_grad'],
+                    "dtype": [Skip(Dtype.float16)],
+                },
+            ]
+        ),
     ),
 
     'rmsprop': dict(