From 4d720fa6084d8126e619e8d76daa96bf26ebd2cf Mon Sep 17 00:00:00 2001
From: Fabian Pedregosa <pedregosa@google.com>
Date: Tue, 26 Mar 2024 01:51:59 -0700
Subject: [PATCH] Fix init_value and end_value in cosine decay

Problem:
* the starting value was not always init_value
* the last value was not always end_value

Solution:
* changed the formulas (they are now compatible with the pytorch implementation https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingLR.html)
* added test to check that starting and end value coincide with `init_value` and `end_value` respectively

Misc:
   renamed alpha -> end_value in warmup_cosine_decay_schedule
PiperOrigin-RevId: 619109148
---
 optax/schedules/_schedule.py      | 14 +++++------
 optax/schedules/_schedule_test.py | 41 ++++++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/optax/schedules/_schedule.py b/optax/schedules/_schedule.py
index d5a3f0c70..21f49cd24 100644
--- a/optax/schedules/_schedule.py
+++ b/optax/schedules/_schedule.py
@@ -266,11 +266,11 @@ def cosine_decay_schedule(
 
   .. math::
 
-     \frac{I (1 - E)}{2}(1+\cos(\pi\,\frac{t}{T})^p) + E\,,
+     \frac{(I - E)}{2}(1+\cos(\pi\,\frac{t}{T})^p) + E\,,
 
   where :math:`T` is the number of decay steps (``decay_steps``), :math:`p` is
   the ``exponent``, :math:`I` is the initial value (``init_value``) and
-  :math:`E` is the end value,.
+  :math:`E` is the end value (``end_value``).
 
   References:
     Loshchilov et al., `SGDR: Stochastic Gradient Descent with Warm Restarts
@@ -286,8 +286,8 @@ def cosine_decay_schedule(
       ``t`` is the current timestep and ``T`` is the ``decay_steps``. The
       exponent modifies this to be ``(0.5 * (1 + cos(pi * t/T))) ** exponent``.
       Defaults to 1.0.
-    alpha: The minimum value of the multiplier used to adjust the
-      learning rate. Defaults to 0.0.
+    alpha: Deprecated, use end_value instead. The minimum value of the
+      multiplier used to adjust the learning rate. Defaults to 0.0.
 
   Returns:
     schedule
@@ -316,8 +316,7 @@ def cosine_decay_schedule(
   def schedule(count):
     count = jnp.minimum(count, decay_steps)
     cosine_decay = 0.5 * (1 + jnp.cos(jnp.pi * count / decay_steps))
-    decayed = (1 - end_value) * cosine_decay ** exponent + end_value
-    return init_value * decayed
+    return (init_value - end_value) * cosine_decay ** exponent + end_value
 
   return schedule
 
@@ -501,7 +500,6 @@ def warmup_cosine_decay_schedule(
     schedule
       A function that maps step counts to values
   """
-  alpha = 0. if peak_value == 0. else end_value / peak_value
   schedules = [
       linear_schedule(
           init_value=init_value,
@@ -511,7 +509,7 @@ def warmup_cosine_decay_schedule(
       cosine_decay_schedule(
           init_value=peak_value,
           decay_steps=decay_steps - warmup_steps,
-          alpha=alpha,
+          end_value=end_value,
           exponent=exponent,
       ),
   ]
diff --git a/optax/schedules/_schedule_test.py b/optax/schedules/_schedule_test.py
index 8826f981a..5b26de957 100644
--- a/optax/schedules/_schedule_test.py
+++ b/optax/schedules/_schedule_test.py
@@ -300,6 +300,24 @@ def test_immutable_count(self):
 
 class CosineDecayTest(chex.TestCase):
 
+  @chex.all_variants
+  def test_init_value_end_value(self):
+    """Check cosine schedule decay for the entire training schedule."""
+    initial_value = 1.5
+    end_value = 0.2
+    num_steps = 10
+    schedule_fn = self.variant(
+        _schedule.cosine_decay_schedule(initial_value, num_steps, end_value))
+    # Test that generated values equal the expected schedule values.
+    generated_vals = []
+    for count in range(num_steps + 1):
+      # Compute next value.
+      generated_vals.append(schedule_fn(count))
+
+    # Test that the first and last values are correct.
+    self.assertAlmostEqual(generated_vals[0], initial_value)
+    self.assertAlmostEqual(generated_vals[-1], end_value)
+
   @chex.all_variants
   def test_decay_count_smaller_count(self):
     """Check cosine schedule decay for the entire training schedule."""
@@ -345,23 +363,28 @@ def test_decay_count_greater_count(self):
   def test_decay_count_greater_count_with_end_value(self):
     """Check cosine schedule decay for a part of the training schedule."""
     # Get schedule function.
-    initial_value = 0.1
+    initial_value = 0.2
+    end_value = 0.1
+    num_steps = 5
     schedule_fn = self.variant(
-        _schedule.cosine_decay_schedule(initial_value, 5, 0.1))
+        _schedule.cosine_decay_schedule(initial_value, num_steps, end_value))
     # Test that generated values equal the expected schedule values.
     generated_vals = []
-    for count in range(12):
+    for count in range(2 * num_steps):
       # Compute next value.
       generated_vals.append(schedule_fn(count))
 
     # Test output.
-    expected_multipliers = np.array(
-        0.5 + 0.5 * np.cos(
-            np.pi * np.array(
-                [0.0, 0.2, 0.4, 0.6, 0.8, 1., 1., 1., 1., 1., 1., 1.])))
-    expected_multipliers = 0.9 * expected_multipliers + 0.1
+    cos_values = 0.5 * (1 + np.cos(np.pi * np.linspace(0, 1, num_steps + 1)))
+    expected_values = (
+        (initial_value - end_value) * cos_values + end_value
+    )
+    # padd with [end_value] at the end.
+    expected_values = np.concatenate(
+        (expected_values, [end_value] * (num_steps - 1))
+    )
     np.testing.assert_allclose(
-        initial_value * expected_multipliers,
+        expected_values,
         np.array(generated_vals), atol=1e-3)
 
   def test_cosine_alpha_exception(self):