diff --git a/optimization/optimizers/gradient_descent/CMakeLists.txt b/optimization/optimizers/gradient_descent/CMakeLists.txt
index a1f0d4b4bb6971067e6b1edf22f5a6f28e59a18b..bb3b52e461bcfa71127efe678e0f49bdf1473c63 100644
--- a/optimization/optimizers/gradient_descent/CMakeLists.txt
+++ b/optimization/optimizers/gradient_descent/CMakeLists.txt
@@ -4,6 +4,6 @@ add_executable(gradient_descent
 target_link_libraries(gradient_descent optimization_lib clara)
 
 if (VISUALIZE)
-    make_plot_target(gradient_descent 2d ARGS -d 2 -l 0.0015)
+    make_plot_target(gradient_descent 2d ARGS -d 2 -l 0.0015 -n 10000)
     make_plot_target(gradient_descent 10d ARGS -d 10 -l 0.0005 -n 10000)
 endif()
diff --git a/optimization/optimizers/gradient_descent/gradient_descent.h b/optimization/optimizers/gradient_descent/gradient_descent.h
index bcd5bf20abd88a4b7549b5e71d95a47b5840231c..1061e181f11a3252ee7514a318b82b102409e85b 100644
--- a/optimization/optimizers/gradient_descent/gradient_descent.h
+++ b/optimization/optimizers/gradient_descent/gradient_descent.h
@@ -2,6 +2,7 @@
 #define OPTIMIZATION_GRADIENT_DESCENT_H
 
 #include "gradient_descent_log.h"
+#include <iostream>
 
 namespace optimization {
 
@@ -10,10 +11,14 @@ template <int32_t N>
 class GradientDescent : public GradientDescentLog<N> {
 public:
     GradientDescent() {}
-    GradientDescent(Scalar learning_rate, uint32_t n_steps)
-        : learning_rate_(learning_rate), n_steps_(n_steps)
+    GradientDescent(Scalar learning_rate, uint32_t me, Scalar gt)
+        : learning_rate_(learning_rate), max_evaluations_(me), gradient_threshold_(gt)
     {}
 
+    uint32_t n_evaluations() const { return n_evaluations_; }
+    // We evaluate the objective function exactly once per iteration.
+    uint32_t n_iterations() const { return n_evaluations_; }
+
     Scalar& learning_rate() { return learning_rate_; }
     Scalar learning_rate() const { return learning_rate_; }
 
@@ -27,9 +32,15 @@ public:
         GradientDescentLog<N>::initialize(objective);
         GradientDescentLog<N>::push_back(point, value, gradient);
 
-        for (uint32_t i = 0; i < n_steps_; ++i) {
+        for (n_evaluations_ = 1; n_evaluations_ < max_evaluations_; ++n_evaluations_) {
+            if (gradient.norm() <= gradient_threshold_) {
+                std::cout << "Gradient norm below threshold: " << gradient.norm() << '\n';
+                break;
+            }
+
             point -= learning_rate_ * gradient;
             objective.eval(point, value, gradient);
+
             GradientDescentLog<N>::push_back(point, value, gradient);
         }
 
@@ -37,8 +48,11 @@ public:
     }
 
 private:
+    uint32_t n_evaluations_;
+
     Scalar learning_rate_;
-    uint32_t n_steps_;
+    uint32_t max_evaluations_;
+    Scalar gradient_threshold_;
 };
 
 }
diff --git a/optimization/optimizers/gradient_descent/main.cpp b/optimization/optimizers/gradient_descent/main.cpp
index 47820b6b617bef26700a636b00caf851d83334ee..dceb38c2e631eb0a58d684fb74d2a6b0d293a902 100644
--- a/optimization/optimizers/gradient_descent/main.cpp
+++ b/optimization/optimizers/gradient_descent/main.cpp
@@ -19,8 +19,9 @@ int main(int const argc, char const** argv) {
     std::string log_file_path;
     std::string vis_file_path;
     uint32_t dim = 2;
-    uint32_t n_steps = 1000;
     Scalar learning_rate = 0.0015;
+    uint32_t max_evaluations = 1000;
+    Scalar gradient_threshold = 1e-8;
     Scalar x0 = -1;
     Scalar y0 = 2;
 
@@ -28,8 +29,9 @@ int main(int const argc, char const** argv) {
         clara::Arg(log_file_path, "log_file_path")("Location of the optimization log") |
         clara::Arg(vis_file_path, "vis_file_path")("Location of the visualization file") |
         clara::Opt(dim, "dim")["-d"]["--dimension"]("Dimensionality of the objective") |
-        clara::Opt(n_steps, "n_steps")["-n"]["--n-steps"]("Number of gradient descent steps") |
         clara::Opt(learning_rate, "learning_rate")["-l"]["--learning-rate"]("Gradient descent learning rate") |
+        clara::Opt(max_evaluations, "max_evaluations")["-n"]["--max-evaluations"]("Max number of gradient descent steps") |
+        clara::Opt(gradient_threshold, "gradient_threshold")["-g"]["--gradient-threshold"]("Terminate if the gradient norm is this small") |
         clara::Opt(x0, "x0")["-x"]["--x0"]("X coordinate of initial point") |
         clara::Opt(y0, "y0")["-y"]["--y0"]("Y coordinate of initial point");
     auto const result = cli.parse(clara::Args(argc, argv));
@@ -52,9 +54,9 @@ int main(int const argc, char const** argv) {
     Objective objective;
     objective.dim() = dim;
 
-    GradientDescent<-1> optimizer(learning_rate, n_steps);
+    GradientDescent<-1> optimizer(learning_rate, max_evaluations, gradient_threshold);
     VectorXs minimum = optimizer.optimize(objective, initial_point);
-    std::cout << "n evaluations: " << n_steps << '\n';
+    std::cout << "n evaluations: " << optimizer.n_evaluations() << '\n';
     std::cout << "final point: " << minimum << '\n';
 
     #ifdef VISUALIZE