Improve memory efficiency

This enables some move semantics for potentially big arguments. I also removed a redundant vector from the internal state of cgd.

Improve memory efficiency
c87a5e0e · Erik Strand · cf6c5c38 · 174ba2da · c87a5e0e · c87a5e0e
Commit c87a5e0e authored Apr 17, 2020 by Erik Strand
--- a/optimization/optimizers/cma_es/cma_es.h
+++ b/optimization/optimizers/cma_es/cma_es.h
@@ -35,6 +35,7 @@ public:
    inline Eigen::Map<const Eigen::MatrixXd> point();
    Scalar value() { return cmaes_Get(&cma_es_, "fbestever"); }

+    // TODO: Support passing initial point by rvalue reference.
    template <typename Objective>
    Eigen::Map<const Eigen::MatrixXd> optimize(
        Objective& objective,

--- a/optimization/optimizers/conjugate_gradient_descent/conjugate_gradient_descent.h
+++ b/optimization/optimizers/conjugate_gradient_descent/conjugate_gradient_descent.h
@@ -3,7 +3,6 @@

 #include "logs/nothing.h"
 #include "optimizers/line_search/brent.h"
-#include "optimizers/line_search/line_objective.h"

 #include <iostream>

@@ -33,12 +32,37 @@ public:
    VectorNs<N> const& gradient() const { return gradient_; }
    Scalar value() const { return value_; }

+    // There are four cases for optimize, since you can use a log or not, and can pass an rvalue or
+    // not. These methods ultimately all call optimize_impl to do the real work.
+    // Note: I explicitly write out the Vector const& and Vector&& cases rather than passing by
+    // value and moving since Eigen forbids passing by value:
+    // https://eigen.tuxfamily.org/dox/group__TopicPassingByValue.html
    template <typename Objective>
-    Vector const& optimize(Objective& objective, Vector const& initial_point);
+    Vector const& optimize(Objective& objective, Vector const& initial_point) {
+        ConjugateGradientDescentLogNothing log;
+        return optimize(objective, initial_point, log);
+    }
+    template <typename Objective, typename Log>
+    Vector const& optimize(Objective& objective, Vector const& initial_point, Log& log) {
+        point_ = initial_point;
+        return optimize_impl(objective, log);
+    }
+    template <typename Objective>
+    Vector const& optimize(Objective& objective, Vector&& initial_point) {
+        ConjugateGradientDescentLogNothing log;
+        return optimize(objective, std::move(initial_point), log);
+    }
    template <typename Objective, typename Log>
-    Vector const& optimize(Objective& objective, Vector const& initial_point, Log& log);
+    Vector const& optimize(Objective& objective, Vector&& initial_point, Log& log) {
+        point_ = std::move(initial_point);
+        return optimize_impl(objective, log);
+    }

 private:
+    // Actually does the optimization, once point_ has been initialized.
+    template <typename Objective, typename Log>
+    Vector const& optimize_impl(Objective& objective, Log& log);
+
    // hyperparameters
    Scalar gradient_threshold_;
    Scalar progress_threshold_;
@@ -48,49 +72,40 @@ private:
    // algorithm state
    uint32_t n_evaluations_;
    uint32_t n_iterations_;
-    // This object holds the current point (x0) and search direction (dir).
-    // TODO allow this to live here, somehow
-    //LineObjective<Objective, N> line_objective_;
-    VectorNs<N> point_;
-    VectorNs<N> direction_;
-    VectorNs<N> new_point_;
+    Vector point_;         // where we are currently
+    Vector direction_;     // the direction we're searching
+    Vector gradient_;      // the gradient at point_
+    // This variable sees double duty. When we're line searching, it holds the candidate points.
+    // Between evaluating the function at the line minimum and computing the next search direction,
+    // it stores the gradient at the point of the previous iteration.
+    Vector new_point_or_last_gradient_;
    Scalar alpha_; // stores the most recent jump distance, which we use as a guess for the next one
    Scalar value_;
    Scalar last_value_;
-    Vector gradient_;
-    Vector last_gradient_;

    // algorithm constants
    static constexpr Scalar tiny_ = std::numeric_limits<Scalar>::epsilon();
 };

-//..................................................................................................
-template <int32_t N>
-template <typename Objective>
-VectorNs<N> const& ConjugateGradientDescent<N>::optimize(
-    Objective& objective,
-    Vector const& initial_point
-) {
-    ConjugateGradientDescentLogNothing log;
-    return optimize(objective, initial_point, log);
-}
-
 //..................................................................................................
 template <int32_t N>
 template <typename Objective, typename Log>
-VectorNs<N> const& ConjugateGradientDescent<N>::optimize(
+VectorNs<N> const& ConjugateGradientDescent<N>::optimize_impl(
    Objective& objective,
-    Vector const& initial_point,
    Log& log
 ) {
    n_evaluations_ = 0;
    n_iterations_ = 0;
-    point_ = initial_point;
-    alpha_ = -1;
+    // point_ has already been initialized
+    direction_.resize(point_.size());
    gradient_.resize(point_.size());
-    last_gradient_.resize(point_.size());
+    new_point_or_last_gradient_.resize(point_.size());
+    alpha_ = -1;

-    LineObjective<Objective, N> line_objective{objective, point_, direction_, new_point_};
+    auto const line_objective = [&](Scalar t, Scalar& value) {
+        new_point_or_last_gradient_ = point_ + t * direction_;
+        objective(new_point_or_last_gradient_, value);
+    };
    BracketFinder bracket;
    Brent line_minimizer;

@@ -106,19 +121,23 @@ VectorNs<N> const& ConjugateGradientDescent<N>::optimize(
    }

    while (true) {
-        // Find the minimum along this direction.
+        // Find the minimum along this direction. The next two lines are the only ones that use
+        // new_point_or_last_gradient_ as new_point_ (happens inside the lambda line_objective).
        bracket.bracket(line_objective, Scalar(0), alpha_);
        alpha_ = line_minimizer.optimize(line_objective, bracket).point;
        n_evaluations_ += bracket.n_evaluations();
        n_evaluations_ += line_minimizer.n_evaluations();
-        // Note: at some point new_point_ already had this new value, but right now there's no
-        // guarantee that it's the current value.
-        point_ += alpha_ * direction_;

-        // Can we re-use evaluation here? Would need to rewrite line search to have guarantees on
-        // which point it evaluated last. Probably best to do this after I've written dbrent.
        last_value_ = value_;
-        gradient_.swap(last_gradient_);
+        // Basically this line means new_point_or_last_gradient_ = std::move(gradient_), but I want
+        // to be clear that we're not releasing any memory here. For the rest of the loop,
+        // new_point_or_last_gradient_ stores the last gradient.
+        new_point_or_last_gradient_.swap(gradient_);
+        // Again, we evaluted the objective here already during the line search, but we probably
+        // threw that data out.
+        // Note: at some point new_point_or_last_gradient_ already had this new value, but right now
+        // there's no guarantee that it's the current value.
+        point_ += alpha_ * direction_;
        objective(point_, value_, gradient_);
        ++n_iterations_;

@@ -149,10 +168,11 @@ VectorNs<N> const& ConjugateGradientDescent<N>::optimize(
            return point_;
        }

-        // Choose the next search direction. It is conjugate to all prior directions.
-        // I view this as the start of the next iteration.
-        Scalar const gamma =
-            gradient_.dot(gradient_ - last_gradient_) / last_gradient_.dot(last_gradient_);
+        // Choose the next search direction. It is conjugate to all prior directions. Recall that
+        // new_point_or_last_gradient_ currently stores the last gradient. I view this as the start
+        // of the next iteration.
+        Scalar const gamma = gradient_.dot(gradient_ - new_point_or_last_gradient_)
+            / new_point_or_last_gradient_.dot(new_point_or_last_gradient_);
        direction_ = gradient_ + gamma * direction_;
    }
 }

--- a/optimization/optimizers/conjugate_gradient_descent/main.cpp
+++ b/optimization/optimizers/conjugate_gradient_descent/main.cpp
@@ -60,9 +60,9 @@ int main(int const argc, char const** argv) {

    // Only log stuff if we're going to use it.
    if (log_file_path.empty() && vis_file_path.empty()) {
-        optimizer.optimize(objective, initial_point);
+        optimizer.optimize(objective, std::move(initial_point));
    } else {
-        optimizer.optimize(objective, initial_point, log);
+        optimizer.optimize(objective, std::move(initial_point), log);
    }

    std::cout << "n evaluations: " << optimizer.n_evaluations() << '\n';

--- a/optimization/optimizers/gradient_descent/gradient_descent.h
+++ b/optimization/optimizers/gradient_descent/gradient_descent.h
@@ -25,6 +25,7 @@ public:
    VectorNs<N> const& gradient() const { return gradient_; }
    Scalar value() const { return value_; }

+    // TODO: Support passing initial point by rvalue reference.
    template <typename Objective>
    VectorNs<N> const& optimize(Objective& objective, VectorNs<N> const& initial_point);


--- a/optimization/optimizers/line_search/line_objective.h
+++ b/optimization/optimizers/line_search/line_objective.h
-#ifndef OPTIMIZATION_LINE_SEARCH_LINE_OBJECTIVE_H
-#define OPTIMIZATION_LINE_SEARCH_LINE_OBJECTIVE_H
-
-#include "utils/vector.h"
-
-namespace optimization {
-
-//--------------------------------------------------------------------------------------------------
-template <typename Objective, int32_t N>
-struct LineObjective {
-public:
-    LineObjective(Objective& o, VectorNs<N>& x0, VectorNs<N>& dir, VectorNs<N>& x) :
-        objective_(o), x0_(x0), dir_(dir), x_(x)
-    {}
-
-    void operator()(Scalar t, Scalar& value) {
-        x_ = x0_ + t * dir_;
-        objective_(x_, value);
-    }
-
-    void operator()(Scalar t, Scalar& value, VectorNs<N>& gradient) {
-        x_ = x0_ + t * dir_;
-        objective_(x_, value, gradient);
-    }
-
-private:
-    Objective& objective_;
-    VectorNs<N>& x0_;
-    VectorNs<N>& dir_;
-    VectorNs<N>& x_;
-};
-
-}
-
-#endif
--- a/optimization/optimizers/nelder_mead/main.cpp
+++ b/optimization/optimizers/nelder_mead/main.cpp
@@ -46,9 +46,9 @@ int main(int const argc, char const** argv) {

    // Only log stuff if we're going to use it.
    if (log_file_path.empty() && vis_file_path.empty()) {
-        optimizer.optimize(objective, simplex);
+        optimizer.optimize(objective, std::move(simplex));
    } else {
-        optimizer.optimize(objective, simplex, log);
+        optimizer.optimize(objective, std::move(simplex), log);
    }

    std::cout << "n evaluations: " << optimizer.n_evaluations() << '\n';

--- a/optimization/optimizers/nelder_mead/nelder_mead.h
+++ b/optimization/optimizers/nelder_mead/nelder_mead.h
@@ -48,14 +48,17 @@ public:
    }
    Scalar value() const { return simplex_values_[i_lowest_]; }

-    // Constructs a simplex from an initial point by offsetting all coordinate values.
-    // These methods return the best point, as a reference to a column (like point()).
+    // All optimization methods return the best point, as a reference to a column (like point()).
+    // These two construct a simplex from an initial point by offsetting all coordinate values.
    template <typename Objective>
    decltype(auto) optimize(
        Objective& objective,
        VectorD const& initial_point,
        Scalar offset = 1
-    );
+    ) {
+        NelderMeadLogNothing log;
+        return optimize(objective, initial_point, offset, log);
+    }
    template <typename Objective, typename Log>
    decltype(auto) optimize(
        Objective& objective,
@@ -63,14 +66,34 @@ public:
        Scalar offset,
        Log& log
    );
-
-    // Each column of simplex is a vertex.
+    // These four take a full simplex. Each column is a vertex.
    template <typename Objective>
-    decltype(auto) optimize(Objective& objective, MatrixDN const& simplex);
+    decltype(auto) optimize(Objective& objective, MatrixDN&& simplex) {
+        NelderMeadLogNothing log;
+        return optimize(objective, std::move(simplex), log);
+    }
    template <typename Objective, typename Log>
-    decltype(auto) optimize(Objective& objective, MatrixDN const& simplex, Log& log);
+    decltype(auto) optimize(Objective& objective, MatrixDN&& simplex, Log& log) {
+        simplex_vertices_ = std::move(simplex);
+        return optimize_impl(objective, log);
+    }
+    template <typename Objective>
+    decltype(auto) optimize(Objective& objective, MatrixDN const& simplex) {
+        NelderMeadLogNothing log;
+        return optimize(objective, simplex, log);
+    }
+    template <typename Objective, typename Log>
+    decltype(auto) optimize(Objective& objective, MatrixDN const& simplex, Log& log) {
+        simplex_vertices_ = simplex;
+        return optimize_impl(objective, log);
+    }

 private:
+    // Does the optimization assuming simplex_vertices_ has been initialized.
+    template <typename Objective, typename Log>
+    decltype(auto) optimize_impl(Objective& objective, Log& log);
+
+    // Helper method for optimize_impl.
    template <typename Objective>
    Scalar try_new_point(Objective& objective, Scalar factor);

@@ -102,18 +125,6 @@ private:
    static constexpr Scalar tiny_ = 1e-10;
 };

-//..................................................................................................
-template <int32_t D>
-template <typename Objective>
-decltype(auto) NelderMead<D>::optimize(
-    Objective& objective,
-    VectorD const& initial_point,
-    Scalar offset
-) {
-    NelderMeadLogNothing log;
-    return optimize(objective, initial_point, offset, log);
-}
-
 //..................................................................................................
 template <int32_t D>
 template <typename Objective, typename Log>
@@ -123,30 +134,20 @@ decltype(auto) NelderMead<D>::optimize(
    Scalar offset,
    Log& log
 ) {
-    MatrixDN simplex;
-    simplex.resize(initial_point.size(), initial_point.size() + 1);
-    simplex.col(0) = initial_point;
+    simplex_vertices_.resize(initial_point.size(), initial_point.size() + 1);
+    simplex_vertices_.col(0) = initial_point;
    for (uint32_t i = 0; i < initial_point.size(); ++i) {
-        simplex.col(i + 1) = initial_point;
-        simplex(i, i + 1) += offset;
+        simplex_vertices_.col(i + 1) = initial_point;
+        simplex_vertices_(i, i + 1) += offset;
    }
-    return optimize(objective, simplex, log);
-}
-
-//..................................................................................................
-template <int32_t D>
-template <typename Objective>
-decltype(auto) NelderMead<D>::optimize(Objective& objective, MatrixDN const& simplex) {
-    NelderMeadLogNothing log;
-    return optimize(objective, simplex, log);
+    return optimize_impl(objective, log);
 }

 //..................................................................................................
 template <int32_t D>
 template <typename Objective, typename Log>
-decltype(auto) NelderMead<D>::optimize(Objective& objective, MatrixDN const& simplex, Log& log) {
-    // initialize state
-    simplex_vertices_ = simplex;
+decltype(auto) NelderMead<D>::optimize_impl(Objective& objective, Log& log) {
+    // initialize state, assuming simplex_vertices_ is already set
    n_vertices_ = simplex_vertices_.cols();
    dim_ = simplex_vertices_.rows();
    assert(n_vertices_ == dim_ + 1);