Commit 15a787be authored by Erik Strand's avatar Erik Strand

Add seventh pset

parents 3e72766c b5f845e7
......@@ -11,8 +11,9 @@ find_package (Eigen3 3.3 REQUIRED NO_MODULE)
include(cmake/shared_settings.cmake)
add_subdirectory(extern)
add_subdirectory(common)
add_subdirectory(shared_code)
add_subdirectory(notes)
add_subdirectory(pset_03)
add_subdirectory(pset_04)
add_subdirectory(pset_05/cpp)
add_subdirectory(pset_07/cpp)
add_library(common INTERFACE)
target_link_libraries(common INTERFACE shared_settings Eigen3::Eigen)
target_include_directories(common INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
......@@ -5,4 +5,4 @@ add_executable(nbody
main.cpp
glut_grapher.cpp
)
target_link_libraries(nbody common ${OPENGL_LIBRARIES} ${GLUT_LIBRARY})
target_link_libraries(nbody shared_code ${OPENGL_LIBRARIES} ${GLUT_LIBRARY})
add_executable(rand1
main1.cpp
)
target_link_libraries(rand1 shared_settings Eigen3::Eigen)
target_link_libraries(rand1 shared_settings shared_code Eigen3::Eigen)
add_executable(rand2
main2.cpp
)
target_link_libraries(rand2 shared_settings Eigen3::Eigen)
target_link_libraries(rand2 shared_settings shared_code Eigen3::Eigen)
......@@ -5,7 +5,7 @@ add_library(sim
glut_grapher.cpp
glut_grapher.h
)
target_link_libraries(sim PUBLIC common ${OPENGL_LIBRARIES} ${GLUT_LIBRARY})
target_link_libraries(sim PUBLIC shared_code ${OPENGL_LIBRARIES} ${GLUT_LIBRARY})
target_include_directories(sim PUBLIC
${OPENGL_INCLUDE_DIRS}
${GLUT_INCLUDE_DIRS}
......
add_executable(svd
svd.cpp
)
target_link_libraries(svd shared_settings shared_code)
add_executable(levmar
levenberg_marquardt.cpp
)
target_link_libraries(levmar shared_settings shared_code)
#include "xorshift.h"
#include <Eigen/Dense>
#include <iostream>
#include <random>
#include <vector>
using namespace std;
using namespace Eigen;
//--------------------------------------------------------------------------------------------------
XorShift64 rg1;
std::random_device rd{};
std::mt19937 rg2{rd()};
//--------------------------------------------------------------------------------------------------
Eigen::ArrayXd draw_n_uniform(uint32_t n) {
Eigen::ArrayXd result(n);
for (uint32_t i = 0; i < n; ++i) {
result[i] = rg1.draw_double();
}
return result;
}
//--------------------------------------------------------------------------------------------------
// TODO Box-Muller transform my uniform samples instead of using STL
Eigen::ArrayXd draw_n_normal(uint32_t n, double std_deviation) {
std::normal_distribution<> normal{0, std_deviation};
Eigen::ArrayXd result(n);
for (uint32_t i = 0; i < n; ++i) {
result[i] = normal(rg2);
}
return result;
}
//--------------------------------------------------------------------------------------------------
int main() {
uint32_t constexpr n_warmup = 1000;
for (uint32_t i = 0; i < n_warmup; ++i) {
rg1.advance();
}
uint32_t constexpr n_samples = 100;
double const std_deviation = 0.1;
Eigen::ArrayXd samples = draw_n_uniform(n_samples);
Eigen::ArrayXd errors = draw_n_normal(n_samples, std_deviation);
//Eigen::ArrayXd values = sin(2 + 3 * samples) + errors;
Eigen::ArrayXd values = sin(2 + 3 * samples);
double a = 1;
double b = 1;
double lambda = 10;
constexpr bool vary_lambda = false;
double error = (values - sin(a + b * samples)).matrix().squaredNorm();
double new_error;
Eigen::Vector2d grad;
Eigen::Matrix2d mat;
Eigen::ArrayXd tmp(n_samples);
Eigen::Vector2d step;
std::cout << "error: " << error << '\n';
for (uint32_t i = 0; i < 1000; ++i) {
// compute gradient
tmp = (values - sin(a + b * samples)) * cos(a + b * samples);
grad[0] = -2 * tmp.sum();
grad[1] = -2 * (samples * tmp).sum();
// compute levenberg marquardt matrix (based on hessian)
tmp = values * sin(a + b * samples) + cos(2 * (a + b * samples));
mat(0, 0) = tmp.sum() + lambda;
mat(0, 1) = (samples * tmp).sum();
mat(1, 0) = mat(0, 1);
mat(1, 1) = (samples * samples * tmp).sum() + lambda;
// compute the step
step = -mat.inverse() * grad; // Levenberg-Marquardt
//step = -lambda * grad; // gradient descent
// apply the step
a += step[0];
b += step[1];
new_error = (values - sin(a + b * samples)).matrix().squaredNorm();
if (vary_lambda) {
if (new_error < error) {
lambda *= 0.8;
} else {
lambda *= 1.2;
}
}
error = new_error;
//std::cout << "error: " << error << '\n';
}
std::cout << "error: " << error << '\n';
std::cout << "a = " << a << ", b = " << b << '\n';
}
#include "xorshift.h"
#include <Eigen/Dense>
#include <iostream>
#include <random>
#include <vector>
using namespace std;
using namespace Eigen;
//--------------------------------------------------------------------------------------------------
XorShift64 rg1;
std::random_device rd{};
std::mt19937 rg2{rd()};
//--------------------------------------------------------------------------------------------------
Eigen::ArrayXd draw_n_uniform(uint32_t n) {
Eigen::ArrayXd result(n);
for (uint32_t i = 0; i < n; ++i) {
result[i] = rg1.draw_double();
}
return result;
}
//--------------------------------------------------------------------------------------------------
// TODO Box-Muller transform my uniform samples instead of using STL
Eigen::ArrayXd draw_n_normal(uint32_t n, double std_deviation) {
std::normal_distribution<> normal{0, std_deviation};
Eigen::ArrayXd result(n);
for (uint32_t i = 0; i < n; ++i) {
result[i] = normal(rg2);
}
return result;
}
//--------------------------------------------------------------------------------------------------
int main() {
uint32_t constexpr n_warmup = 1000;
for (uint32_t i = 0; i < n_warmup; ++i) {
rg1.advance();
}
uint32_t constexpr n_samples = 100;
double const std_deviation = 0.5;
Eigen::ArrayXd samples = draw_n_uniform(n_samples);
Eigen::ArrayXd errors = draw_n_normal(n_samples, std_deviation);
Eigen::ArrayXd values = 2 + 3 * samples + errors;
Eigen::MatrixXd mat(n_samples, 2);
for (uint32_t i = 0; i < n_samples; ++i) {
mat(i, 0) = 1;
mat(i, 1) = samples[i];
}
// Need at least ThinU and ThinV to use mat_svd.solve.
JacobiSVD<MatrixXd> mat_svd(mat, ComputeThinU | ComputeThinV);
std::cout << "singular values:\n" << mat_svd.singularValues() << '\n';
//cout << "left singular vectors:\n" << mat_svd.matrixU() << '\n';
//cout << "right singular vectors:\n" << mat_svd.matrixV() << '\n';
Eigen::VectorXd solution = mat_svd.solve(values.matrix());
std::cout << "solution:\n" << solution << '\n';
}
from sympy import *
a = Symbol("a", real=True)
b = Symbol("b", real=True)
x = Symbol("x", real=True)
y = Symbol("y", real=True)
error = (y - sin(a + b * x))**2
error_da = simplify(diff(error, a))
error_db = simplify(diff(error, b))
print(latex(error_da))
print(latex(error_db))
error_daa = simplify(diff(error_da, a))
error_dab = simplify(diff(error_da, b))
error_dba = simplify(diff(error_db, a))
error_dbb = simplify(diff(error_db, b))
print(latex(error_daa))
print(latex(error_dab))
print(latex(error_dba))
print(latex(error_dbb))
add_library(shared_code INTERFACE)
target_link_libraries(shared_code INTERFACE shared_settings Eigen3::Eigen)
target_include_directories(shared_code INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
---
title: Problem Set 7 (Function Fitting)
---
## 1
Generate 100 points $$x$$ uniformly distributed between 0 and 1, and let $$y = 2 + 3x + \zeta$$,
where $$\zeta$$ is a Gaussian random variable with a standard deviation of 0.5. Use an SVD to fit
$$y = a + bx$$ to this data set, finding $$a$$ and $$b$$. Evaluate the errors in $$a$$ and $$b$$
using equation (12.34), by bootstrapping to generate 100 datasets, and from fitting an ensemble of
100 independent data sets.
I used [Eigen](http://eigen.tuxfamily.org/index.php?title=Main_Page) to compute the SVD (in C++).
My code is
[here](https://gitlab.cba.mit.edu/erik/nmm_2020_site/-/tree/master/_code/pset_07/cpp/svd.cpp). I
found $$a = 2.10697$$ and $$b = 2.91616$$.
## 2
Generate 100 points $$x$$ uniformly distributed between 0 and 1, and let $$y = \sin(2 + 3x) +
\zeta$$, where $$\zeta$$ is a Gaussian random variable with a standard deviation of 0.1. Write a
Levenberg-Marquardt routine to fit $$y = \sin(a + bx)$$ to this data set starting from $$a = b = 1$$
(remembering that the second-derivative term can be dropped in the Hessian), and investigate the
convergence for both fixed and adaptively adjusted $$\lambda$$ values.
We want to minimize
$$
\chi^2(a, b) = \sum_{i = 1}^N \left( y_i - \sin{a + b x_i} \right)^2
$$
The first derivatives of this function are
$$
\begin{aligned}
\frac{\partial \chi^2(a, b)}{\partial a} &= -2 \sum_{i = 1}^N \left(y_i - \sin{\left(a + b x_i \right)}\right) \cos{\left(a + b x_i \right)} \\
\frac{\partial \chi^2(a, b)}{\partial b} &= -2 \sum_{i = 1}^N x_i \left(y_i - \sin{\left(a + b x_i \right)}\right) \cos{\left(a + b x_i \right)}
\end{aligned}
$$
The second derivatives are
$$
\begin{aligned}
\frac{\partial^2 \chi^2(a, b)}{\partial a^2} &= 2 \sum_{i = 1}^N \left( y_i \sin{\left(a + b x_i \right)} + \cos{\left(2 (a + b x_i) \right)} \right) \\
\frac{\partial^2 \chi^2(a, b)}{\partial a \partial b} &= 2 \sum_{i = 1}^N x_i \left(y_i \sin{\left(a + b x_i \right)} + \cos{\left(2 (a + b x_i) \right)}\right) \\
\frac{\partial^2 \chi^2(a, b)}{\partial b^2} &= 2 \sum_{i = 1}^N x_i^2 \left(y_i \sin{\left(a + b x_i \right)} + \cos{\left(2 (a + b x_i) \right)}\right) \\
\end{aligned}
$$
Using these we can construct our matrix $$\bold{M}$$.
$$
\bold{M} =
\begin{bmatrix}
\Large{\frac{1}{2} \frac{\partial^2 \chi^2}{\partial a^2}} \small{(1 + \lambda)} & \Large{\frac{1}{2} \frac{\partial^2 \chi^2}{\partial a \partial b}} \\
\Large{\frac{1}{2} \frac{\partial^2 \chi^2}{\partial b \partial a}} & \Large{\frac{1}{2} \frac{\partial^2 \chi^2}{\partial b^2}} \small{(1 + \lambda)}
\end{bmatrix}
$$
My implementation lives
[here](https://gitlab.cba.mit.edu/erik/nmm_2020_site/-/tree/master/_code/pset_07/cpp/levenberg_marquardt.cpp).
## 3
{:.question}
An alternative way to choose among models is to select the one that makes the weakest assumptions
about the data; this is the purpose of maximum entropy methods. Assume that what is measured is a
set of expectation values for functions $$f_i$$ of a random variable $$x$$,
$$
\langle f_i(x) \rangle = \int_{-\infty}^\infty p(x) f_i(x) dx
$$
### (a)
{:.question}
Given these measurements, find the compatible normalized probability distribution $$p(x)$$ that
maximizes the differential entropy
$$
S = - \int_{-\infty}^\infty p(x) \log p(x) dx
$$
This is an optimization problem subject to multiple equality constraints, so Lagrange multipliers
are a natural choice. Our Lagrangian is
$$
\begin{aligned}
L(p(x), \lambda_0, \ldots, \lambda_N) = &- \int_{-\infty}^\infty p(x) \log p(x) dx
+ \lambda_0 \left( \int_{-\infty}^\infty p(x) dx - 1 \right) \\
&+ \sum_{i = 1}^N \lambda_i \left( \int_{-\infty}^\infty p(x) f_i(x) dx - \langle f_i(x) \rangle \right)
\end{aligned}
$$
Note that I've added an additional constraint that $$p(x)$$ is normalized.
The solution will satisfy
$$
\frac{\delta L}{\delta p(x)} = \frac{\partial L}{\partial \lambda_i} = 0
$$
The derivatives with respect to the Lagrange multipliers are simple enough; they just reproduce our
original constraints. But what is the derivative with respect to a function? It turns out the
[functional derivative](https://en.wikipedia.org/wiki/Functional_derivative) has a rigorous
definition in the [calculus of variations](https://en.wikipedia.org/wiki/Calculus_of_variations).
Ignoring the motivation for a moment, it turns out that in this case, we can just throw away
everything that's not an integral, and differentiate the expressions inside the integrals as if
$$p(x)$$ were a scalar.
$$
\begin{aligned}
\frac{\delta L}{\delta p(x)}
&= -\frac{d}{d p(x)} p(x) \log p(x) + \lambda_0 \frac{d}{d p(x)} p(x)
+ \sum_{i = 1}^N \lambda_i \frac{d}{d p(x)} p(x) f_i(x) \\
&= -\log p(x) - 1 + \lambda_0 + \sum_{i = 1}^N \lambda_i f_i(x)
\end{aligned}
$$
This should be equal to zero, or equivalently,
$$
p(x) = \exp \left(- 1 + \lambda_0 + \sum_{i = 1}^N \lambda_i f_i(x) \right)
$$
Thus the general form of the solution to such a problem takes the form of an [exponential
family](https://en.wikipedia.org/wiki/Exponential_family).
Now, let's dig into that functional derivative a little more. I assume we're working in
[$$L^2$$](https://en.wikipedia.org/wiki/Square-integrable_function), i.e. the space of square
integrable real valued functions. (This is a [Banach
space](https://en.wikipedia.org/wiki/Banach_space). Calculus of variations is often considered in
Banach spaces, but can be defined more generally as well.) Let $$F$$ be a function that maps $$L^2$$
to $$\mathbb{R}$$ via an integral
$$
\int_{-\infty}^\infty f(p(x)) dx
$$
Here $$f$$ is any function in $$L^2$$. For any square integrable function $$\eta(x)$$, consider the
modified function $$p(x) + \epsilon \eta(x)$$ (for some real $$\epsilon$$). Think of this as
$$p(x)$$ with a small perturbation in the direction of $$\eta(x)$$. Then the equivalent of a
directional derivative of $$F$$ (in the direction of $$\eta(x)$$) can be found by taking the first
terms of a Taylor expansion in $$\epsilon$$.
$$
\begin{aligned}
\int_{-\infty}^\infty f(p(x) + \epsilon \eta(x)) dx
&= \int_{-\infty}^\infty \sum_{n = 0}^\infty \frac{(\epsilon \eta(x))^n}{n!} f^{(n)}(p(x)) dx \\
&= \sum_{n = 0}^\infty \int_{-\infty}^\infty \frac{(\epsilon \eta(x))^n}{n!} f^{(n)}(p(x)) dx \\
&= \int_{-\infty}^\infty f(p(x)) dx + \epsilon \int_{-\infty}^\infty \eta(x) f'(p(x)) dx + \mathcal{O}(\epsilon^2)
\end{aligned}
$$
Recall that the derivative of a function $$f : \mathbb{R} \rightarrow \mathbb{R}$$ at $$x$$ can be
defined as the number $$f'(x)$$ such that $$f(x + h) = f(x) + h f'(x) + \mathcal{O}(h^2)$$. So it
shouldn't seem totally crazy that the derivative of $$F$$ (in the direction of $$\eta(x)$$) is
$$
\delta F[p(x); \eta(x)] = \int_{-\infty}^\infty \eta(x) f'(p(x)) dx
$$
Now if we're trying to maximize (or minimize) $$F$$, we want to find a $$p(x)$$ such that all
directional derivatives of $$F$$ are zero. The [fundamental lemma of calculus of
variations](https://en.wikipedia.org/wiki/Fundamental_lemma_of_calculus_of_variations) tells us that
if
$$
\int_{-\infty}^\infty \eta(x) f'(p(x)) dx = 0
$$
for all $$\eta(x)$$, then $$f'(p(x))$$ must be identically zero. So this is why (at least in
handwaving form) we ended up setting the derivatives of the expressions inside of integrals to zero.
(If we wanted to be more rigorous, at least in the Banach space setting, we would show that the
above formula agrees with the definition of the [Fr&eacute;chet
derivative](https://en.wikipedia.org/wiki/Fr%C3%A9chet_derivative).)
### (b)
{:.question}
What is the maximum entropy distribution if we know only the second moment?
Assume the second moment is
$$
\sigma^2 = - \int_{-\infty}^\infty p(x) x^2 dx
$$
We know the solution has the form
$$
\begin{aligned}
p(x)
&= \exp \left(- 1 + \lambda_0 + \lambda_1 x^2 \right) \\
&= e^{\lambda_0 - 1} e^{\lambda_1 x^2}
\end{aligned}
$$
We just need to find $$\lambda_0$$ and $$\lambda_1$$.
For $$p(x)$$ to be square integrable, it has to disappear as $$x$$ approaches positive or negative
infinity. So let's flip the sign of $$\lambda_1$$, and assume this new $$\lambda_1$$ is positive
from here on out. So our modified general form is
$$
p(x) = e^{\lambda_0 - 1} e^{-\lambda_1 x^2}
$$
To ensure that the distribution is normalized, we require
$$
\begin{aligned}
1
&= e^{\lambda_0 - 1} \int_{-\infty}^\infty e^{-\lambda_1 x^2} dx \\
&= \frac{e^{\lambda_0 - 1}}{\sqrt{\lambda_1}} \int_{-\infty}^\infty e^{-x^2} dx \\
&= \frac{e^{\lambda_0 - 1}}{\sqrt{\lambda_1}} \sqrt{\pi} \\
\end{aligned}
$$
The $$\sqrt{\pi}$$ comes from the well known [Gaussian
integral](https://en.wikipedia.org/wiki/Gaussian_integral).
And for the second moment to be $$\sigma^2$$,
$$
\begin{aligned}
\sigma^2
&= e^{\lambda_0 - 1} \int_{-\infty}^\infty e^{-\lambda_1 x^2} x^2 dx \\
&= -e^{\lambda_0 - 1} \int_{-\infty}^\infty \frac{-1}{2 \lambda_1} e^{-\lambda_1 x^2} dx \\
&= e^{\lambda_0 - 1} \frac{1}{2 \lambda_1} \frac{1}{\sqrt{\lambda_1}} \int_{-\infty}^\infty e^{-x^2} dx \\
&= e^{\lambda_0 - 1} \frac{1}{2 \lambda_1} \frac{1}{\sqrt{\lambda_1}} \sqrt{\pi} \\
\end{aligned}
$$
Here I integrated by parts (then changed variables). Note that
$$
\frac{d}{dx} \frac{-1}{2 \lambda_1} e^{-\lambda_1 x^2} = x e^{-\lambda_1 x^2}
$$
The boundary terms disappear, so I omitted them.
Plugging in the first result to the second,
$$
\sigma^2 = \frac{1}{2 \lambda_1} \implies \lambda_1 = \frac{1}{2 \sigma^2}
$$
So then
$$
1 = e^{\lambda_0 - 1} \sigma \sqrt{2 \pi} \implies e^{\lambda_0 - 1} = \frac{1}{\sigma \sqrt{2 \pi}}
$$
Thus the solution is
$$
p(x) = \frac{1}{\sigma \sqrt{2 \pi}} e^{-x^2 / (2 \sigma^2)}
$$
This is a Gaussian with zero mean and variance $$\sigma^2$$. However it should be noted that the
entropy of a Gaussian doesn't depend on its mean, so any mean here would do.
## 4
{:.question}
Now consider the reverse situation. Let's say that we know that a data set $$\{x_n\}_{n=1}^N$$ was
drawn from a Gaussian distribution with variance $$\sigma^2$$ and unknown mean $$\mu$$. Try to find
an optimal estimator of the mean (one that is unbiased and has the smallest possible error in the
estimate).
Let our estimator be
$$
\hat{\mu} = f(x_1, \ldots, x_N) = \frac{1}{N} \sum_{n=1}^n x_n
$$
The sum of two normally distributed random variables $$X_1 \sim N(\mu_1, \sigma_1^2)$$ and $$X_2
\sim N(\mu_2, \sigma_2^2)$$ is also normally distributed:
$$
X_1 + X_2 \sim N(\mu_1 + \mu2, \sigma_1^2 + \sigma_2^2)
$$
So by induction, $$\sum_{n=1}^N x_n \sim N(N \mu, N \sigma^2)$$. Thus
$$
\hat{\mu} \sim N(\mu, \sigma^2/N)
$$
since $$E[ax] = aE[x]$$ and $$E[(ax)^2] = a^2 E[x^2]$$.
As such the mean of our estimator is $$\mu$$, which means it's unbiased. Its variance is
$$\sigma^2/N$$. Let's see how this compares to the Cram&eacute;r–Rao bound.
The score of a Gaussian (with respect to $$\mu$$) is
$$
\begin{aligned}
\frac{\partial}{\partial \mu} \log(N(u, \sigma^2))
&= \frac{\partial}{\partial \mu} \log \left( \frac{1}{\sigma \sqrt{2 \pi}} e^{-(x - \mu)^2 / (2 \sigma^2)} \right) \\
&= \frac{\partial}{\partial \mu} \left[ \log \left( \frac{1}{\sigma \sqrt{2 \pi}} \right) - \frac{(x - \mu)^2}{2 \sigma^2} \right] \\
&= \frac{x - \mu}{\sigma^2}
\end{aligned}
$$
The Fisher information is the variance of the score. And since the expectation of the score is
always zero, its variance is just its expected square.
$$
\begin{aligned}
J(\mu)
&= E_{N(\mu, \sigma^2)} \left[ \left( \frac{x - u}{\sigma^2} \right)^2 \right] \\
&= \int_{-\infty}^\infty \frac{1}{\sigma \sqrt{2 \pi}} e^{-(x - \mu)^2 / (2 \sigma^2)} \left( \frac{x - \mu}{\sigma^2} \right)^2 dx \\
&= \frac{1}{\sigma^4} \int_{-\infty}^\infty \frac{1}{\sigma \sqrt{2 \pi}} e^{-(x - \mu)^2 / (2 \sigma^2)} (x - \mu)^2 dx \\
&= \frac{1}{\sigma^4} \int_{-\infty}^\infty \frac{1}{\sigma \sqrt{2 \pi}} e^{-x^2 / (2 \sigma^2)} x^2 dx \\
&= \frac{1}{\sigma^4} E_{N(0, \sigma^2)}[x^2] \\
&= \frac{1}{\sigma^2}
\end{aligned}
$$
(Recall that $$E[x^2]$$ for a Gaussian with zero mean is its variance $$\sigma^2$$.) So the Fisher
information of the collection of $$N$$ samples is $$J_N(\mu) = N / \sigma^2$$.
The Cram&eacute;r–Rao bound states that the variance of $$\hat{\mu}$$ is no less than $$1 /
J_N(\mu)$$. So it turns out our estimator is as good as possible, since its variance is
$$
E[(x - \hat{\mu})^2] = \frac{\sigma^2}{N} = \frac{1}{J_N(\mu)}
$$
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment