pymbar
pymbar copied to clipboard
MBAR uncertainty estimates in a low-overlap case
Here is an example that came up in a discussion with @mrshirts , where MBAR's uncertainty estimate can revert to a small number when the overlap is brought close to 0.
For comparison, naive implementations of a few options for expressing the covariance Theta as a function of overlap -- from section 3.4. of [Klimovich, Shirts, Mobley, 2015] Guidelines for the analysis of free energy calculations -- behave as expected on this example.
import numpy as np
import pymbar # pymbar==3.1.0
import matplotlib.pyplot as plt
print('pymbar version: ', pymbar.version.full_version)
# test system where we can drive overlap to 0
def make_test_system(outlier_mean=0.0):
"""
test system:
two overlapping Gaussians (at mu = 0, 0.5),
and a third Gaussian at mu = outlier_mean
with a very small stddev
"""
testsystem = pymbar.testsystems.HarmonicOscillatorsTestCase(
O_k=[0.0, 0.5, outlier_mean], # note: O_k = [0.0, 0.0, outlier_mean] can illustrate a different challenge
K_k=[10, 10, 1000],
)
return testsystem
def sample(testsystem, seed=2023):
x_n, u_kn, N_k, _ = testsystem.sample(N_k=testsystem.n_states * [1000], seed=seed)
return x_n, u_kn, N_k
# a few ways to compute the covariance Theta from the overlap,
# from Section 3.4 of https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4420631/
def naive_theta_from_overlap(overlap, N_k):
""" Theta = (O^-1 - I)^+ N^-1
Note: this will break if the overlap matrix is singular!
"""
I = np.eye(len(overlap))
theta = np.linalg.pinv(np.linalg.inv(overlap) - I)
return np.dot(theta, np.diag(1 / np.array(N_k)))
def evd_theta_from_overlap(overlap, N_k):
""" Q(Lam^-1 - I)^+ Q^-1 N^-1
where O = Q Lam Q^-1
"""
eigvals, eigvecs = np.linalg.eig(overlap)
pinv_expr = np.linalg.pinv(np.diag((1 / eigvals) - 1))
theta = np.dot(np.dot(eigvecs, pinv_expr), eigvecs.T)
return np.dot(theta, np.diag(1 / np.array(N_k)))
def evd_theta_from_overlap_2(overlap, N_k):
"""evd_theta_from_overlap
but use a simplification for pinv, using observation in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4420631/
"In this case, because it is a diagonal matrix,
we can give a simple formula for the pseudoinverse;
(Λ−1 –I)+ is a diagonal matrix with one zero diagonal entry
(corresponding to the largest eigenvalue, which is 1),
and the other entries corresponding to the ith eigenvalue λi
being λi/(1 – λi)."
"""
eigvals, eigvecs = np.linalg.eig(overlap)
#pinv_expr = np.linalg.pinv(np.diag((1 / eigvals) - 1))
pinv_diag = eigvals / (1 - eigvals)
pinv_expr = np.diag(pinv_diag)
# is it necessary to mask out vals <= 0?
# pinv_expr = np.diag(np.where(pinv_diag >= 0, pinv_diag, 0))
theta = np.dot(np.dot(eigvecs, pinv_expr), eigvecs.T)
return np.dot(theta, np.diag(1 / np.array(N_k)))
# uncertainty from Theta
def variance_from_theta(theta, i, j):
return theta[i, i] + theta[j, j] - 2 * theta[i, j]
# scan of variance estimates as we drive overlap to 0
def get_variance_estimates_on_testsystem(outlier_mean):
"""Compute uncertainty a few ways"""
testsystem = make_test_system(outlier_mean)
x_n, u_kn, N_k = sample(testsystem)
mbar = pymbar.MBAR(u_kn, N_k)
Deltaf, dDeltaf = mbar.getFreeEnergyDifferences()
overlap = mbar.computeOverlap()['matrix']
theta_naive = naive_theta_from_overlap(overlap, N_k)
theta_evd = evd_theta_from_overlap(overlap, N_k)
# theta_evd, but with a simplification for the pseudoinverse
theta_evd_2 = evd_theta_from_overlap_2(overlap, N_k)
return np.array([
variance_from_theta(theta_naive, 0, 2),
variance_from_theta(theta_evd, 0, 2),
variance_from_theta(theta_evd_2, 0, 2),
dDeltaf[0, 2] ** 2, # pymbar
])
x_grid = np.linspace(0, 5)
variance_estimates = []
for i in range(len(x_grid)):
variance_estimates.append(get_variance_estimates_on_testsystem(x_grid[i]))
variance_estimates = np.array(variance_estimates)
labels = [
r'naive $\Theta$',
r'EVD $\Theta$',
r'EVD $\Theta$ w/ custom pinv',
r'pymbar 3.1.0',
]
for i in range(4):
plt.plot(x_grid, variance_estimates[:,i], '.', label=labels[i])
plt.yscale('log')
plt.legend()
plt.xlabel('distance\n(increasing this makes overlap go to 0)')
plt.ylabel('estimated variance in free energy difference')
plt.show()
Thanks @maxentile! I will review and see if the other implementations are performant enough for larger numbers of states / numbers of samples, and at least code in the alternatives to the current implementation.
Thank you for investigating this!
I will review and see if the other implementations are performant enough for larger numbers of states / numbers of samples, and at least code in the alternatives to the current implementation.
Note the naive_theta_from_overlap
implementation is likely unsafe in general. (Breaks if overlap
is singular, and will throw errors if this example is modified from O_k = [0.0, 0.5, outlier_mean]
to O_k = [0.0, 0.0, outlier_mean]
, for instance.)
(And the other 2 implementations above, evd_theta_from_overlap
, evd_theta_from_overlap_2
, may well have other problems -- I haven't tested beyond this example.)