pymbar icon indicating copy to clipboard operation
pymbar copied to clipboard

MBAR uncertainty estimates in a low-overlap case

Open maxentile opened this issue 2 years ago • 2 comments

Here is an example that came up in a discussion with @mrshirts , where MBAR's uncertainty estimate can revert to a small number when the overlap is brought close to 0.

For comparison, naive implementations of a few options for expressing the covariance Theta as a function of overlap -- from section 3.4. of [Klimovich, Shirts, Mobley, 2015] Guidelines for the analysis of free energy calculations -- behave as expected on this example.

import numpy as np
import pymbar  # pymbar==3.1.0

import matplotlib.pyplot as plt
print('pymbar version: ', pymbar.version.full_version)


# test system where we can drive overlap to 0
def make_test_system(outlier_mean=0.0):
    """
    test system:
        two overlapping Gaussians (at mu = 0, 0.5),
        and a third Gaussian at mu = outlier_mean
        with a very small stddev
    """

    testsystem = pymbar.testsystems.HarmonicOscillatorsTestCase(
        O_k=[0.0, 0.5, outlier_mean], # note: O_k = [0.0, 0.0, outlier_mean] can illustrate a different challenge
        K_k=[10, 10, 1000],
    )
    return testsystem


def sample(testsystem, seed=2023):
    x_n, u_kn, N_k, _ = testsystem.sample(N_k=testsystem.n_states * [1000], seed=seed)
    return x_n, u_kn, N_k


# a few ways to compute the covariance Theta from the overlap, 
# from Section 3.4 of https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4420631/

def naive_theta_from_overlap(overlap, N_k):
    """ Theta = (O^-1 - I)^+ N^-1 
    
    Note: this will break if the overlap matrix is singular!
    """
    I = np.eye(len(overlap))
    theta = np.linalg.pinv(np.linalg.inv(overlap) - I)
    return np.dot(theta, np.diag(1 / np.array(N_k)))


def evd_theta_from_overlap(overlap, N_k):
    """ Q(Lam^-1 - I)^+ Q^-1 N^-1
    where O = Q Lam Q^-1
    """
    
    eigvals, eigvecs = np.linalg.eig(overlap)
    pinv_expr = np.linalg.pinv(np.diag((1 / eigvals) - 1))
    
    theta = np.dot(np.dot(eigvecs, pinv_expr), eigvecs.T)
    return np.dot(theta, np.diag(1 / np.array(N_k)))


def evd_theta_from_overlap_2(overlap, N_k):
    """evd_theta_from_overlap
    
    but use a simplification for pinv, using observation in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4420631/
     "In this case, because it is a diagonal matrix,
       we can give a simple formula for the pseudoinverse;
       (Λ−1 –I)+ is a diagonal matrix with one zero diagonal entry
       (corresponding to the largest eigenvalue, which is 1),
       and the other entries corresponding to the ith eigenvalue λi
       being λi/(1 – λi)."
    """
    eigvals, eigvecs = np.linalg.eig(overlap)
    
    #pinv_expr = np.linalg.pinv(np.diag((1 / eigvals) - 1))
    pinv_diag = eigvals / (1 - eigvals)
    pinv_expr = np.diag(pinv_diag)
    
    # is it necessary to mask out vals <= 0?
    # pinv_expr = np.diag(np.where(pinv_diag >= 0, pinv_diag, 0)) 
    
    theta = np.dot(np.dot(eigvecs, pinv_expr), eigvecs.T)
    return np.dot(theta, np.diag(1 / np.array(N_k)))


# uncertainty from Theta
def variance_from_theta(theta, i, j):
    return theta[i, i] + theta[j, j] - 2 * theta[i, j]


# scan of variance estimates as we drive overlap to 0

def get_variance_estimates_on_testsystem(outlier_mean):
    """Compute uncertainty a few ways"""
    testsystem = make_test_system(outlier_mean)

    x_n, u_kn, N_k = sample(testsystem)
    mbar = pymbar.MBAR(u_kn, N_k)
    Deltaf, dDeltaf = mbar.getFreeEnergyDifferences()
    overlap = mbar.computeOverlap()['matrix']
    

    theta_naive = naive_theta_from_overlap(overlap, N_k)
    theta_evd = evd_theta_from_overlap(overlap, N_k)
    
    # theta_evd, but with a simplification for the pseudoinverse
    theta_evd_2 = evd_theta_from_overlap_2(overlap, N_k)

    return np.array([
        variance_from_theta(theta_naive, 0, 2),
        variance_from_theta(theta_evd, 0, 2),
        variance_from_theta(theta_evd_2, 0, 2),
        dDeltaf[0, 2] ** 2,  # pymbar
    ])

x_grid = np.linspace(0, 5)
variance_estimates = []
for i in range(len(x_grid)):
    variance_estimates.append(get_variance_estimates_on_testsystem(x_grid[i]))
    
variance_estimates = np.array(variance_estimates)

labels = [
    r'naive $\Theta$',
    r'EVD $\Theta$',
    r'EVD $\Theta$ w/ custom pinv',
    r'pymbar 3.1.0',
]
for i in range(4):
    plt.plot(x_grid, variance_estimates[:,i], '.', label=labels[i])
    plt.yscale('log')
plt.legend()
plt.xlabel('distance\n(increasing this makes overlap go to 0)')
plt.ylabel('estimated variance in free energy difference')
plt.show()

image

maxentile avatar Feb 06 '23 17:02 maxentile

Thanks @maxentile! I will review and see if the other implementations are performant enough for larger numbers of states / numbers of samples, and at least code in the alternatives to the current implementation.

mrshirts avatar Feb 06 '23 18:02 mrshirts

Thank you for investigating this!

I will review and see if the other implementations are performant enough for larger numbers of states / numbers of samples, and at least code in the alternatives to the current implementation.

Note the naive_theta_from_overlap implementation is likely unsafe in general. (Breaks if overlap is singular, and will throw errors if this example is modified from O_k = [0.0, 0.5, outlier_mean] to O_k = [0.0, 0.0, outlier_mean], for instance.)

(And the other 2 implementations above, evd_theta_from_overlap, evd_theta_from_overlap_2 , may well have other problems -- I haven't tested beyond this example.)

maxentile avatar Feb 06 '23 18:02 maxentile