modkit icon indicating copy to clipboard operation
modkit copied to clipboard

Question of DMR pair: output may be error?

Open WeiHea opened this issue 7 months ago • 4 comments

I added some intermediate variable prints in your open-source code and found that the llr_ratio calculated by your beta_llk differs from the output of my Python implementation. The specific details are as follows. Is this due to an error in the implementation of Rust's built-in toolkits, or is it a mistake in my Python implementation?

    pub(super) fn new(
        control_counts: AggregatedCounts,
        exp_counts: AggregatedCounts,
        interval: DmrInterval,
    ) -> MkResult<Self> {
        // hewei 2025-05-21
        // Debug print the counts before calculations
        let _guard = DEBUG_LOCK.lock().unwrap();

        println!("Control counts:");
        println!("  Total: {}", control_counts.total);
        println!("  Mod code counts: {:?}", control_counts.mod_code_counts);
        
        println!("Experimental counts:");
        println!("  Total: {}", exp_counts.total);
        println!("  Mod code counts: {:?}", exp_counts.mod_code_counts);
        // 

        let score = llk_ratio(&control_counts, &exp_counts)?;

        println!("LLR score: {}", score);
        //////////////////////////////////////

        let coh_res = cohen_h(&control_counts, &exp_counts);
        Ok(Self {
            control_counts,
            exp_counts,
            interval,
            score,
            cohen_hresult: coh_res,
        })
    }


fn llk_beta(
    control_counts: &AggregatedCounts,
    exp_counts: &AggregatedCounts,
) -> anyhow::Result<f64> {
    let all_mods = control_counts
        .mod_code_counts
        .keys()
        .copied()
        .chain(exp_counts.mod_code_counts.keys().copied())
        .collect::<HashSet<ModCodeRepr>>();
    if all_mods.len() != 1 {
        bail!("should have exactly one modification to use beta llk")
    }
    let raw_mod_code =
        all_mods.into_iter().take(1).collect::<Vec<ModCodeRepr>>()[0];

    let control_methyls =
        *control_counts.mod_code_counts.get(&raw_mod_code).unwrap_or(&0);
    let control_canonicals = control_counts.get_canonical_counts();
    let llk_control = beta_llk(control_methyls, control_canonicals);

    println!("control_methyls: {}", control_methyls);
    println!("control_canonicals: {}", control_canonicals);
    println!("llk_control: {}", llk_control);

    let exp_methyls =
        *exp_counts.mod_code_counts.get(&raw_mod_code).unwrap_or(&0);
    let exp_canonicals = exp_counts.get_canonical_counts();
    let llk_exp = beta_llk(exp_methyls, exp_canonicals);

    println!("exp_methyls: {}", exp_methyls);
    println!("exp_canonicals: {}", exp_canonicals);
    println!("llk_exp: {}", llk_exp);

    let llk_same = beta_llk(
        exp_methyls + control_methyls,
        exp_canonicals + control_canonicals,
    );

    Ok(llk_control + llk_exp - llk_same)
}

pub(super) fn llk_ratio(
    control_counts: &AggregatedCounts,
    exp_counts: &AggregatedCounts,
) -> MkResult<f64> {
    let n_categories = std::cmp::max(
        control_counts.mod_code_counts.keys().len(),
        exp_counts.mod_code_counts.keys().len(),
    ) + 1; // plus 1 for canonical
    if n_categories < 2 {
        return Ok(0f64);
    }

    println!("n_categories: {}", n_categories);

    let res = if n_categories == 2 {
        llk_beta(control_counts, exp_counts)
    } else {
        llk_dirichlet(control_counts, exp_counts)
    };
    res.map_err(|e| {
        debug!("failed to calculate LLR score, {e}");
        MkError::LlrCalcError
    })
}

The modkit dmr output:

> reading reference FASTA at "./dmr_common/reference.fasta"
> 1 common sequence(s) between FASTA and both samples
> loaded 70 regions
> loading 6 regions at a time
> checking for regions contained in samples
Control counts:
  Total: 4143
  Mod code counts: {Code('m'): 268}
Experimental counts:
  Total: 2374
  Mod code counts: {Code('m'): 243}
n_categories: 2
LLR score: 14.026544029662546

In my python version:

from scipy.special import betaln

def counts_to_trials(count_methyl: int, count_canonical: int) -> list[bool]:
    return [True] * count_methyl + [False] * count_canonical

def beta_llk(count_methyl: int, count_canonical: int) -> float:
    trials = counts_to_trials(count_methyl, count_canonical)
    alpha_prior = 0.5  # Jeffrey's prior
    beta_prior = 0.5
    alpha_post = alpha_prior + sum(trials)
    beta_post = beta_prior + len(trials) - sum(trials)

    print(f"alpha_post: {alpha_post}, beta_post: {beta_post}")
    log_likelihood = betaln(alpha_post, beta_post) - betaln(alpha_prior, beta_prior)
    return log_likelihood

def llk_beta(
    control_methyls: int, control_canonicals: int,
    exp_methyls: int, exp_canonicals: int
) -> float:
    llk_control = beta_llk(control_methyls, control_canonicals)
    llk_exp = beta_llk(exp_methyls, exp_canonicals)
    llk_same = beta_llk(
        control_methyls + exp_methyls,
        control_canonicals + exp_canonicals
    )
    return llk_control + llk_exp - llk_same


control_methyls = 268
control_canonicals = 4143 - 268
exp_methyls = 243
exp_canonicals = 2374 - 243

result = llk_beta(control_methyls, control_canonicals, exp_methyls, exp_canonicals)
print(f"log likelihood ratio: {result}")

Output of Python version:

alpha_post: 268.5, beta_post: 3875.5
alpha_post: 243.5, beta_post: 2131.5
alpha_post: 511.5, beta_post: 6006.5
log likelihood ratio: 10.487190674690055

WeiHea avatar May 21 '25 02:05 WeiHea

def counts_to_trials(count_methyl: int, count_canonical: int) -> List[bool]:
    return [True] * count_methyl + [False] * count_canonical


class BernoulliData:
    def __init__(self, data: List[bool]):
        self.data = data

    def counts(self) -> Tuple[int, int]:
        successes = sum(self.data)
        failures = len(self.data) - successes
        return successes, failures


class Beta:
    def __init__(self, alpha: float, beta: float):
        self.alpha = alpha
        self.beta = beta

    @staticmethod
    def jeffreys():
        return Beta(0.5, 0.5)

    def posterior(self, data: BernoulliData):
        successes, failures = data.counts()
        return Beta(
            self.alpha + successes,
            self.beta + failures
        )

    def ln_m(self, data: BernoulliData) -> float:
        successes, failures = data.counts()
        return (
            ln_beta(self.alpha + successes, self.beta + failures)
            - ln_beta(self.alpha, self.beta)
        )


def ln_beta(a: float, b: float) -> float:
    return math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)


def beta_llk(count_methyl: int, count_canonical: int) -> float:
    trials = counts_to_trials(count_methyl, count_canonical)
    data = BernoulliData(trials)
    prior = Beta.jeffreys()
    posterior = prior.posterior(data)
    return posterior.ln_m(data)


def simplified_llr_beta(
    control_methyl: int,
    control_canonical: int,
    exp_methyl: int,
    exp_canonical: int,
) -> float:
    llk_control = beta_llk(control_methyl, control_canonical)
    llk_exp = beta_llk(exp_methyl, exp_canonical)
    llk_same = beta_llk(
        control_methyl + exp_methyl,
        control_canonical + exp_canonical
    )
    return llk_control + llk_exp - llk_same
  posterior = prior.posterior(data)
  posterior.ln_m(data)  <-- use posterior.alpha + successes then add successes,maybe  repeat addition

等价于:

log B(alpha + 2X, beta + 2(n−X)) − log B(alpha + X, beta + (n−X))

This is clearly not marginal likelihood anymore. Instead, it reuses the updated posterior to calculate the "likelihood". Is there any special use for this approach?

shuiliunian avatar May 21 '25 08:05 shuiliunian

Hello @WeiHea and @shuiliunian this issue looks pretty similar to https://github.com/nanoporetech/modkit/issues/438, could we consolidate them? I can work on getting a comparison together.

ArtRand avatar May 22 '25 14:05 ArtRand

yes, By the way, why does the likelihood ratio ignore the combinatorial number, and if the likelihood ratio no longer works correctly after adding the combinatorial number.

shuiliunian avatar May 22 '25 14:05 shuiliunian

def beta_binom_log_marginal(X, n):
    alpha_prior = 0.5
    beta_prior = 0.5
    if n == 0:
        return 0.0  # 无数据时边际似然为1,对数为0
    log_comb = gammaln(n + 1) - gammaln(X + 1) - gammaln(n - X + 1)
    log_beta_num = betaln(2*X + alpha_prior, 2*(n - X) + beta_prior)
    log_beta_denom = betaln(X+alpha_prior, (n-X)+beta_prior)
    return log_comb + (log_beta_num -log_beta_denom)



def compute_score(condition_a, condition_b):
    # Determine if binomial or multinomial
 
    X_a, n_a = condition_a
    X_b, n_b = condition_b
    X_ab = X_a + X_b
    n_ab = n_a + n_b
    log_l_a = beta_binom_log_marginal(X_a, n_a)
    log_l_b = beta_binom_log_marginal(X_b, n_b)
    log_l_ab = beta_binom_log_marginal(X_ab, n_ab)

    return (log_l_a + log_l_b) - log_l_ab

For example, if derived according to the mathematical principles written in the modkit documentation,its code is as above. But it is wrong and cannot measure similarity correctly.

shuiliunian avatar May 22 '25 14:05 shuiliunian