Question of DMR pair: output may be error?
I added some intermediate variable prints in your open-source code and found that the llr_ratio calculated by your beta_llk differs from the output of my Python implementation. The specific details are as follows. Is this due to an error in the implementation of Rust's built-in toolkits, or is it a mistake in my Python implementation?
pub(super) fn new(
control_counts: AggregatedCounts,
exp_counts: AggregatedCounts,
interval: DmrInterval,
) -> MkResult<Self> {
// hewei 2025-05-21
// Debug print the counts before calculations
let _guard = DEBUG_LOCK.lock().unwrap();
println!("Control counts:");
println!(" Total: {}", control_counts.total);
println!(" Mod code counts: {:?}", control_counts.mod_code_counts);
println!("Experimental counts:");
println!(" Total: {}", exp_counts.total);
println!(" Mod code counts: {:?}", exp_counts.mod_code_counts);
//
let score = llk_ratio(&control_counts, &exp_counts)?;
println!("LLR score: {}", score);
//////////////////////////////////////
let coh_res = cohen_h(&control_counts, &exp_counts);
Ok(Self {
control_counts,
exp_counts,
interval,
score,
cohen_hresult: coh_res,
})
}
fn llk_beta(
control_counts: &AggregatedCounts,
exp_counts: &AggregatedCounts,
) -> anyhow::Result<f64> {
let all_mods = control_counts
.mod_code_counts
.keys()
.copied()
.chain(exp_counts.mod_code_counts.keys().copied())
.collect::<HashSet<ModCodeRepr>>();
if all_mods.len() != 1 {
bail!("should have exactly one modification to use beta llk")
}
let raw_mod_code =
all_mods.into_iter().take(1).collect::<Vec<ModCodeRepr>>()[0];
let control_methyls =
*control_counts.mod_code_counts.get(&raw_mod_code).unwrap_or(&0);
let control_canonicals = control_counts.get_canonical_counts();
let llk_control = beta_llk(control_methyls, control_canonicals);
println!("control_methyls: {}", control_methyls);
println!("control_canonicals: {}", control_canonicals);
println!("llk_control: {}", llk_control);
let exp_methyls =
*exp_counts.mod_code_counts.get(&raw_mod_code).unwrap_or(&0);
let exp_canonicals = exp_counts.get_canonical_counts();
let llk_exp = beta_llk(exp_methyls, exp_canonicals);
println!("exp_methyls: {}", exp_methyls);
println!("exp_canonicals: {}", exp_canonicals);
println!("llk_exp: {}", llk_exp);
let llk_same = beta_llk(
exp_methyls + control_methyls,
exp_canonicals + control_canonicals,
);
Ok(llk_control + llk_exp - llk_same)
}
pub(super) fn llk_ratio(
control_counts: &AggregatedCounts,
exp_counts: &AggregatedCounts,
) -> MkResult<f64> {
let n_categories = std::cmp::max(
control_counts.mod_code_counts.keys().len(),
exp_counts.mod_code_counts.keys().len(),
) + 1; // plus 1 for canonical
if n_categories < 2 {
return Ok(0f64);
}
println!("n_categories: {}", n_categories);
let res = if n_categories == 2 {
llk_beta(control_counts, exp_counts)
} else {
llk_dirichlet(control_counts, exp_counts)
};
res.map_err(|e| {
debug!("failed to calculate LLR score, {e}");
MkError::LlrCalcError
})
}
The modkit dmr output:
> reading reference FASTA at "./dmr_common/reference.fasta"
> 1 common sequence(s) between FASTA and both samples
> loaded 70 regions
> loading 6 regions at a time
> checking for regions contained in samples
Control counts:
Total: 4143
Mod code counts: {Code('m'): 268}
Experimental counts:
Total: 2374
Mod code counts: {Code('m'): 243}
n_categories: 2
LLR score: 14.026544029662546
In my python version:
from scipy.special import betaln
def counts_to_trials(count_methyl: int, count_canonical: int) -> list[bool]:
return [True] * count_methyl + [False] * count_canonical
def beta_llk(count_methyl: int, count_canonical: int) -> float:
trials = counts_to_trials(count_methyl, count_canonical)
alpha_prior = 0.5 # Jeffrey's prior
beta_prior = 0.5
alpha_post = alpha_prior + sum(trials)
beta_post = beta_prior + len(trials) - sum(trials)
print(f"alpha_post: {alpha_post}, beta_post: {beta_post}")
log_likelihood = betaln(alpha_post, beta_post) - betaln(alpha_prior, beta_prior)
return log_likelihood
def llk_beta(
control_methyls: int, control_canonicals: int,
exp_methyls: int, exp_canonicals: int
) -> float:
llk_control = beta_llk(control_methyls, control_canonicals)
llk_exp = beta_llk(exp_methyls, exp_canonicals)
llk_same = beta_llk(
control_methyls + exp_methyls,
control_canonicals + exp_canonicals
)
return llk_control + llk_exp - llk_same
control_methyls = 268
control_canonicals = 4143 - 268
exp_methyls = 243
exp_canonicals = 2374 - 243
result = llk_beta(control_methyls, control_canonicals, exp_methyls, exp_canonicals)
print(f"log likelihood ratio: {result}")
Output of Python version:
alpha_post: 268.5, beta_post: 3875.5
alpha_post: 243.5, beta_post: 2131.5
alpha_post: 511.5, beta_post: 6006.5
log likelihood ratio: 10.487190674690055
def counts_to_trials(count_methyl: int, count_canonical: int) -> List[bool]:
return [True] * count_methyl + [False] * count_canonical
class BernoulliData:
def __init__(self, data: List[bool]):
self.data = data
def counts(self) -> Tuple[int, int]:
successes = sum(self.data)
failures = len(self.data) - successes
return successes, failures
class Beta:
def __init__(self, alpha: float, beta: float):
self.alpha = alpha
self.beta = beta
@staticmethod
def jeffreys():
return Beta(0.5, 0.5)
def posterior(self, data: BernoulliData):
successes, failures = data.counts()
return Beta(
self.alpha + successes,
self.beta + failures
)
def ln_m(self, data: BernoulliData) -> float:
successes, failures = data.counts()
return (
ln_beta(self.alpha + successes, self.beta + failures)
- ln_beta(self.alpha, self.beta)
)
def ln_beta(a: float, b: float) -> float:
return math.lgamma(a) + math.lgamma(b) - math.lgamma(a + b)
def beta_llk(count_methyl: int, count_canonical: int) -> float:
trials = counts_to_trials(count_methyl, count_canonical)
data = BernoulliData(trials)
prior = Beta.jeffreys()
posterior = prior.posterior(data)
return posterior.ln_m(data)
def simplified_llr_beta(
control_methyl: int,
control_canonical: int,
exp_methyl: int,
exp_canonical: int,
) -> float:
llk_control = beta_llk(control_methyl, control_canonical)
llk_exp = beta_llk(exp_methyl, exp_canonical)
llk_same = beta_llk(
control_methyl + exp_methyl,
control_canonical + exp_canonical
)
return llk_control + llk_exp - llk_same
posterior = prior.posterior(data)
posterior.ln_m(data) <-- use posterior.alpha + successes then add successes,maybe repeat addition
等价于:
log B(alpha + 2X, beta + 2(n−X)) − log B(alpha + X, beta + (n−X))
This is clearly not marginal likelihood anymore. Instead, it reuses the updated posterior to calculate the "likelihood". Is there any special use for this approach?
Hello @WeiHea and @shuiliunian this issue looks pretty similar to https://github.com/nanoporetech/modkit/issues/438, could we consolidate them? I can work on getting a comparison together.
yes, By the way, why does the likelihood ratio ignore the combinatorial number, and if the likelihood ratio no longer works correctly after adding the combinatorial number.
def beta_binom_log_marginal(X, n):
alpha_prior = 0.5
beta_prior = 0.5
if n == 0:
return 0.0 # 无数据时边际似然为1,对数为0
log_comb = gammaln(n + 1) - gammaln(X + 1) - gammaln(n - X + 1)
log_beta_num = betaln(2*X + alpha_prior, 2*(n - X) + beta_prior)
log_beta_denom = betaln(X+alpha_prior, (n-X)+beta_prior)
return log_comb + (log_beta_num -log_beta_denom)
def compute_score(condition_a, condition_b):
# Determine if binomial or multinomial
X_a, n_a = condition_a
X_b, n_b = condition_b
X_ab = X_a + X_b
n_ab = n_a + n_b
log_l_a = beta_binom_log_marginal(X_a, n_a)
log_l_b = beta_binom_log_marginal(X_b, n_b)
log_l_ab = beta_binom_log_marginal(X_ab, n_ab)
return (log_l_a + log_l_b) - log_l_ab
For example, if derived according to the mathematical principles written in the modkit documentation,its code is as above. But it is wrong and cannot measure similarity correctly.