goldfish
goldfish copied to clipboard
`C` and `R` engine results differ for rate model
Describe the bug
Default C
engine gives different result as R
version when windows effects are used.
To Reproduce
library(goldfish)
data("Social_Evolution")
callNetwork <- defineNetwork(nodes = actors, directed = TRUE) |> # 1
linkEvents(changeEvent = calls, nodes = actors) # 2
callsDependent <- defineDependentEvents(
events = calls, nodes = actors,
defaultNetwork = callNetwork
)
windowFormulaRate <-
callsDependent ~ 1 + indeg(callNetwork) + outdeg(callNetwork) +
indeg(callNetwork, window = 300) +
outdeg(callNetwork, window = 300) +
indeg(friendshipNetwork)
mod04Rate <- estimate(
windowFormulaRate,
model = "DyNAM", subModel = "rate",
estimationInit = list(engine = "default_c")
)
mod04RateC <- coef(mod04Rate)
mod04Rate <- estimate(
windowFormulaRate,
model = "DyNAM", subModel = "rate"
)
mod04RateR <- coef(mod04Rate)
table(abs(mod04RateC - mod04RateR) < 1e-2)
> FALSE
> 6
Desktop (please complete the following information):
- OS: Windows
- R Version 4.2.0
- Goldfish Version 1.6.2
Description Using your example, it seems clear that the issue comes from the right-censoring. If you remove the windowed effects but keep the frienship updates (that create one right-censored event) you get the same behavior. I tried to use the GLM library to figure out which one is right, to know where to start, but I'm a bit confused, because I get yet another result (see script below). But it still looks like the R engine is closer to the GLM library result. Maybe @auzaheta you can see what I'm missing here?!
To reproduce (after the previous one) `
First test with glm
1. we get the preprocessed stats for a simple model
FormulaSimple <- callsDependent ~ 1 + indeg(friendshipNetwork) prepData <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", preprocessingOnly = T )
2. we calculate the exposure timesand the stats for each dependent event
Ndep <- length(prepData$intervals) Nexo <- length(prepData$rightCensoredIntervals) Na <- nrow(actors)
we only need to calculate the update of the friendship net between the first and second event
updates_T1 <- prepData$dependentStatsChange[[2]][[1]] indegs_T1 <- rep(0,Na) for(a in 1:Na) { if(any(updates_T1[,1] == a)) indegs_T1[a] <- updates_T1[max(which(updates_T1[,1] == a)),3] }
times <- rep(0,(Ndep+Nexo)*Na ) indegs_friend <- rep(0,(Ndep+Nexo)*Na ) outcomes <- rep(0,(Ndep+Nexo)*Na ) cptdep <- 1 cptexo <- 1 for(e in 1:(Ndep+Nexo)){
dep or exo?
isdep <- prepData$orderEvents[e] == 1 if(isdep) { times[((e-1)Na+1) : (eNa)] <- prepData$intervals[cptdep] outcomes[(e-1)*Na + prepData$eventSender[e]] <- 1 cptdep <- cptdep + 1 } if(!isdep) { times[((e-1)Na+1) : (eNa)] <- prepData$rightCensoredIntervals[cptexo] cptexo <- cptexo + 1 } if(e>2){ indegs_friend[((e-1)Na+1) : (eNa)] <- indegs_T1 } }
3. check glm results
df <- data.frame(times, logtimes = log(times), indegs_friend, outcomes) df[1:(Na+1),1] <- 1 df[1:(Na+1),2] <- 0 #df <- df[(Na+1):((Ndep+Nexo)*Na),] modGLM <- glm(outcomes ~ offset(logtimes) + indegs_friend, family = poisson(link = "log"), data = df) coef(modGLM)
4. check previous models
mod04RateCSimple <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default_c", startTime = 1220733469) ) coef(mod04RateCSimple)
mod04RateRSimple <- estimate( FormulaSimple, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default", startTime = 1220733469) ) coef(mod04RateRSimple)
coef(mod04RateCSimple) - coef(mod04RateRSimple) # 0.0061807584 -0.0009279592 coef(mod04RateCSimple) - coef(modGLM) # 0.0055274138 -0.0008136509 coef(mod04RateRSimple) - coef(modGLM) # -0.0006533446 0.0001143083
Second test with glm
1. we get the preprocessed stats for another simple model with time windows
FormulaSimple2 <- callsDependent ~ 1 + indeg(callNetwork, window = 300) prepData2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", preprocessingOnly = T )
2. we calculate the exposure times and the stats for each dependent event
Nexo2 <- length(prepData2$rightCensoredIntervals)
this time we need to calculate window updates
times2 <- rep(0,(Ndep+Nexo2)*Na ) indegs_window <- rep(0,(Ndep+Nexo2)*Na ) outcomes2 <- rep(0,(Ndep+Nexo2)*Na ) cptdep <- 1 cptexo <- 1
updates_dep <- prepData2$dependentStatsChange updates_rc <- prepData2$dependentStatsChange current_stats <- rep(0,Na) for(e in 1:(Ndep+Nexo2)){
dep or exo?
isdep <- prepData2$orderEvents[e] == 1 if(isdep) { # update windowed stat up <- updates_dep[[cptdep]][[1]] for(a in 1:Na) { if(!is.null(up) && any(up[,1] == a)) current_stats[a] <- up[max(which(up[,1] == a)),3] } indegs_window[((e-1)Na+1) : (eNa)] <- current_stats times2[((e-1)Na+1) : (eNa)] <- prepData2$intervals[cptdep] outcomes2[(e-1)*Na + prepData2$eventSender[e]] <- 1 cptdep <- cptdep + 1 } if(!isdep) { # update windowed stat up <- updates_rc[[cptexo]][[1]] for(a in 1:Na) { if(!is.null(up) && any(up[,1] == a)) current_stats[a] <- up[max(which(up[,1] == a)),3] } indegs_window[((e-1)Na+1) : (eNa)] <- current_stats times2[((e-1)Na+1) : (eNa)] <- prepData2$rightCensoredIntervals[cptexo] cptexo <- cptexo + 1 } }
3. check glm results
df2 <- data.frame(times = times2, logtimes = log(times2), indegs_window, outcomes = outcomes2) df2[1:(Na+1),1] <- 1 df2[1:(Na+1),2] <- 0 df2 <- df2[1:((Ndep+Nexo2-2)*Na),] modGLM2 <- glm(outcomes ~ offset(logtimes) + indegs_window, family = poisson(link = "log"), data = df2) coef(modGLM2)
4. check previous models
mod04RateCSimple2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default_c", startTime = 1220733469) ) coef(mod04RateCSimple2)
mod04RateRSimple2 <- estimate( FormulaSimple2, model = "DyNAM", subModel = "rate", estimationInit = list(engine = "default", startTime = 1220733469) ) coef(mod04RateRSimple2)
coef(mod04RateCSimple2) - coef(mod04RateRSimple2) # 0.1557715 -0.9546496 coef(mod04RateCSimple2) - coef(modGLM2) # 0.1444659 -1.5702790 coef(mod04RateRSimple2) - coef(modGLM2) # -0.0113056 -0.6156294
`
Hopefully, this simulation helps with the discussion @stadtfeldethz and @marion-hoffman. It only has one exogenous event.
### small simulation
actDf <- data.frame(
label = LETTERS,
xVar = rbinom(length(LETTERS), 1, 0.2)
)
xVarChange <- rbinom(length(LETTERS), 1, 0.7)
X <- cbind(1, actDf$xVar)
endTime <- 30
exEventTime <- 15
parms <- c(log(1000 / endTime / length(LETTERS)), 1 )
expXb <- exp(X %*% parms)
sumRate <- sum(expXb)
time <- 0
events <- NULL
dfglm <- NULL
first <- TRUE
while (time < endTime) {
# cat("Time:", time, "\n")
timeEvent <- rexp(1, sumRate)
if (first && time + timeEvent > exEventTime) {
dfglm <- rbind(
dfglm,
data.frame(
time = time,
diffTime = exEventTime - time,
xVar = X[, 2],
outcomes = 0
)
)
X <- cbind(1, xVarChange)
expXb <- exp(X %*% parms)
sumRate <- sum(expXb)
time <- exEventTime
first <- FALSE
} else {
time <- time + timeEvent
sender <- sample(LETTERS, 1, prob = expXb)
receiver <- sample(LETTERS[sender != LETTERS], 1)
events <- rbind(
events,
data.frame(
time = time,
sender = sender,
receiver = receiver,
increment = 1
)
)
dfglm <- rbind(
dfglm,
data.frame(
time = time,
diffTime = timeEvent,
xVar = X[, 2],
outcomes = 1 * (LETTERS == sender)
)
)
}
}
changeAttr <- data.frame(
time = exEventTime,
node = LETTERS,
replace = xVarChange
)
actDf <- defineNodes(actDf) |>
linkEvents(changeEvents = changeAttr, attribute = "xVar")
netEvents <- defineNetwork(nodes = actDf) |>
linkEvents(changeEvents = events, nodes = actDf)
depEvents <- defineDependentEvents(events, nodes = actDf,
defaultNetwork = netEvents)
# model exclude last event
preproTest <- estimate(depEvents ~ 1 + ego(actDf$xVar),
model = "DyNAM", subModel = "rate",
estimationInit = list(startTime = 0, endTime = 30))
summary(modTest)
confint(modTest)
# model exclude last event
dfglm$logtimes <- log(dfglm$diffTime)
modTestGLM <- glm(outcomes ~ offset(logtimes) + xVar,
family = poisson(link = "log"),
data = dfglm[seq.int(1, (nrow(events)) * length(LETTERS)), ])
summary(modTestGLM)
confint(modTestGLM)
library(broom)
library(pixiedust)
tidy(modTestGLM, conf.int = TRUE)
tidy(modTest, conf.int = TRUE)
From one run
> library(broom)
> library(pixiedust)
> parms
[1] 0.2484614 1.0000000
> tidy(modTestGLM, conf.int = TRUE) |>
+ dust() |>
+ sprinkle(col = c(2:4, 6:7), round = 6) |>
+ sprinkle(col = 5, fn = quote(pvalString(value)))
term estimate std.error statistic p.value conf.low conf.high
1 (Intercept) 0.224695 0.045268 4.963687 < 0.001 0.134638 0.312124
2 xVar 1.011514 0.052867 19.133289 < 0.001 0.908709 1.115992
> tidy(modTest, conf.int = TRUE) |>
+ dust() |>
+ sprinkle(col = c(2:4, 6:7), round = 6) |>
+ sprinkle(col = 5, fn = quote(pvalString(value)))
term estimate std.error statistic p.value conf.low conf.high
1 Intercept 0.224695 0.045268 4.963685 < 0.001 0.135972 0.313419
2 actDf xVar ego 1.011514 0.052867 19.133283 < 0.001 0.907897 1.115131
> coef(modTestGLM) - coef(modTest)
(Intercept) xVar
-3.640075e-10 3.640201e-10
Seems like the last time interval during preprocessing is calculated wrong and produce the differences between glm and goldfish estimations.