DHARMa icon indicating copy to clipboard operation
DHARMa copied to clipboard

Add STAN example to documentation

Open florianhartig opened this issue 2 years ago • 0 comments

Example from Amael Le Squin for residual checks with STAN - to be integrated with the help

#### Example from https://cran.r-project.org/web/packages/DHARMa/vignettes/DHARMaForBayesians.html ran with Stan

library(data.table)
library(cmdstanr)
library(DHARMa)

#### Create data
set.seed(123)

dat = DHARMa::createData(200, overdispersion = 0.2)

Data = as.list(dat)
Data$nobs = nrow(dat)
Data$nGroups = length(unique(dat$group))

#### Fit data
## Common variables
n_chains = 3
maxIter = 3000

## Compile model
model = cmdstan_model("./florian_eg.stan")

## Run model
results = model$sample(data = Data, parallel_chains = n_chains, refresh = 50, chains = n_chains,
	iter_warmup = round(maxIter/2), iter_sampling = round(maxIter/2), save_warmup = FALSE)

n_sampling = results$metadata()$iter_sampling

#### Compute residuals
## Get parameters
intercept_array = as.vector(results$draws("intercept")) # dimensions: iter_sampling * n_chains * 1
env_array = as.vector(results$draws("env")) # dimensions: iter_sampling * n_chains * 1

## Posterior distributions
# Posterior lambda 
posteriorPredDistr = matrix(data = 0, nrow = n_sampling*n_chains, ncol = nrow(dat))
for (i in 1:nrow(dat))
	posteriorPredDistr[, i] = exp(intercept_array + env_array*Data$Environment1[i])

# Posterior response variable
posteriorPredSim = results$draws("observedResponseSim") # dimension: iter_sampling * n_chains * nobs

# Reshape posteriorPredSim as a matrix nobs x n_repetition
posteriorPredSim_matrix = matrix(data = 0, nrow = nrow(dat), ncol = n_sampling*n_chains)
for (obs in 1:nrow(posteriorPredSim_matrix))
{
	col_start = 1
	for (chain in 1:n_chains)
	{
		col_end = chain*n_sampling
		posteriorPredSim_matrix[obs, col_start:col_end] = posteriorPredSim[, chain, obs]
		col_start = col_end + 1
	}
}

## Create dharma data for the residuals
sim = createDHARMa(simulatedResponse = posteriorPredSim_matrix,
	observedResponse = dat$observedResponse,
	fittedPredictedResponse = apply(posteriorPredDistr, 2, median),
	integerResponse = TRUE)

## Plot residuals
plot(sim)

########################################################
###############    RANDOM EFFECT PART    ###############
########################################################
#### Add a random effect.
## Explanations:
# By default (see ?createData), the intercept is 0, the fixed effect is 1, the number of groups is 10, the random effect is on
# the intercept and its variance is 1. We create the expected result below and then run a stan model accounting for the random effect.

expected_results = c(intercept_mean = 0, env = 1, randEff_var = 1)

#### Create stan data
## Create indexing for groups (used in Stan to compute likelihood)
# Convert data to data table and order it by group
setDT(dat)
setorder(dat, group)

# Index
ls_groups = as.integer(dat[, unique(group)])
indices = integer(Data$nGroups + 1)

for (i in 1:Data$nGroups)
	indices[i] = min(which(dat[, group] == ls_groups[i]))
indices[Data$nGroups + 1] = Data$nobs + 1

## Data list for stan
Data = as.list(dat)
Data$nobs = nrow(dat)
Data$nGroups = length(unique(dat$group))
Data$indices = indices

#### Fit data
## Common variables
n_chains = 3
maxIter = 3000

## Compile model
model = cmdstan_model("./florian_eg_randEff.stan")

## Run model
results = model$sample(data = Data, parallel_chains = n_chains, refresh = 50, chains = n_chains,
	iter_warmup = round(maxIter/2), iter_sampling = round(maxIter/2), save_warmup = FALSE)

results$cmdstan_diagnose()

results$print(names(expected_results))
print(expected_results)

n_sampling = results$metadata()$iter_sampling

#### Compute residuals
## Posterior distributions
# Posterior lambda 
posteriorPredDistr = results$draws("lambda") # dimension: iter_sampling * n_chains * nobs
posteriorPredDistr_matrix = matrix(data = 0, nrow = n_sampling*n_chains, ncol = nrow(dat)) # One obs per column

for (i in 1:nrow(dat))
	posteriorPredDistr_matrix[, i] = as.vector(posteriorPredDistr[,,i])

# Posterior response variable
posteriorPredSim = results$draws("observedResponseSim") # dimension: iter_sampling * n_chains * nobs

# Reshape posteriorPredSim as a matrix nobs x n_repetition
posteriorPredSim_matrix = matrix(data = 0, nrow = nrow(dat), ncol = n_sampling*n_chains)
for (obs in 1:nrow(posteriorPredSim_matrix))
{
	col_start = 1
	for (chain in 1:n_chains)
	{
		col_end = chain*n_sampling
		posteriorPredSim_matrix[obs, col_start:col_end] = posteriorPredSim[, chain, obs]
		col_start = col_end + 1
	}
}

## Create dharma data for the residuals
sim = createDHARMa(simulatedResponse = posteriorPredSim_matrix,
	observedResponse = dat$observedResponse,
	fittedPredictedResponse = apply(posteriorPredDistr_matrix, 2, median),
	integerResponse = TRUE)

## Plot residuals
plot(sim)

florian_eg.stan

data {
	// Number of data
	int<lower = 0> nobs;

	// Observations
	int observedResponse [nobs];

	// Explanatory variables
	vector [nobs] Environment1;
}

parameters {
	real intercept;
	real env;
}

model {
	// Declare variables
	vector [nobs] lambda = exp(intercept + env*Environment1);

	// Priors
	target += normal_lpdf(intercept | 0, 1e4);
	target += normal_lpdf(env | 0, 1e4);

	// Likelihood
	target += poisson_lpmf(observedResponse | lambda);
}

generated quantities {
	int<lower = 0> observedResponseSim [nobs] = poisson_rng(exp(intercept + env*Environment1));
}

florian_eg_randEff.stan

data {
	// Number of data
	int<lower = 1> nobs;
	int<lower = 1> nGroups;

	// Indexing (index of the beginning of each group)
	array [nGroups + 1] int indices; // This syntax is compatible only from Stan 2.29!

	// Observations
	array [nobs] int observedResponse;

	// Explanatory variables
	vector [nobs] Environment1;
}

parameters {
	real intercept_mean;
	array [nGroups] real intercepts; // This syntax is compatible only from Stan 2.29!
	
	real env;
	
	// Random effect
	real<lower = 0> randEff_var;
}

model {
	// Declare variables
	vector [nobs] lambda;
	for (i in 1:nGroups)
		lambda[indices[i]:(indices[i + 1] - 1)] = exp(intercepts[i] + env*Environment1[indices[i]:(indices[i + 1] - 1)]);

	// Priors
	target += normal_lpdf(intercept_mean | 0, 1e4);
	target += normal_lpdf(intercepts | intercept_mean, randEff_var);
	target += normal_lpdf(env | 0, 1e4);

	target += gamma_lpdf(randEff_var | 1.0/1000, 1.0/1000); // Give an average of 1, and a variance of 1000

	// Likelihood
	target += poisson_lpmf(observedResponse | lambda);
}


generated quantities {
	array [nobs] int<lower = 0> observedResponseSim;
	vector<lower = 0> [nobs] lambda;

	for (i in 1:nGroups)
	{
		observedResponseSim[indices[i]:(indices[i + 1] - 1)] = poisson_rng(exp(intercepts[i] + env*Environment1[indices[i]:(indices[i + 1] - 1)]));
		lambda[indices[i]:(indices[i + 1] - 1)] = exp(intercepts[i] + env*Environment1[indices[i]:(indices[i + 1] - 1)]);
	}
}

florianhartig avatar Mar 03 '22 12:03 florianhartig