bambi
bambi copied to clipboard
Add support for multidimensional data
Would it be possible to add support for data with more than two dimensions?
My data already consists of three dimensions (exercise, session, observation).
When using Bambi I need to make this in a long format, loosing all the dimensions.
This means that after doing inference, I need to manipulate the output xarray (inference_data) to once again fit the source.
For hierarchical models, this limits how I can visualize using Arviz.
I'm not sure I understand what you mean by dimensions. Is it that you want to have a multivariate response (e.g. a model with not a single, but multiple response variables)?
If you have a small example that shows the problem, that would be even better!
Not a small example I'm afraid. But perhaps you get the idea anyway ☺️
Data
import pymc as pm
import numpy as np
import xarray as xr
import pandas as pd
# Create xarray with test data
coords = {'exercise': ['Squat', 'Overhead Press', 'Deadlift'],
'date': ['2020-02-24', '2020-02-26', '2020-03-21', '2020-03-25', '2020-04-06', '2020-07-25', '2020-08-10'],
'observation': [1, 2, 3, 4, 5, 6]}
dims = list(coords.keys())
velocities = np.array([[[1.198, 1.042, 0.875, 0.735, 0.574, 0.552],
[1.261, 0.993, 0.857, 0.828, 0.624, 0.599],
[1.224, 1.028, 0.990, 0.742, 0.595, 0.427],
[1.112, 0.999, 0.865, 0.751, 0.607, 0.404],
[1.157, 1.010, 0.849, 0.716, 0.593, 0.536],
[1.179, 1.060, 0.898, 0.783, 0.615, 0.501],
[1.209, 1.192, 0.979, 1.025, 0.875, 0.887]],
[[0.911, 0.933, 0.779, 0.629, 0.528, 0.409],
[1.004, 0.863, 0.770, 0.611, 0.511, 0.376],
[0.980, 0.828, 0.766, 0.611, 0.529, 0.349],
[1.024, 0.933, 0.841, 0.734, 0.551, 0.287],
[0.985, 0.852, 0.599, 0.522, 0.313, 0.234],
[0.996, 0.763, 0.758, 0.542, 0.463, 0.378],
[1.066, 0.915, 0.786, 0.686, 0.565, 0.429]],
[[0.654, 1.004, 0.727, 0.483, 0.344, 0.317],
[0.885, 0.738, 0.577, 0.495, 0.335, 0.291],
[0.749, 0.695, 0.539, 0.461, 0.395, 0.279],
[0.801, 0.548, 0.404, 0.424, 0.337, 0.338],
[0.770, 0.642, 0.602, 0.493, 0.394, 0.290],
[0.766, 0.545, 0.426, 0.431, 0.349, 0.329],
[0.787, 0.640, 0.480, 0.401, 0.395, 0.304]]])
loads = np.array([[[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 117.5],
[20.0, 40.0, 60.0, 80.0, 100.0, 115.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 40.0, 60.0, 80.0, 100.0, 110.0],
[20.0, 30.0, 40.0, 50.0, 60.0, 70.0]],
[[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0],
[20.0, 30.0, 40.0, 45.0, 50.0, 52.5],
[20.0, 30.0, 35.0, 40.0, 42.5, 45.0],
[20.0, 25.0, 30.0, 35.0, 40.0, 45.0]],
[[60.0, 80.0, 100.0, 120.0, 140.0, 145.0],
[60.0, 80.0, 100.0, 120.0, 140.0, 145.0],
[60.0, 80.0, 100.0, 120.0, 127.5, 140.0],
[60.0, 100.0, 120.0, 125.0, 140.0, 145.0],
[60.0, 80.0, 100.0, 120.0, 130.0, 140.0],
[60.0, 100.0, 120.0, 125.0, 130.0, 132.5],
[70.0, 90.0, 120.0, 135.0, 140.0, 150.0]]])
mask = np.random.binomial(n = 1, p = 0.8, size = velocities.shape)
dataset = (xr.Dataset({'velocity': (dims, velocities),
'load': (dims, loads),
'mask': (dims, mask)},
coords = coords)
.set_coords(['mask']))
dataset['velocity_std'] = (dataset['velocity'] - dataset['velocity'].mean())/dataset['velocity'].std()
dataset['load_std'] = (dataset['load'] - dataset['load'].mean())/dataset['load'].std()
dataset['velocity_std_masked'] = dataset['velocity_std'].where(dataset['mask'])
dataset['load_std_masked'] = dataset['load_std'].where(dataset['mask'])
I want the model to match the dimensional format, in this example three dimensions. This is my PyMC version:
# Create PyMC
n_exercises = dataset.dims['exercise']
n_dates = dataset.dims['date']
n_observations = dataset.dims['observation']
exercise_date_idx = [[i]*n_dates for i in np.arange(n_exercises)]
exercise_idx = [[[i]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]
date_idx = [[[j]*n_observations for j in np.arange(n_dates)] for i in np.arange(n_exercises)]
velocity = dataset['velocity_std_masked']
load = dataset['load_std_masked']
with pm.Model(coords = coords) as model:
# Inputs
velocity = pm.Normal('velocity',
mu = 0.0,
sigma = 1.0,
observed = velocity,
dims = dims)
# Global parameters
global_intercept = pm.Normal('global_intercept', mu = 0, sigma = 1)
global_intercept_sd = pm.Exponential('global_intercept_sd', lam = 1)
global_slope = pm.Normal('global_slope', mu = 0, sigma = 1)
global_slope_sd = pm.Exponential('global_slope_sd', lam = 1)
# Exercise parameters
exercise_intercept = pm.Normal('exercise_intercept', mu = global_intercept, sigma = global_intercept_sd, dims = 'exercise')
exercise_intercept_sd = pm.Exponential('exercise_intercept_sd', lam = 1, dims = 'exercise')
exercise_slope = pm.Normal('exercise_slope', mu = global_slope, sigma = global_slope_sd, dims = 'exercise')
exercise_slope_sd = pm.Exponential('exercise_slope_sd', lam = 1, dims = 'exercise')
# Date parameters
date_intercept = pm.Normal('date_intercept', mu = exercise_intercept[exercise_date_idx], sigma = exercise_intercept_sd[exercise_date_idx], dims = ['exercise', 'date'])
date_slope = pm.Normal('date_slope', mu = exercise_slope[exercise_date_idx], sigma = exercise_slope_sd[exercise_date_idx], dims = ['exercise', 'date'])
# Model error
sigma = pm.Exponential('sigma', lam = 1)
# Expected value
mu = date_intercept[exercise_idx, date_idx] + date_slope[exercise_idx, date_idx]*velocity
# Likelihood of observed values
likelihood = pm.Normal('load',
mu = mu,
sigma = sigma,
observed = load,
dims = dims)