pymc-marketing
pymc-marketing copied to clipboard
Is contribution curve (response curve) correct?
From here, the plot seems to suggest 1:1 mapping between input, and output. However it doesn't account for adstock effect (lagging). If I were to correct this in toy example, it would give a different curve, like so
Where black line is an input, blue line is adstock transformed, and green line is adstock + saturation transformed. Now for the last plot, I'm not sure which response curve is correct
Reproduction code
import numpy as np
import pandas as pd
import matplotlib.cm as mplcm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from scipy.optimize import curve_fit
from sklearn.preprocessing import MaxAbsScaler
def geometric_adstock(x, alpha: float = 0.0, l_max: int = 12):
"""Geometric adstock transformation."""
# l_max = min(l_max, len(x))
cycles = [
np.concatenate(
[np.zeros(i), x[: x.shape[0] - i]]
)
for i in range(l_max)
]
x_cycle = np.stack(cycles)
w = np.array([np.power(alpha, i) for i in range(l_max)])
discounted = x.reshape(-1, 1) @ w.reshape(1, -1)
contrib = np.zeros((len(discounted), len(discounted)))
for i in range(len(discounted)):
lim = min(len(discounted), i + l_max)
contrib[i:lim, i] = discounted[i, :lim - i]
return np.dot(w, x_cycle), contrib
def saturation(x, k, s):
return 1/(1 + (x/k)**(-s))
def best_fit(x, y, n_points: int = 100):
xScaler, yScaler = MaxAbsScaler(), MaxAbsScaler()
xScaler.fit(x.reshape(-1, 1)), yScaler.fit(y.reshape(-1, 1))
x_scaled = xScaler.transform(x.reshape(-1, 1)).reshape(-1)
y_scaled = yScaler.transform(y.reshape(-1, 1)).reshape(-1)
xs = np.linspace(x_scaled.min(), x_scaled.max(), n_points)
soln, _ = curve_fit(saturation, x_scaled, y_scaled, bounds=[[0.1, 0.1], [10, 10]])
ys = saturation(xs, *soln)
return xScaler.inverse_transform(xs.reshape(-1, 1)).reshape(-1), yScaler.inverse_transform(ys.reshape(-1, 1)).reshape(-1)
np.random.seed(1335)
n_samples = 50
xs = np.linspace(0, 100, n_samples)/100
cm = plt.get_cmap('gist_rainbow')
cNorm = colors.Normalize(vmin=0, vmax=n_samples-1)
scalarMap = mplcm.ScalarMappable(norm=cNorm, cmap=cm)
zs = np.clip(np.linspace(1, 0.3, len(xs)) * np.random.randn(len(xs)) + np.linspace(3, -1, len(xs)), 0, 1e9).reshape(-1, 1)
z_scaler = MaxAbsScaler().fit(zs)
zs_scaled = z_scaler.transform(zs)
z_adstock, z_contrib = geometric_adstock(zs_scaled.reshape(-1), 0.7, l_max=12)
z_saturated = saturation(z_adstock, 0.5, 1)
fig, axes = plt.subplots(
nrows=3,
ncols=1,
sharex=False,
sharey=False,
figsize=(10, 12)
)
z_og = z_scaler.inverse_transform(zs_scaled.reshape(-1, 1)).reshape(-1)
z_adstcok_og = z_scaler.inverse_transform(z_adstock.reshape(-1, 1)).reshape(-1)
z_sat_og = z_scaler.inverse_transform(z_saturated.reshape(-1, 1)).reshape(-1)
z_contrib_og = z_scaler.inverse_transform(z_contrib.reshape(-1, 1)).reshape(z_contrib.shape)
z_contrib_rescaled_og = z_contrib/((np.sum(z_contrib, axis=1)/z_scaler.inverse_transform(z_saturated.reshape(-1, 1)).reshape(-1)).reshape(-1, 1))
pd.DataFrame(z_contrib_og, index=xs).plot.area(
ax=axes[0], alpha=1, linewidth=0, color=plt.cm.Spectral(np.linspace(0, 1, n_samples + 1)), legend=False
)
axes[0].plot(xs, z_og, label="Input", color="k")
axes[0].plot(xs, z_adstcok_og, label="Adstock", color="blue")
axes[0].plot(xs, z_sat_og, label="Adstock + Saturation", color="green")
axes[0].set_title("Adstock Contribution effect")
pd.DataFrame(z_contrib_rescaled_og, index=xs).plot.area(
ax=axes[1], alpha=1, linewidth=0, color=plt.cm.Spectral(np.linspace(0, 1, n_samples + 1)), legend=False
)
axes[1].plot(xs, z_og, color="k")
axes[1].plot(xs, z_adstcok_og, label="Adstock", color="blue")
axes[1].plot(xs, z_sat_og, label="Adstock + Saturation", color="green")
axes[1].set_title("Rescaled Saturation + Adstock Contribution effect")
axes[2].scatter(z_og, z_sat_og, label="Original")
axes[2].plot(*best_fit(z_og, z_sat_og))
axes[2].scatter(z_og, np.sum(z_contrib_rescaled_og, axis=0), label="Correction (?)")
axes[2].plot(*best_fit(z_og, np.sum(z_contrib_rescaled_og, axis=0)))
axes[2].legend(loc="best")
axes[2].set_title("Response Curve")
plt.show()
Hi! @thipokKub ! Thanks for opening this issue! I see your point and I agree our current implementation is too naive as it is taking just the direct contribution. Therefore the carryover contribution is misleading (note the high contribution at zero spending in the original plot). Do you want to open a PR :) ?
A first brute-force approach shows we can apply the adstock transformation to the input vector seen as a diagonal matrix to recover the time specific adstock contributions without having to modify the geometric_adstock
method:
# from the data generating process of the mmm example notebook
plt.scatter(
df["x1"].to_numpy(),
beta_1
* logistic_saturation(
x=(
geometric_adstock(
x=np.diag(df["x1"]), alpha=alpha1, l_max=8, normalize=True # apply to the extended vector
)
),
lam=lam1,
)
.eval()
.sum(axis=0), # sum to get the total contribution per spend date
)
I suspect the points which do not seem to fit the curve are the very last points that do not have enough data fro have the complete contribution window of l_max = 8
.
Actually, after some things (and discussions with @lucianopaz ), I realize this is not that straight forward.
-
At the moment we have these βcostβ curves which indeed show the immediate contribution but not the delayed one.
-
Trying to get the delayed cumulative contribution is not that easy as contributions from the past leak into the future. Specifically, note that we apply the saturation function to the aggregation. As the saturation function is non-linear. This is not the same as taking the sum of the saturation contributions (which I believe is your approach in the code above). Hence, is very hard to reverse engineer the contribution after carryover and saturation composition this way.
I think we could keep these direct cost curves and tackle the return of investment estimation via ROAS and mROAS as described in https://research.google/pubs/pub46001/
Happy to keep up the discussion!
BTW: you might want to normalize the adstock transformation in your code so that the sum of the convoluted signal adds up to the original value as we do it in the model (as suggested by @lucianopaz ) π
You are right, the saturation effect is a bit messy. Considering the adstock effect is the only effect in the model that carry past state information into the future, and the saturation curve rescale the cumulative effect non-linearly. I thought that adjusting the scale across time would be enough
But come to think of it, the saturation level is directly effected by previous state lag effect, and a single response curve level might not make sense. For example assume that at input of 100
, and no past lag effect the model predict 100
as an outcome. But if the past state were to be sonething like 90
, the model might predict 130
I think I am going to look into lightweight-mmm implementation, and see how they are implemented
Also using mROAS might be a better way to do things in the long run
The devil is in the details! Thanks for bringing up this discussion! For decision-making I personally look by the ROAS AND mROAS. Therefore we need to be clear and transparent how (and if is good) to look into these cost curves.
I decided to draft the idea of using the fitted model to generate "cost" curves to show the contribution as a function of the spend share. For example... if for 100 EUR total spend we estimate 1000 EUR then what would we get if we increase the total spend by 20%? This is in essence, the denominator of the mROAS.
Here is the code (to be prettified) and the resulting plot:
Click me to see the code
# Extract model parameters posterior samples
alpha_posterior = az.extract(
mmm.fit_result, group="posterior", var_names=["alpha"]
).to_numpy()
lam_posterior = az.extract(
mmm.fit_result, group="posterior", var_names=["lam"]
).to_numpy()
beta_channel_posterior = az.extract(
mmm.fit_result, group="posterior", var_names=["beta_channel"]
).to_numpy()
# Set up grid of proportions
share_grid = np.linspace(start=0, stop=1.5, num=11)
# We compute the contributions of the channels when the input channel delta is
# scaled by delta.
contributions = []
for delta in share_grid:
channel_data = (
delta
* mmm.max_abs_scale_channel_data(data=mmm.data)[mmm.channel_columns].to_numpy()
)
# Forward pass
channel_contribution_posterior = (
np.expand_dims(a=beta_channel_posterior.T, axis=1)
* logistic_saturation(
x=(
geometric_adstock(
x=channel_data,
alpha=alpha_posterior.T,
l_max=8,
normalize=True,
axis=0,
)
),
lam=np.expand_dims(a=lam_posterior.T, axis=1),
).eval()
)
channel_contribution_posterior_original_scale = np.vectorize(
mmm.target_transformer.inverse_transform,
excluded=[1, 2],
signature="(m, n) -> (m, n)",
)(channel_contribution_posterior)
contributions.append(channel_contribution_posterior_original_scale)
contributions = np.array(contributions)
# Plot results
fig, ax = plt.subplots(figsize=(15, 7))
for i, x in enumerate(mmm.channel_columns):
# Compute the HDI of the contribution per channel
hdi_contribution = az.hdi(ary=contributions[:, :, :, i].sum(axis=-1).T)
ax.fill_between(
x=share_grid,
y1=hdi_contribution[:, 0],
y2=hdi_contribution[:, 1],
color=f"C{i}",
label=f"{x} $94%$ HDI contribution",
alpha=0.4,
)
sns.lineplot(
x=share_grid,
y=contributions[:, :, :, i].sum(axis=-1).mean(axis=1),
color=f"C{i}",
marker="o",
label=f"{x} contribution mean",
ax=ax,
)
# Compare results with the true contribution for delta = 1
ax.axhline(
y=(amplitude * betas[i] * df[f"x{i + 1}_adstock_saturated"]).sum(),
color=f"C{i}",
linestyle="--",
label=f"{x} true contribution",
)
ax.axvline(x=1, color="black", linestyle="--", label=r"$\delta = 1$")
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
ax.set(
title="Channel contribution as a function of cost share",
xlabel=r"$\delta$",
ylabel="contribution (sales)",
)
We can of course change the x-axis to spend units instead of the share change.
Okay, I think I've got what lightweight MMM do for its response curve prediction. From what I've found, they did not account for adstock lag effect, and only calculate its immediate effect after the training data
For example if training data size is 200
, then it will append 1
datapoint and extends its inference to 201
datapoints before selecting the 1
last datapoint (To get the correct adstock effect). Also with 1
extra datapoint, the Lightweight MMM code use mean of exogenous variables mean per channel (if any)
In my opinion, I think the more correct approach is to extend it to the adstock window size, and sum up the predicted value
The Lightweight mmm implementation is not exactly what I expected π
It is along these line of codes
Setup Code
import numpy as np
import jax.numpy as jnp
from typing import Union
import matplotlib.pyplot as plt
from scipy.ndimage import convolve
from sklearn.preprocessing import MaxAbsScaler
def geometric_adstock(x, alpha: Union[float, np.array] = 0.0, l_max: int = 12):
l_max = min(l_max, len(x))
if type(alpha) is np.array:
assert len(x.shape) == 2
assert len(alpha.shape) == 1
assert x.shape[1] == alpha.shape[0]
else:
alpha = np.array([alpha])
if len(x.shape) == 1:
x = x.reshape(-1, 1)
alpha = jnp.array(alpha)
x = jnp.array(x)
cycles = [
jnp.concatenate(
[jnp.zeros((i, x.shape[1])), x[: x.shape[0] - i]]
)
for i in range(l_max)
]
x_cycle = jnp.stack(cycles)
w = jnp.array([jnp.power(alpha, i) for i in range(l_max)])
return (w * x_cycle).sum(axis=0)
def saturation(x, k, s):
return jnp.where(x <= 0, 0, 1/(1 + (jnp.clip(x, 1e-3, jnp.inf)/k)**(-s)))
Mock Data Generation Code
np.random.seed(1464)
n_samples = 200
t = np.linspace(0, 1, n_samples)
xs = np.array([
np.clip(np.linspace(1, 0.3, len(t)) * np.random.randn(len(t)) + np.linspace(3, -1, len(t)), 0, np.inf),
3*np.clip(np.linspace(0.5, 1.2, len(t)) * np.sin(np.linspace(0, 2*np.pi, len(t)))**2 + 0.2*np.random.randn(len(t)), 0, np.inf)
]).T
exog = convolve(np.random.randn(n_samples).cumsum(), np.ones(9)/9, mode="reflect") + 0.5*np.random.randn(n_samples)
exog = (0.9*(exog - exog.min())/(exog.max() - exog.min()) + 0.1).reshape(-1, 1)
n_media = xs.shape[1]
enog_scaler = MaxAbsScaler().fit(xs)
exog_scaler = MaxAbsScaler().fit(exog)
xs_scaled_train = enog_scaler.transform(xs)
exog_scaled_train = exog_scaler.transform(exog)
def mock_model_pred(xs_scaled, exog_scaled):
z_adstock = geometric_adstock(xs_scaled, [0.7, 0.3], l_max=12)
z_saturated = saturation(z_adstock, 0.5, 1)
return jnp.sum(jnp.array([1.5, 0.4]) * z_saturated, axis=1, keepdims=True) + 0.3 * exog_scaled
def mock_model_cat_pred(xs_new_scaled, exog_new_scaled):
full_media = jnp.concatenate([xs_scaled_train, xs_new_scaled], axis=0)
full_exogs = jnp.concatenate([exog_scaled_train, exog_new_scaled], axis=0)
return mock_model_pred(full_media, full_exogs)[len(xs_scaled_train):]
sales = mock_model_pred(xs_scaled_train, exog_scaled_train)
(What I think) Lightweight MMM response curve contribution plot do
n_steps = 50
media_max = xs.max(axis=0) * 1.2
media_range = np.expand_dims(np.linspace(0, media_max, n_steps), axis=0)
diagonal = np.repeat(np.eye(n_media), n_steps, axis=0).reshape(n_media, n_steps, n_media)
mock_media = media_range * diagonal
extra_features = np.expand_dims(exog_scaled_train.mean(axis=0), axis=0)
prediction_offset = mock_model_cat_pred(np.zeros((1, n_media)), np.expand_dims(exog_scaled_train.mean(axis=0), axis=0))
predictions = []
# In the original code, this is a nested vmap, which I'm not familiar with
# See https://github.com/google/lightweight_mmm/blob/main/lightweight_mmm/plot.py#L438
for i in range(len(mock_media)):
acc = []
for j in range(len(mock_media[i])):
acc.append(
mock_model_cat_pred(np.expand_dims(mock_media[i][j], axis=0), np.expand_dims(exog_scaled_train.mean(axis=0), axis=0)).mean(axis=0)
)
acc = jnp.array(acc)
predictions.append(acc)
predictions = jnp.squeeze(jnp.array(predictions).transpose(1, 0, 2))
predictions = predictions - prediction_offset
for i_ch in range(n_media):
plt.plot(jnp.squeeze(media_range)[:, i_ch], predictions[:, i_ch])
plt.title("Response curves")
plt.show()
Thanks for checking! PyMC-Marketing is a community driven project so we are free to develop together! I'll create a PR regarding my approach to global cost curves (as above). I also plan to rename and add comments to the current direct-response for clarity (transparency). Feel free to open further PRs based on your learnings π !
Another thing I did is to inspect response/incremental response curve relationship to the offset (previous state). As shown in the graph below, we can see that the response curve change non-linearly with with the offset
Here
- Response/Incremental Response curve focus on sales volume only
- ROAS/mROAS focus on the sales to cost ratio, and
- CPA/CPIA focus on the cost to sales ratio (basically they are inverse of each other)
Sub-Topic: Budget Optimization From the graph, we can see that the blue channel dominate if the offset is low, but the orange channel dominate in higher offset. So I don't think the offset state should be ignore.This problem is related to budget optimization correctly (but I'm still formulating it, as I only summed to the simulated horizon)
If given a time period of T
, the current assumption is we can only spend Ads only once. However, this is not realistic. So I might look into the case where we spend ads every t
interval e.g. experimental horizon of 30
days, but spend ads at day 0
, 10
, and 20
v.s. spend ads at 0
, 15
v.s. spend at 0
only (the minimum interval is daily spending, or t=1
)
For deterministic case, the solution lies on the curve, ad cross point between ads media, or the maximum at cost budget ceiling. So I think this should have a reliable budget optimizer solver. But notice that this is a toy example where I assume deterministic curve. However, in practice the weights are sampled from the learnt prior distribution. So the logic here might be more fuzzy
Toy example code
import numpy as np
from typing import Union
import matplotlib.pyplot as plt
from scipy.ndimage import convolve
from sklearn.preprocessing import MaxAbsScaler
def geometric_adstock(x, alpha: Union[float, np.array] = 0.0, l_max: int = 12):
l_max = min(l_max, len(x))
if type(alpha) is np.array:
assert len(x.shape) == 2
assert len(alpha.shape) == 1
assert x.shape[1] == alpha.shape[0]
else:
alpha = np.array([alpha])
if len(x.shape) == 1:
x = x.reshape(-1, 1)
alpha = np.array(alpha)
x = np.array(x)
cycles = [
np.concatenate(
[np.zeros((i, x.shape[1])), x[: x.shape[0] - i]]
)
for i in range(l_max)
]
x_cycle = np.stack(cycles)
w = np.array([np.power(alpha, i) for i in range(l_max)])
return (w * x_cycle).sum(axis=0)
def saturation(x, k, s):
return np.where(x <= 0, 0, 1/(1 + (np.clip(x, 1e-3, np.inf)/k)**(-s)))
max_cost_range = np.array([1500, 2000]) * 1.5
n_steps = 100
input_costs = (np.linspace(0, 1, n_steps).reshape(-1, 1) * max_cost_range.reshape(1, -1))[1:]
in_scaler = MaxAbsScaler().fit(input_costs)
sim_steps = 120
multiplier = np.zeros((sim_steps, *input_costs.shape))
multiplier[0, :, :] = 1
offsets = np.linspace(0, 1500, 16)
colors = ["blue", "orange"]
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
axes[0, 0].set_title("Total Response curve per channel")
axes[1, 0].set_title("ROAS per channel")
axes[2, 0].set_title("mROAS curve per channel")
axes[0, 1].set_title("Incremental/Marginal Response curve per channel")
axes[1, 1].set_title("CPA per channel")
axes[2, 1].set_title("CPIA curve per channel")
offsets_spread = []
baselines = []
for idx, offset in enumerate(offsets):
baseline = multiplier[:, 0, :] * in_scaler.transform(np.zeros((1, input_costs.shape[-1])) + offset )
spread_input_cost_scaled = multiplier * (np.expand_dims(in_scaler.transform(input_costs + offset), axis=0))
baseline_adstock = geometric_adstock(baseline, np.array([0.4, 0.9]), l_max=60)
baseline_saturated = saturation(baseline_adstock, np.array([0.3, 0.3]), np.array([0.6, 0.9]))
spread_z_adstocks = np.zeros_like(spread_input_cost_scaled)
spread_saturated = np.zeros_like(spread_input_cost_scaled)
for i in range(len(input_costs)):
spread_z_adstocks[:, i, :] = geometric_adstock(spread_input_cost_scaled[:, i, :], np.array([0.4, 0.9]), l_max=60)
spread_saturated[:, i, :] = saturation(spread_z_adstocks[:, i, :], np.array([0.3, 0.3]), np.array([0.6, 0.9]))
est_sales = (np.array([[[500, 133.5]]]) * spread_saturated).sum(axis=(0))
baseline_est_sales = (np.array([[500, 133.5]]) * baseline_saturated).sum(axis=(0))
offsets_spread.append(est_sales)
baselines.append(baseline_est_sales)
for i in range(offsets_spread[-1].shape[-1]):
axes[0, 0].plot(input_costs[:, i], offsets_spread[-1][:, i], color=colors[i], alpha=0.9*((idx + 1)/len(offsets)) + 0.1)
axes[0, 1].plot(input_costs[:, i], offsets_spread[-1][:, i] - baseline_est_sales[i], color=colors[i], alpha=0.9*((idx + 1)/len(offsets)) + 0.1)
axes[1, 0].plot(input_costs[:, i], offsets_spread[-1][:, i]/np.clip(input_costs[:, i], 1e-2, np.inf), color=colors[i], alpha=0.9*((idx + 1)/len(offsets)) + 0.1)
axes[1, 1].plot(input_costs[:, i], input_costs[:, i]/offsets_spread[-1][:, i], color=colors[i], alpha=0.9*((idx + 1)/len(offsets)) + 0.1)
axes[2, 0].plot(input_costs[:, i], (offsets_spread[-1][:, i] - baseline_est_sales[i])/np.clip(input_costs[:, i], 1e-2, np.inf), color=colors[i], alpha=0.9*((idx + 1)/len(offsets)) + 0.1)
axes[2, 1].plot(input_costs[:, i], input_costs[:, i]/(offsets_spread[-1][:, i] - baseline_est_sales[i]), color=colors[i], alpha=0.9*((idx + 1)/len(offsets)) + 0.1)
axes[1, 0].set_yscale('log')
axes[2, 0].set_yscale('log')
plt.tight_layout()
plt.show()
@thipokKub I started working on the suggested improvements in https://github.com/pymc-labs/pymc-marketing/pull/291 π !