genrl
genrl copied to clipboard
Evaluating performance of contextual bandit agents in examples
I have been playing around with the DCBTrainer and found some potential inconsistencies.
- StatlogData example found here
from genrl.utils import StatlogDataBandit
bandit = StatlogDataBandit(download=True)
context = bandit.reset()
from genrl.agents import NeuralLinearPosteriorAgent
agent = NeuralLinearPosteriorAgent(bandit)
context = bandit.reset()
action = agent.select_action(context)
new_context, reward = bandit.step(action)
from genrl.trainers import DCBTrainer
trainer = DCBTrainer(agent, bandit)
trainer.train(timesteps=1000, batch_size=32)
and code to evaluate
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
def _evaluate(trainer, bandit):
y_true = bandit.df.iloc[:, -1].to_numpy()
class_distribution = bandit.df.iloc[:, -1].value_counts()
most_freq_class = class_distribution.idxmax()
baseline_accuracy = accuracy_score(y_true, np.resize(most_freq_class, len(bandit.df))).round(2)
tensor_matrix = torch.stack([torch.LongTensor(x).float() for x in bandit.df.iloc[:, :-1].to_numpy()])
y_pred = []
for i in tensor_matrix:
y_pred.append(trainer.agent.select_action(i).item())
print("Baseline accuracy score: {}%".format(baseline_accuracy))
print("After {} steps accuracy is {}%".format(agent.t, accuracy_score(y_true, y_pred).round(2)))
print("Classification report")
print(classification_report(y_true, y_pred))
fig = plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(bandit.cum_reward_hist)
plt.title("Cumulative reward")
plt.subplot(122)
plt.plot(bandit.cum_regret_hist)
plt.title("Cumulative regret")
plt.tight_layout()
return y_true, y_pred
yt, yp = _evaluate(trainer, bandit)
Baseline accuracy score: 0.78%
After 44501 steps accuracy is 0.78%
Classification report
precision recall f1-score support
0 0.00 0.00 0.00 0
1 0.78 0.99 0.88 34108
2 0.00 0.00 0.00 37
3 0.00 0.00 0.00 132
4 0.14 0.00 0.00 6748
5 0.17 0.00 0.00 2458
6 0.00 0.00 0.00 6
7 0.00 0.00 0.00 11
accuracy 0.78 43500
macro avg 0.14 0.12 0.11 43500
weighted avg 0.65 0.78 0.69 43500
- WineDataBandit example found here
Define the bandit
from typing import Tuple
import pandas as pd
import torch
from genrl.utils.data_bandits.base import DataBasedBandit
from genrl.utils.data_bandits.utils import download_data
URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
class WineDataBandit(DataBasedBandit):
def __init__(self, **kwargs):
super(WineDataBandit, self).__init__(**kwargs)
path = kwargs.get("path", "./data/Wine/")
download = kwargs.get("download", None)
force_download = kwargs.get("force_download", None)
url = kwargs.get("url", URL)
if download:
path = download_data(path, url, force_download)
self._df = pd.read_csv(path, header=None)
self.n_actions = len(self._df[0].unique())
self.context_dim = self._df.shape[1] - 1
self.len = len(self._df)
def reset(self) -> torch.Tensor:
self._reset()
self.df = self._df.sample(frac=1).reset_index(drop=True)
return self._get_context()
def _compute_reward(self, action: int) -> Tuple[int, int]:
label = self._df.iloc[self.idx, 0]
r = int(label == (action + 1))
return r, 1
def _get_context(self) -> torch.Tensor:
return torch.tensor(
self._df.iloc[self.idx, 1:].values,
device=self.device,
dtype=torch.float,
)
training
bandit = WineDataBandit(path='/path/to/data')
from genrl.agents import NeuralLinearPosteriorAgent
agent = NeuralLinearPosteriorAgent(bandit)
context = bandit.reset()
action = agent.select_action(context)
new_context, reward = bandit.step(action)
from genrl.trainers import DCBTrainer
trainer = DCBTrainer(agent, bandit)
trainer.train(timesteps=5000, batch_size=32)
and evaluation
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
def _evaluate(df, trainer, bandit):
y_true = bandit.df.iloc[:, 0].to_numpy()
class_distribution = bandit.df.iloc[:, 0].value_counts()
most_freq_class = class_distribution.idxmax()
baseline_accuracy = accuracy_score(y_true, np.resize(most_freq_class, len(bandit.df))).round(2)
tensor_matrix = torch.stack([torch.LongTensor(x).float() for x in bandit.df.iloc[:, 1:].to_numpy()])
y_pred = []
for i in tensor_matrix:
y_pred.append(trainer.agent.select_action(i).item())
print("Baseline accuracy score: {}%".format(baseline_accuracy))
print("After {} steps accuracy is {}%".format(agent.t, accuracy_score(y_true, y_pred).round(2)))
print("Classification report")
print(classification_report(y_true, y_pred))
fig = plt.figure(figsize=(10, 4))
plt.subplot(121)
plt.plot(bandit.cum_reward_hist)
plt.title("Cumulative reward")
plt.subplot(122)
plt.plot(bandit.cum_regret_hist)
plt.title("Cumulative regret")
plt.tight_layout()
return y_true, y_pred
yt, yp = _evaluate(trainer, bandit)
Baseline accuracy score: 0.4%
After 5357 steps accuracy is 0.0%
Classification report
precision recall f1-score support
0 0.00 0.00 0.00 0.0
1 0.00 0.00 0.00 59.0
2 0.00 0.00 0.00 71.0
3 0.00 0.00 0.00 48.0
accuracy 0.00 178.0
macro avg 0.00 0.00 0.00 178.0
weighted avg 0.00 0.00 0.00 178.0
For both cases (and the third titanic case referenced in #301 ) when training both reward and regret increases which could point to no actual learning happening and the increase in reward comes purely from randomly guessing instead of learning.
Notice that for the Statlog data, the evaluation col is the last column, while in the wine data, its the first column.
Is this still an issue?
It is. However, I think my baseline might be wrong.
I think the relevant baseline to compare performance to should be a bayesian regression trained directly on the data, instead of the output from the neural network. Do you agree?