pennylane
pennylane copied to clipboard
[BUG] TorchLayer doesn't work with diff_method='parameter-shift'
Expected behavior
The TorchLayer demo should work with diff_method='parameter-shift'
Actual behavior
It throws an error.
Additional information
This question originated from Forum thread 4940.
Source code
# Demo with parameter-shift
import torch
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_moons
# Set random seeds
torch.manual_seed(42)
np.random.seed(42)
X, y = make_moons(n_samples=200, noise=0.1)
y_ = torch.unsqueeze(torch.tensor(y), 1) # used for one-hot encoded labels
y_hot = torch.scatter(torch.zeros((200, 2)), 1, y_, 1)
c = ["#1f77b4" if y_ == 0 else "#ff7f0e" for y_ in y] # colours for each class
plt.axis("off")
plt.scatter(X[:, 0], X[:, 1], c=c)
plt.show()
import pennylane as qml
n_qubits = 2
dev = qml.device("default.qubit", wires=n_qubits)
@qml.qnode(dev, diff_method='parameter-shift')
def qnode(inputs, weights):
qml.AngleEmbedding(inputs, wires=range(n_qubits))
qml.BasicEntanglerLayers(weights, wires=range(n_qubits))
return [qml.expval(qml.PauliZ(wires=i)) for i in range(n_qubits)]
n_layers = 6
weight_shapes = {"weights": (n_layers, n_qubits)}
qlayer = qml.qnn.TorchLayer(qnode, weight_shapes)
clayer_1 = torch.nn.Linear(2, 2)
clayer_2 = torch.nn.Linear(2, 2)
softmax = torch.nn.Softmax(dim=1)
layers = [clayer_1, qlayer, clayer_2, softmax]
model = torch.nn.Sequential(*layers)
opt = torch.optim.SGD(model.parameters(), lr=0.2)
loss = torch.nn.L1Loss()
X = torch.tensor(X, requires_grad=True).float()
y_hot = y_hot.float()
batch_size = 5
batches = 200 // batch_size
data_loader = torch.utils.data.DataLoader(
list(zip(X, y_hot)), batch_size=5, shuffle=True, drop_last=True
)
epochs = 6
for epoch in range(epochs):
running_loss = 0
for xs, ys in data_loader:
opt.zero_grad()
loss_evaluated = loss(model(xs), ys)
loss_evaluated.backward()
opt.step()
running_loss += loss_evaluated
avg_loss = running_loss / batches
print("Average loss over epoch {}: {:.4f}".format(epoch + 1, avg_loss))
y_pred = model(X)
predictions = torch.argmax(y_pred, axis=1).detach().numpy()
correct = [1 if p == p_true else 0 for p, p_true in zip(predictions, y)]
accuracy = sum(correct) / len(correct)
print(f"Accuracy: {accuracy * 100}%")
Tracebacks
---------------------------------------------------------------------------
NotImplementedError Traceback (most recent call last)
<ipython-input-8-d039678a60e4> in <cell line: 13>()
19
20 loss_evaluated = loss(model(xs), ys)
---> 21 loss_evaluated.backward()
22
23 opt.step()
11 frames
/usr/local/lib/python3.10/dist-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
523 inputs=inputs,
524 )
--> 525 torch.autograd.backward(
526 self, gradient, retain_graph, create_graph, inputs=inputs
527 )
/usr/local/lib/python3.10/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
265 # some Python versions print out the first line of a multi-line function
266 # calls in the traceback and some print out the last line
--> 267 _engine_run_backward(
268 tensors,
269 grad_tensors_,
/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py in _engine_run_backward(t_outputs, *args, **kwargs)
742 unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
743 try:
--> 744 return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
745 t_outputs, *args, **kwargs
746 ) # Calls into the C++ engine to run the backward pass
/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py in apply(self, *args)
299 )
300 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 301 return user_fn(self, *args)
302
303 def apply_jvp(self, *args):
/usr/local/lib/python3.10/dist-packages/pennylane/workflow/interfaces/torch.py in new_backward(ctx, *flat_grad_outputs)
99 def new_backward(ctx, *flat_grad_outputs):
100 grad_outputs = pytree.tree_unflatten(flat_grad_outputs, ctx._out_struct)
--> 101 grad_inputs = orig_bw(ctx, *grad_outputs)
102 # None corresponds to the diff of out_struct_holder
103 return (None,) + tuple(grad_inputs)
/usr/local/lib/python3.10/dist-packages/pennylane/workflow/interfaces/torch.py in backward(ctx, *dy)
184 # dL/dz convention of PennyLane, autograd and jax. This converts between the formats
185 dy = _recursive_conj(dy)
--> 186 vjps = ctx.jpc.compute_vjp(ctx.tapes, dy)
187 # split tensor into separate entries
188 unpacked_vjps = []
/usr/local/lib/python3.10/dist-packages/pennylane/workflow/jacobian_products.py in compute_vjp(self, tapes, dy)
297 return _compute_vjps(jacs, dy, tapes)
298
--> 299 vjp_tapes, processing_fn = qml.gradients.batch_vjp(
300 tapes, dy, self._gradient_transform, gradient_kwargs=self._gradient_kwargs
301 )
/usr/local/lib/python3.10/dist-packages/pennylane/gradients/vjp.py in batch_vjp(tapes, dys, gradient_fn, reduction, gradient_kwargs)
500 # Loop through the tapes and dys vector
501 for tape, dy in zip(tapes, dys):
--> 502 g_tapes, fn = vjp(tape, dy, gradient_fn, gradient_kwargs=gradient_kwargs)
503 reshape_info.append(len(g_tapes))
504 processing_fns.append(fn)
/usr/local/lib/python3.10/dist-packages/pennylane/gradients/vjp.py in vjp(tape, dy, gradient_fn, gradient_kwargs)
361 pass
362
--> 363 gradient_tapes, fn = gradient_fn(tape, **gradient_kwargs)
364
365 def processing_fn(results, num=None):
/usr/local/lib/python3.10/dist-packages/pennylane/transforms/core/transform_dispatcher.py in __call__(self, *targs, **tkwargs)
98 start = 0
99 for tape in expanded_tapes:
--> 100 intermediate_tapes, post_processing_fn = self._transform(
101 tape, *targs, **tkwargs
102 )
/usr/local/lib/python3.10/dist-packages/pennylane/gradients/parameter_shift.py in param_shift(tape, argnum, shifts, gradient_recipes, fallback_fn, f0, broadcast)
1110 transform_name = "parameter-shift rule"
1111 assert_no_state_returns(tape.measurements, transform_name)
-> 1112 assert_no_trainable_tape_batching(tape, transform_name)
1113
1114 if argnum is None and not tape.trainable_params:
/usr/local/lib/python3.10/dist-packages/pennylane/gradients/gradient_transform.py in assert_no_trainable_tape_batching(tape, transform_name)
95 for idx in range(len(tape.trainable_params)):
96 if tape.get_operation(idx)[0].batch_size is not None:
---> 97 raise NotImplementedError(
98 "Computing the gradient of broadcasted tapes with respect to the broadcasted "
99 f"parameters using the {transform_name} gradient transform is currently not "
NotImplementedError: Computing the gradient of broadcasted tapes with respect to the broadcasted parameters using the parameter-shift rule gradient transform is currently not supported. See #4462 for details.
System information
Name: PennyLane
Version: 0.37.0
Summary: PennyLane is a cross-platform Python library for quantum computing, quantum machine learning, and quantum chemistry. Train a quantum computer the same way as a neural network.
Home-page: https://github.com/PennyLaneAI/pennylane
Author:
Author-email:
License: Apache License 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: appdirs, autograd, autoray, cachetools, networkx, numpy, packaging, pennylane-lightning, requests, rustworkx, scipy, semantic-version, toml, typing-extensions
Required-by: PennyLane_Lightning
Platform info: Linux-6.1.85+-x86_64-with-glibc2.35
Python version: 3.10.12
Numpy version: 1.26.4
Scipy version: 1.13.1
Installed devices:
- lightning.qubit (PennyLane_Lightning-0.37.0)
- default.clifford (PennyLane-0.37.0)
- default.gaussian (PennyLane-0.37.0)
- default.mixed (PennyLane-0.37.0)
- default.qubit (PennyLane-0.37.0)
- default.qubit.autograd (PennyLane-0.37.0)
- default.qubit.jax (PennyLane-0.37.0)
- default.qubit.legacy (PennyLane-0.37.0)
- default.qubit.tf (PennyLane-0.37.0)
- default.qubit.torch (PennyLane-0.37.0)
- default.qutrit (PennyLane-0.37.0)
- default.qutrit.mixed (PennyLane-0.37.0)
- default.tensor (PennyLane-0.37.0)
- null.qubit (PennyLane-0.37.0)
Existing GitHub issues
- [X] I have searched existing GitHub issues to make sure the issue does not already exist.
Given the error
/usr/local/lib/python3.10/dist-packages/pennylane/gradients/gradient_transform.py in assert_no_trainable_tape_batching(tape, transform_name)
95 for idx in range(len(tape.trainable_params)):
96 if tape.get_operation(idx)[0].batch_size is not None:
---> 97 raise NotImplementedError(
98 "Computing the gradient of broadcasted tapes with respect to the broadcasted "
99 f"parameters using the {transform_name} gradient transform is currently not "
Would this imply there is some behavior not implemented for diff_method='parameter-shift?
@mews6 yep that's correct! We never updated the parameter-shift rule to support broadcasting, however it seems that the torch layer is providing the parameter-shift rule with a broadcasted tape, which is breaking. @dwierichs may have more technical details here.
As @josh146 and the error message say, if trainable parameters are batched/broadcasted, param_shift can not handle that.
However, oftentimes the training data is batched, whereas the trainable parameters are not batched. Could this be the case here, and the batched parameters can actually be marked as non-trainable? Non-trainable batched parameters are supported by param_shift.