executorch
executorch copied to clipboard
[pt2e to tosa] face AttributeError
Hi @Jerry-Ge ,
I have run the https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh example done and success, now I am try to modify it to run a quantize int8 pytorch model which need to pass vela on FVP use ARM Ethous U55.
I use the pytorch mnist classification cnn model and quantize to int8 by convert_pt2e. The result of int8 model seems correct.
And I want to export to executorch which backend is ARM U55, but face AttributeError: 'ReshapeAttribute' object has no attribute 'NewshapeAsNumpy'. Did you mean: 'NewShapeAsNumpy'? while doing edge = edge.to_backend(ArmPartitioner).
How could I fix it?
The following code is my export code.
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from torch.ao.quantization import get_default_qconfig_mapping
from torch.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import QuantStub, DeQuantStub
import cv2
import numpy as np
import argparse
import logging
import torch
import torch._export as export
from executorch.backends.arm.arm_backend import ArmPartitioner
from executorch.exir import EdgeCompileConfig
from ..portable.utils import export_to_edge, save_pte_program
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 8, 3, 1)
self.conv2 = nn.Conv2d(8, 16, 3, 1)
self.conv3 = nn.Conv2d(16, 32, 5, 1)
self.fc1 = nn.Linear(32, 64)
self.fc2 = nn.Linear(64, 10)
def forward(self, x):
x = self.conv1(x)
x = F.max_pool2d(x, 2,stride=2)
x = self.conv2(x)
x = F.max_pool2d(x, 2,stride=2)
x = self.conv3(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = self.fc2(x)
output = F.softmax(x, dim=1)
return output
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.cross_entropy(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break
def test(model, device, test_loader):
# model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def calibrate(model, data_loader):
# model.eval()
with torch.no_grad():
for image, target in data_loader:
model(image)
def main():
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=14, metavar='N',
help='number of epochs to train (default: 14)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=False,
help='For Saving the current Model')
parser.add_argument(
"-d",
"--delegate",
action="store_true",
required=False,
default=False,
help="Flag for producing ArmBackend delegated model",
)
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
train_kwargs = {'batch_size': args.batch_size}
test_kwargs = {'batch_size': args.test_batch_size}
if use_cuda:
cuda_kwargs = {'num_workers': 1,
'pin_memory': True,
'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)
transform=transforms.Compose([
transforms.ToTensor()
])
dataset1 = datasets.MNIST('./data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('./data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
float_model = Net().to(device)
float_model.load_state_dict(torch.load("./pytorch_mnist_cnn_floating.pt"))
float_model.eval()
model_to_quantize = Net().to(device)
model_to_quantize.load_state_dict(torch.load("./pytorch_mnist_cnn_floating.pt"))
model_to_quantize.eval()
from torch._export import capture_pre_autograd_graph
example_inputs = (torch.randn(1, 1, 28,28),)
exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs)
# or capture with dynamic dimensions
# from torch._export import dynamic_dim
# exported_model = capture_pre_autograd_graph(model_to_quantize, example_inputs, constraints=[dynamic_dim(example_inputs[0], 0)])
from torch.ao.quantization.quantizer.xnnpack_quantizer import (
XNNPACKQuantizer,
get_symmetric_quantization_config,
)
quantizer = XNNPACKQuantizer()
quantizer.set_global(get_symmetric_quantization_config())
from torch.ao.quantization.quantize_pt2e import (
prepare_pt2e,
convert_pt2e,
)
prepared_model = prepare_pt2e(exported_model, quantizer)
print(prepared_model.graph)
calibrate(prepared_model, train_loader)
quantized_model = convert_pt2e(prepared_model)
################################################################
################################################################
# pre-autograd export. eventually this will become torch.export
# model = export.capture_pre_autograd_graph(quantized_model, example_inputs)
print("convert_pt2e(prepared_model)done ")
edge = export_to_edge(
quantized_model,
example_inputs,
edge_compile_config=EdgeCompileConfig(
_check_ir_validity=False,
),
)
print("export_to_edge done ")
logging.info(f"Exported graph:\n{edge.exported_program().graph}")
delegate = args.delegate
model_name = "pytorch_mnist_cnn_ptq_qnnpack"
if delegate is True:
edge = edge.to_backend(ArmPartitioner)
logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
print("edge.to_backend(ArmPartitioner) done ")
exec_prog = edge.to_executorch()
print("edge.to_executorch() done ")
model_name = f"{model_name}" + (
"_arm_delegate" if delegate is True else ""
)
save_pte_program(exec_prog.buffer, model_name)
# delegate = args.delegate
# # model_name = args.model_name + str_qconfig_mapping
# model_name = args.model_name
# if delegate is True:
# edge = edge.to_backend(ArmPartitioner)
# logging.info(f"Lowered graph:\n{edge.exported_program().graph}")
# exec_prog = edge.to_executorch()
# model_name = f"{model_name}" + (
# "_arm_delegate" if delegate is True else ""
# )
# save_pte_program(exec_prog.buffer, model_name)
if __name__ == '__main__':
main()
Hi Kris,
Thanks for the issue! This is a known issue from Vela (https://pypi.org/project/ethos-u-vela/)
The TOSA->Vela pass is experimental only currently. We have some fix internally but haven't upstreamed that yet.
Please let us know if this is urgent for production or else.
Thanks,
Jerry
cc @robell @eric-k256
Hi @Jerry-Ge ,
Thanks for your reply. The executorch flow is better choice for us to deploy the pytorch model to run on our hardware chip with Cortex M55 + Ethos-U55. So It maybe urgent to me to figure out how to use the total flow to deploy a pytorch toy model to our hardware chip with Cortex M55 + Ethos-U55.
If the TOSA->Vela flow is experimental only currently.
I try to add following code to convert the pytorch model to arm backend with tosa without pass vela , and finally success to generate .pte without pass vela .
from executorch.exir.backend.compile_spec_schema import CompileSpec
compile_spec = [
CompileSpec("output_format", bytes("tosa", "utf8")),
]
ArmPartitioner.compile_spec = compile_spec
And I try to modify https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh to run pytorch_mnist_cnn_ptq_qnnpack_arm_delegate.pte which I generated on FVP.
But finally face Init failed for backend ArmBackend.
I have already modified the method_allocator_pool size which I thought is the same as tensor arena space at /executorch/examples/arm/ethos-u-scratch/ethos-u/core_platform/applications/executorch_tests/runner.cpp
// __attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U];
__attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[136 * 1024U];
But I still don't know what is the mistake about my total flow.
Thanks, Kris
Hi @Jerry-Ge ,
Thanks for your reply. The executorch flow is better choice for us to deploy the pytorch model to run on our hardware chip with Cortex M55 + Ethos-U55. So It maybe urgent to me to figure out how to use the total flow to deploy a pytorch toy model to our hardware chip with Cortex M55 + Ethos-U55.
If the
TOSA->Velaflow is experimental only currently. I try to add following code to convert the pytorch model to arm backend withtosawithout pass vela , and finally success to generate.ptewithout pass vela .from executorch.exir.backend.compile_spec_schema import CompileSpec compile_spec = [ CompileSpec("output_format", bytes("tosa", "utf8")), ] ArmPartitioner.compile_spec = compile_specAnd I try to modify https://github.com/pytorch/executorch/blob/main/examples/arm/run.sh to run
pytorch_mnist_cnn_ptq_qnnpack_arm_delegate.ptewhich I generated on FVP. But finally faceInit failed for backend ArmBackend.I have already modified the method_allocator_pool size which I thought is the same as tensor arena space at
/executorch/examples/arm/ethos-u-scratch/ethos-u/core_platform/applications/executorch_tests/runner.cpp// __attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[4 * 1024U]; __attribute__((section(".sram.data"), aligned(16))) uint8_t method_allocator_pool[136 * 1024U];But I still don't know what is the mistake about my total flow.
Thanks, Kris
Hi Kris,
Yes, you can add the CompileSpec in the preprocess compilation to bypass Vela to generate a pte file. But I don't think that's gonna run on the hardware without a generated Vela command stream. I have more experience with the compilation stage, @robell has done more work on later stages maybe could share more comments here.
Jerry
Hi Kris,
It just simply will not work to emit TOSA into the PTE and run it on u55; if you turn on debug in the runtime you'd see it reject the binary as not being valid input for the NPU.
You need to specify Vela output (currently the default) and if there is a failure then that needs to be resolved.
In this case AttributeError: 'ReshapeAttribute' object has no attribute 'NewshapeAsNumpy'. Did you mean: 'NewShapeAsNumpy'? is an internal compiler error in Vela.
This one issue is a simple fix, but as we have a linear layer in the graph, this will also fail until we revise the mapping of linear through to vela compilation, which will take a bit longer.
Thanks, Rob
Hi @robell @Jerry-Ge ,
Thanks for reply.
It just simply will not work to emit TOSA into the PTE and run it on u55; if you turn on debug in the runtime you'd see it reject the binary as not being valid input for the NPU.
-> In my opinion, I am doing convert PTE to TOSA at my code by adding following code. So, I think that without passing vela, I still using tosa binary which output format is still .PTE to run on M55 but failure.
Am I wrong? I am not really sure about the total flow.
from executorch.exir.backend.compile_spec_schema import CompileSpec
import os
import tempfile
# Temp directory that any debug output is written to
DEBUG_OUTPUT_PATH = tempfile.mkdtemp(prefix="arm_tosa_")
# Debug output for TOSA
TOSA_OUT_PATH = os.path.join(DEBUG_OUTPUT_PATH, "tosa", "")
os.makedirs(TOSA_OUT_PATH, exist_ok=True)
compile_spec = [
CompileSpec("debug_tosa_path", bytes(TOSA_OUT_PATH, "utf8")),
CompileSpec("output_format", bytes("tosa", "utf8")),
]
ArmPartitioner.compile_spec = compile_spec
In my experience, while the op at tflite which could not pass vela compiler, the op will be fall back to run on M55 cpu while using FVP or on our hardward chip. Is it different at the executorch?
Or how could I modify the /executorch/examples/arm/ethos-u-scratch/ethos-u/core_platform/applications/executorch_tests/runner.cpp to run this pytorch model using m55 first untill to waiting for you modifying the vela compiler.
Our product, which called WE-II using M55+U55, have already been ready to support tflite for micro controller, and we are urgent to support pytorch model on our product soon or later. Product website: https://himaxwiseeyeplus.github.io/
Sincerely, Kris
Hi Kris, please raise the product related requests with your Arm support channel, I'm not able to help on a public forum. I'll answer your technical questions below.
One thing to be clear, the current code is alpha quality and will have performance issues and bugs, and I'll explain this in your case below.
So, I think that without passing vela, I still using tosa binary which output format is still .PTE to run on M55 but failure.
No, this is wrong. The default to_edge and to_executorch will produce a .pte file which has only "generic" ExecuTorch sequences which can run on the CPU. TOSA is not used in this flow.
If you look in https://github.com/pytorch/executorch/blob/release/0.1/examples/arm/aot_arm_compiler.py and the flow which excludes delegate, this will produce binaries which run in the generic ExecuTorch on the CPU.
could not pass vela compiler, the op will be fall back to run on M55 cpu
Yes, this is right. Unfortunately what you see is that we believe in the TOSA conversion Vela supports an OP, but there is a bug in Vela's handling which causes compilation to fail. Once the bug is fixed, the behavior will be as you expect.
To just run on M55 for now, follow the non-delegated flow in aot_arm_compiler.py as above.
I am not really sure about the total flow.
The intent is that: TOSA output - use this for checking correct model in reference flow, and in future to aid quatization Vela output - use this with the ArmPartitioner to generate a binary for a Cortex-M + and Ethos-U Use no delegate if you only want to run on CPU
Hi @robell ,
Thanks for reply. So, the delegate flow is: floating pytorch model (.pth) -> pytorch qunatize PTQ to int8 by XNNPACKQuantizer -> export to edge and to_backend(ArmPartitioner) -> output TOSA (.tosa) and pass Vela compiler -> edge.to_executorch -> output pte binary file with vela bin stream (.pte)
the no delegate flow is: floating pytorch model (.pth) -> pytorch qunatize PTQ to int8 by XNNPACKQuantizer -> export to edge and to_backend(ArmPartitioner) -> edge.to_executorch -> output pte binary file (.pte)
Use the above .pte files which can be used ./executorch/examples/xtensa/utils/gen_header.py to gen "model_pte.h", and can run on M55 or U55 on FVP. Am I correct?
But while I want to use no delegate to try to run on CPU, I still face the runtime error RuntimeError: Missing out variants: {'quantized_decomposed::dequantize_per_tensor', 'quantized_decomposed::quantize_per_tensor'}.
How could I fix it?
Thanks, Kris
RuntimeError: Missing out variants: {'quantized_decomposed::dequantize_per_tensor', 'quantized_decomposed::quantize_per_tensor'}.
You will have to link in the quantized op lib, and I expect perf to be pretty tragic without a delegate. @larryliu0820 do you know how to do this in oss builds?
Yeah pretty much we have to build quantized lib and link it into compiler. Similar to here https://github.com/pytorch/executorch/blob/main/examples/portable/custom_ops/test_custom_ops.sh#L101
Hi @robell and @Jerry-Ge,
I also have another question. Because the pytorch model is channel first and the tflite is channel last, and our arm U55 hardware is TensorFlow Lite Micro based. My question is that at which part we do the transpose to let the pytorch model become the channel last model and the U55 do not have additionally latency with add the transpose op at each layer.
By the way. Do there any update about TOSA compiler and vela compiler to let the covert flow more smoothly?
Sincerely, Kris
Hi @robell and @Jerry-Ge,
I also have another question.
Because the pytorch model is channel first and the tflite is channel last, and our arm U55 hardware is TensorFlow Lite Micro based.
My question is that at which part we do the transpose to let the pytorch model become the channel last model and the U55 do not have additionally latency with add the transpose op at each layer.
By the way. Do there any update about TOSA compiler and vela compiler to let the covert flow more smoothly?
Sincerely,
Kris
Hi @Jerry-Ge
Would you recommend me to try the lastest vela compiler to run the total flow about the mnist classification model created by pytorch?
Thanks, Kris
Hi @robell and @Jerry-Ge, I also have another question. Because the pytorch model is channel first and the tflite is channel last, and our arm U55 hardware is TensorFlow Lite Micro based. My question is that at which part we do the transpose to let the pytorch model become the channel last model and the U55 do not have additionally latency with add the transpose op at each layer. By the way. Do there any update about TOSA compiler and vela compiler to let the covert flow more smoothly? Sincerely, Kris
Hi @Jerry-Ge
Would you recommend me to try the lastest vela compiler to run the total flow about the mnist classification model created by pytorch?
Thanks, Kris
Hi Kris,
I would not. There're still a few pending issues/todos need to be done before we can reach that milestone.
- [Done :white_check_mark: ]https://github.com/pytorch/executorch/pull/1311
- [In-progress] https://github.com/pytorch/executorch/issues/1110 pending PR upstreamed here: https://github.com/pytorch/executorch/pull/2371
Right now, I'm pushing things slowly to have better quality code. I'll let you know when we reach that milestone.
Sorry for the inconvenience right now but it will definitely get better with more development cycles.
Jerry
Hi @Jerry-Ge ,
I saw you add the mobilenet v2 example at the executorch. Would you write the readme about it after you reach some milestone?
Thanks, Kris
Hi @Jerry-Ge ,
I saw you add the mobilenet v2 example at the executorch. Would you write the readme about it after you reach some milestone?
Thanks, Kris
Thanks Kris. Sorry I'm just back from vacation and travels. I will be back to you with the readme you requested soon!
Jerry
Hi @Jerry-Ge ,
I saw you add the mobilenet v2 example at the executorch. Would you write the readme about it after you reach some milestone?
Thanks, Kris
Hi Kris, sorry about the delays. You can find the related docs for MobileNetV2 here: https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html#mobilenetv2-module
@kris-himax code has moved a lot since last time :) Is this issues still valid?
@kris-himax Hi I hope stuff works better now. Ok to close?
