How to fix this issue for C# Winforms project
Hi, I always get the same run time errors: Image tensor dimensions: 1, 1, 3, 480, 854
Image size tensor dimensions: 1, 2
Exception thrown: 'Microsoft.ML.OnnxRuntime.OnnxRuntimeException' in Microsoft.ML.OnnxRuntime.dll
[ErrorCode:RuntimeException] Non-zero status code returned while running Add node. Name:'/img_processor/vision_model/embeddings/Add' Status Message: D:\a_work\1\s\onnxruntime\core/providers/cpu/math/element_wise_ops.h:560 onnxruntime::BroadcastIterator::Append axis == 1 || axis == largest was false. Attempting to broadcast an axis by a dimension other than 1. 577 by 2075
=> I write one WinForms app project, I want to use Phi-3 model to describe the image, but my image size is different: 854 by 480 pixels.
Here is my form1.cs code: using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using Newtonsoft.Json.Linq;
namespace TestPhi3Form { public partial class Form1 : Form { private string visionModelPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx"; private string textModelPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx"; private string tokenPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";
private Dictionary<int, string> tokenIdToTextMap;
private InferenceSession visionSession;
private InferenceSession textSession;
public Form1()
{
InitializeComponent();
visionSession = new InferenceSession(visionModelPath);
textSession = new InferenceSession(textModelPath);
LoadTokenizerMapping(tokenPath);
}
private void LoadTokenizerMapping(string tokenizerJsonPath)
{
// Read the tokenizer.json file
var json = File.ReadAllText(tokenizerJsonPath);
var tokenizerConfig = JObject.Parse(json);
// Extract the "added_tokens" section
var addedTokens = tokenizerConfig["added_tokens"];
// Create a dictionary to map token IDs to their text
tokenIdToTextMap = new Dictionary<int, string>();
foreach (var token in addedTokens)
{
int id = token["id"].ToObject<int>();
string content = token["content"].ToString();
tokenIdToTextMap[id] = content;
}
}
private Tensor<float> PreprocessImage(Image image)
{
// Resize the image to the required input size (e.g., 224x224)
const int targetWidth = 224;
const int targetHeight = 224;
Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
// Convert the image to a tensor with an additional dimension
var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
// Normalize RGB values to [0, 1] range
tensor[0, 0, 0, y, x] = pixel.R / 255f;
tensor[0, 0, 1, y, x] = pixel.G / 255f;
tensor[0, 0, 2, y, x] = pixel.B / 255f;
}
}
return tensor;
}
private Tensor<float> PreprocessImage(string imagePath)
{
// Load the image
using var bitmap = new Bitmap(imagePath);
// Ensure the image is of size 854x480
const int targetWidth = 854;
const int targetHeight = 480;
// Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
var pixel = bitmap.GetPixel(x, y);
// Normalize pixel values to [0, 1] and populate the tensor
tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
}
}
// Add an additional dimension to match the expected rank of 5
var expandedTensor = tensor.Reshape(new[] { 1, 1, 3, targetHeight, targetWidth });
return expandedTensor;
}
private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare the image size tensor (Height and Width)
var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
imageSizeTensor[0, 0] = 480; // Height
imageSizeTensor[0, 1] = 854; // Width
// Debugging: Print the tensor dimensions and types
Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");
// Create inputs for the vision model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
};
return inputs;
}
private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
{
// Create a dummy attention mask (adjust based on your model's requirements)
var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });
// Create inputs for the text model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
};
return inputs;
}
private string GenerateResponse(Tensor<float> visionOutput)
{
// Prepare inputs for the text model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("input", visionOutput)
};
// Run the text model
using (var results = textSession.Run(inputs))
{
var output = results.First().AsTensor<float>();
return ProcessTextOutput(output);
}
}
private string ProcessTextOutput(Tensor<float> output)
{
// Convert the float array to a byte array
var floatArray = output.ToArray();
var byteArray = new byte[floatArray.Length * sizeof(float)];
Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);
// Convert the byte array to a string
var text = Encoding.UTF8.GetString(byteArray);
return text;
}
private string DecodeTokenIds(int[] tokenIds)
{
// Decode the token IDs into text using the tokenizer mapping
var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
return text;
}
private void DetectObjects(string imagePath)
{
try
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare inputs for the vision model
var visionInputs = PrepareVisionInputs(imagePath);
// Run the vision model
var visualFeatures = RunVisionModel(visionInputs);
// Prepare inputs for the text model
var textInputs = PrepareTextInputs(visualFeatures);
// Run the text model
var response = RunTextModel(textInputs);
// Display the response
MessageBox.Show($"Model Response: {response}");
}
catch (Exception ex)
{
Debug.Print(ex.Message.ToString());
MessageBox.Show($"Error during inference: {ex.Message}");
}
}
private string RunTextModel(List<NamedOnnxValue> inputs)
{
using (var results = textSession.Run(inputs))
{
var logits = results.First().AsTensor<float>();
// Decode the logits into text (adjust based on your tokenizer)
var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
var text = DecodeTokenIds(tokenIds);
return text;
}
}
private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
{
using (var results = visionSession.Run(inputs))
{
var visualFeatures = results.First().AsTensor<float>();
return visualFeatures;
}
}
private Tensor<float> ConvertImageToTensor(Image image)
{
// Resize the image to the model's input size (e.g., 224x224)
const int targetWidth = 224;
const int targetHeight = 224;
Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
// Normalize RGB values to [0, 1] range
tensor[0, 0, y, x] = pixel.R / 255f;
tensor[0, 1, y, x] = pixel.G / 255f;
tensor[0, 2, y, x] = pixel.B / 255f;
}
}
return tensor;
}
private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
{
var boxes = new List<Rectangle>();
var descriptions = new List<string>();
// Example logic for processing output (adjust based on your model)
int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
for (int i = 0; i < numObjects; i++)
{
int offset = i * 6;
float x = output[offset] * PBImage.Width;
float y = output[offset + 1] * PBImage.Height;
float width = output[offset + 2] * PBImage.Width;
float height = output[offset + 3] * PBImage.Height;
int classId = (int)output[offset + 4];
float confidence = output[offset + 5];
if (confidence > 0.5) // Confidence threshold
{
boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
descriptions.Add($"Object {classId} with confidence {confidence:P}");
}
}
return (boxes, descriptions);
}
private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
{
if (PBImage.Image == null)
return;
Image image = (Image)PBImage.Image.Clone();
using (Graphics g = Graphics.FromImage(image))
{
Pen pen = new Pen(Color.Red, 2);
Font font = new Font("Arial", 10);
Brush brush = new SolidBrush(Color.Yellow);
for (int i = 0; i < boxes.Count; i++)
{
g.DrawRectangle(pen, boxes[i]);
g.DrawString(descriptions[i], font, brush, boxes[i].Location);
}
}
PBImage.Image = image;
}
private void BTNInfer_Click(object sender, EventArgs e)
{
using (OpenFileDialog openFileDialog = new OpenFileDialog())
{
openFileDialog.Filter = "Image Files|*.png";
if (openFileDialog.ShowDialog() == DialogResult.OK)
{
string filePath = openFileDialog.FileName;
PBImage.Image = Image.FromFile(filePath);
DetectObjects(filePath);
}
}
}
private void BTNFinish_Click(object sender, EventArgs e)
{
Console.Beep();
Environment.Exit(0);
}
}
} => I want to know how to run my program, so I can click on BTNInfer button to open any image file in my PC and let Phi-3 to describe what it can see in the image. Please advise, Thanks,
Here are some steps and tips to help you resolve this issue:
1. Tensor Dimensions
The error message indicates a mismatch in tensor dimensions. Specifically, the model expects a tensor of dimensions [1, 3, 480, 854] but is receiving [1, 1, 3, 480, 854]. You need to ensure that the tensor dimensions match the model's expectations.
2. Preprocess Image Function
Update your PreprocessImage function to ensure it returns a tensor with the correct dimensions:
private Tensor<float> PreprocessImage(string imagePath)
{
// Load the image
using var bitmap = new Bitmap(imagePath);
// Ensure the image is of size 854x480
const int targetWidth = 854;
const int targetHeight = 480;
// Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
var pixel = bitmap.GetPixel(x, y);
// Normalize pixel values to [0, 1] and populate the tensor
tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
}
}
return tensor;
}
3. Prepare Vision Inputs
Ensure that the PrepareVisionInputs function does not add an extra dimension:
private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare the image size tensor (Height and Width)
var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
imageSizeTensor[0, 0] = 480; // Height
imageSizeTensor[0, 1] = 854; // Width
// Debugging: Print the tensor dimensions and types
Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");
// Create inputs for the vision model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
};
return inputs;
}
4. Debugging
Add debugging statements to ensure the dimensions of your tensors are correct before passing them to the model:
Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");
5. Running the Application
Ensure that your BTNInfer_Click method is correctly set up to handle the image file selection and model inference:
private void BTNInfer_Click(object sender, EventArgs e)
{
using (OpenFileDialog openFileDialog = new OpenFileDialog())
{
openFileDialog.Filter = "Image Files|*.png;*.jpg;*.jpeg";
if (openFileDialog.ShowDialog() == DialogResult.OK)
{
string filePath = openFileDialog.FileName;
PBImage.Image = Image.FromFile(filePath);
DetectObjects(filePath);
}
}
}
6. Exception Handling
Ensure you have proper exception handling to catch and debug any runtime errors:
private void DetectObjects(string imagePath)
{
try
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare inputs for the vision model
var visionInputs = PrepareVisionInputs(imagePath);
// Run the vision model
var visualFeatures = RunVisionModel(visionInputs);
// Prepare inputs for the text model
var textInputs = PrepareTextInputs(visualFeatures);
// Run the text model
var response = RunTextModel(textInputs);
// Display the response
MessageBox.Show($"Model Response: {response}");
}
catch (Exception ex)
{
Debug.Print(ex.Message.ToString());
MessageBox.Show($"Error during inference: {ex.Message}");
}
}
By following these steps, you should be able to resolve the tensor dimension mismatch and successfully run your application to describe images using the Phi-3 model.
Hi, thanks you very much for your help & code, now I have changed my code, it looks like this now: using System; using System.Diagnostics; using System.Drawing; using System.Drawing.Drawing2D; using System.Linq; using System.Text; using System.Windows.Forms; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; using Newtonsoft.Json.Linq;
namespace TestPhi3Form { public partial class Form1 : Form { private string visionModelPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx"; private string textModelPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx"; private string tokenPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";
private Dictionary<int, string> tokenIdToTextMap;
private InferenceSession visionSession;
private InferenceSession textSession;
public Form1()
{
InitializeComponent();
visionSession = new InferenceSession(visionModelPath);
textSession = new InferenceSession(textModelPath);
LoadTokenizerMapping(tokenPath);
}
private void LoadTokenizerMapping(string tokenizerJsonPath)
{
// Read the tokenizer.json file
var json = File.ReadAllText(tokenizerJsonPath);
var tokenizerConfig = JObject.Parse(json);
// Extract the "added_tokens" section
var addedTokens = tokenizerConfig["added_tokens"];
// Create a dictionary to map token IDs to their text
tokenIdToTextMap = new Dictionary<int, string>();
foreach (var token in addedTokens)
{
int id = token["id"].ToObject<int>();
string content = token["content"].ToString();
tokenIdToTextMap[id] = content;
}
}
private Tensor<float> PreprocessImage(Image image)
{
// Resize the image to the required input size (e.g., 224x224)
const int targetWidth = 224;
const int targetHeight = 224;
Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
// Convert the image to a tensor with an additional dimension
var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
// Normalize RGB values to [0, 1] range
tensor[0, 0, 0, y, x] = pixel.R / 255f;
tensor[0, 0, 1, y, x] = pixel.G / 255f;
tensor[0, 0, 2, y, x] = pixel.B / 255f;
}
}
return tensor;
}
private Tensor<float> PreprocessImage(string imagePath)
{
// Load the image
using var bitmap = new Bitmap(imagePath);
// Ensure the image is of size 854x480
const int targetWidth = 854;
const int targetHeight = 480;
// Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
var pixel = bitmap.GetPixel(x, y);
// Normalize pixel values to [0, 1] and populate the tensor
tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
}
}
return tensor;
}
private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare the image size tensor (Height and Width)
var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
imageSizeTensor[0, 0] = 480; // Height
imageSizeTensor[0, 1] = 854; // Width
// Debugging: Print the tensor dimensions and types
Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");
// Create inputs for the vision model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
};
return inputs;
}
private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
{
// Create a dummy attention mask (adjust based on your model's requirements)
var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });
// Create inputs for the text model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
};
return inputs;
}
private string GenerateResponse(Tensor<float> visionOutput)
{
// Prepare inputs for the text model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("input", visionOutput)
};
// Run the text model
using (var results = textSession.Run(inputs))
{
var output = results.First().AsTensor<float>();
return ProcessTextOutput(output);
}
}
private string ProcessTextOutput(Tensor<float> output)
{
// Convert the float array to a byte array
var floatArray = output.ToArray();
var byteArray = new byte[floatArray.Length * sizeof(float)];
Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);
// Convert the byte array to a string
var text = Encoding.UTF8.GetString(byteArray);
return text;
}
private string DecodeTokenIds(int[] tokenIds)
{
// Decode the token IDs into text using the tokenizer mapping
var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
return text;
}
private void DetectObjects(string imagePath)
{
try
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare inputs for the vision model
var visionInputs = PrepareVisionInputs(imagePath);
// Run the vision model
var visualFeatures = RunVisionModel(visionInputs);
// Prepare inputs for the text model
var textInputs = PrepareTextInputs(visualFeatures);
// Run the text model
var response = RunTextModel(textInputs);
// Display the response
MessageBox.Show($"Model Response: {response}");
}
catch (Exception ex)
{
Debug.Print(ex.Message.ToString());
MessageBox.Show($"Error during inference: {ex.Message}");
}
}
private string RunTextModel(List<NamedOnnxValue> inputs)
{
using (var results = textSession.Run(inputs))
{
var logits = results.First().AsTensor<float>();
// Decode the logits into text (adjust based on your tokenizer)
var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
var text = DecodeTokenIds(tokenIds);
return text;
}
}
private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
{
using (var results = visionSession.Run(inputs))
{
var visualFeatures = results.First().AsTensor<float>();
return visualFeatures;
}
}
private Tensor<float> ConvertImageToTensor(Image image)
{
// Resize the image to the model's input size (e.g., 224x224)
const int targetWidth = 224;
const int targetHeight = 224;
Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
// Normalize RGB values to [0, 1] range
tensor[0, 0, y, x] = pixel.R / 255f;
tensor[0, 1, y, x] = pixel.G / 255f;
tensor[0, 2, y, x] = pixel.B / 255f;
}
}
return tensor;
}
private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
{
var boxes = new List<Rectangle>();
var descriptions = new List<string>();
// Example logic for processing output (adjust based on your model)
int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
for (int i = 0; i < numObjects; i++)
{
int offset = i * 6;
float x = output[offset] * PBImage.Width;
float y = output[offset + 1] * PBImage.Height;
float width = output[offset + 2] * PBImage.Width;
float height = output[offset + 3] * PBImage.Height;
int classId = (int)output[offset + 4];
float confidence = output[offset + 5];
if (confidence > 0.5) // Confidence threshold
{
boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
descriptions.Add($"Object {classId} with confidence {confidence:P}");
}
}
return (boxes, descriptions);
}
private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
{
if (PBImage.Image == null)
return;
Image image = (Image)PBImage.Image.Clone();
using (Graphics g = Graphics.FromImage(image))
{
Pen pen = new Pen(Color.Red, 2);
Font font = new Font("Arial", 10);
Brush brush = new SolidBrush(Color.Yellow);
for (int i = 0; i < boxes.Count; i++)
{
g.DrawRectangle(pen, boxes[i]);
g.DrawString(descriptions[i], font, brush, boxes[i].Location);
}
}
PBImage.Image = image;
}
private void BTNInfer_Click(object sender, EventArgs e)
{
using (OpenFileDialog openFileDialog = new OpenFileDialog())
{
openFileDialog.Filter = "Image Files|*.png";
if (openFileDialog.ShowDialog() == DialogResult.OK)
{
string filePath = openFileDialog.FileName;
PBImage.Image = Image.FromFile(filePath);
DetectObjects(filePath);
}
}
}
private void BTNFinish_Click(object sender, EventArgs e)
{
Console.Beep();
Environment.Exit(0);
}
}
}
=> But when I run the code, I got different kind of error. I don't totally understand, but it seems there is always something not correct, please look at the screenshot to understand why I still got issues. I am using image size of 854 by 480 pixels, not 224 by 224 in size.
Thanks,
The error message indicates that the input tensor's rank is incorrect. The model expects a 5-dimensional tensor, but it is receiving a 4-dimensional tensor. You need to adjust the input tensor to match the expected dimensions.
Here's how you can modify the code to reshape the input tensor correctly:
- Ensure the input image is resized to the expected dimensions.
- Add an extra dimension to the input tensor to match the expected rank.
Here's an example of how you can do this:
// ...existing code...
using System.Numerics.Tensors;
// ...existing code...
private Tensor<float> PreprocessImage(Bitmap image)
{
// Resize the image to 854x480
Bitmap resizedImage = new Bitmap(image, new Size(854, 480));
// Convert the image to a float tensor
Tensor<float> inputTensor = new DenseTensor<float>(new[] { 1, 3, 480, 854, 1 });
for (int y = 0; y < 480; y++)
{
for (int x = 0; x < 854; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
inputTensor[0, 0, y, x, 0] = pixel.R / 255.0f;
inputTensor[0, 1, y, x, 0] = pixel.G / 255.0f;
inputTensor[0, 2, y, x, 0] = pixel.B / 255.0f;
}
}
return inputTensor;
}
private void RunInference(Bitmap image)
{
var inputTensor = PreprocessImage(image);
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("pixel_values", inputTensor)
};
using (var results = visionSession.Run(inputs))
{
// Process the results
}
}
public Form1()
{
InitializeComponent();
visionSession = new InferenceSession(visionModelPath);
textSession = new InferenceSession(textModelPath);
LoadTokenizerMapping(tokenPath);
// Example usage
Bitmap image = new Bitmap("path_to_image.jpg");
RunInference(image);
}
// ...existing code...
This code resizes the input image to 854x480 pixels and converts it into a 5-dimensional tensor with the shape [1, 3, 480, 854, 1], which should match the expected input dimensions of the model. Adjust the preprocessing as needed based on your model's requirements.
New solution
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using Newtonsoft.Json.Linq;
using System.Numerics.Tensors;
namespace TestPhi3Form
{
public partial class Form1 : Form
{
private string visionModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx";
private string textModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx";
private string tokenPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";
private Dictionary<int, string> tokenIdToTextMap;
private InferenceSession visionSession;
private InferenceSession textSession;
public Form1()
{
InitializeComponent();
visionSession = new InferenceSession(visionModelPath);
textSession = new InferenceSession(textModelPath);
LoadTokenizerMapping(tokenPath);
// Example usage
Bitmap image = new Bitmap("path_to_image.jpg");
RunInference(image);
}
private void LoadTokenizerMapping(string tokenizerJsonPath)
{
// Read the tokenizer.json file
var json = File.ReadAllText(tokenizerJsonPath);
var tokenizerConfig = JObject.Parse(json);
// Extract the "added_tokens" section
var addedTokens = tokenizerConfig["added_tokens"];
// Create a dictionary to map token IDs to their text
tokenIdToTextMap = new Dictionary<int, string>();
foreach (var token in addedTokens)
{
int id = token["id"].ToObject<int>();
string content = token["content"].ToString();
tokenIdToTextMap[id] = content;
}
}
private Tensor<float> PreprocessImage(Image image)
{
// Resize the image to the required input size (e.g., 224x224)
const int targetWidth = 224;
const int targetHeight = 224;
Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
// Convert the image to a tensor with an additional dimension
var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
// Normalize RGB values to [0, 1] range
tensor[0, 0, 0, y, x] = pixel.R / 255f;
tensor[0, 0, 1, y, x] = pixel.G / 255f;
tensor[0, 0, 2, y, x] = pixel.B / 255f;
}
}
return tensor;
}
private Tensor<float> PreprocessImage(string imagePath)
{
// Load the image
using var bitmap = new Bitmap(imagePath);
// Ensure the image is of size 854x480
const int targetWidth = 854;
const int targetHeight = 480;
// Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
var pixel = bitmap.GetPixel(x, y);
// Normalize pixel values to [0, 1] and populate the tensor
tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
}
}
return tensor;
}
private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare the image size tensor (Height and Width)
var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
imageSizeTensor[0, 0] = 480; // Height
imageSizeTensor[0, 1] = 854; // Width
// Debugging: Print the tensor dimensions and types
Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");
// Create inputs for the vision model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
};
return inputs;
}
private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
{
// Create a dummy attention mask (adjust based on your model's requirements)
var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });
// Create inputs for the text model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
};
return inputs;
}
private string GenerateResponse(Tensor<float> visionOutput)
{
// Prepare inputs for the text model
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("input", visionOutput)
};
// Run the text model
using (var results = textSession.Run(inputs))
{
var output = results.First().AsTensor<float>();
return ProcessTextOutput(output);
}
}
private string ProcessTextOutput(Tensor<float> output)
{
// Convert the float array to a byte array
var floatArray = output.ToArray();
var byteArray = new byte[floatArray.Length * sizeof(float)];
Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);
// Convert the byte array to a string
var text = Encoding.UTF8.GetString(byteArray);
return text;
}
private string DecodeTokenIds(int[] tokenIds)
{
// Decode the token IDs into text using the tokenizer mapping
var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
return text;
}
private void DetectObjects(string imagePath)
{
try
{
// Preprocess the image
var imageTensor = PreprocessImage(imagePath);
// Prepare inputs for the vision model
var visionInputs = PrepareVisionInputs(imagePath);
// Run the vision model
var visualFeatures = RunVisionModel(visionInputs);
// Prepare inputs for the text model
var textInputs = PrepareTextInputs(visualFeatures);
// Run the text model
var response = RunTextModel(textInputs);
// Display the response
MessageBox.Show($"Model Response: {response}");
}
catch (Exception ex)
{
Debug.Print(ex.Message.ToString());
MessageBox.Show($"Error during inference: {ex.Message}");
}
}
private string RunTextModel(List<NamedOnnxValue> inputs)
{
using (var results = textSession.Run(inputs))
{
var logits = results.First().AsTensor<float>();
// Decode the logits into text (adjust based on your tokenizer)
var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
var text = DecodeTokenIds(tokenIds);
return text;
}
}
private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
{
using (var results = visionSession.Run(inputs))
{
var visualFeatures = results.First().AsTensor<float>();
return visualFeatures;
}
}
private Tensor<float> ConvertImageToTensor(Image image)
{
// Resize the image to the model's input size (e.g., 224x224)
const int targetWidth = 224;
const int targetHeight = 224;
Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });
for (int y = 0; y < targetHeight; y++)
{
for (int x = 0; x < targetWidth; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
// Normalize RGB values to [0, 1] range
tensor[0, 0, y, x] = pixel.R / 255f;
tensor[0, 1, y, x] = pixel.G / 255f;
tensor[0, 2, y, x] = pixel.B / 255f;
}
}
return tensor;
}
private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
{
var boxes = new List<Rectangle>();
var descriptions = new List<string>();
// Example logic for processing output (adjust based on your model)
int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
for (int i = 0; i < numObjects; i++)
{
int offset = i * 6;
float x = output[offset] * PBImage.Width;
float y = output[offset + 1] * PBImage.Height;
float width = output[offset + 2] * PBImage.Width;
float height = output[offset + 3] * PBImage.Height;
int classId = (int)output[offset + 4];
float confidence = output[offset + 5];
if (confidence > 0.5) // Confidence threshold
{
boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
descriptions.Add($"Object {classId} with confidence {confidence:P}");
}
}
return (boxes, descriptions);
}
private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
{
if (PBImage.Image == null)
return;
Image image = (Image)PBImage.Image.Clone();
using (Graphics g = Graphics.FromImage(image))
{
Pen pen = new Pen(Color.Red, 2);
Font font = new Font("Arial", 10);
Brush brush = new SolidBrush(Color.Yellow);
for (int i = 0; i < boxes.Count; i++)
{
g.DrawRectangle(pen, boxes[i]);
g.DrawString(descriptions[i], font, brush, boxes[i].Location);
}
}
PBImage.Image = image;
}
private void BTNInfer_Click(object sender, EventArgs e)
{
using (OpenFileDialog openFileDialog = new OpenFileDialog())
{
openFileDialog.Filter = "Image Files|*.png";
if (openFileDialog.ShowDialog() == DialogResult.OK)
{
string filePath = openFileDialog.FileName;
PBImage.Image = Image.FromFile(filePath);
DetectObjects(filePath);
}
}
}
private void BTNFinish_Click(object sender, EventArgs e)
{
Console.Beep();
Environment.Exit(0);
}
private Tensor<float> PreprocessImage(Bitmap image)
{
// Resize the image to 854x480
Bitmap resizedImage = new Bitmap(image, new Size(854, 480));
// Convert the image to a float tensor
Tensor<float> inputTensor = new DenseTensor<float>(new[] { 1, 3, 480, 854, 1 });
for (int y = 0; y < 480; y++)
{
for (int x = 0; x < 854; x++)
{
Color pixel = resizedImage.GetPixel(x, y);
inputTensor[0, 0, y, x, 0] = pixel.R / 255.0f;
inputTensor[0, 1, y, x, 0] = pixel.G / 255.0f;
inputTensor[0, 2, y, x, 0] = pixel.B / 255.0f;
}
}
return inputTensor;
}
private void RunInference(Bitmap image)
{
var inputTensor = PreprocessImage(image);
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("pixel_values", inputTensor)
};
using (var results = visionSession.Run(inputs))
{
// Process the results
}
}
}