How to fix this issue for C# Winforms project

Open zydjohnHotmail opened this issue 11 months ago • 3 comments

Hi, I always get the same run time errors: Image tensor dimensions: 1, 1, 3, 480, 854 Image size tensor dimensions: 1, 2 Exception thrown: 'Microsoft.ML.OnnxRuntime.OnnxRuntimeException' in Microsoft.ML.OnnxRuntime.dll [ErrorCode:RuntimeException] Non-zero status code returned while running Add node. Name:'/img_processor/vision_model/embeddings/Add' Status Message: D:\a_work\1\s\onnxruntime\core/providers/cpu/math/element_wise_ops.h:560 onnxruntime::BroadcastIterator::Append axis == 1 || axis == largest was false. Attempting to broadcast an axis by a dimension other than 1. 577 by 2075 => I write one WinForms app project, I want to use Phi-3 model to describe the image, but my image size is different: 854 by 480 pixels.
Here is my form1.cs code: using System; using System.Diagnostics; using System.Drawing; using System.Drawing.Drawing2D; using System.Linq; using System.Text; using System.Windows.Forms; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; using Newtonsoft.Json.Linq;

namespace TestPhi3Form { public partial class Form1 : Form { private string visionModelPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx"; private string textModelPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx"; private string tokenPath = @"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";

    private Dictionary<int, string> tokenIdToTextMap;

    private InferenceSession visionSession;
    private InferenceSession textSession;

    public Form1()
    {
        InitializeComponent();
        visionSession = new InferenceSession(visionModelPath);
        textSession = new InferenceSession(textModelPath);
        LoadTokenizerMapping(tokenPath);
    }

    private void LoadTokenizerMapping(string tokenizerJsonPath)
    {
        // Read the tokenizer.json file
        var json = File.ReadAllText(tokenizerJsonPath);
        var tokenizerConfig = JObject.Parse(json);

        // Extract the "added_tokens" section
        var addedTokens = tokenizerConfig["added_tokens"];

        // Create a dictionary to map token IDs to their text
        tokenIdToTextMap = new Dictionary<int, string>();
        foreach (var token in addedTokens)
        {
            int id = token["id"].ToObject<int>();
            string content = token["content"].ToString();
            tokenIdToTextMap[id] = content;
        }
    }

    private Tensor<float> PreprocessImage(Image image)
    {
        // Resize the image to the required input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;
        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));

        // Convert the image to a tensor with an additional dimension
        var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, 0, y, x] = pixel.R / 255f;
                tensor[0, 0, 1, y, x] = pixel.G / 255f;
                tensor[0, 0, 2, y, x] = pixel.B / 255f;
            }
        }
        return tensor;
    }

    private Tensor<float> PreprocessImage(string imagePath)
    {
        // Load the image
        using var bitmap = new Bitmap(imagePath);

        // Ensure the image is of size 854x480
        const int targetWidth = 854;
        const int targetHeight = 480;

        // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                var pixel = bitmap.GetPixel(x, y);

                // Normalize pixel values to [0, 1] and populate the tensor
                tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
                tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
                tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
            }
        }

        // Add an additional dimension to match the expected rank of 5
        var expandedTensor = tensor.Reshape(new[] { 1, 1, 3, targetHeight, targetWidth });

        return expandedTensor;
    }

    private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare the image size tensor (Height and Width)
        var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
        imageSizeTensor[0, 0] = 480; // Height
        imageSizeTensor[0, 1] = 854; // Width

        // Debugging: Print the tensor dimensions and types
        Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
        Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

        // Create inputs for the vision model
        var inputs = new List<NamedOnnxValue>
        {
        NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
        NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
        };

        return inputs;
    }

    private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
    {
        // Create a dummy attention mask (adjust based on your model's requirements)
        var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });

        // Create inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
           NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
           NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
        };

        return inputs;
    }


    private string GenerateResponse(Tensor<float> visionOutput)
    {
        // Prepare inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("input", visionOutput)
        };

        // Run the text model
        using (var results = textSession.Run(inputs))
        {
            var output = results.First().AsTensor<float>();
            return ProcessTextOutput(output);
        }
    }

    private string ProcessTextOutput(Tensor<float> output)
    {
        // Convert the float array to a byte array
        var floatArray = output.ToArray();
        var byteArray = new byte[floatArray.Length * sizeof(float)];
        Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);

        // Convert the byte array to a string
        var text = Encoding.UTF8.GetString(byteArray);
        return text;
    }


    private string DecodeTokenIds(int[] tokenIds)
    {
        // Decode the token IDs into text using the tokenizer mapping
        var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
        return text;
    }

    private void DetectObjects(string imagePath)
    {
        try
        {
            // Preprocess the image
            var imageTensor = PreprocessImage(imagePath);

            // Prepare inputs for the vision model
            var visionInputs = PrepareVisionInputs(imagePath);

            // Run the vision model
            var visualFeatures = RunVisionModel(visionInputs);

            // Prepare inputs for the text model
            var textInputs = PrepareTextInputs(visualFeatures);

            // Run the text model
            var response = RunTextModel(textInputs);

            // Display the response
            MessageBox.Show($"Model Response: {response}");
        }
        catch (Exception ex)
        {
            Debug.Print(ex.Message.ToString());
            MessageBox.Show($"Error during inference: {ex.Message}");
        }
    }


    private string RunTextModel(List<NamedOnnxValue> inputs)
    {
        using (var results = textSession.Run(inputs))
        {
            var logits = results.First().AsTensor<float>();

            // Decode the logits into text (adjust based on your tokenizer)
            var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
            var text = DecodeTokenIds(tokenIds);
            return text;
        }
    }


    private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
    {
        using (var results = visionSession.Run(inputs))
        {
            var visualFeatures = results.First().AsTensor<float>();
            return visualFeatures;
        }
    }


    private Tensor<float> ConvertImageToTensor(Image image)
    {
        // Resize the image to the model's input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;

        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, y, x] = pixel.R / 255f;
                tensor[0, 1, y, x] = pixel.G / 255f;
                tensor[0, 2, y, x] = pixel.B / 255f;
            }
        }

        return tensor;
    }

    private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
    {
        var boxes = new List<Rectangle>();
        var descriptions = new List<string>();

        // Example logic for processing output (adjust based on your model)
        int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
        for (int i = 0; i < numObjects; i++)
        {
            int offset = i * 6;
            float x = output[offset] * PBImage.Width;
            float y = output[offset + 1] * PBImage.Height;
            float width = output[offset + 2] * PBImage.Width;
            float height = output[offset + 3] * PBImage.Height;
            int classId = (int)output[offset + 4];
            float confidence = output[offset + 5];

            if (confidence > 0.5) // Confidence threshold
            {
                boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
                descriptions.Add($"Object {classId} with confidence {confidence:P}");
            }
        }

        return (boxes, descriptions);
    }

    private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
    {
        if (PBImage.Image == null)
            return;

        Image image = (Image)PBImage.Image.Clone();
        using (Graphics g = Graphics.FromImage(image))
        {
            Pen pen = new Pen(Color.Red, 2);
            Font font = new Font("Arial", 10);
            Brush brush = new SolidBrush(Color.Yellow);

            for (int i = 0; i < boxes.Count; i++)
            {
                g.DrawRectangle(pen, boxes[i]);
                g.DrawString(descriptions[i], font, brush, boxes[i].Location);
            }
        }

        PBImage.Image = image;
    }


    private void BTNInfer_Click(object sender, EventArgs e)
    {
        using (OpenFileDialog openFileDialog = new OpenFileDialog())
        {
            openFileDialog.Filter = "Image Files|*.png";

            if (openFileDialog.ShowDialog() == DialogResult.OK)
            {
                string filePath = openFileDialog.FileName;
                PBImage.Image = Image.FromFile(filePath);

                DetectObjects(filePath);
            }
        }
    }

    private void BTNFinish_Click(object sender, EventArgs e)
    {
        Console.Beep();
        Environment.Exit(0);
    }
}

} => I want to know how to run my program, so I can click on BTNInfer button to open any image file in my PC and let Phi-3 to describe what it can see in the image. Please advise, Thanks,

Jan 05 '25 22:01 zydjohnHotmail

Here are some steps and tips to help you resolve this issue:

1. Tensor Dimensions

The error message indicates a mismatch in tensor dimensions. Specifically, the model expects a tensor of dimensions [1, 3, 480, 854] but is receiving [1, 1, 3, 480, 854]. You need to ensure that the tensor dimensions match the model's expectations.

2. Preprocess Image Function

Update your PreprocessImage function to ensure it returns a tensor with the correct dimensions:

private Tensor<float> PreprocessImage(string imagePath)
{
    // Load the image
    using var bitmap = new Bitmap(imagePath);

    // Ensure the image is of size 854x480
    const int targetWidth = 854;
    const int targetHeight = 480;

    // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
    var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
    for (int y = 0; y < targetHeight; y++)
    {
        for (int x = 0; x < targetWidth; x++)
        {
            var pixel = bitmap.GetPixel(x, y);

            // Normalize pixel values to [0, 1] and populate the tensor
            tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
            tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
            tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
        }
    }

    return tensor;
}

3. Prepare Vision Inputs

Ensure that the PrepareVisionInputs function does not add an extra dimension:

private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
{
    // Preprocess the image
    var imageTensor = PreprocessImage(imagePath);

    // Prepare the image size tensor (Height and Width)
    var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
    imageSizeTensor[0, 0] = 480; // Height
    imageSizeTensor[0, 1] = 854; // Width

    // Debugging: Print the tensor dimensions and types
    Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
    Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

    // Create inputs for the vision model
    var inputs = new List<NamedOnnxValue>
    {
        NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
        NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
    };

    return inputs;
}

4. Debugging

Add debugging statements to ensure the dimensions of your tensors are correct before passing them to the model:

Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

5. Running the Application

Ensure that your BTNInfer_Click method is correctly set up to handle the image file selection and model inference:

private void BTNInfer_Click(object sender, EventArgs e)
{
    using (OpenFileDialog openFileDialog = new OpenFileDialog())
    {
        openFileDialog.Filter = "Image Files|*.png;*.jpg;*.jpeg";

        if (openFileDialog.ShowDialog() == DialogResult.OK)
        {
            string filePath = openFileDialog.FileName;
            PBImage.Image = Image.FromFile(filePath);

            DetectObjects(filePath);
        }
    }
}

6. Exception Handling

Ensure you have proper exception handling to catch and debug any runtime errors:

private void DetectObjects(string imagePath)
{
    try
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare inputs for the vision model
        var visionInputs = PrepareVisionInputs(imagePath);

        // Run the vision model
        var visualFeatures = RunVisionModel(visionInputs);

        // Prepare inputs for the text model
        var textInputs = PrepareTextInputs(visualFeatures);

        // Run the text model
        var response = RunTextModel(textInputs);

        // Display the response
        MessageBox.Show($"Model Response: {response}");
    }
    catch (Exception ex)
    {
        Debug.Print(ex.Message.ToString());
        MessageBox.Show($"Error during inference: {ex.Message}");
    }
}

By following these steps, you should be able to resolve the tensor dimension mismatch and successfully run your application to describe images using the Phi-3 model.

Jan 08 '25 18:01 leestott

Hi, thanks you very much for your help & code, now I have changed my code, it looks like this now: using System; using System.Diagnostics; using System.Drawing; using System.Drawing.Drawing2D; using System.Linq; using System.Text; using System.Windows.Forms; using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; using Newtonsoft.Json.Linq;

    private Dictionary<int, string> tokenIdToTextMap;

    private InferenceSession visionSession;
    private InferenceSession textSession;

    public Form1()
    {
        InitializeComponent();
        visionSession = new InferenceSession(visionModelPath);
        textSession = new InferenceSession(textModelPath);
        LoadTokenizerMapping(tokenPath);
    }

    private void LoadTokenizerMapping(string tokenizerJsonPath)
    {
        // Read the tokenizer.json file
        var json = File.ReadAllText(tokenizerJsonPath);
        var tokenizerConfig = JObject.Parse(json);

        // Extract the "added_tokens" section
        var addedTokens = tokenizerConfig["added_tokens"];

        // Create a dictionary to map token IDs to their text
        tokenIdToTextMap = new Dictionary<int, string>();
        foreach (var token in addedTokens)
        {
            int id = token["id"].ToObject<int>();
            string content = token["content"].ToString();
            tokenIdToTextMap[id] = content;
        }
    }

    private Tensor<float> PreprocessImage(Image image)
    {
        // Resize the image to the required input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;
        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));

        // Convert the image to a tensor with an additional dimension
        var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, 0, y, x] = pixel.R / 255f;
                tensor[0, 0, 1, y, x] = pixel.G / 255f;
                tensor[0, 0, 2, y, x] = pixel.B / 255f;
            }
        }
        return tensor;
    }

    private Tensor<float> PreprocessImage(string imagePath)
    {
        // Load the image
        using var bitmap = new Bitmap(imagePath);

        // Ensure the image is of size 854x480
        const int targetWidth = 854;
        const int targetHeight = 480;

        // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                var pixel = bitmap.GetPixel(x, y);

                // Normalize pixel values to [0, 1] and populate the tensor
                tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
                tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
                tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
            }
        }

        return tensor;
    }

    private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare the image size tensor (Height and Width)
        var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
        imageSizeTensor[0, 0] = 480; // Height
        imageSizeTensor[0, 1] = 854; // Width

        // Debugging: Print the tensor dimensions and types
        Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
        Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

        // Create inputs for the vision model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
            NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
        };

        return inputs;
    }
    
    private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
    {
        // Create a dummy attention mask (adjust based on your model's requirements)
        var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });

        // Create inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
           NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
           NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
        };

        return inputs;
    }


    private string GenerateResponse(Tensor<float> visionOutput)
    {
        // Prepare inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("input", visionOutput)
        };

        // Run the text model
        using (var results = textSession.Run(inputs))
        {
            var output = results.First().AsTensor<float>();
            return ProcessTextOutput(output);
        }
    }

    private string ProcessTextOutput(Tensor<float> output)
    {
        // Convert the float array to a byte array
        var floatArray = output.ToArray();
        var byteArray = new byte[floatArray.Length * sizeof(float)];
        Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);

        // Convert the byte array to a string
        var text = Encoding.UTF8.GetString(byteArray);
        return text;
    }


    private string DecodeTokenIds(int[] tokenIds)
    {
        // Decode the token IDs into text using the tokenizer mapping
        var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
        return text;
    }

    private void DetectObjects(string imagePath)
    {
        try
        {
            // Preprocess the image
            var imageTensor = PreprocessImage(imagePath);

            // Prepare inputs for the vision model
            var visionInputs = PrepareVisionInputs(imagePath);

            // Run the vision model
            var visualFeatures = RunVisionModel(visionInputs);

            // Prepare inputs for the text model
            var textInputs = PrepareTextInputs(visualFeatures);

            // Run the text model
            var response = RunTextModel(textInputs);

            // Display the response
            MessageBox.Show($"Model Response: {response}");
        }
        catch (Exception ex)
        {
            Debug.Print(ex.Message.ToString());
            MessageBox.Show($"Error during inference: {ex.Message}");
        }
    }


    private string RunTextModel(List<NamedOnnxValue> inputs)
    {
        using (var results = textSession.Run(inputs))
        {
            var logits = results.First().AsTensor<float>();

            // Decode the logits into text (adjust based on your tokenizer)
            var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
            var text = DecodeTokenIds(tokenIds);
            return text;
        }
    }


    private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
    {
        using (var results = visionSession.Run(inputs))
        {
            var visualFeatures = results.First().AsTensor<float>();
            return visualFeatures;
        }
    }


    private Tensor<float> ConvertImageToTensor(Image image)
    {
        // Resize the image to the model's input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;

        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, y, x] = pixel.R / 255f;
                tensor[0, 1, y, x] = pixel.G / 255f;
                tensor[0, 2, y, x] = pixel.B / 255f;
            }
        }

        return tensor;
    }

    private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
    {
        var boxes = new List<Rectangle>();
        var descriptions = new List<string>();

        // Example logic for processing output (adjust based on your model)
        int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
        for (int i = 0; i < numObjects; i++)
        {
            int offset = i * 6;
            float x = output[offset] * PBImage.Width;
            float y = output[offset + 1] * PBImage.Height;
            float width = output[offset + 2] * PBImage.Width;
            float height = output[offset + 3] * PBImage.Height;
            int classId = (int)output[offset + 4];
            float confidence = output[offset + 5];

            if (confidence > 0.5) // Confidence threshold
            {
                boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
                descriptions.Add($"Object {classId} with confidence {confidence:P}");
            }
        }

        return (boxes, descriptions);
    }

    private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
    {
        if (PBImage.Image == null)
            return;

        Image image = (Image)PBImage.Image.Clone();
        using (Graphics g = Graphics.FromImage(image))
        {
            Pen pen = new Pen(Color.Red, 2);
            Font font = new Font("Arial", 10);
            Brush brush = new SolidBrush(Color.Yellow);

            for (int i = 0; i < boxes.Count; i++)
            {
                g.DrawRectangle(pen, boxes[i]);
                g.DrawString(descriptions[i], font, brush, boxes[i].Location);
            }
        }

        PBImage.Image = image;
    }


    private void BTNInfer_Click(object sender, EventArgs e)
    {
        using (OpenFileDialog openFileDialog = new OpenFileDialog())
        {
            openFileDialog.Filter = "Image Files|*.png";

            if (openFileDialog.ShowDialog() == DialogResult.OK)
            {
                string filePath = openFileDialog.FileName;
                PBImage.Image = Image.FromFile(filePath);

                DetectObjects(filePath);
            }
        }
    }

    private void BTNFinish_Click(object sender, EventArgs e)
    {
        Console.Beep();
        Environment.Exit(0);
    }
}

} => But when I run the code, I got different kind of error. I don't totally understand, but it seems there is always something not correct, please look at the screenshot to understand why I still got issues. I am using image size of 854 by 480 pixels, not 224 by 224 in size.

Thanks,

Jan 08 '25 20:01 zydjohnHotmail

The error message indicates that the input tensor's rank is incorrect. The model expects a 5-dimensional tensor, but it is receiving a 4-dimensional tensor. You need to adjust the input tensor to match the expected dimensions.

Here's how you can modify the code to reshape the input tensor correctly:

Ensure the input image is resized to the expected dimensions.
Add an extra dimension to the input tensor to match the expected rank.

Here's an example of how you can do this:



// ...existing code...
using System.Numerics.Tensors;
// ...existing code...

private Tensor<float> PreprocessImage(Bitmap image)
{
    // Resize the image to 854x480
    Bitmap resizedImage = new Bitmap(image, new Size(854, 480));

    // Convert the image to a float tensor
    Tensor<float> inputTensor = new DenseTensor<float>(new[] { 1, 3, 480, 854, 1 });

    for (int y = 0; y < 480; y++)
    {
        for (int x = 0; x < 854; x++)
        {
            Color pixel = resizedImage.GetPixel(x, y);
            inputTensor[0, 0, y, x, 0] = pixel.R / 255.0f;
            inputTensor[0, 1, y, x, 0] = pixel.G / 255.0f;
            inputTensor[0, 2, y, x, 0] = pixel.B / 255.0f;
        }
    }

    return inputTensor;
}

private void RunInference(Bitmap image)
{
    var inputTensor = PreprocessImage(image);

    var inputs = new List<NamedOnnxValue>
    {
        NamedOnnxValue.CreateFromTensor("pixel_values", inputTensor)
    };

    using (var results = visionSession.Run(inputs))
    {
        // Process the results
    }
}

public Form1()
{
    InitializeComponent();
    visionSession = new InferenceSession(visionModelPath);
    textSession = new InferenceSession(textModelPath);
    LoadTokenizerMapping(tokenPath);

    // Example usage
    Bitmap image = new Bitmap("path_to_image.jpg");
    RunInference(image);
}
// ...existing code...

This code resizes the input image to 854x480 pixels and converts it into a 5-dimensional tensor with the shape [1, 3, 480, 854, 1], which should match the expected input dimensions of the model. Adjust the preprocessing as needed based on your model's requirements.

New solution

using System.Diagnostics;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using Newtonsoft.Json.Linq;
using System.Numerics.Tensors;

namespace TestPhi3Form
{
public partial class Form1 : Form
{
private string visionModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-vision.onnx";
private string textModelPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\phi-3-v-128k-instruct-text.onnx";
private string tokenPath =
@"D:\Phi3\Phi3-vision-128k-instruct-onnx-cpu\Phi3-vision-128k-instruct-onnx-cpu\tokenizer.json";

    private Dictionary<int, string> tokenIdToTextMap;

    private InferenceSession visionSession;
    private InferenceSession textSession;

    public Form1()
    {
        InitializeComponent();
        visionSession = new InferenceSession(visionModelPath);
        textSession = new InferenceSession(textModelPath);
        LoadTokenizerMapping(tokenPath);

        // Example usage
        Bitmap image = new Bitmap("path_to_image.jpg");
        RunInference(image);
    }

    private void LoadTokenizerMapping(string tokenizerJsonPath)
    {
        // Read the tokenizer.json file
        var json = File.ReadAllText(tokenizerJsonPath);
        var tokenizerConfig = JObject.Parse(json);

        // Extract the "added_tokens" section
        var addedTokens = tokenizerConfig["added_tokens"];

        // Create a dictionary to map token IDs to their text
        tokenIdToTextMap = new Dictionary<int, string>();
        foreach (var token in addedTokens)
        {
            int id = token["id"].ToObject<int>();
            string content = token["content"].ToString();
            tokenIdToTextMap[id] = content;
        }
    }

    private Tensor<float> PreprocessImage(Image image)
    {
        // Resize the image to the required input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;
        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));

        // Convert the image to a tensor with an additional dimension
        var tensor = new DenseTensor<float>(new[] { 1, 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, 0, y, x] = pixel.R / 255f;
                tensor[0, 0, 1, y, x] = pixel.G / 255f;
                tensor[0, 0, 2, y, x] = pixel.B / 255f;
            }
        }
        return tensor;
    }

    private Tensor<float> PreprocessImage(string imagePath)
    {
        // Load the image
        using var bitmap = new Bitmap(imagePath);

        // Ensure the image is of size 854x480
        const int targetWidth = 854;
        const int targetHeight = 480;

        // Convert to a Tensor<float> in NCHW format (batch, channels, height, width)
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth }); // Batch, Channels, Height, Width
        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                var pixel = bitmap.GetPixel(x, y);

                // Normalize pixel values to [0, 1] and populate the tensor
                tensor[0, 0, y, x] = pixel.R / 255f; // Red channel
                tensor[0, 1, y, x] = pixel.G / 255f; // Green channel
                tensor[0, 2, y, x] = pixel.B / 255f; // Blue channel
            }
        }

        return tensor;
    }

    private List<NamedOnnxValue> PrepareVisionInputs(string imagePath)
    {
        // Preprocess the image
        var imageTensor = PreprocessImage(imagePath);

        // Prepare the image size tensor (Height and Width)
        var imageSizeTensor = new DenseTensor<long>(new[] { 1, 2 });
        imageSizeTensor[0, 0] = 480; // Height
        imageSizeTensor[0, 1] = 854; // Width

        // Debugging: Print the tensor dimensions and types
        Debug.Print($"Image tensor dimensions: {string.Join(", ", imageTensor.Dimensions.ToArray())}");
        Debug.Print($"Image size tensor dimensions: {string.Join(", ", imageSizeTensor.Dimensions.ToArray())}");

        // Create inputs for the vision model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("pixel_values", imageTensor),
            NamedOnnxValue.CreateFromTensor("image_sizes", imageSizeTensor)
        };

        return inputs;
    }
    
    private List<NamedOnnxValue> PrepareTextInputs(Tensor<float> visualFeatures)
    {
        // Create a dummy attention mask (adjust based on your model's requirements)
        var attentionMask = new DenseTensor<float>(new[] { 1, visualFeatures.Dimensions[1] });

        // Create inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
           NamedOnnxValue.CreateFromTensor("inputs_embeds", visualFeatures),
           NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
        };

        return inputs;
    }


    private string GenerateResponse(Tensor<float> visionOutput)
    {
        // Prepare inputs for the text model
        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("input", visionOutput)
        };

        // Run the text model
        using (var results = textSession.Run(inputs))
        {
            var output = results.First().AsTensor<float>();
            return ProcessTextOutput(output);
        }
    }

    private string ProcessTextOutput(Tensor<float> output)
    {
        // Convert the float array to a byte array
        var floatArray = output.ToArray();
        var byteArray = new byte[floatArray.Length * sizeof(float)];
        Buffer.BlockCopy(floatArray, 0, byteArray, 0, byteArray.Length);

        // Convert the byte array to a string
        var text = Encoding.UTF8.GetString(byteArray);
        return text;
    }


    private string DecodeTokenIds(int[] tokenIds)
    {
        // Decode the token IDs into text using the tokenizer mapping
        var text = string.Join("", tokenIds.Select(id => tokenIdToTextMap.ContainsKey(id) ? tokenIdToTextMap[id] : $"<unk:{id}>"));
        return text;
    }

    private void DetectObjects(string imagePath)
    {
        try
        {
            // Preprocess the image
            var imageTensor = PreprocessImage(imagePath);

            // Prepare inputs for the vision model
            var visionInputs = PrepareVisionInputs(imagePath);

            // Run the vision model
            var visualFeatures = RunVisionModel(visionInputs);

            // Prepare inputs for the text model
            var textInputs = PrepareTextInputs(visualFeatures);

            // Run the text model
            var response = RunTextModel(textInputs);

            // Display the response
            MessageBox.Show($"Model Response: {response}");
        }
        catch (Exception ex)
        {
            Debug.Print(ex.Message.ToString());
            MessageBox.Show($"Error during inference: {ex.Message}");
        }
    }


    private string RunTextModel(List<NamedOnnxValue> inputs)
    {
        using (var results = textSession.Run(inputs))
        {
            var logits = results.First().AsTensor<float>();

            // Decode the logits into text (adjust based on your tokenizer)
            var tokenIds = logits.ToArray().Select(x => (int)x).ToArray();
            var text = DecodeTokenIds(tokenIds);
            return text;
        }
    }


    private Tensor<float> RunVisionModel(List<NamedOnnxValue> inputs)
    {
        using (var results = visionSession.Run(inputs))
        {
            var visualFeatures = results.First().AsTensor<float>();
            return visualFeatures;
        }
    }


    private Tensor<float> ConvertImageToTensor(Image image)
    {
        // Resize the image to the model's input size (e.g., 224x224)
        const int targetWidth = 224;
        const int targetHeight = 224;

        Bitmap resizedImage = new Bitmap(image, new Size(targetWidth, targetHeight));
        var tensor = new DenseTensor<float>(new[] { 1, 3, targetHeight, targetWidth });

        for (int y = 0; y < targetHeight; y++)
        {
            for (int x = 0; x < targetWidth; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);

                // Normalize RGB values to [0, 1] range
                tensor[0, 0, y, x] = pixel.R / 255f;
                tensor[0, 1, y, x] = pixel.G / 255f;
                tensor[0, 2, y, x] = pixel.B / 255f;
            }
        }

        return tensor;
    }

    private (List<Rectangle>, List<string>) ProcessOutput(float[] output)
    {
        var boxes = new List<Rectangle>();
        var descriptions = new List<string>();

        // Example logic for processing output (adjust based on your model)
        int numObjects = output.Length / 6; // Assuming 6 values per object: x, y, width, height, classId, confidence
        for (int i = 0; i < numObjects; i++)
        {
            int offset = i * 6;
            float x = output[offset] * PBImage.Width;
            float y = output[offset + 1] * PBImage.Height;
            float width = output[offset + 2] * PBImage.Width;
            float height = output[offset + 3] * PBImage.Height;
            int classId = (int)output[offset + 4];
            float confidence = output[offset + 5];

            if (confidence > 0.5) // Confidence threshold
            {
                boxes.Add(new Rectangle((int)x, (int)y, (int)width, (int)height));
                descriptions.Add($"Object {classId} with confidence {confidence:P}");
            }
        }

        return (boxes, descriptions);
    }

    private void DrawBoundingBoxes(List<Rectangle> boxes, List<string> descriptions)
    {
        if (PBImage.Image == null)
            return;

        Image image = (Image)PBImage.Image.Clone();
        using (Graphics g = Graphics.FromImage(image))
        {
            Pen pen = new Pen(Color.Red, 2);
            Font font = new Font("Arial", 10);
            Brush brush = new SolidBrush(Color.Yellow);

            for (int i = 0; i < boxes.Count; i++)
            {
                g.DrawRectangle(pen, boxes[i]);
                g.DrawString(descriptions[i], font, brush, boxes[i].Location);
            }
        }

        PBImage.Image = image;
    }


    private void BTNInfer_Click(object sender, EventArgs e)
    {
        using (OpenFileDialog openFileDialog = new OpenFileDialog())
        {
            openFileDialog.Filter = "Image Files|*.png";

            if (openFileDialog.ShowDialog() == DialogResult.OK)
            {
                string filePath = openFileDialog.FileName;
                PBImage.Image = Image.FromFile(filePath);

                DetectObjects(filePath);
            }
        }
    }

    private void BTNFinish_Click(object sender, EventArgs e)
    {
        Console.Beep();
        Environment.Exit(0);
    }

    private Tensor<float> PreprocessImage(Bitmap image)
    {
        // Resize the image to 854x480
        Bitmap resizedImage = new Bitmap(image, new Size(854, 480));

        // Convert the image to a float tensor
        Tensor<float> inputTensor = new DenseTensor<float>(new[] { 1, 3, 480, 854, 1 });

        for (int y = 0; y < 480; y++)
        {
            for (int x = 0; x < 854; x++)
            {
                Color pixel = resizedImage.GetPixel(x, y);
                inputTensor[0, 0, y, x, 0] = pixel.R / 255.0f;
                inputTensor[0, 1, y, x, 0] = pixel.G / 255.0f;
                inputTensor[0, 2, y, x, 0] = pixel.B / 255.0f;
            }
        }

        return inputTensor;
    }

    private void RunInference(Bitmap image)
    {
        var inputTensor = PreprocessImage(image);

        var inputs = new List<NamedOnnxValue>
        {
            NamedOnnxValue.CreateFromTensor("pixel_values", inputTensor)
        };

        using (var results = visionSession.Run(inputs))
        {
            // Process the results
        }
    }
}

Jan 27 '25 18:01 leestott