Fix: Tensor device mismatch error (CPU vs MPS) when running on Apple Silicon
Issue
When running the transformers example for the Moondream2 model on Apple Silicon Macs, it failed with:
RuntimeError: torch.cat(): all input tensors must be on the same device. Received cpu and mps:0
The error when the model tried to concatenate tensors that were on different devices - one tensor was on CPU while another was on MPS (Apple's Metal Performance Shaders GPU acceleration).
Root cause
By default, the model doesn't explicitly specify which device to use. On Apple Silicon, some operations were automatically using MPS acceleration while others remained on CPU, causing the tensor device mismatch.
Solution
Added explicit device detection and model placement:
- Check for available hardware (MPS for Apple Silicon, CUDA for NVIDIA GPUs, or fallback to CPU)
- Move the entire model to the selected device using
model.to(device)
This ensures all model components operate on the same device, preventing the tensor device mismatch error.
# Check for available devices
if torch.backends.mps.is_available():
device = torch.device("mps")
print("Using MPS device")
elif torch.cuda.is_available():
device = torch.device("cuda")
print("Using CUDA device")
else:
device = torch.device("cpu")
print("Using CPU")
# Load the model
model = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
revision="2025-01-09",
trust_remote_code=True,
)
# Move model to the appropriate device
model = model.to(device)
Before
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
# Load the model
model = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
revision="2025-01-09",
trust_remote_code=True, # Uncomment for GPU acceleration & pip install accelerate # device_map={"": "cuda"}
)
# Load your image
image = Image.open("path/to/your/image.jpg")
# 1. Image Captioning
print("Short caption:")
print(model.caption(image, length="short")["caption"])
print("Detailed caption:")
for t in model.caption(image, length="normal", stream=True)["caption"]:
print(t, end="", flush=True)
# 2. Visual Question Answering
print("Asking questions about the image:")
print(model.query(image, "How many people are in the image?")["answer"])
# 3. Object Detection
print("Detecting objects:")
objects = model.detect(image, "face")["objects"]
print(f"Found {len(objects)} face(s)")
# 4. Visual Pointing
print("Locating objects:")
points = model.point(image, "person")["points"]
print(f"Found {len(points)} person(s)")
After:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
# Check for available devices
if torch.backends.mps.is_available():
device = torch.device("mps")
print("Using MPS device")
elif torch.cuda.is_available():
device = torch.device("cuda")
print("Using CUDA device")
else:
device = torch.device("cpu")
print("Using CPU")
# Load the model
model = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2",
revision="2025-01-09",
trust_remote_code=True,
)
model = model.to(device)
# Load your image
image = Image.open("image.png")
# 1. Image Captioning
print("Short caption:")
print(model.caption(image, length="short")["caption"])
print("Detailed caption:")
for t in model.caption(image, length="normal", stream=True)["caption"]:
print(t, end="", flush=True)
# 2. Visual Question Answering
print("Asking questions about the image:")
print(model.query(image, "How many people are in the image?")["answer"])
# 3. Object Detection
print("Detecting objects:")
objects = model.detect(image, "face")["objects"]
print(f"Found {len(objects)} face(s)")
# 4. Visual Pointing
print("Locating objects:")
points = model.point(image, "person")["points"]
print(f"Found {len(points)} person(s)")
Hi, thank you for getting this to our notice. We'll investigate this.
cc: @EthanReid
It worked actually for me !
import torch from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image
Check for available devices
if torch.backends.mps.is_available(): device = torch.device("mps") print("Using MPS device") elif torch.cuda.is_available(): device = torch.device("cuda") print("Using CUDA device") else: device = torch.device("cpu") print("Using CPU")
model = AutoModelForCausalLM.from_pretrained( "vikhyatk/moondream2", revision="2025-01-09", trust_remote_code=True, # Uncomment for GPU acceleration & pip install accelerate # device_map={"": "cuda"} device_map={"": "mps"}, # CHANGE THIS PART TO "MPS" )