[multimodal-rag-gemini] Generating image summaries throws error
When trying to use the code in the codelab step for generating image summaries as-is, the API doesn't seem to be able to parse the image type. The image, however, is a correct base64-encoded JPG (I verified this by decoding the image string locally).
It returns the following error:
---------------------------------------------------------------------------
UnidentifiedImageError Traceback (most recent call last)
[<ipython-input-93-deebfb520504>](https://localhost:8080/#) in <cell line: 56>()
54
55 # Image summaries
---> 56 img_base64_list, image_summaries = generate_img_summaries("./cj")
57
58 len(img_base64_list)
18 frames
[/usr/local/lib/python3.10/dist-packages/PIL/Image.py](https://localhost:8080/#) in open(fp, mode, formats)
3281 raise TypeError(msg) from e
3282 else:
-> 3283 rawmode = mode
3284 if mode in ["1", "L", "I", "P", "F"]:
3285 ndmax = 2
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f97edde71a0>
Full Stack Trace
---------------------------------------------------------------------------
UnidentifiedImageError Traceback (most recent call last)
[<ipython-input-99-e1b8336de171>](https://localhost:8080/#) in <cell line: 56>()
54
55 # Image summaries
---> 56 img_base64_list, image_summaries = generate_img_summaries("./cj")
57
58 len(img_base64_list)
18 frames
[<ipython-input-99-e1b8336de171>](https://localhost:8080/#) in generate_img_summaries(path)
48 base64_image = encode_image(img_path)
49 img_base64_list.append(base64_image)
---> 50 image_summaries.append(image_summarize(base64_image, prompt))
51
52 return img_base64_list, image_summaries
[<ipython-input-99-e1b8336de171>](https://localhost:8080/#) in image_summarize(img_base64, prompt)
9 model = ChatVertexAI(model_name="gemini-1.5-pro", max_output_tokens=1024)
10
---> 11 msg = model(
12 [
13 HumanMessage(
[/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py](https://localhost:8080/#) in warning_emitting_wrapper(*args, **kwargs)
168 warned = True
169 emit_warning()
--> 170 return wrapped(*args, **kwargs)
171
172 async def awarning_emitting_wrapper(*args: Any, **kwargs: Any) -> Any:
[/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/chat_models.py](https://localhost:8080/#) in __call__(self, messages, stop, callbacks, **kwargs)
1028 **kwargs: Any,
1029 ) -> BaseMessage:
-> 1030 generation = self.generate(
1031 [messages], stop=stop, callbacks=callbacks, **kwargs
1032 ).generations[0][0]
[/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/chat_models.py](https://localhost:8080/#) in generate(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)
646 if run_managers:
647 run_managers[i].on_llm_error(e, response=LLMResult(generations=[]))
--> 648 raise e
649 flattened_outputs = [
650 LLMResult(generations=[res.generations], llm_output=res.llm_output) # type: ignore[list-item]
[/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/chat_models.py](https://localhost:8080/#) in generate(self, messages, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)
636 try:
637 results.append(
--> 638 self._generate_with_cache(
639 m,
640 stop=stop,
[/usr/local/lib/python3.10/dist-packages/langchain_core/language_models/chat_models.py](https://localhost:8080/#) in _generate_with_cache(self, messages, stop, run_manager, **kwargs)
858 else:
859 if inspect.signature(self._generate).parameters.get("run_manager"):
--> 860 result = self._generate(
861 messages, stop=stop, run_manager=run_manager, **kwargs
862 )
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/chat_models.py](https://localhost:8080/#) in _generate(self, messages, stop, run_manager, stream, **kwargs)
1135 if not self._is_gemini_model:
1136 return self._generate_non_gemini(messages, stop=stop, **kwargs)
-> 1137 return self._generate_gemini(
1138 messages=messages,
1139 stop=stop,
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/chat_models.py](https://localhost:8080/#) in _generate_gemini(self, messages, stop, run_manager, **kwargs)
1291 **kwargs: Any,
1292 ) -> ChatResult:
-> 1293 request = self._prepare_request_gemini(messages=messages, stop=stop, **kwargs)
1294 response = _completion_with_retry(
1295 self.prediction_client.generate_content,
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/chat_models.py](https://localhost:8080/#) in _prepare_request_gemini(self, messages, stop, stream, tools, functions, tool_config, safety_settings, cached_content, tool_choice, **kwargs)
1205 **kwargs,
1206 ) -> GenerateContentRequest:
-> 1207 system_instruction, contents = _parse_chat_history_gemini(messages)
1208 formatted_tools = self._tools_gemini(tools=tools, functions=functions)
1209 if tool_config:
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/chat_models.py](https://localhost:8080/#) in _parse_chat_history_gemini(history, project, convert_system_message_to_human)
280 prev_ai_message = None
281 role = "user"
--> 282 parts = _convert_to_parts(message)
283 if system_parts is not None:
284 if i != 1:
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/chat_models.py](https://localhost:8080/#) in _convert_to_parts(message)
245 result = []
246 for raw_part in raw_content:
--> 247 part = _convert_to_prompt(raw_part)
248 if part:
249 result.append(part)
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/chat_models.py](https://localhost:8080/#) in _convert_to_prompt(part)
211 if part["type"] == "image_url":
212 path = part["image_url"]["url"]
--> 213 return ImageBytesLoader(project=project).load_gapic_part(path)
214
215 # Handle media type like LangChain.js
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/_image_utils.py](https://localhost:8080/#) in load_gapic_part(self, image_string)
110
111 def load_gapic_part(self, image_string: str) -> GapicPart:
--> 112 part = self.load_part(image_string)
113 return part._raw_part
114
[/usr/local/lib/python3.10/dist-packages/langchain_google_vertexai/_image_utils.py](https://localhost:8080/#) in load_part(self, image_string)
107 bytes_ = self._bytes_from_file(image_string)
108
--> 109 return Part.from_image(Image.from_bytes(bytes_))
110
111 def load_gapic_part(self, image_string: str) -> GapicPart:
[/usr/local/lib/python3.10/dist-packages/vertexai/generative_models/_generative_models.py](https://localhost:8080/#) in from_image(image)
2023 @staticmethod
2024 def from_image(image: "Image") -> "Part":
-> 2025 return Part.from_data(data=image.data, mime_type=image._mime_type)
2026
2027 @staticmethod
[/usr/local/lib/python3.10/dist-packages/vertexai/generative_models/_generative_models.py](https://localhost:8080/#) in _mime_type(self)
2443 """Returns the MIME type of the image."""
2444 if PIL_Image:
-> 2445 return _FORMAT_TO_MIME_TYPE[self._pil_image.format.lower()]
2446 else:
2447 # Fall back to jpeg
[/usr/local/lib/python3.10/dist-packages/vertexai/generative_models/_generative_models.py](https://localhost:8080/#) in _pil_image(self)
2436 "The PIL module is not available. Please install the Pillow package."
2437 )
-> 2438 self._loaded_image = PIL_Image.open(io.BytesIO(self._image_bytes))
2439 return self._loaded_image
2440
[/usr/local/lib/python3.10/dist-packages/PIL/Image.py](https://localhost:8080/#) in open(fp, mode, formats)
3281 raise TypeError(msg) from e
3282 else:
-> 3283 rawmode = mode
3284 if mode in ["1", "L", "I", "P", "F"]:
3285 ndmax = 2
UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7f97edd30040>
Swapping out the image_summarize function with this:
def image_summarize(base64_image, prompt):
"""Make image summary"""
model = ChatVertexAI(model_name="gemini-1.5-flash", max_output_tokens=1024)
return model.invoke([('human', [prompt, base64_image])]).content
Seems to have fixed the issue for me (but I'm a newbie to using these APIs, so I could be doing the wrong thing).
Actually, my "fix" only makes it not throw errors: the summaries generated from my "fixed" code are really lovely descriptions of images that have nothing to do with the image from the PDF. I suspect it is hallucinating images from the base64-encoded text.