difference in caching behavior between `Anthropic` and `AnthropicBedrock`
Here is a example I took from one of the cookbooks, modified slightly:
import time
from app.core.config import settings
from anthropic import Anthropic, AnthropicBedrock
client_ant = Anthropic(api_key=settings.ANTHROPIC_API_KEY)
client_bedrock = AnthropicBedrock(
aws_access_key=settings.AWS_ACCESS_KEY_ID,
aws_secret_key=settings.AWS_SECRET_ACCESS_KEY,
aws_region=settings.AWS_BEDROCK_PRIMARY_REGION,
)
bedrock_sonnet37 = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
sonnet37 = "claude-3-7-sonnet-20250219"
class ConversationHistory:
def __init__(self):
self.turns = []
def add_turn_assistant(self, content):
self.turns.append(
{"role": "assistant", "content": [{"type": "text", "text": content}]}
)
def add_turn_user(self, content):
self.turns.append(
{"role": "user", "content": [{"type": "text", "text": content}]}
)
def get_turns(self):
result = []
user_turns_processed = 0
for turn in reversed(self.turns):
if turn["role"] == "user" and user_turns_processed < 1:
result.append(
{
"role": "user",
"content": [
{
"type": "text",
"text": turn["content"][0]["text"],
"cache_control": {"type": "ephemeral"},
}
],
}
)
user_turns_processed += 1
else:
result.append(turn)
return list(reversed(result))
conversation_history = ConversationHistory()
system_message = "The user's name is Li Yang and he is 7 years old." * 100
questions = ["What is my name?", "What is my age?"]
def simulate_conversation(client, model_name):
for i, question in enumerate(questions, 1):
print(f"\nTurn {i}:")
print(f"User: {question}")
conversation_history.add_turn_user(question)
start_time = time.time()
response = client.messages.create(
model=model_name,
max_tokens=100,
system=[
{
"type": "text",
"text": system_message,
},
],
messages=conversation_history.get_turns(),
)
end_time = time.time()
assistant_reply = response.content[0].text
print(f"Assistant: {assistant_reply}")
input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
input_tokens_cache_read = getattr(
response.usage, "cache_read_input_tokens", "---"
)
input_tokens_cache_create = getattr(
response.usage, "cache_creation_input_tokens", "---"
)
print(f"User input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Input tokens (cache read): {input_tokens_cache_read}")
print(f"Input tokens (cache write): {input_tokens_cache_create}")
elapsed_time = end_time - start_time
total_input_tokens = input_tokens + (
int(input_tokens_cache_read) if input_tokens_cache_read != "---" else 0
)
percentage_cached = (
int(input_tokens_cache_read) / total_input_tokens * 100
if input_tokens_cache_read != "---" and total_input_tokens > 0
else 0
)
print(
f"{percentage_cached:.1f}% of input prompt cached ({total_input_tokens} tokens)"
)
print(f"Time taken: {elapsed_time:.2f} seconds")
conversation_history.add_turn_assistant(assistant_reply)
The changes are:
- Removed cache control on the system prompt
- Shortened the system prompt to some dummy text
Running it with:
simulate_conversation(client_ant, sonnet37)
yields
Turn 1:
User: What is my name?
Assistant: Your name is Li Yang, as mentioned in the text I provided.
User input tokens: 4
Output tokens: 17
Input tokens (cache read): 0
Input tokens (cache write): 1608
0.0% of input prompt cached (4 tokens)
Time taken: 1.64 seconds
Turn 2:
User: What is my age?
Assistant: Your age is 7 years old, as mentioned in the text I provided.
User input tokens: 4
Output tokens: 20
Input tokens (cache read): 1608
Input tokens (cache write): 25
99.8% of input prompt cached (1612 tokens)
Time taken: 1.28 seconds
and
simulate_conversation(client_bedrock, bedrock_sonnet37)
yields
Turn 1:
User: What is my name?
Assistant: Your name is Li Yang, according to the information provided in our conversation.
User input tokens: 4
Output tokens: 18
Input tokens (cache read): 0
Input tokens (cache write): 1661
0.0% of input prompt cached (4 tokens)
Time taken: 2.40 seconds
Turn 2:
User: What is my age?
Assistant: According to the information in our conversation, you are 7 years old.
User input tokens: 4
Output tokens: 19
Input tokens (cache read): 0
Input tokens (cache write): 1687
0.0% of input prompt cached (4 tokens)
Time taken: 1.36 seconds
The Bedrock one does not register a cache hit, although the prefix is cached unlike on the official Anthropic API.
I have not seen anything in either documentation that would suggest divergent behavior. It seems that Bedrock requires you to keep a checkpoint at a previous message, and if it's not there regardless if a prefix matches, it does not hit the cache.
Is this behavior expected?
Thanks for flagging this! We have not yet got bedrock support for the prompt caching improvement mentioned here https://docs.anthropic.com/en/release-notes/api#january-15th-2025
if you want something that will work on bedrock you will need to use the cache control header on each read point
@bradabrams Could you point to somewhere I can get to know the exact name & value of the cache control header I need to pass?
Sorry not a header -- you would need to explicitly add a cache_control: {"type": "ephemeral"} field