xgrammar
xgrammar copied to clipboard
incorrect mask with additionalProperties:true and non-required properties
With the following schema, xgrammar will accept {"a": "wrong"} even though it shouldn't ("a" is supposed to be an integer)
{
"type": "object",
"properties": {
"a": {"type": "integer"}
},
"additionalProperties": true,
"required": []
}
full repro
import xgrammar as xgr
import torch
import numpy as np
import json
from transformers import AutoTokenizer, AutoConfig
# Get tokenizer info
model_id = "unsloth/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
config = AutoConfig.from_pretrained(model_id)
# This can be larger than tokenizer.vocab_size due to paddings
full_vocab_size = config.vocab_size
tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=full_vocab_size)
compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=1)
schema = {
"type": "object",
"properties": {
"a": {"type": "integer"},
},
"additionalProperties": True,
"required": [],
}
compiled_grammar = compiler.compile_json_schema(json.dumps(schema))
matcher = xgr.GrammarMatcher(compiled_grammar)
token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
sim_sampled_response = '{"a": "wrong"}'
sim_sampled_token_ids = tokenizer.encode(sim_sampled_response, add_special_tokens=False)
for i, sim_token_id in enumerate(sim_sampled_token_ids):
matcher.fill_next_token_bitmask(token_bitmask)
assert matcher.accept_token(sim_token_id)
This of course affects great many testcases if you enable strict=False (which is to say, JSON-Schema compliant mode).