outlines
outlines copied to clipboard
JSON Processor two spaces allowed when whitespace_pattern=r"[\n ]?"
Describe the issue as clearly as possible:
When using JSON logits processor, the processor does not mask off the token SPIECE_UNDERLINE * 2.``
Steps/code to reproduce the bug:
import math
from collections import defaultdict
from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Type, Union
import json
import torch
from outlines.integrations.utils import adapt_tokenizer
from outlines.fsm.json_schema import build_regex_from_schema
from outlines.fsm.guide import RegexGuide, Guide
from transformers import AutoTokenizer
from transformers import SPIECE_UNDERLINE
import torch.nn.functional as F
class BaseLogitsProcessor:
def __init__(self, guide: Guide):
self.mask_cache: Dict[int, torch.Tensor] = {}
self.fsm: Guide = guide
self._fsm_state: DefaultDict[int, int] = defaultdict(int)
def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
"""Use the FSM to bias the logits before sampling the next token.
Parameters
----------
input_ids
The tokens of the current sentence.
scores
The logits of the current sentence.
Returns
-------
torch.Tensor
The biased logits.
"""
seq_id = hash(tuple(input_ids))
# Initialize the FSM state dictionary if the input_ids are empty, as this means
# that the input_ids are the first tokens of the sequence.
if len(input_ids) > 0:
last_token = input_ids[-1]
last_seq_id = hash(tuple(input_ids[:-1]))
self._fsm_state[seq_id] = self.fsm.get_next_state(
state=self._fsm_state[last_seq_id], token_id=last_token
)
state_id = self._fsm_state[seq_id]
if state_id not in self.mask_cache:
allowed_tokens = self.fsm.get_next_instruction(
state=self._fsm_state[seq_id]
).tokens
mask = torch.full((scores.shape[-1],), -math.inf)
mask[allowed_tokens] = 0
mask = mask.pin_memory()
self.mask_cache[state_id] = mask
else:
mask = self.mask_cache[state_id]
mask = mask.to(device=scores.device, non_blocking=True)
biased_scores = scores + mask
return biased_scores
class RegexLogitsProcessor(BaseLogitsProcessor):
@classmethod
def _get_guide(cls, regex_string: str, tokenizer) -> Guide:
tokenizer = adapt_tokenizer(tokenizer)
return RegexGuide(regex_string, tokenizer)
def __init__(self, regex_string: str, tokenizer):
"""Compile the FSM that drives the regex-structured generation.
Parameters
----------
regex_string
A string that represents a regular expression
tokenizer
The model's tokenizer
"""
super().__init__(
RegexLogitsProcessor._get_guide(regex_string, tokenizer))
class JSONLogitsProcessor(RegexLogitsProcessor):
def __init__(self, schema, tokenizer, whitespace_pattern=None):
"""Compile the FSM that drives the JSON-guided generation.
Parameters
----------
schema
A JSON schema that encodes the structure we want the model to
generate
tokenizer
The model's tokenizer
whitespace_pattern
Pattern to use for JSON syntactic whitespace (doesn't impact
string literals)
Example: allow only a single space or newline with
`whitespace_pattern=r"[\n ]?"`
"""
regex_string = build_regex_from_schema(schema, whitespace_pattern)
super().__init__(regex_string, tokenizer)
tokenizer = AutoTokenizer.from_pretrained(
"meta-llama/Llama-2-7b-chat-hf"
)
tokens = ['{', SPIECE_UNDERLINE * 2]
token_indices = [tokenizer.convert_tokens_to_ids(token) for token in tokens]
schema = {
"type": "object",
"additionalProperties": True
}
schema = json.dumps(schema)
processor = JSONLogitsProcessor(schema, tokenizer, whitespace_pattern=r"[\n ]?)
for i, token in enumerate(tokens):
print(f"Step {i}:")
logits = torch.zeros(32000)
logits[token_indices[i]] = 100
logits = processor(token_indices[:i], logits)
probs = F.softmax(logits, dim=-1)
top_probs, top_indices = torch.topk(probs, 5)
tokens = tokenizer.convert_ids_to_tokens(top_indices.squeeze().tolist())
for token, prob in zip(tokens, top_probs.squeeze().tolist()):
print(f"Token: {token}, Probability: {prob:.4f}")
Expected result:
Current output is
Step 0:
Token: {, Probability: 1.0000
Token: {}, Probability: 0.0000
Token: {", Probability: 0.0000
Token: <0x7B>, Probability: 0.0000
Token: <0x00>, Probability: 0.0000
Step 1:
Token: ▁▁, Probability: 1.0000
Token: <0x7D>, Probability: 0.0000
Token: <0x22>, Probability: 0.0000
Token: ▁", Probability: 0.0000
Token: <0x20>, Probability: 0.0000
I expect that the Probability for ▁▁ should go to zero.
Error message:
No response
Outlines/Python version information:
Version information
```
0.0.43
Python 3.11.2 (main, May 2 2024, 11:59:08) [GCC 12.2.0]
```
Context for the issue:
No response
Does this bug exist when you use the following?
logits_processor = outlines.processors.JSONLogitsProcessor(
schema,
outlines.models.transformers(...).tokenizer
)
@lapp0 Yes. The output is the same.
I'm unable to reproduce using logits processors imported from outlines.processors. Could you please help me understand what I'm missing here?
from transformers import AutoTokenizer
from transformers.file_utils import SPIECE_UNDERLINE
import json
from outlines.processors import JSONLogitsProcessor
from outlines.models.transformers import TransformerTokenizer
import torch
import torch.nn.functional as F
tokenizer = TransformerTokenizer(AutoTokenizer.from_pretrained(
"NousResearch/Llama-2-7b-hf"
))
tokens = ['{', SPIECE_UNDERLINE * 2]
token_indices = sorted(tokenizer.vocabulary.values())
schema = {
"type": "object",
"additionalProperties": True
}
schema = json.dumps(schema)
processor = JSONLogitsProcessor(schema, tokenizer, whitespace_pattern=r"[\n ]?")
for i, token in enumerate(tokens):
logits = torch.zeros(32000)
logits[token_indices[i]] = 100
logits = processor(token_indices[:i], logits)
probs = F.softmax(logits, dim=-1)
print("probability of SPIECE_UNDERLINE * 2:", probs[tokenizer.vocabulary[SPIECE_UNDERLINE * 2]])
print(probs[tokenizer.vocabulary[SPIECE_UNDERLINE * 2]])
top_probs, top_indices = torch.topk(probs, 5)
tokens = {tok: tid for tok, tid in tokenizer.vocabulary.items() if tid in top_indices.squeeze().tolist()}
Output:
probability of SPIECE_UNDERLINE * 2:
tensor(0.)
probability of SPIECE_UNDERLINE * 2:
tensor(0.)