outlines JSON Processor two spaces allowed when whitespace

Describe the issue as clearly as possible:

When using JSON logits processor, the processor does not mask off the token SPIECE_UNDERLINE * 2.``

Steps/code to reproduce the bug:

import math
from collections import defaultdict
from typing import TYPE_CHECKING, DefaultDict, Dict, List, Optional, Type, Union
import json

import torch
from outlines.integrations.utils import adapt_tokenizer
from outlines.fsm.json_schema import build_regex_from_schema
from outlines.fsm.guide import RegexGuide, Guide

from transformers import AutoTokenizer
from transformers import SPIECE_UNDERLINE
import torch.nn.functional as F

class BaseLogitsProcessor:

    def __init__(self, guide: Guide):
        self.mask_cache: Dict[int, torch.Tensor] = {}
        self.fsm: Guide = guide
        self._fsm_state: DefaultDict[int, int] = defaultdict(int)

    def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
        """Use the FSM to bias the logits before sampling the next token.

        Parameters
        ----------
        input_ids
            The tokens of the current sentence.
        scores
            The logits of the current sentence.

        Returns
        -------
        torch.Tensor
            The biased logits.
        """
        seq_id = hash(tuple(input_ids))

        # Initialize the FSM state dictionary if the input_ids are empty, as this means
        # that the input_ids are the first tokens of the sequence.
        if len(input_ids) > 0:
            last_token = input_ids[-1]
            last_seq_id = hash(tuple(input_ids[:-1]))
            self._fsm_state[seq_id] = self.fsm.get_next_state(
                state=self._fsm_state[last_seq_id], token_id=last_token
            )

        state_id = self._fsm_state[seq_id]
        if state_id not in self.mask_cache:
            allowed_tokens = self.fsm.get_next_instruction(
                state=self._fsm_state[seq_id]
            ).tokens
            mask = torch.full((scores.shape[-1],), -math.inf)
            mask[allowed_tokens] = 0
            mask = mask.pin_memory()
            self.mask_cache[state_id] = mask
        else:
            mask = self.mask_cache[state_id]
        mask = mask.to(device=scores.device, non_blocking=True)
        biased_scores = scores + mask

        return biased_scores


class RegexLogitsProcessor(BaseLogitsProcessor):

    @classmethod
    def _get_guide(cls, regex_string: str, tokenizer) -> Guide:
        tokenizer = adapt_tokenizer(tokenizer)
        return RegexGuide(regex_string, tokenizer)

    def __init__(self, regex_string: str, tokenizer):
        """Compile the FSM that drives the regex-structured generation.

        Parameters
        ----------
        regex_string
            A string that represents a regular expression
        tokenizer
            The model's tokenizer

        """
        super().__init__(
            RegexLogitsProcessor._get_guide(regex_string, tokenizer))


class JSONLogitsProcessor(RegexLogitsProcessor):

    def __init__(self, schema, tokenizer, whitespace_pattern=None):
        """Compile the FSM that drives the JSON-guided generation.

        Parameters
        ----------
        schema
            A JSON schema that encodes the structure we want the model to
            generate
        tokenizer
            The model's tokenizer
        whitespace_pattern
            Pattern to use for JSON syntactic whitespace (doesn't impact
            string literals)
            Example: allow only a single space or newline with
            `whitespace_pattern=r"[\n ]?"`
        """
        regex_string = build_regex_from_schema(schema, whitespace_pattern)
        super().__init__(regex_string, tokenizer)

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf"
)

tokens = ['{', SPIECE_UNDERLINE * 2]
token_indices = [tokenizer.convert_tokens_to_ids(token) for token in tokens]

schema = {
    "type": "object",
    "additionalProperties": True
}
schema = json.dumps(schema)
processor = JSONLogitsProcessor(schema, tokenizer, whitespace_pattern=r"[\n ]?)

for i, token in enumerate(tokens):

    print(f"Step {i}:")
    logits = torch.zeros(32000)
    logits[token_indices[i]] = 100

    logits = processor(token_indices[:i], logits)
    probs = F.softmax(logits, dim=-1)
    top_probs, top_indices = torch.topk(probs, 5)
    tokens = tokenizer.convert_ids_to_tokens(top_indices.squeeze().tolist())
    for token, prob in zip(tokens, top_probs.squeeze().tolist()):
        print(f"Token: {token}, Probability: {prob:.4f}")

Expected result:

Current output is

Step 0:
Token: {, Probability: 1.0000
Token: {}, Probability: 0.0000
Token: {", Probability: 0.0000
Token: <0x7B>, Probability: 0.0000
Token: <0x00>, Probability: 0.0000
Step 1:
Token: ▁▁, Probability: 1.0000
Token: <0x7D>, Probability: 0.0000
Token: <0x22>, Probability: 0.0000
Token: ▁", Probability: 0.0000
Token: <0x20>, Probability: 0.0000

I expect that the Probability for ▁▁ should go to zero.

Error message:

No response

Outlines/Python version information:

Version information

``` 0.0.43 Python 3.11.2 (main, May 2 2024, 11:59:08) [GCC 12.2.0] ```

Context for the issue:

No response

Jul 12 '24 03:07 wzhao18

Does this bug exist when you use the following?

logits_processor = outlines.processors.JSONLogitsProcessor(
    schema, 
    outlines.models.transformers(...).tokenizer
)

Jul 13 '24 21:07 lapp0

@lapp0 Yes. The output is the same.

Jul 15 '24 20:07 wzhao18

I'm unable to reproduce using logits processors imported from outlines.processors. Could you please help me understand what I'm missing here?

from transformers import AutoTokenizer
from transformers.file_utils import SPIECE_UNDERLINE
import json
from outlines.processors import JSONLogitsProcessor
from outlines.models.transformers import TransformerTokenizer
import torch
import torch.nn.functional as F

tokenizer = TransformerTokenizer(AutoTokenizer.from_pretrained(
    "NousResearch/Llama-2-7b-hf"
))

tokens = ['{', SPIECE_UNDERLINE * 2]
token_indices = sorted(tokenizer.vocabulary.values())

schema = {
    "type": "object",
    "additionalProperties": True
}
schema = json.dumps(schema)
processor = JSONLogitsProcessor(schema, tokenizer, whitespace_pattern=r"[\n ]?")

for i, token in enumerate(tokens):

    logits = torch.zeros(32000)
    logits[token_indices[i]] = 100

    logits = processor(token_indices[:i], logits)
    probs = F.softmax(logits, dim=-1)
    print("probability of SPIECE_UNDERLINE * 2:", probs[tokenizer.vocabulary[SPIECE_UNDERLINE * 2]])
    print(probs[tokenizer.vocabulary[SPIECE_UNDERLINE * 2]])
    top_probs, top_indices = torch.topk(probs, 5)
    tokens = {tok: tid for tok, tid in tokenizer.vocabulary.items() if tid in top_indices.squeeze().tolist()}

Output:

probability of SPIECE_UNDERLINE * 2:
tensor(0.)
probability of SPIECE_UNDERLINE * 2:
tensor(0.)

Sep 15 '24 02:09 lapp0

outlines
outlines copied to clipboard

JSON Processor two spaces allowed when whitespace_pattern=r"[\n ]?"

Describe the issue as clearly as possible:

Steps/code to reproduce the bug:

Expected result:

Error message:

Outlines/Python version information:

Context for the issue:

outlines outlines copied to clipboard

JSON Processor two spaces allowed when whitespace_pattern=r"[\n ]?"

Describe the issue as clearly as possible:

Steps/code to reproduce the bug:

Expected result:

Error message:

Outlines/Python version information:

Context for the issue:

outlines
outlines copied to clipboard