spaCy icon indicating copy to clipboard operation
spaCy copied to clipboard

SpanCat - Custom SpanCat NER model with whitespace issues

Open david-kld opened this issue 7 months ago • 0 comments

How to reproduce the behaviour

This is going to be difficult to reproduce as the behaviour I am seeing it during inference with a custom trained model. I have trained a multilingual Span Categorizer NER model on a combination of publicly available datasets, I have pasted in the config I used for training below. I used SpaCy's convert module to convert the IOB2 text files to SpaCy format to create the train, dev and test sets.

When testing the model on new data I am observing two issues around whitespace:

  1. In some cases preceding whitespace is included in the detected entity span
  2. More significantly I am seeing a sensitivity to entity detection based on if there is preceding multi-whitespace For example:
import spacy
model_path = ""
spacy_model = spacy.load(model_path)

# single space
output = spacy_model ("4:30 Jerry Smith")
print(f"Output with a single space: {output.spans['ents']}")
# two spaces
output = spacy_model ("4:30  Jerry Smith")
print(f"Output with two spaces: {output.spans['ents']}")
# three spaces
output = spacy_model ("4:30   Jerry Smith")
print(f"Output with three spaces: {output.spans['ents']}")
# four spaces
output = spacy_model ("4:30    Jerry Smith")
print(f"Output with four spaces: {output.spans['ents']}")
# five spaces
output = spacy_model ("4:30     Jerry Smith")
print(f"Output with five spaces: {output.spans['ents']}")
# six spaces
output = spacy_model ("4:30      Jerry Smith")
print(f"Output with six spaces: {output.spans['ents']}")

The above outputs:

Output with a single space: [Jerry Smith]
Output with two spaces: [Jerry Smith]
Output with three spaces: []
Output with four spaces: [Jerry Smith]
Output with five spaces: [    Jerry Smith]
Output with six spaces: []

I understand these are short sentences with little context, but I wouldn't expect this sensitivity to preceding whitespace. Just wondering if this has been seen before and if there is anything that can be done during model training to avoid it.

Your Environment

  • Operating System:
  • spaCy version: 3.8.6
  • Platform: Linux-6.6.87.1-microsoft-standard-WSL2-x86_64-with-glibc2.36
  • Python version: 3.12.10

Config

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "xx"
pipeline = ["spancat"]
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 512
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.spancat]
factory = "spancat"
max_positive = null
spans_key = "ents"
threshold = 0.5

[components.spancat.model]
@architectures = "spacy.SpanCategorizer.v1"

[components.spancat.model.reducer]
@layers = "spacy.mean_max_reducer.v1"
hidden_size = 128

[components.spancat.model.scorer]
@layers = "spacy.LinearLogistic.v1"
nO = null
nI = null

[components.spancat.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[components.spancat.model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.spancat.model.tok2vec.encode:width}
rows = [5000,2000,1000,1000]
attrs = ["ORTH","PREFIX","SUFFIX","SHAPE"]
include_static_vectors = false

[components.spancat.model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

[components.spancat.suggester]
@misc = "spacy.ngram_range_suggester.v1"
min_size = 1
max_size = 5

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
gold_preproc = false
max_length = 0
limit = 0
augmenter = null

[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system:seed}
gpu_allocator = ${system:gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 20000
max_epochs = 0
max_steps = 0
eval_frequency = 10000
frozen_components = []
before_to_disk = null
annotating_components = []

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
spans_ents_f = 1.0
spans_ents_p = 0.0
spans_ents_r = 0.0
spans_sc_f = null
spans_sc_p = null
spans_sc_r = null

[pretraining]

[initialize]
vocab_data = null
vectors = null
init_tok2vec = ${paths.init_tok2vec}
before_init = null
after_init = null
lookups = null


[initialize.components]

[initialize.tokenizer]

david-kld avatar Jun 05 '25 11:06 david-kld