python-crfsuite icon indicating copy to clipboard operation
python-crfsuite copied to clipboard

feature.possible_transitions generating invalid transitions?

Open davidsbatista opened this issue 6 years ago • 0 comments

I've trained my CRF model with the following parameters configuration:

trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        'feature.possible_transitions': False

I was expecting that the CRF would only learn transitions found in the training data, and not all possible transitions, and indeed when I print the info.transitions from the Tagger object it only shows features from the training data.

But, when I predict tags, I get transitions which are 'invalid', i.e., they never occurred in the training data. I've noticed this, when working with my own dataset that the CRF was predicting sequence of tags like: B-PER, I-LOC, I-LOC, although it happens for very few cases.

I wrote a small script based on your tutorial and on the CONLL dataset, to replicate the problem:

import nltk
import sklearn
import pycrfsuite

from itertools import chain
from collections import Counter, defaultdict

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

def main():
    print("Loading data")

    train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
    test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

    #train_sents = list(nltk.corpus.conll2000.iob_sents('test.txt'))
    #test_sents = list(nltk.corpus.conll2000.iob_sents('train.txt'))

    print("Extracting features")
    X_train = [sent2features(s) for s in train_sents]
    y_train = [sent2labels(s) for s in train_sents]

    X_test = [sent2features(s) for s in test_sents]
    y_test = [sent2labels(s) for s in test_sents]

    print("Training")
    trainer = pycrfsuite.Trainer(verbose=False)
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 1e-3,  # coefficient for L2 penalty
        'max_iterations': 50,  # stop earlier
        'feature.possible_transitions': False,
    })
    # save model
    trainer.train('learned-model.crfsuite')

    # open model and predict
    tagger = pycrfsuite.Tagger()
    tagger.open('learned-model.crfsuite')

    y_pred = [tagger.tag(xseq) for xseq in X_test]
    info = tagger.info()

    print("\nTransitions learned:")
    transitions_learned = set()
    for k in Counter(info.transitions):
        transitions_learned.add(k)
    print_transitions(Counter(info.transitions).most_common())

    print("\nTransitions predicted")
    counts = defaultdict(int)
    for i in range(len(y_pred)):
       for tag1, tag2 in zip(y_pred[i], y_pred[i][1:]):
         counts[(tag1,tag2)] += 1

    transitions_predicted = set()
    for k in Counter(counts):
        print(k, counts[k])
        transitions_predicted.add(k)

    print("\nDifference")
    print(transitions_predicted.difference(transitions_learned))

if __name__ == "__main__":
    main()

Commenting/uncommenting the following pairs of lines allows to test the code with different datasets:

   train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
   test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

   #train_sents = list(nltk.corpus.conll2000.iob_sents('test.txt'))
   #test_sents = list(nltk.corpus.conll2000.iob_sents('train.txt'))

In the Conll2002 corpus for named-entity recognition, the following transitions are tagged in prediction mode, although they where never seen in training, and feature.possible_transitions is set to False:

('I-LOC', 'B-MISC'), ('I-LOC', 'B-PER'), ('I-LOC', 'B-ORG')

In the CoNLL2000 corpus for PoS-tagging, similar thing happens, the following never-seen in training transitions are assigned when the CRF is used in prediction mode:

('B-ADJP', 'I-NP'), ('I-ADJP', 'I-NP'), ('I-PP', 'B-ADJP'), ('B-VP', 'B-CONJP'), ('B-PRT', 'B-CONJP'), ('I-CONJP', 'B-PP'), ('I-PP', 'B-ADVP'), ('B-SBAR', 'B-SBAR'), ('B-NP', 'B-ADVP'), ('B-PRT', 'B-ADJP'), ('I-NP', 'B-LST'), ('I-ADVP', 'B-ADVP'), ('I-LST', 'O'), ('I-ADVP', 'B-ADJP'), ('B-PRT', 'B-VP'), ('B-PRT', 'B-SBAR'), ('B-SBAR', 'B-ADJP'), ('B-ADJP', 'B-PRT'), ('I-CONJP', 'O'), ('B-ADVP', 'B-ADVP'), ('I-VP', 'B-CONJP'), ('B-NP', 'B-CONJP'), ('I-ADJP', 'B-CONJP'), ('B-ADVP', 'B-PRT'), ('I-CONJP', 'B-VP')

davidsbatista avatar Oct 17 '17 20:10 davidsbatista