dak ブログ

python、rubyなどのプログラミング、MySQL、サーバーの設定などの備忘録。レゴの写真も。

CRF で名詞句の判定を試してみた

2022-02-16 23:49:49 | 自然言語処理
conll2000 のデータで、CRF で名詞句の判定をやってみたメモ。

データは以下のように [(表記, 品詞, 名詞句ラベル), ...] の形式です。
[('Confidence', 'NN', 'B-NP'), ('in', 'IN', 'O'), ('the', 'DT', 'B-NP'), ('pound', 'NN', 'I-NP'), ('is', 'VBZ', 'O'), ('widely', 'RB', 'O'), ('expected', 'VBN', 'O'), ('to', 'TO', 'O'), ('take', 'VB', 'O'), ('another', 'DT', 'B-NP'), ('sharp', 'JJ', 'I-NP'), ('dive', 'NN', 'I-NP'), ('if', 'IN', 'O'), ('trade', 'NN', 'B-NP'), ('figures', 'NNS', 'I-NP'), ('for', 'IN', 'O'), ('September', 'NNP', 'B-NP'), (',', ',', 'O'), ('due', 'JJ', 'O'), ('for', 'IN', 'O'), ('release', 'NN', 'B-NP'), ('tomorrow', 'NN', 'B-NP'), (',', ',', 'O'), ('fail', 'VB', 'O'), ('to', 'TO', 'O'), ('show', 'VB', 'O'), ('a', 'DT', 'B-NP'), ('substantial', 'JJ', 'I-NP'), ('improvement', 'NN', 'I-NP'), ('from', 'IN', 'O'), ('July', 'NNP', 'B-NP'), ('and', 'CC', 'I-NP'), ('August', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('near-record', 'JJ', 'I-NP'), ('deficits', 'NNS', 'I-NP'), ('.', '.', 'O')]

学習データ(train.txt)で学習を行い、テストデータ(test.txt)で精度を評価します。
プログラムは以下の通り。
import nltk
from nltk.corpus import conll2000
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# [(token, pos, label), ...]
def create_data(file):
    sents = conll2000.chunked_sents(file, chunk_types=['NP'])
    sents = [nltk.chunk.tree2conlltags(s) for s in sents]
    return sents

# 単語の特徴量
def word_featureh(sent, i):
    word = sent[i]
    tkn = word[0]
    pos = word[1]

    feath = {
        'bias': 1.0,
        'token': tkn.lower(),
        'token.isuppser()': tkn.isupper(),
        'token.istitle()': tkn.istitle(),
        'token.isdigit()': tkn.isdigit(),
        'pos': pos,
    }

    return feath

# 文の特徴量
def sent_features(sent):
    return [word_featureh(sent, i) for i in range(len(sent))]

# 文のラベル
def sent_labels(sent):
    return [word[2] for word in sent]

def main():
    train_sents = create_data('train.txt')
    X_train = [sent_features(s) for s in train_sents]
    y_train = [sent_labels(s) for s in train_sents]

    test_sents = create_data('test.txt')
    X_test =  [sent_features(s) for s in test_sents]
    y_test =  [sent_labels(s) for s in test_sents]

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    labels = list(crf.classes_)
    labels.remove('O')

    y_pred = crf.predict(X_test)
    res = metrics.flat_f1_score(y_test, y_pred,
                                average='weighted', labels=labels)
    print(res)

    return 0

if __name__ == '__main__':
    res = main()
    exit(res)

実行結果は以下の通り。
0.9441077916505949

単純な特徴量でそれなりの結果がでています。