conll2000 のデータで、CRF で名詞句の判定をやってみたメモ。
データは以下のように [(表記, 品詞, 名詞句ラベル), ...] の形式です。
[('Confidence', 'NN', 'B-NP'), ('in', 'IN', 'O'), ('the', 'DT', 'B-NP'), ('pound', 'NN', 'I-NP'), ('is', 'VBZ', 'O'), ('widely', 'RB', 'O'), ('expected', 'VBN', 'O'), ('to', 'TO', 'O'), ('take', 'VB', 'O'), ('another', 'DT', 'B-NP'), ('sharp', 'JJ', 'I-NP'), ('dive', 'NN', 'I-NP'), ('if', 'IN', 'O'), ('trade', 'NN', 'B-NP'), ('figures', 'NNS', 'I-NP'), ('for', 'IN', 'O'), ('September', 'NNP', 'B-NP'), (',', ',', 'O'), ('due', 'JJ', 'O'), ('for', 'IN', 'O'), ('release', 'NN', 'B-NP'), ('tomorrow', 'NN', 'B-NP'), (',', ',', 'O'), ('fail', 'VB', 'O'), ('to', 'TO', 'O'), ('show', 'VB', 'O'), ('a', 'DT', 'B-NP'), ('substantial', 'JJ', 'I-NP'), ('improvement', 'NN', 'I-NP'), ('from', 'IN', 'O'), ('July', 'NNP', 'B-NP'), ('and', 'CC', 'I-NP'), ('August', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('near-record', 'JJ', 'I-NP'), ('deficits', 'NNS', 'I-NP'), ('.', '.', 'O')]
学習データ(train.txt)で学習を行い、テストデータ(test.txt)で精度を評価します。
プログラムは以下の通り。
import nltk
from nltk.corpus import conll2000
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import metrics
# [(token, pos, label), ...]
def create_data(file):
sents = conll2000.chunked_sents(file, chunk_types=['NP'])
sents = [nltk.chunk.tree2conlltags(s) for s in sents]
return sents
# 単語の特徴量
def word_featureh(sent, i):
word = sent[i]
tkn = word[0]
pos = word[1]
feath = {
'bias': 1.0,
'token': tkn.lower(),
'token.isuppser()': tkn.isupper(),
'token.istitle()': tkn.istitle(),
'token.isdigit()': tkn.isdigit(),
'pos': pos,
}
return feath
# 文の特徴量
def sent_features(sent):
return [word_featureh(sent, i) for i in range(len(sent))]
# 文のラベル
def sent_labels(sent):
return [word[2] for word in sent]
def main():
train_sents = create_data('train.txt')
X_train = [sent_features(s) for s in train_sents]
y_train = [sent_labels(s) for s in train_sents]
test_sents = create_data('test.txt')
X_test = [sent_features(s) for s in test_sents]
y_test = [sent_labels(s) for s in test_sents]
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
crf.fit(X_train, y_train)
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
res = metrics.flat_f1_score(y_test, y_pred,
average='weighted', labels=labels)
print(res)
return 0
if __name__ == '__main__':
res = main()
exit(res)
実行結果は以下の通り。
0.9441077916505949
単純な特徴量でそれなりの結果がでています。