Elasticsearch で形態素解析を行い、各 token の品所などの情報を取得することができます。
■インデックス定義(create_test1.json)
{
"settings": {
"analysis": {
"analyzer": {
"ja_text_analyzer1": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"filter": [
"icu_normalizer",
"kuromoji_baseform",
"to_katakana"
]
}
},
"filter": {
"to_katakana": {
"type": "icu_transform",
"id": "Hiragana-Katakana"
}
}
}
},
"mappings": {
"dynamic": "strict",
"properties": {
"text": {"type": "text", "store": "true", "analyzer": "ja_text_analyzer1"}
}
}
}
■インデックス作成
curl "http://localhost:9200/test1?pretty" \
-X PUT \
-H 'Content-Type: application/json' \
-T create_test1.json
■解析(詳細情報なし)
curl "http://localhost:9200/test1/_analyze?pretty" \
-XGET \
-H 'Content-Type: application/json' \
-v \
--data '
{
"analyzer": "ja_text_analyzer1",
"text": "私は日本人です"
}'
■解析結果
{
"tokens" : [
{
"token" : "私",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0
},
{
"token" : "ハ",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1
},
{
"token" : "日本人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 2
},
{
"token" : "デス",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 3
}
]
}
■解析(詳細情報あり)
"explain": "true" を指定すると、token の詳細情報を取得できます。
curl "http://localhost:9200/test1/_analyze?pretty" \
-XGET \
-H 'Content-Type: application/json' \
-v \
--data '
{
"analyzer": "ja_text_analyzer1",
"explain": "true",
"text": "私は日本人です"
}'
■解析結果
{
"detail" : {
"custom_analyzer" : true,
"charfilters" : [ ],
"tokenizer" : {
"name" : "kuromoji_tokenizer",
"tokens" : [
{
"token" : "私",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0,
"baseForm" : null,
"bytes" : "[e7 a7 81]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"partOfSpeech" : "名詞-代名詞-一般",
"partOfSpeech (en)" : "noun-pronoun-misc",
"positionLength" : 1,
"pronunciation" : "ワタシ",
"pronunciation (en)" : "watashi",
"reading" : "ワタシ",
"reading (en)" : "watashi",
"termFrequency" : 1
},
{
"token" : "は",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1,
"baseForm" : null,
"bytes" : "[e3 81 af]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"partOfSpeech" : "助詞-係助詞",
"partOfSpeech (en)" : "particle-dependency",
"positionLength" : 1,
"pronunciation" : "ワ",
"pronunciation (en)" : "wa",
"reading" : "ハ",
"reading (en)" : "ha",
"termFrequency" : 1
},
{
"token" : "日本人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 2,
"baseForm" : null,
"bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common",
"positionLength" : 1,
"pronunciation" : "ニッポンジン",
"pronunciation (en)" : "nipponjin",
"reading" : "ニッポンジン",
"reading (en)" : "nipponjin",
"termFrequency" : 1
},
{
"token" : "です",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 3,
"baseForm" : null,
"bytes" : "[e3 81 a7 e3 81 99]",
"inflectionForm" : "基本形",
"inflectionForm (en)" : "base",
"inflectionType" : "特殊・デス",
"inflectionType (en)" : "special-desu",
"partOfSpeech" : "助動詞",
"partOfSpeech (en)" : "auxiliary-verb",
"positionLength" : 1,
"pronunciation" : "デス",
"pronunciation (en)" : "desu",
"reading" : "デス",
"reading (en)" : "desu",
"termFrequency" : 1
}
]
},
"tokenfilters" : [
{
"name" : "icu_normalizer",
"tokens" : [
{
"token" : "私",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0,
"baseForm" : null,
"bytes" : "[e7 a7 81]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"partOfSpeech" : "名詞-代名詞-一般",
"partOfSpeech (en)" : "noun-pronoun-misc",
"positionLength" : 1,
"pronunciation" : "ワタシ",
"pronunciation (en)" : "watashi",
"reading" : "ワタシ",
"reading (en)" : "watashi",
"termFrequency" : 1
},
{
"token" : "は",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1,
"baseForm" : null,
"bytes" : "[e3 81 af]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"partOfSpeech" : "助詞-係助詞",
"partOfSpeech (en)" : "particle-dependency",
"positionLength" : 1,
"pronunciation" : "ワ",
"pronunciation (en)" : "wa",
"reading" : "ハ",
"reading (en)" : "ha",
"termFrequency" : 1
},
{
"token" : "日本人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 2,
"baseForm" : null,
"bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common",
"positionLength" : 1,
"pronunciation" : "ニッポンジン",
"pronunciation (en)" : "nipponjin",
"reading" : "ニッポンジン",
"reading (en)" : "nipponjin",
"termFrequency" : 1
},
{
"token" : "です",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 3,
"baseForm" : null,
"bytes" : "[e3 81 a7 e3 81 99]",
"inflectionForm" : "基本形",
"inflectionForm (en)" : "base",
"inflectionType" : "特殊・デス",
"inflectionType (en)" : "special-desu",
"partOfSpeech" : "助動詞",
"partOfSpeech (en)" : "auxiliary-verb",
"positionLength" : 1,
"pronunciation" : "デス",
"pronunciation (en)" : "desu",
"reading" : "デス",
"reading (en)" : "desu",
"termFrequency" : 1
}
]
},
{
"name" : "kuromoji_baseform",
"tokens" : [
{
"token" : "私",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0,
"baseForm" : null,
"bytes" : "[e7 a7 81]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"keyword" : false,
"partOfSpeech" : "名詞-代名詞-一般",
"partOfSpeech (en)" : "noun-pronoun-misc",
"positionLength" : 1,
"pronunciation" : "ワタシ",
"pronunciation (en)" : "watashi",
"reading" : "ワタシ",
"reading (en)" : "watashi",
"termFrequency" : 1
},
{
"token" : "は",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1,
"baseForm" : null,
"bytes" : "[e3 81 af]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"keyword" : false,
"partOfSpeech" : "助詞-係助詞",
"partOfSpeech (en)" : "particle-dependency",
"positionLength" : 1,
"pronunciation" : "ワ",
"pronunciation (en)" : "wa",
"reading" : "ハ",
"reading (en)" : "ha",
"termFrequency" : 1
},
{
"token" : "日本人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 2,
"baseForm" : null,
"bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"keyword" : false,
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common",
"positionLength" : 1,
"pronunciation" : "ニッポンジン",
"pronunciation (en)" : "nipponjin",
"reading" : "ニッポンジン",
"reading (en)" : "nipponjin",
"termFrequency" : 1
},
{
"token" : "です",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 3,
"baseForm" : null,
"bytes" : "[e3 81 a7 e3 81 99]",
"inflectionForm" : "基本形",
"inflectionForm (en)" : "base",
"inflectionType" : "特殊・デス",
"inflectionType (en)" : "special-desu",
"keyword" : false,
"partOfSpeech" : "助動詞",
"partOfSpeech (en)" : "auxiliary-verb",
"positionLength" : 1,
"pronunciation" : "デス",
"pronunciation (en)" : "desu",
"reading" : "デス",
"reading (en)" : "desu",
"termFrequency" : 1
}
]
},
{
"name" : "to_katakana",
"tokens" : [
{
"token" : "私",
"start_offset" : 0,
"end_offset" : 1,
"type" : "word",
"position" : 0,
"baseForm" : null,
"bytes" : "[e7 a7 81]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"keyword" : false,
"partOfSpeech" : "名詞-代名詞-一般",
"partOfSpeech (en)" : "noun-pronoun-misc",
"positionLength" : 1,
"pronunciation" : "ワタシ",
"pronunciation (en)" : "watashi",
"reading" : "ワタシ",
"reading (en)" : "watashi",
"termFrequency" : 1
},
{
"token" : "ハ",
"start_offset" : 1,
"end_offset" : 2,
"type" : "word",
"position" : 1,
"baseForm" : null,
"bytes" : "[e3 83 8f]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"keyword" : false,
"partOfSpeech" : "助詞-係助詞",
"partOfSpeech (en)" : "particle-dependency",
"positionLength" : 1,
"pronunciation" : "ワ",
"pronunciation (en)" : "wa",
"reading" : "ハ",
"reading (en)" : "ha",
"termFrequency" : 1
},
{
"token" : "日本人",
"start_offset" : 2,
"end_offset" : 5,
"type" : "word",
"position" : 2,
"baseForm" : null,
"bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
"inflectionForm" : null,
"inflectionForm (en)" : null,
"inflectionType" : null,
"inflectionType (en)" : null,
"keyword" : false,
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common",
"positionLength" : 1,
"pronunciation" : "ニッポンジン",
"pronunciation (en)" : "nipponjin",
"reading" : "ニッポンジン",
"reading (en)" : "nipponjin",
"termFrequency" : 1
},
{
"token" : "デス",
"start_offset" : 5,
"end_offset" : 7,
"type" : "word",
"position" : 3,
"baseForm" : null,
"bytes" : "[e3 83 87 e3 82 b9]",
"inflectionForm" : "基本形",
"inflectionForm (en)" : "base",
"inflectionType" : "特殊・デス",
"inflectionType (en)" : "special-desu",
"keyword" : false,
"partOfSpeech" : "助動詞",
"partOfSpeech (en)" : "auxiliary-verb",
"positionLength" : 1,
"pronunciation" : "デス",
"pronunciation (en)" : "desu",
"reading" : "デス",
"reading (en)" : "desu",
"termFrequency" : 1
}
]
}
]
}
}