Elasticsearch で形態素解析を行い、各 token の品所などの情報を取得することができます。
■インデックス定義(create_test1.json)
■インデックス作成
■解析(詳細情報なし)
■解析結果
■解析(詳細情報あり)
"explain": "true" を指定すると、token の詳細情報を取得できます。
■インデックス定義(create_test1.json)
{ "settings": { "analysis": { "analyzer": { "ja_text_analyzer1": { "type": "custom", "tokenizer": "kuromoji_tokenizer", "filter": [ "icu_normalizer", "kuromoji_baseform", "to_katakana" ] } }, "filter": { "to_katakana": { "type": "icu_transform", "id": "Hiragana-Katakana" } } } }, "mappings": { "dynamic": "strict", "properties": { "text": {"type": "text", "store": "true", "analyzer": "ja_text_analyzer1"} } } }
■インデックス作成
curl "http://localhost:9200/test1?pretty" \ -X PUT \ -H 'Content-Type: application/json' \ -T create_test1.json
■解析(詳細情報なし)
curl "http://localhost:9200/test1/_analyze?pretty" \ -XGET \ -H 'Content-Type: application/json' \ -v \ --data ' { "analyzer": "ja_text_analyzer1", "text": "私は日本人です" }'
■解析結果
{ "tokens" : [ { "token" : "私", "start_offset" : 0, "end_offset" : 1, "type" : "word", "position" : 0 }, { "token" : "ハ", "start_offset" : 1, "end_offset" : 2, "type" : "word", "position" : 1 }, { "token" : "日本人", "start_offset" : 2, "end_offset" : 5, "type" : "word", "position" : 2 }, { "token" : "デス", "start_offset" : 5, "end_offset" : 7, "type" : "word", "position" : 3 } ] }
■解析(詳細情報あり)
"explain": "true" を指定すると、token の詳細情報を取得できます。
curl "http://localhost:9200/test1/_analyze?pretty" \ -XGET \ -H 'Content-Type: application/json' \ -v \ --data ' { "analyzer": "ja_text_analyzer1", "explain": "true", "text": "私は日本人です" }' ■解析結果{ "detail" : { "custom_analyzer" : true, "charfilters" : [ ], "tokenizer" : { "name" : "kuromoji_tokenizer", "tokens" : [ { "token" : "私", "start_offset" : 0, "end_offset" : 1, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[e7 a7 81]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-代名詞-一般", "partOfSpeech (en)" : "noun-pronoun-misc", "positionLength" : 1, "pronunciation" : "ワタシ", "pronunciation (en)" : "watashi", "reading" : "ワタシ", "reading (en)" : "watashi", "termFrequency" : 1 }, { "token" : "は", "start_offset" : 1, "end_offset" : 2, "type" : "word", "position" : 1, "baseForm" : null, "bytes" : "[e3 81 af]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "助詞-係助詞", "partOfSpeech (en)" : "particle-dependency", "positionLength" : 1, "pronunciation" : "ワ", "pronunciation (en)" : "wa", "reading" : "ハ", "reading (en)" : "ha", "termFrequency" : 1 }, { "token" : "日本人", "start_offset" : 2, "end_offset" : 5, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ニッポンジン", "pronunciation (en)" : "nipponjin", "reading" : "ニッポンジン", "reading (en)" : "nipponjin", "termFrequency" : 1 }, { "token" : "です", "start_offset" : 5, "end_offset" : 7, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e3 81 a7 e3 81 99]", "inflectionForm" : "基本形", "inflectionForm (en)" : "base", "inflectionType" : "特殊・デス", "inflectionType (en)" : "special-desu", "partOfSpeech" : "助動詞", "partOfSpeech (en)" : "auxiliary-verb", "positionLength" : 1, "pronunciation" : "デス", "pronunciation (en)" : "desu", "reading" : "デス", "reading (en)" : "desu", "termFrequency" : 1 } ] }, "tokenfilters" : [ { "name" : "icu_normalizer", "tokens" : [ { "token" : "私", "start_offset" : 0, "end_offset" : 1, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[e7 a7 81]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-代名詞-一般", "partOfSpeech (en)" : "noun-pronoun-misc", "positionLength" : 1, "pronunciation" : "ワタシ", "pronunciation (en)" : "watashi", "reading" : "ワタシ", "reading (en)" : "watashi", "termFrequency" : 1 }, { "token" : "は", "start_offset" : 1, "end_offset" : 2, "type" : "word", "position" : 1, "baseForm" : null, "bytes" : "[e3 81 af]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "助詞-係助詞", "partOfSpeech (en)" : "particle-dependency", "positionLength" : 1, "pronunciation" : "ワ", "pronunciation (en)" : "wa", "reading" : "ハ", "reading (en)" : "ha", "termFrequency" : 1 }, { "token" : "日本人", "start_offset" : 2, "end_offset" : 5, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ニッポンジン", "pronunciation (en)" : "nipponjin", "reading" : "ニッポンジン", "reading (en)" : "nipponjin", "termFrequency" : 1 }, { "token" : "です", "start_offset" : 5, "end_offset" : 7, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e3 81 a7 e3 81 99]", "inflectionForm" : "基本形", "inflectionForm (en)" : "base", "inflectionType" : "特殊・デス", "inflectionType (en)" : "special-desu", "partOfSpeech" : "助動詞", "partOfSpeech (en)" : "auxiliary-verb", "positionLength" : 1, "pronunciation" : "デス", "pronunciation (en)" : "desu", "reading" : "デス", "reading (en)" : "desu", "termFrequency" : 1 } ] }, { "name" : "kuromoji_baseform", "tokens" : [ { "token" : "私", "start_offset" : 0, "end_offset" : 1, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[e7 a7 81]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-代名詞-一般", "partOfSpeech (en)" : "noun-pronoun-misc", "positionLength" : 1, "pronunciation" : "ワタシ", "pronunciation (en)" : "watashi", "reading" : "ワタシ", "reading (en)" : "watashi", "termFrequency" : 1 }, { "token" : "は", "start_offset" : 1, "end_offset" : 2, "type" : "word", "position" : 1, "baseForm" : null, "bytes" : "[e3 81 af]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "助詞-係助詞", "partOfSpeech (en)" : "particle-dependency", "positionLength" : 1, "pronunciation" : "ワ", "pronunciation (en)" : "wa", "reading" : "ハ", "reading (en)" : "ha", "termFrequency" : 1 }, { "token" : "日本人", "start_offset" : 2, "end_offset" : 5, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ニッポンジン", "pronunciation (en)" : "nipponjin", "reading" : "ニッポンジン", "reading (en)" : "nipponjin", "termFrequency" : 1 }, { "token" : "です", "start_offset" : 5, "end_offset" : 7, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e3 81 a7 e3 81 99]", "inflectionForm" : "基本形", "inflectionForm (en)" : "base", "inflectionType" : "特殊・デス", "inflectionType (en)" : "special-desu", "keyword" : false, "partOfSpeech" : "助動詞", "partOfSpeech (en)" : "auxiliary-verb", "positionLength" : 1, "pronunciation" : "デス", "pronunciation (en)" : "desu", "reading" : "デス", "reading (en)" : "desu", "termFrequency" : 1 } ] }, { "name" : "to_katakana", "tokens" : [ { "token" : "私", "start_offset" : 0, "end_offset" : 1, "type" : "word", "position" : 0, "baseForm" : null, "bytes" : "[e7 a7 81]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-代名詞-一般", "partOfSpeech (en)" : "noun-pronoun-misc", "positionLength" : 1, "pronunciation" : "ワタシ", "pronunciation (en)" : "watashi", "reading" : "ワタシ", "reading (en)" : "watashi", "termFrequency" : 1 }, { "token" : "ハ", "start_offset" : 1, "end_offset" : 2, "type" : "word", "position" : 1, "baseForm" : null, "bytes" : "[e3 83 8f]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "助詞-係助詞", "partOfSpeech (en)" : "particle-dependency", "positionLength" : 1, "pronunciation" : "ワ", "pronunciation (en)" : "wa", "reading" : "ハ", "reading (en)" : "ha", "termFrequency" : 1 }, { "token" : "日本人", "start_offset" : 2, "end_offset" : 5, "type" : "word", "position" : 2, "baseForm" : null, "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]", "inflectionForm" : null, "inflectionForm (en)" : null, "inflectionType" : null, "inflectionType (en)" : null, "keyword" : false, "partOfSpeech" : "名詞-一般", "partOfSpeech (en)" : "noun-common", "positionLength" : 1, "pronunciation" : "ニッポンジン", "pronunciation (en)" : "nipponjin", "reading" : "ニッポンジン", "reading (en)" : "nipponjin", "termFrequency" : 1 }, { "token" : "デス", "start_offset" : 5, "end_offset" : 7, "type" : "word", "position" : 3, "baseForm" : null, "bytes" : "[e3 83 87 e3 82 b9]", "inflectionForm" : "基本形", "inflectionForm (en)" : "base", "inflectionType" : "特殊・デス", "inflectionType (en)" : "special-desu", "keyword" : false, "partOfSpeech" : "助動詞", "partOfSpeech (en)" : "auxiliary-verb", "positionLength" : 1, "pronunciation" : "デス", "pronunciation (en)" : "desu", "reading" : "デス", "reading (en)" : "desu", "termFrequency" : 1 } ] } ] } }