dak ブログ

python、rubyなどのプログラミング、MySQL、サーバーの設定などの備忘録。レゴの写真も。

Elasticsearch での形態素解析

2022-10-16 23:37:55 | elasticsearch
Elasticsearch で形態素解析を行い、各 token の品所などの情報を取得することができます。
■インデックス定義(create_test1.json)
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ja_text_analyzer1": {
          "type": "custom",
          "tokenizer": "kuromoji_tokenizer",
          "filter": [
            "icu_normalizer",
            "kuromoji_baseform",
            "to_katakana"
          ]
        }
      },
      "filter": {
        "to_katakana": {
          "type": "icu_transform",
          "id": "Hiragana-Katakana"
        }
      }
    }
  },
  "mappings": {
    "dynamic": "strict",
    "properties": {
      "text": {"type": "text", "store": "true", "analyzer": "ja_text_analyzer1"}
    }
  }
}

■インデックス作成
curl "http://localhost:9200/test1?pretty" \
     -X PUT \
     -H 'Content-Type: application/json' \
     -T create_test1.json

■解析(詳細情報なし)
curl "http://localhost:9200/test1/_analyze?pretty" \
     -XGET \
     -H 'Content-Type: application/json' \
     -v \
     --data '
{
     "analyzer": "ja_text_analyzer1",
     "text": "私は日本人です"
}'

■解析結果
{
  "tokens" : [
    {
      "token" : "私",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "ハ",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "日本人",
      "start_offset" : 2,
      "end_offset" : 5,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "デス",
      "start_offset" : 5,
      "end_offset" : 7,
      "type" : "word",
      "position" : 3
    }
  ]
}

■解析(詳細情報あり)
"explain": "true" を指定すると、token の詳細情報を取得できます。
curl "http://localhost:9200/test1/_analyze?pretty" \
     -XGET \
     -H 'Content-Type: application/json' \
     -v \
     --data '
{
     "analyzer": "ja_text_analyzer1",
     "explain": "true",
     "text": "私は日本人です"
}'
■解析結果
{
  "detail" : {
    "custom_analyzer" : true,
    "charfilters" : [ ],
    "tokenizer" : {
      "name" : "kuromoji_tokenizer",
      "tokens" : [
        {
          "token" : "私",
          "start_offset" : 0,
          "end_offset" : 1,
          "type" : "word",
          "position" : 0,
          "baseForm" : null,
          "bytes" : "[e7 a7 81]",
          "inflectionForm" : null,
          "inflectionForm (en)" : null,
          "inflectionType" : null,
          "inflectionType (en)" : null,
          "partOfSpeech" : "名詞-代名詞-一般",
          "partOfSpeech (en)" : "noun-pronoun-misc",
          "positionLength" : 1,
          "pronunciation" : "ワタシ",
          "pronunciation (en)" : "watashi",
          "reading" : "ワタシ",
          "reading (en)" : "watashi",
          "termFrequency" : 1
        },
        {
          "token" : "は",
          "start_offset" : 1,
          "end_offset" : 2,
          "type" : "word",
          "position" : 1,
          "baseForm" : null,
          "bytes" : "[e3 81 af]",
          "inflectionForm" : null,
          "inflectionForm (en)" : null,
          "inflectionType" : null,
          "inflectionType (en)" : null,
          "partOfSpeech" : "助詞-係助詞",
          "partOfSpeech (en)" : "particle-dependency",
          "positionLength" : 1,
          "pronunciation" : "ワ",
          "pronunciation (en)" : "wa",
          "reading" : "ハ",
          "reading (en)" : "ha",
          "termFrequency" : 1
        },
        {
          "token" : "日本人",
          "start_offset" : 2,
          "end_offset" : 5,
          "type" : "word",
          "position" : 2,
          "baseForm" : null,
          "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
          "inflectionForm" : null,
          "inflectionForm (en)" : null,
          "inflectionType" : null,
          "inflectionType (en)" : null,
          "partOfSpeech" : "名詞-一般",
          "partOfSpeech (en)" : "noun-common",
          "positionLength" : 1,
          "pronunciation" : "ニッポンジン",
          "pronunciation (en)" : "nipponjin",
          "reading" : "ニッポンジン",
          "reading (en)" : "nipponjin",
          "termFrequency" : 1
        },
        {
          "token" : "です",
          "start_offset" : 5,
          "end_offset" : 7,
          "type" : "word",
          "position" : 3,
          "baseForm" : null,
          "bytes" : "[e3 81 a7 e3 81 99]",
          "inflectionForm" : "基本形",
          "inflectionForm (en)" : "base",
          "inflectionType" : "特殊・デス",
          "inflectionType (en)" : "special-desu",
          "partOfSpeech" : "助動詞",
          "partOfSpeech (en)" : "auxiliary-verb",
          "positionLength" : 1,
          "pronunciation" : "デス",
          "pronunciation (en)" : "desu",
          "reading" : "デス",
          "reading (en)" : "desu",
          "termFrequency" : 1
        }
      ]
    },
    "tokenfilters" : [
      {
        "name" : "icu_normalizer",
        "tokens" : [
          {
            "token" : "私",
            "start_offset" : 0,
            "end_offset" : 1,
            "type" : "word",
            "position" : 0,
            "baseForm" : null,
            "bytes" : "[e7 a7 81]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "partOfSpeech" : "名詞-代名詞-一般",
            "partOfSpeech (en)" : "noun-pronoun-misc",
            "positionLength" : 1,
            "pronunciation" : "ワタシ",
            "pronunciation (en)" : "watashi",
            "reading" : "ワタシ",
            "reading (en)" : "watashi",
            "termFrequency" : 1
          },
          {
            "token" : "は",
            "start_offset" : 1,
            "end_offset" : 2,
            "type" : "word",
            "position" : 1,
            "baseForm" : null,
            "bytes" : "[e3 81 af]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "partOfSpeech" : "助詞-係助詞",
            "partOfSpeech (en)" : "particle-dependency",
            "positionLength" : 1,
            "pronunciation" : "ワ",
            "pronunciation (en)" : "wa",
            "reading" : "ハ",
            "reading (en)" : "ha",
            "termFrequency" : 1
          },
          {
            "token" : "日本人",
            "start_offset" : 2,
            "end_offset" : 5,
            "type" : "word",
            "position" : 2,
            "baseForm" : null,
            "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "partOfSpeech" : "名詞-一般",
            "partOfSpeech (en)" : "noun-common",
            "positionLength" : 1,
            "pronunciation" : "ニッポンジン",
            "pronunciation (en)" : "nipponjin",
            "reading" : "ニッポンジン",
            "reading (en)" : "nipponjin",
            "termFrequency" : 1
          },
          {
            "token" : "です",
            "start_offset" : 5,
            "end_offset" : 7,
            "type" : "word",
            "position" : 3,
            "baseForm" : null,
            "bytes" : "[e3 81 a7 e3 81 99]",
            "inflectionForm" : "基本形",
            "inflectionForm (en)" : "base",
            "inflectionType" : "特殊・デス",
            "inflectionType (en)" : "special-desu",
            "partOfSpeech" : "助動詞",
            "partOfSpeech (en)" : "auxiliary-verb",
            "positionLength" : 1,
            "pronunciation" : "デス",
            "pronunciation (en)" : "desu",
            "reading" : "デス",
            "reading (en)" : "desu",
            "termFrequency" : 1
          }
        ]
      },
      {
        "name" : "kuromoji_baseform",
        "tokens" : [
          {
            "token" : "私",
            "start_offset" : 0,
            "end_offset" : 1,
            "type" : "word",
            "position" : 0,
            "baseForm" : null,
            "bytes" : "[e7 a7 81]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "keyword" : false,
            "partOfSpeech" : "名詞-代名詞-一般",
            "partOfSpeech (en)" : "noun-pronoun-misc",
            "positionLength" : 1,
            "pronunciation" : "ワタシ",
            "pronunciation (en)" : "watashi",
            "reading" : "ワタシ",
            "reading (en)" : "watashi",
            "termFrequency" : 1
          },
          {
            "token" : "は",
            "start_offset" : 1,
            "end_offset" : 2,
            "type" : "word",
            "position" : 1,
            "baseForm" : null,
            "bytes" : "[e3 81 af]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "keyword" : false,
            "partOfSpeech" : "助詞-係助詞",
            "partOfSpeech (en)" : "particle-dependency",
            "positionLength" : 1,
            "pronunciation" : "ワ",
            "pronunciation (en)" : "wa",
            "reading" : "ハ",
            "reading (en)" : "ha",
            "termFrequency" : 1
          },
          {
            "token" : "日本人",
            "start_offset" : 2,
            "end_offset" : 5,
            "type" : "word",
            "position" : 2,
            "baseForm" : null,
            "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "keyword" : false,
            "partOfSpeech" : "名詞-一般",
            "partOfSpeech (en)" : "noun-common",
            "positionLength" : 1,
            "pronunciation" : "ニッポンジン",
            "pronunciation (en)" : "nipponjin",
            "reading" : "ニッポンジン",
            "reading (en)" : "nipponjin",
            "termFrequency" : 1
          },
          {
            "token" : "です",
            "start_offset" : 5,
            "end_offset" : 7,
            "type" : "word",
            "position" : 3,
            "baseForm" : null,
            "bytes" : "[e3 81 a7 e3 81 99]",
            "inflectionForm" : "基本形",
            "inflectionForm (en)" : "base",
            "inflectionType" : "特殊・デス",
            "inflectionType (en)" : "special-desu",
            "keyword" : false,
            "partOfSpeech" : "助動詞",
            "partOfSpeech (en)" : "auxiliary-verb",
            "positionLength" : 1,
            "pronunciation" : "デス",
            "pronunciation (en)" : "desu",
            "reading" : "デス",
            "reading (en)" : "desu",
            "termFrequency" : 1
          }
        ]
      },
      {
        "name" : "to_katakana",
        "tokens" : [
          {
            "token" : "私",
            "start_offset" : 0,
            "end_offset" : 1,
            "type" : "word",
            "position" : 0,
            "baseForm" : null,
            "bytes" : "[e7 a7 81]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "keyword" : false,
            "partOfSpeech" : "名詞-代名詞-一般",
            "partOfSpeech (en)" : "noun-pronoun-misc",
            "positionLength" : 1,
            "pronunciation" : "ワタシ",
            "pronunciation (en)" : "watashi",
            "reading" : "ワタシ",
            "reading (en)" : "watashi",
            "termFrequency" : 1
          },
          {
            "token" : "ハ",
            "start_offset" : 1,
            "end_offset" : 2,
            "type" : "word",
            "position" : 1,
            "baseForm" : null,
            "bytes" : "[e3 83 8f]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "keyword" : false,
            "partOfSpeech" : "助詞-係助詞",
            "partOfSpeech (en)" : "particle-dependency",
            "positionLength" : 1,
            "pronunciation" : "ワ",
            "pronunciation (en)" : "wa",
            "reading" : "ハ",
            "reading (en)" : "ha",
            "termFrequency" : 1
          },
          {
            "token" : "日本人",
            "start_offset" : 2,
            "end_offset" : 5,
            "type" : "word",
            "position" : 2,
            "baseForm" : null,
            "bytes" : "[e6 97 a5 e6 9c ac e4 ba ba]",
            "inflectionForm" : null,
            "inflectionForm (en)" : null,
            "inflectionType" : null,
            "inflectionType (en)" : null,
            "keyword" : false,
            "partOfSpeech" : "名詞-一般",
            "partOfSpeech (en)" : "noun-common",
            "positionLength" : 1,
            "pronunciation" : "ニッポンジン",
            "pronunciation (en)" : "nipponjin",
            "reading" : "ニッポンジン",
            "reading (en)" : "nipponjin",
            "termFrequency" : 1
          },
          {
            "token" : "デス",
            "start_offset" : 5,
            "end_offset" : 7,
            "type" : "word",
            "position" : 3,
            "baseForm" : null,
            "bytes" : "[e3 83 87 e3 82 b9]",
            "inflectionForm" : "基本形",
            "inflectionForm (en)" : "base",
            "inflectionType" : "特殊・デス",
            "inflectionType (en)" : "special-desu",
            "keyword" : false,
            "partOfSpeech" : "助動詞",
            "partOfSpeech (en)" : "auxiliary-verb",
            "positionLength" : 1,
            "pronunciation" : "デス",
            "pronunciation (en)" : "desu",
            "reading" : "デス",
            "reading (en)" : "desu",
            "termFrequency" : 1
          }
        ]
      }
    ]
  }
}