dak ブログ

python、rubyなどのプログラミング、MySQL、サーバーの設定などの備忘録。レゴの写真も。

chevrotain で簡易な正規表現の構文解析

2022-04-03 17:51:00 | Node.js
chevrotain での簡易な正規表現の構文解析のメモ。
以下のような簡易な正規表現の構文解析を行います。
root -> exprs
exprs -> expr+
expr -> or_expr
or_expr -> '(' str or_strs ')' '?'?
or_expr -> str
or_strs -> ('|' or_str)+
str -> STR

上記の文法は、以下のような文字列を受理します。
abc
(abc|def)
(abc|def)?
(abc|def)(ghi|jkl|mno)?(pqr|stu)

プログラムは以下の通りです。
import { CstParser, Lexer, createToken, Rule } from 'chevrotain'

// lexer
const STR = createToken({ name: "STR", pattern: /[^()|?]+/ });
const LP = createToken({ name: "LP", pattern: /[(]/ });
const RP = createToken({ name: "RP", pattern: /[)]/ });
const PIPE = createToken({ name: "PIPE", pattern: /[|]/ });
const QM = createToken({ name: "QM", pattern: /[?]/ });

const allTokens = [
  STR,
  LP,
  RP,
  PIPE,
  QM,
];

const lexer = new Lexer(allTokens, { positionTracking: "onlyOffset" });
  

// parser
class MatchOrParser extends CstParser {
  public value_stack: any[] = [];
  
  constructor() {
    super(allTokens);
    this.performSelfAnalysis();
  }
  
  public root = this.RULE("root", () => {
    this.SUBRULE1(this.exprs);
  });
  
  public exprs = this.RULE("exprs", () => {
    this.SUBRULE1(this.expr);
    this.MANY(() => {
      this.SUBRULE2(this.expr);
    });
  });
  
  public expr = this.RULE("expr", () => {
    this.SUBRULE(this.or_expr);
  });
  
  public or_expr = this.RULE("or_expr", () => {
    this.OR([
      { ALT: () => {
	this.CONSUME(LP);
	this.SUBRULE1(this.str);
	this.SUBRULE2(this.or_strs);
	this.CONSUME(RP);
	this.OPTION(() => { this.CONSUME(QM); });
      }},
      { ALT: () => {
	this.SUBRULE3(this.str);
      }},
    ]);
  });
  
  public or_strs = this.RULE("or_strs", () => {
    this.MANY(() => {
      this.CONSUME(PIPE);
      this.SUBRULE(this.str);
    });
  });
  
  public str = this.RULE("str", () => {
    this.CONSUME(STR);
  });
}


const parser = new MatchOrParser();
const BaseCstVisitor = parser.getBaseCstVisitorConstructor();


class MatchOrVisitor extends BaseCstVisitor {
  public constructor() {
    super();
    this.validateVisitor();
  }

  root(ctx: any) {
    const v = this.visit(ctx.exprs);
    return {
      type: "root",
      exprs: v.exprs,
    };
  }
  
  exprs(ctx: any) {
    const ret = {
      type: "exprs",
      exprs: [],
    };
    
    for (let e of ctx.expr) {
      let v: any = this.visit(e);
      //console.log(JSON.stringify(v));
      ret.exprs.push(v as never);
    }

    return ret;
  }
  
  expr(ctx: any) {
    if (ctx.or_expr) {
      const v1 = this.visit(ctx.or_expr);
      return v1;
    }
    else {
      const v2 = this.visit(ctx.str);
      return v2;
    }
  }
  
  or_expr(ctx: any) {
    if (ctx.or_strs) {
      const v1 = this.visit(ctx.str);
      const v2 = this.visit(ctx.or_strs);
      return {
	type: "or_expr",
	exprs: [v1.str].concat(v2.strs),
	qm: ctx.QM ? true : false,
      };
    }
    else {
      const v = this.visit(ctx.str);
      return {
	type: "str",
	str: v.str,
      };
    }
  }

  or_strs(ctx: any) {
    const ret = {
      type: "or_strs",
      strs: [],
    };

    for (let e of ctx.str) {
      let v = this.visit(ctx.str);
      ret.strs.push(v.str as never);
    }

    return ret;
  }

  str(ctx: any) {
    return {
      type: "str",
      str: ctx.STR[0].image,
    };
  }
}


const visitor = new MatchOrVisitor();

const texts = [
  'あ',
  '(あ|ア)',
  '(あ|ア)?',
  '(あい|アイ)(う|ウ)?(えお|エオ)',
];

for (let text of texts) {
  console.log(text);
  
  let lex_result = lexer.tokenize(text);
  parser.input = lex_result.tokens;
  let cst = parser.root();
  //console.log(JSON.stringify(cst));
  let res = visitor.visit(cst);
  console.log(JSON.stringify(res));
}

■実行結果
あ
{"type":"root","exprs":[{"type":"str","str":"あ"}]}

(あ|ア)
{"type":"root","exprs":[{"type":"or_expr","exprs":["あ","ア"],"qm":false}]}

(あ|ア)?
{"type":"root","exprs":[{"type":"or_expr","exprs":["あ","ア"],"qm":true}]}

(あい|アイ)(う|ウ)?(えお|エオ)
{"type":"root","exprs":[{"type":"or_expr","exprs":["あい","アイ"],"qm":false},{"type":"or_expr","exprs":["う","ウ"],"qm":true},{"type":"or_expr","exprs":["えお","エオ"],"qm":false}]}


この記事についてブログを書く
« chevrotain での加減算の構文... | トップ | node.js 版の kuromoji での... »

Node.js」カテゴリの最新記事