chevrotain での簡易な正規表現の構文解析のメモ。
以下のような簡易な正規表現の構文解析を行います。
root -> exprs
exprs -> expr+
expr -> or_expr
or_expr -> '(' str or_strs ')' '?'?
or_expr -> str
or_strs -> ('|' or_str)+
str -> STR
上記の文法は、以下のような文字列を受理します。
abc
(abc|def)
(abc|def)?
(abc|def)(ghi|jkl|mno)?(pqr|stu)
プログラムは以下の通りです。
■実行結果
以下のような簡易な正規表現の構文解析を行います。
root -> exprs
exprs -> expr+
expr -> or_expr
or_expr -> '(' str or_strs ')' '?'?
or_expr -> str
or_strs -> ('|' or_str)+
str -> STR
上記の文法は、以下のような文字列を受理します。
abc
(abc|def)
(abc|def)?
(abc|def)(ghi|jkl|mno)?(pqr|stu)
プログラムは以下の通りです。
import { CstParser, Lexer, createToken, Rule } from 'chevrotain' // lexer const STR = createToken({ name: "STR", pattern: /[^()|?]+/ }); const LP = createToken({ name: "LP", pattern: /[(]/ }); const RP = createToken({ name: "RP", pattern: /[)]/ }); const PIPE = createToken({ name: "PIPE", pattern: /[|]/ }); const QM = createToken({ name: "QM", pattern: /[?]/ }); const allTokens = [ STR, LP, RP, PIPE, QM, ]; const lexer = new Lexer(allTokens, { positionTracking: "onlyOffset" }); // parser class MatchOrParser extends CstParser { public value_stack: any[] = []; constructor() { super(allTokens); this.performSelfAnalysis(); } public root = this.RULE("root", () => { this.SUBRULE1(this.exprs); }); public exprs = this.RULE("exprs", () => { this.SUBRULE1(this.expr); this.MANY(() => { this.SUBRULE2(this.expr); }); }); public expr = this.RULE("expr", () => { this.SUBRULE(this.or_expr); }); public or_expr = this.RULE("or_expr", () => { this.OR([ { ALT: () => { this.CONSUME(LP); this.SUBRULE1(this.str); this.SUBRULE2(this.or_strs); this.CONSUME(RP); this.OPTION(() => { this.CONSUME(QM); }); }}, { ALT: () => { this.SUBRULE3(this.str); }}, ]); }); public or_strs = this.RULE("or_strs", () => { this.MANY(() => { this.CONSUME(PIPE); this.SUBRULE(this.str); }); }); public str = this.RULE("str", () => { this.CONSUME(STR); }); } const parser = new MatchOrParser(); const BaseCstVisitor = parser.getBaseCstVisitorConstructor(); class MatchOrVisitor extends BaseCstVisitor { public constructor() { super(); this.validateVisitor(); } root(ctx: any) { const v = this.visit(ctx.exprs); return { type: "root", exprs: v.exprs, }; } exprs(ctx: any) { const ret = { type: "exprs", exprs: [], }; for (let e of ctx.expr) { let v: any = this.visit(e); //console.log(JSON.stringify(v)); ret.exprs.push(v as never); } return ret; } expr(ctx: any) { if (ctx.or_expr) { const v1 = this.visit(ctx.or_expr); return v1; } else { const v2 = this.visit(ctx.str); return v2; } } or_expr(ctx: any) { if (ctx.or_strs) { const v1 = this.visit(ctx.str); const v2 = this.visit(ctx.or_strs); return { type: "or_expr", exprs: [v1.str].concat(v2.strs), qm: ctx.QM ? true : false, }; } else { const v = this.visit(ctx.str); return { type: "str", str: v.str, }; } } or_strs(ctx: any) { const ret = { type: "or_strs", strs: [], }; for (let e of ctx.str) { let v = this.visit(ctx.str); ret.strs.push(v.str as never); } return ret; } str(ctx: any) { return { type: "str", str: ctx.STR[0].image, }; } } const visitor = new MatchOrVisitor(); const texts = [ 'あ', '(あ|ア)', '(あ|ア)?', '(あい|アイ)(う|ウ)?(えお|エオ)', ]; for (let text of texts) { console.log(text); let lex_result = lexer.tokenize(text); parser.input = lex_result.tokens; let cst = parser.root(); //console.log(JSON.stringify(cst)); let res = visitor.visit(cst); console.log(JSON.stringify(res)); }
■実行結果
あ {"type":"root","exprs":[{"type":"str","str":"あ"}]} (あ|ア) {"type":"root","exprs":[{"type":"or_expr","exprs":["あ","ア"],"qm":false}]} (あ|ア)? {"type":"root","exprs":[{"type":"or_expr","exprs":["あ","ア"],"qm":true}]} (あい|アイ)(う|ウ)?(えお|エオ) {"type":"root","exprs":[{"type":"or_expr","exprs":["あい","アイ"],"qm":false},{"type":"or_expr","exprs":["う","ウ"],"qm":true},{"type":"or_expr","exprs":["えお","エオ"],"qm":false}]}