Luaでパーサジェネレータを作る

LALR(1)だと思いますが理論的なことはあまり厳密に考えていません。

LR(0)とかLALR(1)とかいうのは、CFG (文脈自由文法) のサブセットです。
CFGなら何でもパースできるのではなく、CFGの一部の範囲だけをカバーします。

使い方

src.lua の構文木がほしいとき
tree = parse(lex(io.lines("src.lua"))) とする。

ただし、簡単のため文法を改変しているのでうまくいかないかもしれない。

概略

文法

Lua 5.1 Reference Manual にあるものを改変した。

local start_symbol = "s"
local eof_symbol = "$"
local cdata = [=[
s block $
block stat*
block stat* laststat
block stat* laststat ;
stat*
stat* stat* stat
stat* stat* stat ;
laststat return
laststat return explist
laststat break
stat varlist = explist
stat functioncall
stat do block end
stat while exp do block end
stat repeat block until exp
stat if exp then block elseif* end
stat if exp then block elseif* else block end
stat for Name = exp , exp do block end
stat for Name = exp , exp , exp do block end
stat for namelist in explist do block end
stat function funcname funcbody
stat local function Name funcbody
stat local namelist
stat local namelist = explist
elseif*
elseif* elseif* elseif exp then block
funcname Name .Name*
funcname Name .Name* : Name
.Name*
.Name* .Name* . Name
varlist var
varlist varlist , var
var Name
var prefixexp [ exp ]
var prefixexp . Name
namelist Name
namelist namelist , Name
explist exp,* exp
exp,*
exp,* exp,* exp ,
exp andexp
exp exp or andexp
andexp relexp
andexp andexp and relexp
relexp catexp
relexp relexp relop catexp
relop <
relop >
relop <=
relop >=
relop ~=
relop ==
catexp addexp
catexp catexp .. addexp
addexp mulexp
addexp addexp addop mulexp
addop +
addop -
mulexp unexp
mulexp mulexp mulop unexp
mulop *
mulop /
mulop %
unexp expexp
unexp unop unexp
unop not
unop #
unop -
expexp _exp
expexp expexp ^ _exp
_exp nil
_exp false
_exp true
_exp Number
_exp String
_exp ...
_exp function_literal
_exp prefixexp
_exp tableconstructor
_exp ( exp )
prefixexp var
prefixexp functioncall
functioncall prefixexp args
functioncall prefixexp : Name args
args ( )
args ( explist )
args tableconstructor
args String
function_literal function funcbody
funcbody ( ) block end
funcbody ( parlist ) block end
parlist namelist
parlist ...
parlist namelist , ...
tableconstructor { }
tableconstructor { fieldlist }
fieldlist field,* field
fieldlist field,* field fieldsep
field,*
field,* field,* field fieldsep
fieldsep ,
fieldsep ;
field [ exp ] = exp
field Name = exp
field exp
]=]

-- grammar[nonterminal_symbol] = {rule, ...}
-- r_id[rule] = number
local grammar, r_id = {}, {}
do
  local id = 1
  for line in string.gmatch(cdata, "[^\r\n]+") do
    local rule = {}
    for symbol in string.gmatch(line, "%S+") do
      table.insert(rule, symbol)
    end
    assert(#rule > 0)
    r_id[rule] = id; id = id+1

    local nonterminal_symbol = rule[1]
    if not grammar[nonterminal_symbol] then
      grammar[nonterminal_symbol] = {}
    end
    table.insert(grammar[nonterminal_symbol], rule)
  end
end

オートマトン

uniqueなconsで状態オブジェクトを作る

ここでuniqueなconsといっているのは
(x.car == y.car and x.cdr == y.cdr) ならば (x == y) であるようなもの。

caars は for l, caar, cdar in caars(list) do ... end のように使う。

local cons, caars
do
  local mt = {__mode = "kv"} -- weak references to the keys and the values
  local memo = setmetatable({}, mt)
  local empty = {}
  cons = function(x, y)
    local car, cdr = x, y
    if car == nil then car = empty end
    if cdr == nil then cdr = empty end

    local t = memo[car]
    if not t then
      t = setmetatable({}, mt)
      memo[car] = t
    end

    if not t[cdr] then
      t[cdr] = {car = x, cdr = y, ref = t}
    end
    return t[cdr]
  end

  caars = function(xs)
    return function()
      if xs == nil then return nil end
      local temp = xs
      local x = xs.car; xs = xs.cdr
      return temp, x.car, x.cdr
    end
  end
end

状態をprintする関数を先に書いておく。

状態はLR(0)項のリストで、LR(0)項は rule と整数のペアである。
たとえば A → B C ・ D というLR(0)項を cons(3, {A, B, C, D}) と表す。

local print_state = function(state)
  for l, pos, rule in caars(state) do
    io.write(string.format("(%d) %s", r_id[rule], rule[1]))
    for i = 2, pos do io.write("   " .. rule[i]) end
    io.write(" _")
    if pos < #rule then
      io.write(" " .. rule[pos+1])
      for i = pos+2, #rule do io.write("   " .. rule[i]) end
    end
    io.write("\n")
  end
end

LR(0)項は、とりあえずテーブルに入れておいてから make_state でconsのリストに変換する。

local make_state = function(s)
  assert(type(s) == "table")
  local t = {}
  for k,v in pairs(s) do
    assert(v == true)
    table.insert(t, k)
  end
  local comp = function(x, y) -- x < y
    if r_id[x.cdr] == r_id[y.cdr] then
      return x.car < y.car
    else
      return r_id[x.cdr] < r_id[y.cdr]
    end
  end
  table.sort(t, comp)

  local xs = nil
  for i = #t, 1, -1 do xs = cons(t[i], xs) end
  return xs
end

状態はLR(0)項の集合の ε-closure である。 集合 P の ε-closure は次のような集合である。
{ q | あるPの要素pが存在してpからε遷移だけでqに到達可能 }

ここでいうε遷移とは たとえば A → B C ・ D から D → ・ E F への遷移など。

次のコードは ε-closure を計算して 最後に make_state する。

local epsilon = function(s)
  assert(type(s) == "table")
  local todo = {}
  for k,v in pairs(s) do
    assert(v == true)
    table.insert(todo, k)
  end
  while #todo > 0 do
    local k = table.remove(todo)
    local pos, rule = k.car, k.cdr
    local rules = rule[pos+1] and grammar[rule[pos+1]]
    for i = 1, rules and #rules or 0 do
      local k = cons(1, rules[i])
      if not s[k] then
        s[k] = true
        table.insert(todo, k)
      end
    end
  end
  return make_state(s)
end

初期状態を求める

local start_state
do
  assert(grammar[start_symbol])
  local s = {}
  for i,v in ipairs(grammar[start_symbol]) do
    s[cons(1, v)] = true
  end
  start_state = epsilon(s)
end
--print_state(start_state)

次の状態を求める

試しに print(delta_f(start_state, "stat*") == delta_f(start_state, "stat*")) などとしてみるとよい。

local delta_f = function(state, symbol)
  local s = {}
  for l, pos, rule in caars(state) do
    if rule[pos+1] == symbol then s[cons(pos+1, rule)] = true end
  end
  return epsilon(s)
end

全ての状態を求める

delta_f を何度も呼び出すのはさすがに効率が悪いのでテーブルを作る。
まず、テーブルを返す関数 node を用意して、 全ての state に対して delta[state] = node(state) とする。

local node = function(state)
  local t = {}
  for l, pos, rule in caars(state) do
    local symbol = rule[pos+1]
    if symbol and not t[symbol] then
      t[symbol] = delta_f(state, symbol)
    end
  end
  return t
end

-- delta[state][symbol] = next_state
-- s_id[state] = number
local delta = {[start_state] = node(start_state)}
local s_id = {[start_state] = 1}
do
  local id = 2
  local todo = {start_state}
  while #todo > 0 do
    local state = table.remove(todo)
    for symbol, next_state in pairs(delta[state]) do
      if not delta[next_state] then
        s_id[next_state] = id; id = id+1
        delta[next_state] = node(next_state)
        table.insert(todo, next_state)
      end
    end
  end
end

テスト

yaccのy.output的なものを出力する。

local print_node0 = function(state)
  io.write(string.format("[state %d]\n", s_id[state]))
  print_state(state)
  io.write("{{{\n")
  for k,v in pairs(delta[state]) do
    io.write(string.format("%s => %d\n", k, s_id[v]))
  end
  io.write("}}}\n")
end

local test = function(print_node)
  local t = {}
  for k,v in pairs(s_id) do t[v] = k end
  print_node(t[1])
  for i = 2, #t do
    io.write("\n")
    print_node(t[i])
  end
end
--test(print_node0)

context, lookahead, reduce_tab

先読みが必要な文法を扱えるようにする。

contextは、reduceでどの状態に戻る (可能性がある) かを調べるための表。

-- context[state][rule] = {x, ...}
local context = {}
do
  local insert = function(state, rule, x)
    if not context[state] then
      context[state] = {[rule] = {x}}
    elseif not context[state][rule] then
      context[state][rule] = {x}
    else
      table.insert(context[state][rule], x)
    end
  end

  for state, id in pairs(s_id) do
    for l, pos, rule in caars(state) do
      if pos == 1 then
        local x = state
        for i = 2, #rule do
          x = delta[x][rule[i]]
          assert(x)
        end
        insert(x, rule, state)
      end
    end
  end
end

lookahead_fは、reduceで戻った後最初にshiftされる可能性のある 終端記号を全て探索してテーブルに入れる。

local function lookahead_f(state, rule, t, visited)
  assert(visited[cons(state, rule)])
  assert(context[state] and context[state][rule])
  for i,v in ipairs(context[state][rule]) do
    local state = delta[v][rule[1]]
    assert(state ~= nil or rule[1] == start_symbol)
    for l, pos, rule in caars(state) do
      if pos < #rule and not grammar[rule[pos+1]] then -- terminal symbol
        t[rule[pos+1]] = true
      elseif pos == #rule then
        local x = cons(state, rule)
        if not visited[x] then
          visited[x] = true
          lookahead_f(state, rule, t, visited)
        end
      end
    end
  end
  return t
end

lookahead_fを何度も呼ばなくて済むようにテーブルを作る。

-- lookahead[state][rule][symbol] = true
local lookahead = {}
for state, t in pairs(context) do
  for rule, t in pairs(t) do
    local t = lookahead_f(state, rule, {}, {[cons(state, rule)] = true})
    if not lookahead[state] then
      lookahead[state] = {[rule] = t}
    else
      lookahead[state][rule] = t
    end
  end
end

ここでまたテストしてみる。

local print_node1 = function(state)
  print_node0(state)
  if not lookahead[state] then return end
  for rule, t in pairs(lookahead[state]) do
    io.write(string.format("(%d)", r_id[rule]))
    io.write("\nlookahead:")
    for k,v in pairs(t) do io.write(" " .. k) end
    io.write("\ncontext:")
    for i,v in ipairs(context[state][rule]) do io.write(" " .. s_id[v]) end
    io.write("\n")
  end
end
--test(print_node1)

最後に reduce_tab を作る。ここで conflict が見つかれば警告を出す。

-- reduce_tab[state][symbol] = rule
local reduce_tab = {}
for state, val in pairs(lookahead) do
  for rule, val in pairs(val) do
    for symbol, val in pairs(val) do
      assert(val == true)
      if delta[state][symbol] then
        print("shift/reduce conflict: state " .. s_id[state])
        print("symbol: " .. symbol)
      end
      if not reduce_tab[state] then
        reduce_tab[state] = {[symbol] = rule}
      elseif reduce_tab[state][symbol] then
        print("reduce/reduce conflict: state " .. s_id[state])
        print("symbol: " .. symbol)
        reduce_tab[state][symbol] = cons(rule, reduce_tab[state][symbol])
      else
        reduce_tab[state][symbol] = rule
      end
    end
  end
end

parserとlexer

以下のコードは、とりあえず構文木を作るだけのparserと、 かなりてきとうに書いたLua言語のlexerである。

parser

たとえば A → B C D を reduce するとき {{A, B, C, D}, value1, value2, value3} のようなノードを作る。
それ以外の特別な処理を指定するような機能はまだない。

local parse = function(lex)
  local state = start_state
  local s_stack, v_stack = {}, {}
  for symbol, value in lex do
    --print("lex: " .. symbol, value)
    while not delta[state][symbol] do
      local rule = reduce_tab[state] and reduce_tab[state][symbol]
      if not rule then -- error
        print("syntax error")
        return nil
      elseif not r_id[rule] then -- reduce/reduce conflict
        print("reduce/reduce conflict")
        return nil
      end
      -- reduce
      local t = {}
      for i = #rule, 2, -1 do
        t[i] = table.remove(v_stack)
        state = table.remove(s_stack)
      end
      t[1] = rule
      --print("reduce: " .. rule[1])
      table.insert(s_stack, state)
      table.insert(v_stack, t)
      state = delta[state][rule[1]]
      assert(state ~= nil)
      --print(s_id[state])
    end
    -- shift
    if symbol == eof_symbol then return v_stack[#v_stack] end
    --print("shift: " .. symbol)
    table.insert(s_stack, state)
    table.insert(v_stack, value)
    state = delta[state][symbol]
    assert(state ~= nil)
    --print(s_id[state])
  end
end

lexer

lex = function(readline)
  local line
  local function loop()
    if not readline then return end
    if not line or #line == 0 then
      line = readline()
      if not line then
        readline = nil
        return "$", nil
      end
    end
    do -- 空白文字を読み飛ばす
      local i = string.find(line, "%S")
      if not i then line = nil; return loop() end
      line = string.sub(line, i)
    end
    -- コメント
    if string.match(line, "^%-%-") then
      line = string.sub(line, 3)
      local m = string.match(line, "^%[(=*)%[")
      if not m then line = nil; return loop() end
      m = "]" .. m .. "]"
      while true do
        local i, j = string.find(line, m, 1, true) -- plain
        if j then line = string.sub(line, j+1); return loop() end
        line = readline()
        if not line then print("unexpected EOF"); return end
      end
    end
    do -- 文字列 エスケープなし
      local m = string.match(line, "^%[(=*)%[")
      if m then
        m = "]" .. m .. "]"
        local value = ""
        if #line == #m then -- 最初の改行は無視する
          line = readline()
          if not line then print("unexpected EOF"); return end
        else
          line = string.sub(line, #m+1)
        end
        while true do
          local i, j = string.find(line, m, 1, true) -- plain
          if i then
            value = value .. string.sub(line, 1, i-1)
            line = string.sub(line, j+1)
            return "String", value
          end
          value = value .. line .. "\n"
          line = readline()
          if not line then print("unexpected EOF"); return end
        end -- while
      end -- if
    end -- do
    do -- 文字列 エスケープあり
      local m = string.sub(line, 1, 1)
      if m == [["]] or m == [[']] then
        line = string.sub(line, 2)
        local escape = {
          a = "\a", b = "\b", f = "\f", n = "\n", r = "\r", t = "\t", v = "\v",
          ["\\"] = "\\", ["\""] = "\"", ["'"] = "'"
        }
        local value = ""
        local pattern = string.format("^[^%s\\]+", m)
        local function loop()
          if not line or line == "" then print(m .. [=[ が無い]=]); return end
          local i, j = string.find(line, pattern)
          if i then
            value = value .. string.sub(line, i, j)
            line = string.sub(line, j+1)
            return loop()
          elseif string.sub(line, 1, 1) == m then
            line = string.sub(line, 2)
            return "String", value
          elseif line == [[\]] then
            value = value .. "\n"
            line = readline()
            return loop()
          end
          local m = string.match(line, "^\\([0-9][0-9]?[0-9]?)")
          if m then
            value = value .. string.char(tonumber(m))
            line = string.sub(line, #m+1)
            return loop()
          end
          m = string.match(line, "^\\([abfnrtv\\\"'])")
          if not m then print("invalid escape sequence"); return end
          value = value .. escape[m]
          line = string.sub(line, 3)
          return loop()
        end
        return loop()
      end -- if
    end -- do
    do -- 識別子
      local cdata = [[
      and break do else elseif end false for function if in local nil not or
      repeat return then true until while]]
      local keyword = {}
      for m in string.gmatch(cdata, "[a-z]+") do
        keyword[m] = true
      end
      local m = string.match(line, "^[_a-zA-Z][_a-zA-Z0-9]*")
      if m then
        line = string.sub(line, #m+1)
        if keyword[m] then
          return m, nil
        else
          return "Name", m
        end
      end
    end
    do -- 16進
      local m = string.match(line, "^0[xX][0-9a-fA-F]*")
      if m then
        line = string.sub(line, #m+1)
        local value = tonumber(m)
        if not value then print("0x"); return end
        return "Number", tonumber(m)
      end
    end
    do -- 10進
      local m = string.match(line, "^[0-9]*%.?[0-9]*[eE]?[+-]?[0-9]*")
      if string.match(m, "^[+-]") then m = nil end
      local value = m and tonumber(m)
      if value then
        line = string.sub(line, #m+1)
        return "Number", value
      end
    end
    do -- その他
      local m = string.match(line, "^%.%.?%.?")
      m = m or string.match(line, "^[=~<>]=")
      m = m or string.match(line, "^[][(){};:,<>=#^%%*/+-]")
      if m then
        line = string.sub(line, #m+1)
        return m, nil
      end
    end
    print("unknown error")
  end
  return loop
end
inserted by FC2 system