LALR(1)だと思いますが理論的なことはあまり厳密に考えていません。
LR(0)とかLALR(1)とかいうのは、CFG (文脈自由文法) のサブセットです。
CFGなら何でもパースできるのではなく、CFGの一部の範囲だけをカバーします。
src.lua の構文木がほしいとき
tree = parse(lex(io.lines("src.lua")))
とする。
ただし、簡単のため文法を改変しているのでうまくいかないかもしれない。
Lua 5.1 Reference Manual にあるものを改変した。
local start_symbol = "s" local eof_symbol = "$" local cdata = [=[ s block $ block stat* block stat* laststat block stat* laststat ; stat* stat* stat* stat stat* stat* stat ; laststat return laststat return explist laststat break stat varlist = explist stat functioncall stat do block end stat while exp do block end stat repeat block until exp stat if exp then block elseif* end stat if exp then block elseif* else block end stat for Name = exp , exp do block end stat for Name = exp , exp , exp do block end stat for namelist in explist do block end stat function funcname funcbody stat local function Name funcbody stat local namelist stat local namelist = explist elseif* elseif* elseif* elseif exp then block funcname Name .Name* funcname Name .Name* : Name .Name* .Name* .Name* . Name varlist var varlist varlist , var var Name var prefixexp [ exp ] var prefixexp . Name namelist Name namelist namelist , Name explist exp,* exp exp,* exp,* exp,* exp , exp andexp exp exp or andexp andexp relexp andexp andexp and relexp relexp catexp relexp relexp relop catexp relop < relop > relop <= relop >= relop ~= relop == catexp addexp catexp catexp .. addexp addexp mulexp addexp addexp addop mulexp addop + addop - mulexp unexp mulexp mulexp mulop unexp mulop * mulop / mulop % unexp expexp unexp unop unexp unop not unop # unop - expexp _exp expexp expexp ^ _exp _exp nil _exp false _exp true _exp Number _exp String _exp ... _exp function_literal _exp prefixexp _exp tableconstructor _exp ( exp ) prefixexp var prefixexp functioncall functioncall prefixexp args functioncall prefixexp : Name args args ( ) args ( explist ) args tableconstructor args String function_literal function funcbody funcbody ( ) block end funcbody ( parlist ) block end parlist namelist parlist ... parlist namelist , ... tableconstructor { } tableconstructor { fieldlist } fieldlist field,* field fieldlist field,* field fieldsep field,* field,* field,* field fieldsep fieldsep , fieldsep ; field [ exp ] = exp field Name = exp field exp ]=] -- grammar[nonterminal_symbol] = {rule, ...} -- r_id[rule] = number local grammar, r_id = {}, {} do local id = 1 for line in string.gmatch(cdata, "[^\r\n]+") do local rule = {} for symbol in string.gmatch(line, "%S+") do table.insert(rule, symbol) end assert(#rule > 0) r_id[rule] = id; id = id+1 local nonterminal_symbol = rule[1] if not grammar[nonterminal_symbol] then grammar[nonterminal_symbol] = {} end table.insert(grammar[nonterminal_symbol], rule) end end
ここでuniqueなconsといっているのは
(x.car == y.car and x.cdr == y.cdr) ならば (x == y) であるようなもの。
caars は
for l, caar, cdar in caars(list) do ... end
のように使う。
local cons, caars do local mt = {__mode = "kv"} -- weak references to the keys and the values local memo = setmetatable({}, mt) local empty = {} cons = function(x, y) local car, cdr = x, y if car == nil then car = empty end if cdr == nil then cdr = empty end local t = memo[car] if not t then t = setmetatable({}, mt) memo[car] = t end if not t[cdr] then t[cdr] = {car = x, cdr = y, ref = t} end return t[cdr] end caars = function(xs) return function() if xs == nil then return nil end local temp = xs local x = xs.car; xs = xs.cdr return temp, x.car, x.cdr end end end
状態をprintする関数を先に書いておく。
状態はLR(0)項のリストで、LR(0)項は rule と整数のペアである。
たとえば A → B C ・ D というLR(0)項を cons(3, {A, B, C, D}) と表す。
local print_state = function(state) for l, pos, rule in caars(state) do io.write(string.format("(%d) %s", r_id[rule], rule[1])) for i = 2, pos do io.write(" " .. rule[i]) end io.write(" _") if pos < #rule then io.write(" " .. rule[pos+1]) for i = pos+2, #rule do io.write(" " .. rule[i]) end end io.write("\n") end end
LR(0)項は、とりあえずテーブルに入れておいてから make_state でconsのリストに変換する。
local make_state = function(s) assert(type(s) == "table") local t = {} for k,v in pairs(s) do assert(v == true) table.insert(t, k) end local comp = function(x, y) -- x < y if r_id[x.cdr] == r_id[y.cdr] then return x.car < y.car else return r_id[x.cdr] < r_id[y.cdr] end end table.sort(t, comp) local xs = nil for i = #t, 1, -1 do xs = cons(t[i], xs) end return xs end
状態はLR(0)項の集合の ε-closure である。
集合 P の ε-closure は次のような集合である。
{ q | あるPの要素pが存在してpからε遷移だけでqに到達可能 }
ここでいうε遷移とは たとえば A → B C ・ D から D → ・ E F への遷移など。
次のコードは ε-closure を計算して 最後に make_state する。
local epsilon = function(s) assert(type(s) == "table") local todo = {} for k,v in pairs(s) do assert(v == true) table.insert(todo, k) end while #todo > 0 do local k = table.remove(todo) local pos, rule = k.car, k.cdr local rules = rule[pos+1] and grammar[rule[pos+1]] for i = 1, rules and #rules or 0 do local k = cons(1, rules[i]) if not s[k] then s[k] = true table.insert(todo, k) end end end return make_state(s) end
local start_state do assert(grammar[start_symbol]) local s = {} for i,v in ipairs(grammar[start_symbol]) do s[cons(1, v)] = true end start_state = epsilon(s) end --print_state(start_state)
試しに
print(delta_f(start_state, "stat*") == delta_f(start_state, "stat*"))
などとしてみるとよい。
local delta_f = function(state, symbol) local s = {} for l, pos, rule in caars(state) do if rule[pos+1] == symbol then s[cons(pos+1, rule)] = true end end return epsilon(s) end
delta_f を何度も呼び出すのはさすがに効率が悪いのでテーブルを作る。
まず、テーブルを返す関数 node を用意して、
全ての state に対して delta[state] = node(state) とする。
local node = function(state) local t = {} for l, pos, rule in caars(state) do local symbol = rule[pos+1] if symbol and not t[symbol] then t[symbol] = delta_f(state, symbol) end end return t end -- delta[state][symbol] = next_state -- s_id[state] = number local delta = {[start_state] = node(start_state)} local s_id = {[start_state] = 1} do local id = 2 local todo = {start_state} while #todo > 0 do local state = table.remove(todo) for symbol, next_state in pairs(delta[state]) do if not delta[next_state] then s_id[next_state] = id; id = id+1 delta[next_state] = node(next_state) table.insert(todo, next_state) end end end end
yaccのy.output的なものを出力する。
local print_node0 = function(state) io.write(string.format("[state %d]\n", s_id[state])) print_state(state) io.write("{{{\n") for k,v in pairs(delta[state]) do io.write(string.format("%s => %d\n", k, s_id[v])) end io.write("}}}\n") end local test = function(print_node) local t = {} for k,v in pairs(s_id) do t[v] = k end print_node(t[1]) for i = 2, #t do io.write("\n") print_node(t[i]) end end --test(print_node0)
先読みが必要な文法を扱えるようにする。
contextは、reduceでどの状態に戻る (可能性がある) かを調べるための表。
-- context[state][rule] = {x, ...} local context = {} do local insert = function(state, rule, x) if not context[state] then context[state] = {[rule] = {x}} elseif not context[state][rule] then context[state][rule] = {x} else table.insert(context[state][rule], x) end end for state, id in pairs(s_id) do for l, pos, rule in caars(state) do if pos == 1 then local x = state for i = 2, #rule do x = delta[x][rule[i]] assert(x) end insert(x, rule, state) end end end end
lookahead_fは、reduceで戻った後最初にshiftされる可能性のある 終端記号を全て探索してテーブルに入れる。
local function lookahead_f(state, rule, t, visited) assert(visited[cons(state, rule)]) assert(context[state] and context[state][rule]) for i,v in ipairs(context[state][rule]) do local state = delta[v][rule[1]] assert(state ~= nil or rule[1] == start_symbol) for l, pos, rule in caars(state) do if pos < #rule and not grammar[rule[pos+1]] then -- terminal symbol t[rule[pos+1]] = true elseif pos == #rule then local x = cons(state, rule) if not visited[x] then visited[x] = true lookahead_f(state, rule, t, visited) end end end end return t end
lookahead_fを何度も呼ばなくて済むようにテーブルを作る。
-- lookahead[state][rule][symbol] = true local lookahead = {} for state, t in pairs(context) do for rule, t in pairs(t) do local t = lookahead_f(state, rule, {}, {[cons(state, rule)] = true}) if not lookahead[state] then lookahead[state] = {[rule] = t} else lookahead[state][rule] = t end end end
ここでまたテストしてみる。
local print_node1 = function(state) print_node0(state) if not lookahead[state] then return end for rule, t in pairs(lookahead[state]) do io.write(string.format("(%d)", r_id[rule])) io.write("\nlookahead:") for k,v in pairs(t) do io.write(" " .. k) end io.write("\ncontext:") for i,v in ipairs(context[state][rule]) do io.write(" " .. s_id[v]) end io.write("\n") end end --test(print_node1)
最後に reduce_tab を作る。ここで conflict が見つかれば警告を出す。
-- reduce_tab[state][symbol] = rule local reduce_tab = {} for state, val in pairs(lookahead) do for rule, val in pairs(val) do for symbol, val in pairs(val) do assert(val == true) if delta[state][symbol] then print("shift/reduce conflict: state " .. s_id[state]) print("symbol: " .. symbol) end if not reduce_tab[state] then reduce_tab[state] = {[symbol] = rule} elseif reduce_tab[state][symbol] then print("reduce/reduce conflict: state " .. s_id[state]) print("symbol: " .. symbol) reduce_tab[state][symbol] = cons(rule, reduce_tab[state][symbol]) else reduce_tab[state][symbol] = rule end end end end
以下のコードは、とりあえず構文木を作るだけのparserと、 かなりてきとうに書いたLua言語のlexerである。
たとえば A → B C D を reduce するとき
{{A, B, C, D}, value1, value2, value3}
のようなノードを作る。
それ以外の特別な処理を指定するような機能はまだない。
local parse = function(lex) local state = start_state local s_stack, v_stack = {}, {} for symbol, value in lex do --print("lex: " .. symbol, value) while not delta[state][symbol] do local rule = reduce_tab[state] and reduce_tab[state][symbol] if not rule then -- error print("syntax error") return nil elseif not r_id[rule] then -- reduce/reduce conflict print("reduce/reduce conflict") return nil end -- reduce local t = {} for i = #rule, 2, -1 do t[i] = table.remove(v_stack) state = table.remove(s_stack) end t[1] = rule --print("reduce: " .. rule[1]) table.insert(s_stack, state) table.insert(v_stack, t) state = delta[state][rule[1]] assert(state ~= nil) --print(s_id[state]) end -- shift if symbol == eof_symbol then return v_stack[#v_stack] end --print("shift: " .. symbol) table.insert(s_stack, state) table.insert(v_stack, value) state = delta[state][symbol] assert(state ~= nil) --print(s_id[state]) end end
lex = function(readline) local line local function loop() if not readline then return end if not line or #line == 0 then line = readline() if not line then readline = nil return "$", nil end end do -- 空白文字を読み飛ばす local i = string.find(line, "%S") if not i then line = nil; return loop() end line = string.sub(line, i) end -- コメント if string.match(line, "^%-%-") then line = string.sub(line, 3) local m = string.match(line, "^%[(=*)%[") if not m then line = nil; return loop() end m = "]" .. m .. "]" while true do local i, j = string.find(line, m, 1, true) -- plain if j then line = string.sub(line, j+1); return loop() end line = readline() if not line then print("unexpected EOF"); return end end end do -- 文字列 エスケープなし local m = string.match(line, "^%[(=*)%[") if m then m = "]" .. m .. "]" local value = "" if #line == #m then -- 最初の改行は無視する line = readline() if not line then print("unexpected EOF"); return end else line = string.sub(line, #m+1) end while true do local i, j = string.find(line, m, 1, true) -- plain if i then value = value .. string.sub(line, 1, i-1) line = string.sub(line, j+1) return "String", value end value = value .. line .. "\n" line = readline() if not line then print("unexpected EOF"); return end end -- while end -- if end -- do do -- 文字列 エスケープあり local m = string.sub(line, 1, 1) if m == [["]] or m == [[']] then line = string.sub(line, 2) local escape = { a = "\a", b = "\b", f = "\f", n = "\n", r = "\r", t = "\t", v = "\v", ["\\"] = "\\", ["\""] = "\"", ["'"] = "'" } local value = "" local pattern = string.format("^[^%s\\]+", m) local function loop() if not line or line == "" then print(m .. [=[ が無い]=]); return end local i, j = string.find(line, pattern) if i then value = value .. string.sub(line, i, j) line = string.sub(line, j+1) return loop() elseif string.sub(line, 1, 1) == m then line = string.sub(line, 2) return "String", value elseif line == [[\]] then value = value .. "\n" line = readline() return loop() end local m = string.match(line, "^\\([0-9][0-9]?[0-9]?)") if m then value = value .. string.char(tonumber(m)) line = string.sub(line, #m+1) return loop() end m = string.match(line, "^\\([abfnrtv\\\"'])") if not m then print("invalid escape sequence"); return end value = value .. escape[m] line = string.sub(line, 3) return loop() end return loop() end -- if end -- do do -- 識別子 local cdata = [[ and break do else elseif end false for function if in local nil not or repeat return then true until while]] local keyword = {} for m in string.gmatch(cdata, "[a-z]+") do keyword[m] = true end local m = string.match(line, "^[_a-zA-Z][_a-zA-Z0-9]*") if m then line = string.sub(line, #m+1) if keyword[m] then return m, nil else return "Name", m end end end do -- 16進 local m = string.match(line, "^0[xX][0-9a-fA-F]*") if m then line = string.sub(line, #m+1) local value = tonumber(m) if not value then print("0x"); return end return "Number", tonumber(m) end end do -- 10進 local m = string.match(line, "^[0-9]*%.?[0-9]*[eE]?[+-]?[0-9]*") if string.match(m, "^[+-]") then m = nil end local value = m and tonumber(m) if value then line = string.sub(line, #m+1) return "Number", value end end do -- その他 local m = string.match(line, "^%.%.?%.?") m = m or string.match(line, "^[=~<>]=") m = m or string.match(line, "^[][(){};:,<>=#^%%*/+-]") if m then line = string.sub(line, #m+1) return m, nil end end print("unknown error") end return loop end