今天有点空闲,想想用Ruby写个NFA试试。从正则表达式构造NFA采用经典的Thompson算法:正则表达式 -> 后缀表达式 -> 构造NFA。构造了NFA后,用之匹配字符串。一句话,写了个玩具的正则表达式引擎,支持concatenation、alternation以及*、?、+量词,不支持反向引用和转义符。测试了下与Ruby自带的正则表达式引擎的性能对比,慢了3倍
。构造NFA没什么问题,主要是匹配运行写的烂,有空再改改。
nfa.rb
module NFA
class NFA
def initialize(state)
@state=state
end
def step(clist,c)
return clist if clist.size==0;
nlist=[]
allNull = true
matched = false
clist.each do |t|
if !t.nil?
allNull = false if t.c!=-1
if t.c == c && t.end.type ==1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return ListUitls.new_list(t);
elsif (t.c == -1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allNull)
return step(nlist, c) if (!matched)
nlist
end
def test?(s)
match(@state,s)
end
def match(state,s)
clist =[]
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do |c|
c =c&0xFF;
clist = step(clist, c);
return false if clist.size==0
end
return is_match?(clist)
end
def is_match?(clist)
clist.each do |t|
return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched?
end
false
end
end
class Paren
attr_accessor:n_alt,:n_atom
end
class State
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1=out1
@out2=out2
@type=1
end
def is_matched?
return @type==0
end
end
class Transition
attr_accessor :c,:end
def initialize(c)
@c=c
end
end
class Frame
attr_accessor :start,:outs
def initialize(start,outs)
@start=start
@outs=outs
end
end
class ListUitls
def self.link(list,state)
list.each{|t| t.end=state}
end
def self.append(list1,list2)
list1+list2
end
def self.new_list(out)
result=[]
result.push(out)
result
end
end
def self.compile(re)
post = re2post(re)
raise ArgumentError.new,"bad regexp!" if post.nil?
state = post2nfa(post);
raise RuntimeError.new,"construct nfa from postfix fail!" if state.nil?
return NFA.new(state);
end
def self.post2nfa(postfix)
stack=[]
s=nil
t=t1=t2=nil
e1=e2=e=nil
return nil if postfix.nil?
postfix.each_byte do |p|
case p.chr
when '.':
e2 = stack.pop()
e1 = stack.pop()
ListUitls.link(e1.outs, e2.start)
stack.push(Frame.new(e1.start, e2.outs))
when '|':
e2 = stack.pop()
e1 = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e1.start
t2.end = e2.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e1.outs, e2.outs)))
when '?':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
stack.push(Frame.new(s, ListUitls.append(e.outs, ListUitls.new_list(t2))))
when '*':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(s, ListUitls.new_list(s.out2)))
when '+':
e = stack.pop()
t1 = Transition.new(-1)
t2 = Transition.new(-1)
t1.end = e.start
s = State.new(t1, t2)
ListUitls.link(e.outs, s)
stack.push(Frame.new(e.start, ListUitls.new_list(t2)))
else
t = Transition.new(p)
s = State.new(t, Transition.new(-1))
stack.push(Frame.new(s, ListUitls.new_list(s.out1)))
end
end
e = stack.pop()
return nil if stack.size()>0
end_state = State.new(nil, nil)
end_state.type=0
e.outs.each do |tran|
if tran.c!=-1
t1 = Transition.new(-1)
t2 = Transition.new(-1)
s=State.new(t1,t2)
tran.end=s
s.out1.end=end_state
s.out2.end=end_state
else
tran.end=end_state
end
end
start = e.start
return start
end
def self.re2post(re)
n_alt = n_atom = 0
result=""
paren=[]
re.each_byte do |c|
case c.chr
when '(' then
if (n_atom > 1) then
n_atom-=1
result<<"."
end
p =Paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when '|' then
if (n_atom == 0)
return nil
end
while (n_atom-=1) > 0
result<<"."
end
n_alt+=1
when ')' then
if (paren.size() == 0)
return nil
end
if (n_atom == 0)
return nil
end
while (n_atom-=1)>0
result<<"."
end
while(n_alt>0)
result<<"|"
n_alt-=1
end
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom+=1
when '*','+','?':
if (n_atom == 0)
return nil
end
result<<c
else
if (n_atom > 1)
n_atom-=1
result<<"."
end
result<<c
n_atom+=1
end
end
return nil if paren.size()>0
while ( (n_atom-=1)> 0)
result<<"."
end
while(n_alt>0)
n_alt-=1
result<<"|"
end
result
end
end
使用:
nfa = NFA::compile("a(bb)+a(cdf)*")
assert nfa.test?("abba")
assert nfa.test?("abbbba")
assert !nfa.test?("a")
assert !nfa.test?("aa")
assert nfa.test?("abbacdf")
assert nfa.test?("abbbbacdfcdf")
assert !nfa.test?("bbbbacdfcdf")
assert !nfa.test?("abbbacdfcdf")
assert !nfa.test?("abbbbacdfdf")
assert !nfa.test?("abbbbacdfdfg")