2011年8月1日 星期一

[編譯器的研究]parser的實作(2)

一般 parse tree 是一個有序(ordered)、有唯一根(rooted)的樹。
因此,之前的 grammar 要再改變一下,加上<log> ::= <msg> {<msg>} 在最前面。

<log> ::= <msg> {<msg>}
<msg> ::= timestamp <secs_type_msg>
          | timestamp <host_type_msg>
<secs_type_msg> ::= <desc> secs_comment <secs_msg>
<host_type_msg> ::= eap_log_1 {xml_transaction}
<desc> ::= <secs_msg_name> colon <secs_SxFy_sym>
          | normal_word {normal_word}
<secs_SxFy_sym> ::= secs_SxFy {<secs_w_bit>}
<secs_msg_name> ::= normal_word
<secs_w_bit> ::= normal_word
<secs_msg> ::= {<secs_msg_body>} secs_end
<secs_msg_body> ::= <secs_msg_item> {secs_comment }
<secs_msg_item> ::= secs_item_a
                    | secs_item_u
                    | secs_item_b
                    | secs_item_bool
                    | secs_item_f
                    | secs_item_i
                    | secs_list_start {secs_comment} {<secs_msg_item>} secs_list_end

接下來是設計 tree node 存放物件。

之前的程式實際上只有判斷 input 是否為可接受。
所以每一個函式傳回 true 的地方都改為傳回 node ,傳回 false 的地方則改為傳回 false。

最基本的一個 node,就是只有一個 name 屬性的物件。
把它當做抽象類別。為了方便序列化,我們把其他的屬性都用一個 dict 裝起來,
為了讓屬性按加入順序序列化,把屬性名字加到一個 list 中。

class c_node:
  def __init__(self):
    self.keyvalue = {}
    self.keyarray = []
  def add_attr(self,k,v):
    if self.keyvalue.has_key(k):
      self.keyvalue[k] = v
    else:
      self.keyarray.append(k)
      self.keyvalue[k] = v
  def set_attr(self,k,v):
    if self.keyvalue.has_key(k):
      self.keyvalue[k] = v
    else:
      self.keyarray.append(k)
      self.keyvalue[k] = v

  def get_attr(self,k):
    if self.keyvalue.has_key(k):
      return self.keyvalue[k]
    else:
      return None
  def __str__(self):
    _str = "["
    _str1 = []
    for i in self.keyarray:
      if self.keyvalue[i] != None:
        if type(self.keyvalue[i]) != list:
          _str1.append(i + ":" + str(self.keyvalue[i]))
        else:
          _str2 = []
          for j in self.keyvalue[i]:
            _str2.append( str(j) )
          _str1.append(i + ":" + "[" + ",".join(_str2) + "]")
      else:
        _str1.append(i + ":" + "None")
    _str += ",".join(_str1)
    _str += "]"
    return _str

我們為 log 設計一個 c_log 的類別。
class c_log(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','msgs')
    self.add_attr('msg',None)

其他的類別也很單純,只要按 grammar 中的順序把 symbol (terminal 及 non-terminal) 當做屬性加入即可。
但是到了 c_secs_msg_item,就不太一樣,因為他有可能是 terminal 也可能是 non-terminal 的 symbol,
所以屬性除了當做 terminal 存放值的 value,也有當做 non-terminal 存放值的 children。

class c_secs_msg_item(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_msg_item')
    self.add_attr('list_start',None)
    self.add_attr('secs_comment',None)
    self.add_attr('children',None)
    self.add_attr('value',None)
    self.add_attr('list_end',None)

在 parser 的程式裡,遇到 secs_msg_item 的時候,
如果不是遇到 secs_list_start,就把值放到 value 去。
否則就把值放到 children 裡面,就算該值是空的。(因為允許空的 list)

(secs_msg_item 的程式碼請參考底下)

老實說,我也沒辦法一開始就知道需要 c_node,也沒辦法知道 c_node 就要長這個樣子。
我是邊寫邊改的。因為要 debug 才生出 __str__ 這個方法。
因為要 __str__ 才知道要把屬性放到 list 及 dict 裡面方便列舉。
在寫的過程中,參考了 google code 裡面的 traceur。(它是用 javascript 寫的)
因為難以了解之後會遇到什麼,所以花了一點時間研究才隔了兩星期沒有產生(有一點找藉口)
完整的程式碼我分成兩個,一個是 node 們的定義。一個是 parser 的程式。

node_def.py

class c_node:
  def __init__(self):
    self.keyvalue = {}
    self.keyarray = []
  def add_attr(self,k,v):
    if self.keyvalue.has_key(k):
      self.keyvalue[k] = v
    else:
      self.keyarray.append(k)
      self.keyvalue[k] = v
  def set_attr(self,k,v):
    if self.keyvalue.has_key(k):
      self.keyvalue[k] = v
    else:
      self.keyarray.append(k)
      self.keyvalue[k] = v

  def get_attr(self,k):
    if self.keyvalue.has_key(k):
      return self.keyvalue[k]
    else:
      return None
  def __str__(self):
    _str = "["
    _str1 = []
    for i in self.keyarray:
      if self.keyvalue[i] != None:
        if type(self.keyvalue[i]) != list:
          _str1.append(i + ":" + str(self.keyvalue[i]))
        else:
          _str2 = []
          for j in self.keyvalue[i]:
            _str2.append( str(j) )
          _str1.append(i + ":" + "[" + ",".join(_str2) + "]")
      else:
        _str1.append(i + ":" + "None")
    _str += ",".join(_str1)
    _str += "]"
    return _str

class c_log(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','msgs')
    self.add_attr('msg',None)

class c_msg(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','msg')
    self.add_attr('timestamp',None)
    self.add_attr('host_type_msg',None)
    self.add_attr('secs_type_msg',None)

class c_secs_type_msg(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_type_msg')
    self.add_attr('desc',None)
    self.add_attr('secs_comment',None)
    self.add_attr('secs_msg',None)
class c_secs_SxFy_sym(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_SxFy_sym')
    self.add_attr('secs_SxFy',None)
    self.add_attr('secs_w_bit',None)

class c_desc(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.name = 'desc'
    self.add_attr('name','desc')
    self.add_attr('secs_msg_name',None)
    self.add_attr('colon',None)
    self.add_attr('secs_SxFy_sym',None)
    self.add_attr('normal_word',None)

class c_secs_msg_name(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_msg_name')
    self.add_attr('normal_word',None)

class c_colon(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','colon')
    self.add_attr('value',':')

class c_secs_w_bit(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.name = 'secs_w_bit'
    self.add_attr('name',self.name)
    self.normal_word = None   
    self.add_attr('normal_word',self.normal_word)

class c_secs_msg(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_msg')
    self.add_attr('secs_msg_body',None)
    self.add_attr('secs_end',None)

class c_secs_msg_body(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_msg_body')
    self.add_attr('secs_msg_item',None)
    self.add_attr('secs_comment',None)
class c_secs_msg_item(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','secs_msg_item')
    self.add_attr('list_start',None)
    self.add_attr('secs_comment',None)
    self.add_attr('children',None)
    self.add_attr('value',None)
    self.add_attr('list_end',None)

class c_host_type_msg(c_node):
  def __init__(self):
    c_node.__init__(self)
    self.add_attr('name','host_type_msg')
    self.add_attr('eap_log_1',None)
    self.add_attr('xml_transaction',None)

------------------------------------------------------------------------------
ll_parser_v4.py  # 看到這個 v4 就知道我大改了4次。

import node_def

t_file = open("tokens.txt","r")
t_file_content = t_file.read()
t_file.close()

tokens = t_file_content.split("\n\n\n")
global tokenindex
tokenindex = 0

global_stack = []

def token_cut(token):
  r = []
  sep = ":"
  t1 = token[:token.find(sep)]
  token = token[token.find(sep)+1:]
  sep = ","
  t1 = token[:token.find(sep)]
  token = token[token.find(sep)+1:]
  r.append(t1)
  sep = ","
  t1 = token[:token.find(sep)]
  token = token[token.find(sep)+1:]
  r.append(t1)
  sep = ","
  t1 = token[:token.find(sep)]
  token = token[token.find(sep)+1:]
  r.append(t1)

  sep = ","
  t1 = token[:token.find(sep)]
  token = token[token.find(sep)+1:]
  r.append(t1)
  sep = ","
  t1 = token[:token.find(sep)]
  token = token[token.find(sep)+1:]
  r.append(t1)
  r.append(token)
  return r

input_index = -1
sym = None
tokencount = len(tokens)
def getsym():
  global input_index
  input_index += 1
  global sym
  if input_index > tokencount - 1:
    sym = None
  else:
    if len(tokens[input_index])==0:
      sym = None
    else:
      sym = token_cut(tokens[input_index])

def lookahead():
  global input_index
  if input_index + 1 > tokencount - 1:
    return None
  else:
    return token_cut(tokens[input_index + 1])
def accept(s):
  global sym
  global global_stack
  if sym != None and sym[0] == s:
    #print 'accept',sym
    global_stack.append(sym[5])
    getsym()
    return True
  return False

def expect(s):
  if accept(s):
    return True
  print "expect: unexpected symbol"
  print "expect:", s
  global sym
  print "encounter:",sym
  sym = None
  return False

def secs_w_bit():
  print 'secs_w_bit()'
  my = node_def.c_secs_w_bit()
  r = accept("normal_word")
  if r:
    v = global_stack.pop()
    my.set_attr('normal_word',v)
    #print 'normal_word pop:',v
  #print str(my)
  print 'secs_w_bit() complete'
  return my

def secs_SxFy_sym():
  print 'secs_SxFy_sym()'
  my = node_def.c_secs_SxFy_sym()
  r = expect("secs_SxFy")
  if r:
    v = global_stack.pop()
    my.set_attr("secs_SxFy",v)
    #print 'secs_SxFy pop:',v
  my.set_attr("secs_w_bit",secs_w_bit())
  #print str(my)
  print 'secs_SxFy_sym() complete'
  return my

def secs_msg_name():
  print 'secs_msg_name()'
  my = node_def.c_secs_msg_name()
  r = expect("normal_word")
  if r:
    v = global_stack.pop()
    my.normal_word = v
    #print 'normal_word pop:',v
  #print str(my)
  print 'secs_msg_name() complete'
  return my

def desc():
  print 'desc()'
  my =  node_def.c_desc()
  global input_index
  _sym = lookahead()
  if _sym[0]=="colon":
    my.set_attr("secs_msg_name",secs_msg_name())
    r = expect("colon")
    if r:
      v = global_stack.pop()
      my.set_attr("colon",v)
      #print 'colon pop:', v
    my.set_attr("secs_SxFy_sym",secs_SxFy_sym())
  else:
    r = accept("normal_word")
    if r:
      v = global_stack.pop()
      my.set_attr("normal_word",v)
      #print 'normal_word pop:',v
    while r == True:
      r = accept("normal_word")
      if r:
        v = global_stack.pop()
        if type(my.get_attr("normal_word")) == list:
          my.get_attr("normal_word").append(v)
        else:
          tmpv = my.get_attr("normal_word")
          my.set_attr("normal_word",[])
          my.get_attr("normal_word").append(tmpv)
          my.get_attr("normal_word").append(v)
        #print 'normal_word pop:',v
  #print str(my)
  print 'desc() complete'
  return my

def secs_msg_item():
  print 'secs_msg_item()'
  my = node_def.c_secs_msg_item()
  if accept("secs_item_a"):
    v = global_stack.pop()
    my.set_attr("value", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
  elif accept("secs_item_u"):
    v = global_stack.pop()
    my.set_attr("value", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
  elif accept("secs_item_b"):
    v = global_stack.pop()
    my.set_attr("value", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
  elif accept("secs_item_bool"):
    v = global_stack.pop()
    my.set_attr("value", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
  elif accept("secs_item_f"):
    v = global_stack.pop()
    my.set_attr("value", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
  elif accept("secs_item_i"):
    v = global_stack.pop()
    my.set_attr("value", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
  elif accept("secs_list_start"):
    v = global_stack.pop()
    my.set_attr("list_start", v)
    r = accept("secs_comment")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_comment", v)
    r = secs_msg_item()
    if r != None:
      my.set_attr("children", [])
    while r != None:
      my.get_attr("children").append(r)
      r = secs_msg_item()
    r = expect("secs_list_end")
    if r:
      v = global_stack.pop()
      my.set_attr("list_end", v)
  else:
    print 'secs_msg_item() complete'
    return None
  #print str(my)
  print 'secs_msg_item() complete'
  return my

def secs_msg_body():
  print 'secs_msg_body()'
  my = node_def.c_secs_msg_body()
  my.secs_msg_item = secs_msg_item()
  r = accept("secs_comment")
  if r:
    v = global_stack.pop()
    self.secs_comment = v
    #print 'secs_comment pop:',global_stack.pop()
  #print str(my)
  print 'secs_msg_body() complete'
  return my

def secs_msg():
  print 'secs_msg()'
  my = node_def.c_secs_msg()
  if accept("secs_end"):
    v = global_stack.pop()
    my.set_attr("secs_end",v)
    #print 'secs_end pop:',v
  else:
    my.secs_msg_body = secs_msg_body()
    r = accept("secs_end")
    if r:
      v = global_stack.pop()
      my.set_attr("secs_end",v)
      #print 'secs_end pop:',v
  #print str(my)
  print 'secs_msg() complete'
  return my

def secs_type_msg():
  print 'secs_type_msg()'
  my = node_def.c_secs_type_msg()
  my.set_attr('desc',desc())
  r = accept("secs_comment")
  if r:
    v = global_stack.pop()
    my.set_attr('secs_comment',v)
    #print 'secs_comment pop:',v
  my.set_attr('secs_msg',secs_msg())
  #print str(my)
  print 'secs_type_msg() complete'
  return my

def host_type_msg():
  print 'host_type_msg()'
  my = node_def.c_host_type_msg()
  r = expect("eap_log_1")
  if r:
    v = global_stack.pop()
    my.set_attr('eap_log_1',v)
    r = accept("eap_log_1")
    while r:
      v = global_stack.pop()
      if type(my.get_attr('eap_log_1')) == list:
        my.get_attr('eap_log_1').append(v)
      else:
        tmpv = my.get_attr('eap_log_1')
        my.set_attr('eap_log_1',[])
        my.get_attr('eap_log_1').append(tmpv)
        my.get_attr('eap_log_1').append(v)
      r = accept("eap_log_1")
    #print 'eap_log_1 pop:',global_stack.pop()
  r = accept("xml_transaction")
  if r:
    v = global_stack.pop()
    my.set_attr('xml_transaction',v)
    #print 'xml_transaction pop:',global_stack.pop()
  print 'host_type_msg() complete'
  return my
def msg():
  print 'msg()'
  my = node_def.c_msg()
  if expect("timestamp"):
    v = global_stack.pop()
    my.add_attr('timestamp',v)
    #print 'timestamp pop:',v
    if sym[0]=="eap_log_1":
      my.add_attr('host_type_msg',host_type_msg())
    else:
      my.add_attr('secs_type_msg',secs_type_msg())
  print 'msg() complete'
  return my

ms = node_def.c_log()
getsym()
while sym != None:
  print 'token count',input_index
  a = msg()
##  print 'a.timestamp',a.timestamp
##  print 'a.host_type_msg',a.host_type_msg
##  print 'a.secs_type_msg',a.secs_type_msg
  if type(ms.get_attr("msg")) != list:
    ms.set_attr("msg",[])
  ms.get_attr("msg").append(a)

f = open("v4output.txt","w")
f.write(str(ms))
f.close()

-------------------------
tokens_for_dev.txt  #用來測試用的輸入檔

tokens(248743):timestamp,1,True,3302688,62420,2011-06-22-23:59:55

tokens(248745):normal_word,3,True,3302708,62420,AYT_Host

tokens(248746):colon,4,True,3302716,62420,:

tokens(248748):secs_SxFy,5,True,3302718,62420,'S1F1'

tokens(248750):normal_word,3,True,3302725,62420,W

tokens(248752):secs_comment,6,True,3302727,62420,/* Name=AreYouThere_Host Dir=2 Header=[00 00 81 01 00 00 00 01 CA 55] Rcvd=2 Time=23:59:55 TID=31094 */

tokens(248754):secs_end,8,True,3302831,62421,.

tokens(248756):timestamp,1,True,3302833,62422,2011-06-22-23:59:55

tokens(248758):normal_word,3,True,3302853,62422,OLD

tokens(248759):colon,4,True,3302856,62422,:

tokens(248761):secs_SxFy,5,True,3302858,62422,'S1F2'

tokens(248763):secs_comment,6,True,3302865,62422,/* Name=OnlineData Dir=1 Header=[00 00 01 02 00 00 00 01 CA 55] Rcvd=1 Time=23:59:55 TID=31094 */

tokens(248766):secs_list_start,9,True,3302967,62423,<L [2]

tokens(248769):secs_item_a,10,True,3302982,62424,<A [0] >

tokens(248771):secs_comment,6,True,3302991,62424,/* Name=MDLN Keyword=EquipmentModelType */

tokens(248774):secs_item_a,10,True,3303042,62425,<A [0] >

tokens(248776):secs_comment,6,True,3303051,62425,/* Name=SOFTREV Keyword=SoftwareRevision */

tokens(248779):secs_list_end,11,True,3303100,62426,>

tokens(248781):secs_end,8,True,3303102,62427,.

tokens(41):timestamp,1,True,416,8,2011-06-22-07:56:17

tokens(43):eap_log_1,12,True,436,8,Receiving PP_Upload request from TCS...

tokens(45):xml_transaction,13,True,476,9,<?xml version="1.0"?>
<Transaction TxName="PP_Upload" Type="Request" MessageKey="0629">
    <Tool ToolID="CLB02" fromOPI="true" Type=""/>
    <Recipes>
        <Recipe RecipeID="APM_6MIN" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_00" FilePath="" FormatFlag="1" Type="" RecipeLevel="1" Option=""/>
        <Recipe RecipeID="Production;MXIC_SC1_360;1" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_01" FilePath="" FormatFlag="1" Type="" RecipeLevel="2" Option=""/>
        <Recipe RecipeID="Production;POR QDR-HCL;1" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_02" FilePath="" FormatFlag="1" Type="" RecipeLevel="2" Option=""/>
        <Recipe RecipeID="Production;POR-philic;1" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_03" FilePath="" FormatFlag="1" Type="" RecipeLevel="2" Option=""/>
    </Recipes>
</Transaction>

tokens(47):timestamp,1,True,1442,20,2011-06-22-07:56:17

tokens(49):eap_log_1,12,True,1462,20,[S7F5] Sent by Host

--------------------------------------------------------------
v4output.txt # 最後的結果給大家參考,有點像是 json。也許有天我會改成 json。

[name:msgs,msg:[[name:msg,timestamp:2011-06-22-23:59:55,host_type_msg:None,secs_type_msg:[name:secs_type_msg,desc:[name:desc,secs_msg_name:[name:secs_msg_name,normal_word:None],colon::,secs_SxFy_sym:[name:secs_SxFy_sym,secs_SxFy:'S1F1',secs_w_bit:[name:secs_w_bit,normal_word:W]],normal_word:None],secs_comment:/* Name=AreYouThere_Host Dir=2 Header=[00 00 81 01 00 00 00 01 CA 55] Rcvd=2 Time=23:59:55 TID=31094 */,secs_msg:[name:secs_msg,secs_msg_body:None,secs_end:.]]],[name:msg,timestamp:2011-06-22-23:59:55,host_type_msg:None,secs_type_msg:[name:secs_type_msg,desc:[name:desc,secs_msg_name:[name:secs_msg_name,normal_word:None],colon::,secs_SxFy_sym:[name:secs_SxFy_sym,secs_SxFy:'S1F2',secs_w_bit:[name:secs_w_bit,normal_word:None]],normal_word:None],secs_comment:/* Name=OnlineData Dir=1 Header=[00 00 01 02 00 00 00 01 CA 55] Rcvd=1 Time=23:59:55 TID=31094 */,secs_msg:[name:secs_msg,secs_msg_body:None,secs_end:.]]],[name:msg,timestamp:2011-06-22-07:56:17,host_type_msg:[name:host_type_msg,eap_log_1:Receiving PP_Upload request from TCS...,xml_transaction:<?xml version="1.0"?>
<Transaction TxName="PP_Upload" Type="Request" MessageKey="0629">
    <Tool ToolID="CLB02" fromOPI="true" Type=""/>
    <Recipes>
        <Recipe RecipeID="APM_6MIN" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_00" FilePath="" FormatFlag="1" Type="" RecipeLevel="1" Option=""/>
        <Recipe RecipeID="Production;MXIC_SC1_360;1" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_01" FilePath="" FormatFlag="1" Type="" RecipeLevel="2" Option=""/>
        <Recipe RecipeID="Production;POR QDR-HCL;1" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_02" FilePath="" FormatFlag="1" Type="" RecipeLevel="2" Option=""/>
        <Recipe RecipeID="Production;POR-philic;1" MachineRecipeID="CLB02.APM_6MIN.AA" LocalFilePath="D:\share\110622_075617_304_0082706463_03" FilePath="" FormatFlag="1" Type="" RecipeLevel="2" Option=""/>
    </Recipes>
</Transaction>],secs_type_msg:None],[name:msg,timestamp:2011-06-22-07:56:17,host_type_msg:[name:host_type_msg,eap_log_1:[S7F5] Sent by Host

,xml_transaction:None],secs_type_msg:None]]]

--------------------------------------------------------------
能看到這的人不是正常人,恭喜你。
正常的編譯器接下來進到 code generation。
而我們這次的題目,接下來呢?
應該要上色了。

沒有留言:

張貼留言