Martin已经看过了我的代码,并且提出了意见,我们最担心的问题——性能问题还是凸显出来了,并不奇怪,erwin导出的文件确实是非常庞大的。
我在python邮件列表里面查找了一下,找到了几个常见的解决方案:
- 使用4suite提供的cDomleete
- 通过pywin32通过com调用ms的MSXML解析器
- PIRXX一个和xerces绑定的dom
4suite提供的cDomleete看起来是个比较不错的选择,我对解析的脚本作了一点小的改动:
1 import logging,logging.handlers
2 logging.basicConfig(level=logging.DEBUG,\
3 format='%(asctime)s %(levelname)s %(message)s',\
4 filename='trans2.log',\
5 filemode='w')
6 logger = logging.getLogger('trans2')
7
8 class entity:
9
10 def _parse_entity_properties(self, root):
11 entity_attrs_map = _get_attributes_as_dict(root)
12 self.id = entity_attrs_map['id']
13 self.name = entity_attrs_map['Name']
14
15 entity_properties_map = _get_child_nodes_as_dict(root.firstChild)
16 if 'Physical_Name' in entity_properties_map:
17 logger.debug('found Physical_Name in entity(%s)' % self.id)
18 self.physical_name = entity_properties_map['Physical_Name']
19 else:
20 self.physical_name = ''
21 logger.info('entity id = %s, name=%s, physical_name=%s'\
22 % (self.id, self.name,self.physical_name))
23
24 def _parse_entity_attributes(self, root):
25 self.attrs = []
26 #attr_list = root.getElementsByTagName('Attribute')
27 attr_list = root.xpath('//Attribute')
28 for a in attr_list:
29 attr_map = {}
30 id = _get_attributes_as_dict(a)['id']
31 name = _get_attributes_as_dict(a)['Name']
32
33 child_map = _get_child_nodes_as_dict(a.firstChild)
34 pysical_name = ''
35 parent_attr_id = ''
36 parent_relation_id = ''
37 master_attr_id = ''
38 nullable = 1
39 try:
40 #for process some special cases
41 if 'Physical_Name' in child_map:
42 logger.debug('found Physical_Name element in Attribute(%s)' % id)
43 pysical_name = child_map['Physical_Name'].firstChild.data
44 if 'Parent_Attribute' in child_map:
45 logger.debug('found Parent_Attribute element in Attribute(%s)' % id)
46 parent_attr_id = child_map['Parent_Attribute'].firstChild.data
47 if 'Parent_Relationship' in child_map:
48 logger.debug('found Parent_Relationship element in Attribute(%s)' % id)
49 parent_relation_id = child_map['Parent_Relationship'].firstChild.data
50 if 'Master_Attribute' in child_map:
51 logger.debug('found Master_Attribute element in Attribute(%s)' % id)
52 master_attr_id = child_map['Master_Attribute'].firstChild.data
53 if 'Null_Option' in child_map:
54 logger.debug('found Null_Option element in Attribute(%s)' % id)
55 nullable = child_map['Null_Option'].firstChild.data
56 data_type = child_map['Datatype'].firstChild.data
57
58 attr_map = {'attr_id':id,'name':name,'pysical_name':pysical_name,\
59 'nullable':nullable,'data_type':data_type,'parent_attr_id':parent_attr_id\
60 ,'parent_relation_id':parent_relation_id ,'master_attr_id':master_attr_id }
61 except KeyError,msg:
62 print 'warring, maybe missing some attribute\'s infomation:%s of entity %s'\
63 % (msg, self.name)
64 logger.warn('warring, maybe missing some attribute\'s infomation:%s of entity %s'\
65 % (msg, self.name))
66 self.attrs.append(attr_map)
67
68 def _parse_entity_keys(self, root):
69 self.pk = {}
70 self.fks = []
71 #key_list = root.getElementsByTagName('Key_Group')
72 key_list = root.xpath('//Key_Group')
73 for k in key_list:
74 key_id = _get_attributes_as_dict(k)['id']
75 key_name = _get_attributes_as_dict(k)['Name']
76 # process Key_GroupProps, get the key type infomation
77 key_type = _get_child_nodes_as_dict(k.firstChild)['Key_Group_Type'].firstChild.data
78 # process Key_Group_MemberProps, get the key column property
79 #try:
80 if 'Key_Group_Member_Column' in _get_child_nodes_as_dict(\
81 k.lastChild.firstChild.firstChild):
82 key_attr_id = _get_child_nodes_as_dict(k.lastChild.firstChild.firstChild)\
83 ['Key_Group_Member_Column'].firstChild.data
84 else:
85 logger.error('error, can\'t find the key defination %s for %s'\
86 % ('Key_Group_Member_Column', self.name))
87 key_attr_id = ''
88
89 key_conf = {'key_id':key_id,'key_name':key_name,\
90 'key_type':key_type,'key_attr_id':key_attr_id}
91 if key_conf['key_type'] == 'PK':
92 self.pk = key_conf
93 else:
94 self.fks.append(key_conf)
95 logger.debug('key_props for Key_Group(%s):%s:%s:%s'\
96 % (key_id,key_name,key_type,key_attr_id))
97 #except KeyError, msg:
98 # print 'error, can\'t find the key defination %s for %s'\
99 # % (msg, self.name)
100 # logger.error('error, can\'t find the key defination %s for %s'\
101 # % (msg, self.name))
102
103
104 def _reset(self):
105 self.id = ''
106 self.name = ''
107 self.attrs = []
108 self.pk = {}
109 self.fks = []
110
111 def __init__(self,entity_element):
112 self._reset()
113 self._parse_entity_properties(entity_element)
114 self._parse_entity_attributes(entity_element)
115 self._parse_entity_keys(entity_element)
116
117 def __eq__(a,b):
118 return a.id == b.id
119
120 def __repr__(self):
121 #print self.__dict__
122 return 'entity with {id:%(id)s,name:%(name)s,pk:%(pk)s' \
123 % self.__dict__
124
125 class relationship:
126 def __init__(self,relation_element):
127 self._reset()
128 self._parse_relationship(relation_element)
129
130 def _reset(self):
131 self.id = ''
132 self.parent_id = ''
133 self.child_id = ''
134 self.name = ''
135
136 def _parse_relationship(self, relations_element):
137 attr_map = _get_attributes_as_dict(relations_element)
138 self.id = attr_map['id']
139 self.name = attr_map['Name']
140
141 rel_props = _get_child_nodes_as_dict(relations_element.childNodes[0])
142 self.parent_id = rel_props['Relationship_Parent_Entity'].firstChild.data
143 self.child_id = rel_props['Relationship_Child_Entity'].firstChild.data
144 logger.debug('parsed relation:%s:' % self)
145
146 def __repr__(self):
147 return 'relationship with {id:%(id)s,name:%(name)s,parent_id:%(parent_id)s,child_id:%(child_id)s}' \
148 % self.__dict__
149
150 def __eq__(a, b):
151 return a.id == b.id
152
153 def _get_attributes_as_dict(element):
154 attrs = {}
155 if element.attributes:
156 for attr in element.attributes.values():
157 attrs[attr.name.strip()] = attr.value
158 return attrs
159
160 def _get_child_nodes_as_dict(element):
161 child_nodes_map = {}
162 if element.childNodes:
163 for e in element.childNodes:
164 if not e.nodeType == e.TEXT_NODE:
165 child_nodes_map[e.tagName.strip()] = e
166 else:
167 child_nodes_map[e.parentNode.tagName.strip()] = e.data
168 return child_nodes_map
169
170 def parseXmlFile(file_name):
171 from xml.dom.ext.reader import Sax2 as parser
172 from Ft.Xml.InputSource import InputSourceFactory
173 from Ft.Xml.cDomlette import Parse
174 f = None
175 docContent = ''
176 try:
177 f = file(file_name)
178 docContent = '\n'.join(f.readlines())
179 finally:
180 f.close()
181 #doc = parser.FromXmlFile(file_name)
182 doc = Parse(InputSourceFactory().fromString(docContent))
183 return doc
184
185 def _startParse(root):
186 #entities = root.getElementsByTagName('Entity')
187 #relations = root.getElementsByTagName('Relationship')
188 entities = root.xpath(u'//Entity')
189 relations = root.xpath(u'//Relationship')
190 parsed_entities = [entity(item) for item in entities]
191 parsed_relations = [relationship(item) for item in relations]
192 return parsed_entities,parsed_relations
193
194 if __name__ == '__main__':
195 import sys,time
196 start = time.time()
197 print 'start@%s' % start
198 root = parseXmlFile(sys.argv[1])
199 entities,relations = _startParse(root.documentElement)
200 end = time.time()
201 print 'stop@%s' % end
202 logger.info('cost %ss' % (end - start))
203 print 'cost %s' % (end - start)
204
除了对解析器的初始化代码改动了之外,dom的getElementsByTagName被一个Xpath表达式取代了,这个变化其实也不是太大,另外,我把一些依靠异常处理的代码路径截掉,主要是考虑构造异常对象是一个比较耗时的操作。 经过测试,最终的结果是大约加速了4倍,不知道这个结果能不能令人满意,但是从主观感觉来看,解析的时间确实大大缩短了。我使用的xpath表达式可能也影响了效率,这种问题还是交给Martin来帮我看看吧。