这个东西是我的朋友
Martin在处理一个老系统转换erwin的数据库表结构定义的xml文件到hibernate配置文件, VO和DAO时做的一个实验,我们本来有一个牛人写的程序来处理这些乱七八糟的表设计,可惜源代码都丢失了,而且他的转换是基于hibernate 1.x,有很多的先进特性无法利用,我们的目标是:
1,erwin版本独立,erwin这个xml文件格式随不同的版本略有不同,希望我们的东西可以做到隔离版本带来的问题。
2,高效,系统中的表定义超过2000,导出的xml文件非常庞大,动辄超过三四十兆,我们希望对于表结构变动带来的变化能够及时响应,不能(过于)影响开发。
初步的设想是先解析文件,得到最基本的表结构信息,主外键信息,然后进一步加工成我们想要得文件格式,我选择的工具是python。
现在准备工作已经完成了,下面是把xml文件解析成为python对象的代码。
下面是几个testcase
点击下载import logging,logging.handlers
logging.basicConfig(level=logging.DEBUG,\
format='%(asctime)s %(levelname)s %(message)s',\
filename='trans2.log',\
filemode='w')
logger = logging.getLogger('trans2')
class entity:
def _parse_entity_properties(self, root):
entity_attrs_map = _get_attributes_as_dict(root)
self.id = entity_attrs_map['id']
self.name = entity_attrs_map['Name']
entity_properties_map = _get_child_nodes_as_dict(root.firstChild)
if 'Physical_Name' in entity_properties_map:
logger.debug('found Physical_Name in entity(%s)' % self.id)
self.physical_name = entity_properties_map['Physical_Name']
else:
self.physical_name = ''
logger.debug('entity id = %s, name=%s, physical_name=%s'\
% (self.id, self.name,self.physical_name))
def _parse_entity_attributes(self, root):
self.attrs = []
attr_list = root.getElementsByTagName('Attribute')
for a in attr_list:
attr_map = {}
id = _get_attributes_as_dict(a)['id']
name = _get_attributes_as_dict(a)['Name']
child_map = _get_child_nodes_as_dict(a.firstChild)
pysical_name = ''
parent_attr_id = ''
parent_relation_id = ''
master_attr_id = ''
nullable = 1
try:
#for process some special cases
if 'Physical_Name' in child_map:
logger.debug('found Physical_Name element in Attribute(%s)' % id)
pysical_name = child_map['Physical_Name'].firstChild.data
if 'Parent_Attribute' in child_map:
logger.debug('found Parent_Attribute element in Attribute(%s)' % id)
parent_attr_id = child_map['Parent_Attribute'].firstChild.data
if 'Parent_Relationship' in child_map:
logger.debug('found Parent_Relationship element in Attribute(%s)' % id)
parent_relation_id = child_map['Parent_Relationship'].firstChild.data
if 'Master_Attribute' in child_map:
logger.debug('found Master_Attribute element in Attribute(%s)' % id)
master_attr_id = child_map['Master_Attribute'].firstChild.data
if 'Null_Option' in child_map:
logger.debug('found Null_Option element in Attribute(%s)' % id)
nullable = child_map['Null_Option'].firstChild.data
data_type = child_map['Datatype'].firstChild.data
attr_map = {'attr_id':id,'name':name,'pysical_name':pysical_name,\
'nullable':nullable,'data_type':data_type,'parent_attr_id':parent_attr_id\
,'parent_relation_id':parent_relation_id ,'master_attr_id':master_attr_id }
except KeyError,msg:
print 'warring, maybe missing some attribute\'s infomation:%s of entity %s'\
% (msg, self.name)
logger.warn('warring, maybe missing some attribute\'s infomation:%s of entity %s'\
% (msg, self.name))
self.attrs.append(attr_map)
def _parse_entity_keys(self, root):
self.pk = {}
self.fks = []
key_list = root.getElementsByTagName('Key_Group')
for k in key_list:
key_id = _get_attributes_as_dict(k)['id']
key_name = _get_attributes_as_dict(k)['Name']
# process Key_GroupProps, get the key type infomation
key_type = _get_child_nodes_as_dict(k.firstChild)['Key_Group_Type'].firstChild.data
# process Key_Group_MemberProps, get the key column property
try:
key_attr_id = _get_child_nodes_as_dict(k.lastChild.firstChild.firstChild)\
['Key_Group_Member_Column'].firstChild.data
key_conf = {'key_id':key_id,'key_name':key_name,\
'key_type':key_type,'key_attr_id':key_attr_id}
if key_conf['key_type'] == 'PK':
self.pk = key_conf
else:
self.fks.append(key_conf)
logger.debug('key_props for Key_Group(%s):%s:%s:%s'\
% (key_id,key_name,key_type,key_attr_id))
except KeyError, msg:
print 'error, can\'t find the key defination %s for %s'\
% (msg, self.name)
logger.error('error, can\'t find the key defination %s for %s'\
% (msg, self.name))
def _reset(self):
self.id = ''
self.name = ''
self.attrs = []
self.pk = {}
self.fks = []
def __init__(self,entity_element):
self._reset()
self._parse_entity_properties(entity_element)
self._parse_entity_attributes(entity_element)
self._parse_entity_keys(entity_element)
def __eq__(a,b):
return a.id == b.id
def __repr__(self):
#print self.__dict__
return 'entity with {id:%(id)s,name:%(name)s,pk:%(pk)s' \
% self.__dict__
class relationship:
def __init__(self,relation_element):
self._reset()
self._parse_relationship(relation_element)
def _reset(self):
self.id = ''
self.parent_id = ''
self.child_id = ''
self.name = ''
def _parse_relationship(self, relations_element):
attr_map = _get_attributes_as_dict(relations_element)
self.id = attr_map['id']
self.name = attr_map['Name']
rel_props = _get_child_nodes_as_dict(relations_element.childNodes[0])
self.parent_id = rel_props['Relationship_Parent_Entity'].firstChild.data
self.child_id = rel_props['Relationship_Child_Entity'].firstChild.data
logger.debug('parsed relation:%s:' % self)
def __repr__(self):
return 'relationship with {id:%(id)s,name:%(name)s,parent_id:%(parent_id)s,child_id:%(child_id)s}' \
% self.__dict__
def __eq__(a, b):
return a.id == b.id
def _get_attributes_as_dict(element):
attributes = {}
if element.attributes:
for attr in element.attributes:
attributes[attr.name.strip()] = attr.value
return attributes
def _get_child_nodes_as_dict(element):
child_nodes_map = {}
if element.childNodes:
for e in element.childNodes:
if not e.nodeType == e.TEXT_NODE:
child_nodes_map[e.tagName.strip()] = e
else:
child_nodes_map[e.parentNode.tagName.strip()] = e.data
return child_nodes_map
def parseXmlFile(file_name):
from xml.dom.ext.reader import Sax2
doc = Sax2.FromXmlFile(file_name)
return doc
def _startParse(root):
entities = root.getElementsByTagName('Entity')
relations = root.getElementsByTagName('Relationship')
parsed_entities = [entity(item) for item in entities]
parsed_relations = [relationship(item) for item in relations]
return parsed_entities,parsed_relations
if __name__ == '__main__':
import sys,time
start = time.time()
print 'start@%s' % start
root = parseXmlFile(sys.argv[1])
entities,relations = _startParse(root.documentElement)
end = time.time()
print 'stop@%s' % end
print 'cost %s' % (end - start)