文章/答案/技术大牛

发布

社区首页 >问答首页 >解析xml文件，设计我的代码的想法

问解析xml文件，设计我的代码的想法
EN

Code Review用户

提问于 2019-11-04 15:14:25

回答 1查看 134关注 0票数 1

我开始为一家没有足够时间进行代码评审的公司工作。作为一个初级程序员，我想提高我在实际工作案例上的技能。我被要求以英文代替法文文本，以方便翻译"tableau工作簿“文件(这些只是XML文件)。

我分4个步骤工作(因此我编写了4个可以独立运行的文件)：

从".twb“文件加载文本字段，将其转储到数据库中
从数据库中加载文本字段，并将它们写入excel工作表中，由“语言技术团队”翻译。
在数据库中转储已翻译的字段
根据已翻译的字段编写一个新的".twb“文件。

我的大部分工作都是从.twb文件(XML )中检索有趣的字段。因此，我编写了一个助手文件，该文件具有检索/获取感兴趣的节点的功能，以及其他更新XML文件中节点的函数( XML文件可以看作是一棵有节点的树)。下面是我的助手文件的摘录，让您了解我想要完成的任务。

我想我的设计很糟糕，如果你能给我一些建议(OO设计等)，我会很高兴的。或任何建议(书籍.)帮助我提高我的设计技能。

from lxml import etree
import os
from copy import deepcopy
from collections import namedtuple
from ast import literal_eval
import re

####### Variable globales ######
textual_node = namedtuple('textual_node', ['id', 'label', 'type'])
stop_words = ['<', '>', 'Æ', '', '\n', ':', '()']


####### Parsing xml #######
def get_root_tree(file_source):
    tree = etree.parse(file_source)
    root = tree.getroot()
    return tree, root

####### Utilities #########
def get_parent(node,depth):
    """ Retrieve ancestor node """
    ancestor_node = node
    for i in range(depth):
        ancestor_node=ancestor_node.getparent()
    return ancestor_node

def get_brute_path(root,tree,xpath):
    brute_paths = []
    paths = []
    for node in root.xpath(xpath):
        path = tree.getpath(node)
        brute_path = clean_path(path)
        if not (brute_path  in brute_paths):
            paths.append(path)
            brute_paths.append(brute_path)
    return brute_paths,paths

def clean_path(path):
    split_path = [elt.split('[')[0] for elt in path.split('/')]
    brute_path = '/'.join(split_path)
    return brute_path

def get_ws_name(root, tree):
    """ Mapping between worksheet's name and worksheet's number in the xml file"""
    d_wsname = {}
    d_inv_wsname = {}
    for node in root.xpath('worksheets/worksheet[@name]'):
        ws_id = tree.getpath(node).split('/')[-1]
        ws_name = node.get('name')
        d_wsname[ws_id] = ws_name
        d_inv_wsname[ws_name] = ws_id
    return d_wsname, d_inv_wsname

####### Get nodes of interest #########
def get_alias(root):
    """ Get the aliases nodes """
    s_alias = set()
    xpath = "/workbook/datasources/datasource/column/aliases/alias"
    for node in root.xpath(xpath):
            ds_node = get_parent(node,3)
            ds_id = ds_node.attrib['name']
            col_node = get_parent(node,2)
            col_id = col_node.attrib['name']
            alias_id = node.attrib['key']
            node_id = (ds_id,col_id,alias_id)
            node_text = node.attrib['value']
            node_type = 'alias'
            alias_node = textual_node(node_id, node_text, node_type)
            s_alias.add(alias_node)
    return s_alias

def get_member(root):
    """ Get the member nodes"""
    s_member = set()
    xpath = "/workbook/datasources/datasource/column/members/member[@alias]"
    for node in root.xpath(xpath):
        ds_node = get_parent(node,3)
        ds_id = ds_node.attrib['name']
        col_node = get_parent(node, 2)
        col_id = col_node.attrib['name']
        member_id = node.attrib['value']
        node_id = (ds_id, col_id, member_id)
        node_text = node.attrib['alias']
        node_type = 'member'
        member_node =  textual_node(node_id, node_text, node_type)
        s_member.add(member_node)
    return s_member

def get_title_format(tree,root):
    """ Get the title text"""
    s = set()
    xpath = '//format[@attr="title"][@value]'
    for node in root.xpath(xpath):
        ws_id = get_parent(node, 4).attrib['name']
        fm_id = node.attrib['field']
        node_id = (ws_id, fm_id)
        node_text = node.get('value')
        node_type = 'title_'+node.tag
        title_format_node = textual_node(node_id, node_text, node_type)
        s.add(title_format_node)
    return s





def get_formatted_format(tree,root):
    """ Get the formatted text """
    raw = r"(?P<type>^.)(?:\"(?P<prefix>[^\"]*)\"){0,1}(?P<format>[^\"]*)(?:\"(?P<suffix>[^\"]*)\"){0,1};(?P<negformat>.)(?:\"(?P=prefix)\"){0,1}(?P=format)(?:\"(?P=suffix)\"){0,1}"
    pattern = re.compile(raw)
    s_format = set()
    xpath = '//format[@attr="text-format"]' # xpath = "/workbook/worksheets//table/style/style-rule/format[@field][@attr='text-format']"
    for node in root.xpath(xpath):
        fm_text = node.attrib['value']
        matches = pattern.search(fm_text)
        if matches:
            ws_id = get_parent(node,4).attrib['name']
            fm_id = node.attrib['field']
            node_id = (ws_id,fm_id)
            match_dic = matches.groupdict()
            node_prefix,node_suffix = match_dic['prefix'], match_dic['suffix']
            if node_prefix:
                if node_prefix.strip()!='€':
                    node_type = "{}_prefix".format(node.tag)
                    format_prefix_node = textual_node(node_id, node_prefix, node_type)
                    s_format.add(format_prefix_node)
            if node_suffix:
                if node_suffix.strip() != '€':
                    node_type = "{}_suffix".format(node.tag)
                    format_suffix_node = textual_node(node_id, node_suffix, node_type)
                    s_format.add(format_suffix_node)
    return s_format

def get_caption(root):
    """ Get the column names (original and calculated ones)"""
    s_caption = set()
    for node in root.xpath('/workbook/datasources/datasource/column[@caption]'):
        node_id = node.get('name')
        node_text = node.get('caption')
        node_type = node.tag
        caption_node = textual_node(node_id, node_text, node_type)
        s_caption.add(caption_node)
    return s_caption


def get_tooltip_label(root, tree, d_wsname):
    """" Get the tooltip and label """
    s_tooltip_label = set()
    nodes_type = ['customized-tooltip','customized-label']
    for node_type in nodes_type:
        ancestor_path = '//worksheets/*/table/panes/pane/{node_type}/formatted-text/run/ancestor::pane'.format(node_type=node_type)
        for ancestor in root.xpath(ancestor_path):
            try:
                pane_id = ancestor.attrib["id"]
            except:
                pane_id=''
            finally:
                run_path="{node_type}/formatted-text/run".format(node_type=node_type)
                for node in ancestor.xpath(run_path):
                    if not any(substring == node.text.strip() for substring in stop_words):  # 'Æ', '<['
                        ws_id = tree.getpath(node).split('/')[3]
                        ws_name = d_wsname[ws_id]
                        run_index = tree.getpath(node)[-2:-1]
                        node_id = (ws_name,pane_id,run_index)
                        node_text = node.text
                        node_type = tree.getpath(node).split('/')[-3]
                        tooltip_label_node = textual_node(node_id, node_text, node_type)
                        s_tooltip_label.add(tooltip_label_node)
    return s_tooltip_label

def get_zone(root,tree):
    """ Get the textual zones """
    s_zone = set()
    xpath_run = "/workbook/dashboards/dashboard/zones//zone/formatted-text/run"
    for run_node in root.xpath(xpath_run):
        if not any(substring == run_node.text.strip() for substring in stop_words):
            previous_zone_node = get_parent(run_node,2)
            pane_id = previous_zone_node.attrib['id']
            run_index = tree.getpath(run_node)[-2:-1]
            node_id = (pane_id,run_index)
            node_text = run_node.text
            node_type =  tree.getpath(run_node).split('/')[-3].split('[')[0]
            zone_node = textual_node(node_id,node_text,node_type)
            s_zone.add(zone_node)
    return s_zone

def get_node_info(tree,node):
    info = " tree.getpath(node): {path} \n node.items(): {attr} \n node.text: {text}".format(path=tree.getpath(node),attr=node.items(),text=node.text)
    print(info)

def get_ancestor_by_name(root,tree,node,namefield):
    path = tree.getpath(node)
    brute_path = clean_path(path)
    hier = brute_path.split('/')[1:]
    for i,elt in enumerate(hier):
        if namefield==elt:
            return get_parent(node,len(hier)-i-1)
    return False

############### Update Nodes #############################
def update_wblocal(root,lang):
    dic_lang = {'FR':'fr_FR','EN':'en_GB','ES':'es_ES'}
    root.attrib['locale'] = dic_lang[lang]
    return root

def update_nodes(df, root):
    root_trad = deepcopy(root)
    tree_trad = etree.ElementTree(root_trad)
    d_wsname, d_inv_wsname = get_ws_name(root_trad, tree_trad)
    d_func = {"customized-tooltip": update_customized_node, "customized-label": update_customized_node,
              "column": update_caption_node,"zone":update_zone_node,"alias":update_alias_node,
              "member":update_member_node,"format_prefix":update_format_node,"format_suffix":update_format_node,
              "title_format":update_title_format_node}
    d_args = {"customized-tooltip": (d_inv_wsname, root_trad, tree_trad), "customized-label": (d_inv_wsname, root_trad, tree_trad),
              "column": (root_trad, tree_trad) ,"zone":(root_trad, tree_trad),"alias":(root_trad,tree_trad),
              "member":(root_trad,tree_trad),"format_suffix":(root_trad,tree_trad),"format_prefix":(root_trad,tree_trad),
              "title_format":(root_trad,tree_trad)}
    for  index, row in df.iterrows():
        node_type = row['node_type']
        d_func[node_type](row,*d_args[node_type])
    return root_trad, tree_trad

def update_title_format_node(row,root,tree):
    """ Get the title text"""
    xpath = '//format[@attr="title"][@value]'
    brute_paths, _ = get_brute_path(root, tree, xpath)
    node_id = literal_eval(row['node_id'])
    ws_id, fm_id = node_id
    dic = {'worksheet': ('name', ws_id), 'format': ('field', fm_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            print("Avant title_format: ", node.attrib['value'])
            node.attrib['value'] = row.to_replace
            print("Après title_format:", node.attrib['value'])
    return root, tree

def update_format_node(row,root,tree):
    node_id = literal_eval(row['node_id'])
    raw = r"(?P<type>^.)(?:\"(?P<prefix>[^\"]*)\"){0,1}(?P<format>[^\"]*)(?:\"(?P<suffix>[^\"]*)\"){0,1};(?P<negformat>.)(?:\"(?P=prefix)\"){0,1}(?P=format)(?:\"(?P=suffix)\"){0,1}"
    pattern = re.compile(raw)
    xpath = '//format[@attr="text-format"][@field]' # xpath = "/workbook/worksheets//table/style/style-rule/format[@field][@attr='text-format']"
    brute_paths,_ = get_brute_path(root,tree,xpath)
    ws_id,fm_id = node_id
    dic = {'worksheet':('name',ws_id),'format':('field',fm_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            node_text = node.attrib['value']
            print("Avant format: ", node_text)
            dic = pattern.search(node_text).groupdict()
            dic_pattern = {'format_prefix':dic['prefix'],'format_suffix':dic['suffix']}
            pattern = dic_pattern[row.node_type]
            new_text = node_text.replace(pattern,row.to_replace)
            node.attrib['value'] = new_text
            print("Après format:", node.attrib['value'])
    return root,tree

def update_alias_node(row,root,tree):
    node_id = literal_eval(row['node_id'])
    ds_id, col_id,alias_id = node_id
    xpath = "//alias"
    brute_paths,_ = get_brute_path(root,tree,xpath)
    dic = {'datasource':('name',ds_id),'datasource-dependencies':('datasource',ds_id),\
           'column':('name',col_id),'alias':('key',alias_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            print("Avant alias: ", node.attrib['value'])
            node.attrib['value'] = row.to_replace
            print("Après Alias:", node.attrib['value'])
    return root,tree

def iterpath(node,tree,rightpath,dic,nodes=None):
    """ This recursive algorithm traverse the tree till the last element of the specified path
    (from top to bottom) looking if (key,value) pair of nodes match with dictionary """
    if nodes == None:
        nodes =[]
    node_tag = node.tag
    if len(rightpath.split('/'))>1:
        _, rightpath = rightpath.split("{}/".format(node_tag))
        if rightpath:
            next_tag,*_  = rightpath.split('/')
            xpath = "{}/{}".format(tree.getpath(node),next_tag)
            if node_tag in dic.keys():
                key, value = dic[node_tag]
                if node.attrib[key] != value:
                    return False
            for child_node in node.xpath(xpath):
                    iterpath(child_node, tree, rightpath, dic,nodes)
    else: #last node of the original specified path (fisrt call to the function)
        key, key_id = dic[node_tag]
        try:
            if node.attrib[key] == key_id:
                nodes.append(node)
        except KeyError:
            print("KeyError:",node.items())
    return nodes


def update_caption_node(row, root_trad, tree_trad):
    node_id = row['node_id']
    xpath_request = '//*[@name="{name}"][@caption]'.format(name=node_id)
    for node in root_trad.xpath(xpath_request):
        try:
            print("Avant caption:", node.get('caption'))
            node.attrib["caption"] = row[-1]
            print("Après caption:", node.get('caption'))
        except Exception as e:
            print("Exception",e)
    return root_trad, tree_trad

def update_customized_node(row, d_inv_wsname, root_trad, tree_trad):
    """ Se sert de la dernière colonne du dataframe, comme colonne de remplacement (à améliorer)"""
    node_type = row['node_type']
    node_id = literal_eval(row['node_id'])
    wsname = node_id[0]
    pane_id = node_id[1]
    run_index = node_id[2]
    ws = d_inv_wsname[wsname]
    if pane_id:
        xpath_pane = '//worksheets/{worksheet}/table/panes/pane[@id={pane_id}]'.format(worksheet=ws,pane_id=pane_id)
    else:
        xpath_pane = '//worksheets/{worksheet}/table/panes/pane'.format(worksheet=ws)
    if run_index != 'u':
        xpath_run = '//{node_type}/formatted-text/run[{run_index}]'.format(node_type=node_type,run_index=run_index)
    elif run_index == 'u':
        xpath_run = '//{node_type}/formatted-text/run'.format(node_type=node_type)
    for run_node in root_trad.xpath(xpath_pane+xpath_run):
        try:
            print('Avant run:', run_node.text)
            run_node.text = row[-1]
            print('Après run:', run_node.text)
        except Exception as e:
            print(">",repr(e))
    return root_trad, tree_trad

def update_zone_node(row, root_trad, tree_trad):
    node_id = literal_eval(row['node_id'])
    zone_id = node_id[0]
    run_index = node_id[1]
    if run_index != 'u':
        xpath_run = '/workbook/dashboards/dashboard/zones//zone[@id={zone_id}]/formatted-text/run[{run_index}]'.format(zone_id=zone_id,run_index=run_index)
    elif run_index == 'u':
        xpath_run = '/workbook/dashboards/dashboard/zones//zone[@id={zone_id}]/formatted-text/run'.format(zone_id=zone_id)
    for run_node in root_trad.xpath(xpath_run):
        try:
            print('Avant run:', run_node.text)
            run_node.text = row[-1]
            print('Après run:', run_node.text)
        except Exception as e:
            print(">", repr(e))
    return root_trad, tree_trad

def update_member_node(row, root, tree):
    node_id = literal_eval(row['node_id'])
    ds_id, col_id,member_id = node_id
    xpath = "//member"
    brute_paths,_ = get_brute_path(root,tree,xpath)
    dic = {'datasource':('name',ds_id),'column':('name',col_id),'member':('value',member_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            print("Avant member: ", node.attrib['alias'])
            node.attrib['alias'] = row.to_replace
            print("Après member:", node.attrib['alias'])
    return root,tree



# Write to file

def replace_apostroph(filepath):
    filepath2 = filepath.split('.twb')[0]+'2.twb'
    with open(filepath) as fp_in:
        with open(filepath2,'w') as fp_out:
            for line in fp_in:
                fp_out.write(line.replace("'", ''').replace('"', "'").replace('/>',' />'))

def rootToXml(view_name,output_dir, language_target, tree_trad):
    twb_output = '{view}_{languageTarget}.twb'.format(view=view_name, languageTarget=language_target)
    # tree_trad = etree.ElementTree(root_trad)
    filepath = os.path.join(output_dir, twb_output)
    tree_trad.write(filepath,encoding ='utf-8', pretty_print=True)
    #replace_apostroph(filepath)
    print("Writed to {filepath}".format(filepath=filepath))

python

python-3.x

design-patterns

lxml

回答 1

Code Review用户

发布于 2019-11-05 15:26:09

根据我对Python文档的看法/经验和研究，我建议改进以下方法：

首先，namedtuple必须与大写名称一起使用，就像类一样。

Textual_Node = namedtuple('Textual_Node', ['id', 'label', 'type'])

这有助于确定创建新数据类对象的操作。namedtuple() Python文档

第二，尝试将与一个数据函数相关的所有函数组合到类中。

例如，在attr中使用root的所有函数都可以组合成一个函数，在改进之后使用self，它们会攻击root。它可以看起来像：

class ClassName():
    def __init__(self, root):
        self.root

    def get_caption(self):
        do_somethink_with_root(self.root)
        ...

第三，对各个相关代码块进行一行缩进。

第四，使用logging模块保存有关脚本执行的信息。

请试着在代码中使用指针。例如，它可能是一个pylint或flake8，用于制作更多的pythonic代码样式，而Pydocstyle将执行更多的可理解代码和文档字符串操作。作为请求Google提供Python代码样式，只在文档行中使用英语。

票数 6

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/231839

复制

相似问题

问解析xml文件，设计我的代码的想法
EN

回答 1

Code Review用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问解析xml文件，设计我的代码的想法EN

回答 1

Code Review用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问解析xml文件，设计我的代码的想法
EN