首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >解析xml文件,设计我的代码的想法

解析xml文件,设计我的代码的想法
EN

Code Review用户
提问于 2019-11-04 15:14:25
回答 1查看 134关注 0票数 1

我开始为一家没有足够时间进行代码评审的公司工作。作为一个初级程序员,我想提高我在实际工作案例上的技能。我被要求以英文代替法文文本,以方便翻译"tableau工作簿“文件(这些只是XML文件)。

我分4个步骤工作(因此我编写了4个可以独立运行的文件):

  1. 从".twb“文件加载文本字段,将其转储到数据库中
  2. 从数据库中加载文本字段,并将它们写入excel工作表中,由“语言技术团队”翻译。
  3. 在数据库中转储已翻译的字段
  4. 根据已翻译的字段编写一个新的".twb“文件。

我的大部分工作都是从.twb文件(XML )中检索有趣的字段。因此,我编写了一个助手文件,该文件具有检索/获取感兴趣的节点的功能,以及其他更新XML文件中节点的函数( XML文件可以看作是一棵有节点的树)。下面是我的助手文件的摘录,让您了解我想要完成的任务。

我想我的设计很糟糕,如果你能给我一些建议(OO设计等),我会很高兴的。或任何建议(书籍.)帮助我提高我的设计技能。

代码语言:javascript
复制
from lxml import etree
import os
from copy import deepcopy
from collections import namedtuple
from ast import literal_eval
import re

####### Variable globales ######
textual_node = namedtuple('textual_node', ['id', 'label', 'type'])
stop_words = ['<', '>', 'Æ', '', '\n', ':', '()']


####### Parsing xml #######
def get_root_tree(file_source):
    tree = etree.parse(file_source)
    root = tree.getroot()
    return tree, root

####### Utilities #########
def get_parent(node,depth):
    """ Retrieve ancestor node """
    ancestor_node = node
    for i in range(depth):
        ancestor_node=ancestor_node.getparent()
    return ancestor_node

def get_brute_path(root,tree,xpath):
    brute_paths = []
    paths = []
    for node in root.xpath(xpath):
        path = tree.getpath(node)
        brute_path = clean_path(path)
        if not (brute_path  in brute_paths):
            paths.append(path)
            brute_paths.append(brute_path)
    return brute_paths,paths

def clean_path(path):
    split_path = [elt.split('[')[0] for elt in path.split('/')]
    brute_path = '/'.join(split_path)
    return brute_path

def get_ws_name(root, tree):
    """ Mapping between worksheet's name and worksheet's number in the xml file"""
    d_wsname = {}
    d_inv_wsname = {}
    for node in root.xpath('worksheets/worksheet[@name]'):
        ws_id = tree.getpath(node).split('/')[-1]
        ws_name = node.get('name')
        d_wsname[ws_id] = ws_name
        d_inv_wsname[ws_name] = ws_id
    return d_wsname, d_inv_wsname

####### Get nodes of interest #########
def get_alias(root):
    """ Get the aliases nodes """
    s_alias = set()
    xpath = "/workbook/datasources/datasource/column/aliases/alias"
    for node in root.xpath(xpath):
            ds_node = get_parent(node,3)
            ds_id = ds_node.attrib['name']
            col_node = get_parent(node,2)
            col_id = col_node.attrib['name']
            alias_id = node.attrib['key']
            node_id = (ds_id,col_id,alias_id)
            node_text = node.attrib['value']
            node_type = 'alias'
            alias_node = textual_node(node_id, node_text, node_type)
            s_alias.add(alias_node)
    return s_alias

def get_member(root):
    """ Get the member nodes"""
    s_member = set()
    xpath = "/workbook/datasources/datasource/column/members/member[@alias]"
    for node in root.xpath(xpath):
        ds_node = get_parent(node,3)
        ds_id = ds_node.attrib['name']
        col_node = get_parent(node, 2)
        col_id = col_node.attrib['name']
        member_id = node.attrib['value']
        node_id = (ds_id, col_id, member_id)
        node_text = node.attrib['alias']
        node_type = 'member'
        member_node =  textual_node(node_id, node_text, node_type)
        s_member.add(member_node)
    return s_member

def get_title_format(tree,root):
    """ Get the title text"""
    s = set()
    xpath = '//format[@attr="title"][@value]'
    for node in root.xpath(xpath):
        ws_id = get_parent(node, 4).attrib['name']
        fm_id = node.attrib['field']
        node_id = (ws_id, fm_id)
        node_text = node.get('value')
        node_type = 'title_'+node.tag
        title_format_node = textual_node(node_id, node_text, node_type)
        s.add(title_format_node)
    return s





def get_formatted_format(tree,root):
    """ Get the formatted text """
    raw = r"(?P<type>^.)(?:\"(?P<prefix>[^\"]*)\"){0,1}(?P<format>[^\"]*)(?:\"(?P<suffix>[^\"]*)\"){0,1};(?P<negformat>.)(?:\"(?P=prefix)\"){0,1}(?P=format)(?:\"(?P=suffix)\"){0,1}"
    pattern = re.compile(raw)
    s_format = set()
    xpath = '//format[@attr="text-format"]' # xpath = "/workbook/worksheets//table/style/style-rule/format[@field][@attr='text-format']"
    for node in root.xpath(xpath):
        fm_text = node.attrib['value']
        matches = pattern.search(fm_text)
        if matches:
            ws_id = get_parent(node,4).attrib['name']
            fm_id = node.attrib['field']
            node_id = (ws_id,fm_id)
            match_dic = matches.groupdict()
            node_prefix,node_suffix = match_dic['prefix'], match_dic['suffix']
            if node_prefix:
                if node_prefix.strip()!='€':
                    node_type = "{}_prefix".format(node.tag)
                    format_prefix_node = textual_node(node_id, node_prefix, node_type)
                    s_format.add(format_prefix_node)
            if node_suffix:
                if node_suffix.strip() != '€':
                    node_type = "{}_suffix".format(node.tag)
                    format_suffix_node = textual_node(node_id, node_suffix, node_type)
                    s_format.add(format_suffix_node)
    return s_format

def get_caption(root):
    """ Get the column names (original and calculated ones)"""
    s_caption = set()
    for node in root.xpath('/workbook/datasources/datasource/column[@caption]'):
        node_id = node.get('name')
        node_text = node.get('caption')
        node_type = node.tag
        caption_node = textual_node(node_id, node_text, node_type)
        s_caption.add(caption_node)
    return s_caption


def get_tooltip_label(root, tree, d_wsname):
    """" Get the tooltip and label """
    s_tooltip_label = set()
    nodes_type = ['customized-tooltip','customized-label']
    for node_type in nodes_type:
        ancestor_path = '//worksheets/*/table/panes/pane/{node_type}/formatted-text/run/ancestor::pane'.format(node_type=node_type)
        for ancestor in root.xpath(ancestor_path):
            try:
                pane_id = ancestor.attrib["id"]
            except:
                pane_id=''
            finally:
                run_path="{node_type}/formatted-text/run".format(node_type=node_type)
                for node in ancestor.xpath(run_path):
                    if not any(substring == node.text.strip() for substring in stop_words):  # 'Æ', '<['
                        ws_id = tree.getpath(node).split('/')[3]
                        ws_name = d_wsname[ws_id]
                        run_index = tree.getpath(node)[-2:-1]
                        node_id = (ws_name,pane_id,run_index)
                        node_text = node.text
                        node_type = tree.getpath(node).split('/')[-3]
                        tooltip_label_node = textual_node(node_id, node_text, node_type)
                        s_tooltip_label.add(tooltip_label_node)
    return s_tooltip_label

def get_zone(root,tree):
    """ Get the textual zones """
    s_zone = set()
    xpath_run = "/workbook/dashboards/dashboard/zones//zone/formatted-text/run"
    for run_node in root.xpath(xpath_run):
        if not any(substring == run_node.text.strip() for substring in stop_words):
            previous_zone_node = get_parent(run_node,2)
            pane_id = previous_zone_node.attrib['id']
            run_index = tree.getpath(run_node)[-2:-1]
            node_id = (pane_id,run_index)
            node_text = run_node.text
            node_type =  tree.getpath(run_node).split('/')[-3].split('[')[0]
            zone_node = textual_node(node_id,node_text,node_type)
            s_zone.add(zone_node)
    return s_zone

def get_node_info(tree,node):
    info = " tree.getpath(node): {path} \n node.items(): {attr} \n node.text: {text}".format(path=tree.getpath(node),attr=node.items(),text=node.text)
    print(info)

def get_ancestor_by_name(root,tree,node,namefield):
    path = tree.getpath(node)
    brute_path = clean_path(path)
    hier = brute_path.split('/')[1:]
    for i,elt in enumerate(hier):
        if namefield==elt:
            return get_parent(node,len(hier)-i-1)
    return False

############### Update Nodes #############################
def update_wblocal(root,lang):
    dic_lang = {'FR':'fr_FR','EN':'en_GB','ES':'es_ES'}
    root.attrib['locale'] = dic_lang[lang]
    return root

def update_nodes(df, root):
    root_trad = deepcopy(root)
    tree_trad = etree.ElementTree(root_trad)
    d_wsname, d_inv_wsname = get_ws_name(root_trad, tree_trad)
    d_func = {"customized-tooltip": update_customized_node, "customized-label": update_customized_node,
              "column": update_caption_node,"zone":update_zone_node,"alias":update_alias_node,
              "member":update_member_node,"format_prefix":update_format_node,"format_suffix":update_format_node,
              "title_format":update_title_format_node}
    d_args = {"customized-tooltip": (d_inv_wsname, root_trad, tree_trad), "customized-label": (d_inv_wsname, root_trad, tree_trad),
              "column": (root_trad, tree_trad) ,"zone":(root_trad, tree_trad),"alias":(root_trad,tree_trad),
              "member":(root_trad,tree_trad),"format_suffix":(root_trad,tree_trad),"format_prefix":(root_trad,tree_trad),
              "title_format":(root_trad,tree_trad)}
    for  index, row in df.iterrows():
        node_type = row['node_type']
        d_func[node_type](row,*d_args[node_type])
    return root_trad, tree_trad

def update_title_format_node(row,root,tree):
    """ Get the title text"""
    xpath = '//format[@attr="title"][@value]'
    brute_paths, _ = get_brute_path(root, tree, xpath)
    node_id = literal_eval(row['node_id'])
    ws_id, fm_id = node_id
    dic = {'worksheet': ('name', ws_id), 'format': ('field', fm_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            print("Avant title_format: ", node.attrib['value'])
            node.attrib['value'] = row.to_replace
            print("Après title_format:", node.attrib['value'])
    return root, tree

def update_format_node(row,root,tree):
    node_id = literal_eval(row['node_id'])
    raw = r"(?P<type>^.)(?:\"(?P<prefix>[^\"]*)\"){0,1}(?P<format>[^\"]*)(?:\"(?P<suffix>[^\"]*)\"){0,1};(?P<negformat>.)(?:\"(?P=prefix)\"){0,1}(?P=format)(?:\"(?P=suffix)\"){0,1}"
    pattern = re.compile(raw)
    xpath = '//format[@attr="text-format"][@field]' # xpath = "/workbook/worksheets//table/style/style-rule/format[@field][@attr='text-format']"
    brute_paths,_ = get_brute_path(root,tree,xpath)
    ws_id,fm_id = node_id
    dic = {'worksheet':('name',ws_id),'format':('field',fm_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            node_text = node.attrib['value']
            print("Avant format: ", node_text)
            dic = pattern.search(node_text).groupdict()
            dic_pattern = {'format_prefix':dic['prefix'],'format_suffix':dic['suffix']}
            pattern = dic_pattern[row.node_type]
            new_text = node_text.replace(pattern,row.to_replace)
            node.attrib['value'] = new_text
            print("Après format:", node.attrib['value'])
    return root,tree

def update_alias_node(row,root,tree):
    node_id = literal_eval(row['node_id'])
    ds_id, col_id,alias_id = node_id
    xpath = "//alias"
    brute_paths,_ = get_brute_path(root,tree,xpath)
    dic = {'datasource':('name',ds_id),'datasource-dependencies':('datasource',ds_id),\
           'column':('name',col_id),'alias':('key',alias_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            print("Avant alias: ", node.attrib['value'])
            node.attrib['value'] = row.to_replace
            print("Après Alias:", node.attrib['value'])
    return root,tree

def iterpath(node,tree,rightpath,dic,nodes=None):
    """ This recursive algorithm traverse the tree till the last element of the specified path
    (from top to bottom) looking if (key,value) pair of nodes match with dictionary """
    if nodes == None:
        nodes =[]
    node_tag = node.tag
    if len(rightpath.split('/'))>1:
        _, rightpath = rightpath.split("{}/".format(node_tag))
        if rightpath:
            next_tag,*_  = rightpath.split('/')
            xpath = "{}/{}".format(tree.getpath(node),next_tag)
            if node_tag in dic.keys():
                key, value = dic[node_tag]
                if node.attrib[key] != value:
                    return False
            for child_node in node.xpath(xpath):
                    iterpath(child_node, tree, rightpath, dic,nodes)
    else: #last node of the original specified path (fisrt call to the function)
        key, key_id = dic[node_tag]
        try:
            if node.attrib[key] == key_id:
                nodes.append(node)
        except KeyError:
            print("KeyError:",node.items())
    return nodes


def update_caption_node(row, root_trad, tree_trad):
    node_id = row['node_id']
    xpath_request = '//*[@name="{name}"][@caption]'.format(name=node_id)
    for node in root_trad.xpath(xpath_request):
        try:
            print("Avant caption:", node.get('caption'))
            node.attrib["caption"] = row[-1]
            print("Après caption:", node.get('caption'))
        except Exception as e:
            print("Exception",e)
    return root_trad, tree_trad

def update_customized_node(row, d_inv_wsname, root_trad, tree_trad):
    """ Se sert de la dernière colonne du dataframe, comme colonne de remplacement (à améliorer)"""
    node_type = row['node_type']
    node_id = literal_eval(row['node_id'])
    wsname = node_id[0]
    pane_id = node_id[1]
    run_index = node_id[2]
    ws = d_inv_wsname[wsname]
    if pane_id:
        xpath_pane = '//worksheets/{worksheet}/table/panes/pane[@id={pane_id}]'.format(worksheet=ws,pane_id=pane_id)
    else:
        xpath_pane = '//worksheets/{worksheet}/table/panes/pane'.format(worksheet=ws)
    if run_index != 'u':
        xpath_run = '//{node_type}/formatted-text/run[{run_index}]'.format(node_type=node_type,run_index=run_index)
    elif run_index == 'u':
        xpath_run = '//{node_type}/formatted-text/run'.format(node_type=node_type)
    for run_node in root_trad.xpath(xpath_pane+xpath_run):
        try:
            print('Avant run:', run_node.text)
            run_node.text = row[-1]
            print('Après run:', run_node.text)
        except Exception as e:
            print(">",repr(e))
    return root_trad, tree_trad

def update_zone_node(row, root_trad, tree_trad):
    node_id = literal_eval(row['node_id'])
    zone_id = node_id[0]
    run_index = node_id[1]
    if run_index != 'u':
        xpath_run = '/workbook/dashboards/dashboard/zones//zone[@id={zone_id}]/formatted-text/run[{run_index}]'.format(zone_id=zone_id,run_index=run_index)
    elif run_index == 'u':
        xpath_run = '/workbook/dashboards/dashboard/zones//zone[@id={zone_id}]/formatted-text/run'.format(zone_id=zone_id)
    for run_node in root_trad.xpath(xpath_run):
        try:
            print('Avant run:', run_node.text)
            run_node.text = row[-1]
            print('Après run:', run_node.text)
        except Exception as e:
            print(">", repr(e))
    return root_trad, tree_trad

def update_member_node(row, root, tree):
    node_id = literal_eval(row['node_id'])
    ds_id, col_id,member_id = node_id
    xpath = "//member"
    brute_paths,_ = get_brute_path(root,tree,xpath)
    dic = {'datasource':('name',ds_id),'column':('name',col_id),'member':('value',member_id)}
    for brute_path in brute_paths:
        nodes = iterpath(root,tree,brute_path,dic)
        for node in nodes:
            print("Avant member: ", node.attrib['alias'])
            node.attrib['alias'] = row.to_replace
            print("Après member:", node.attrib['alias'])
    return root,tree



# Write to file

def replace_apostroph(filepath):
    filepath2 = filepath.split('.twb')[0]+'2.twb'
    with open(filepath) as fp_in:
        with open(filepath2,'w') as fp_out:
            for line in fp_in:
                fp_out.write(line.replace("'", ''').replace('"', "'").replace('/>',' />'))

def rootToXml(view_name,output_dir, language_target, tree_trad):
    twb_output = '{view}_{languageTarget}.twb'.format(view=view_name, languageTarget=language_target)
    # tree_trad = etree.ElementTree(root_trad)
    filepath = os.path.join(output_dir, twb_output)
    tree_trad.write(filepath,encoding ='utf-8', pretty_print=True)
    #replace_apostroph(filepath)
    print("Writed to {filepath}".format(filepath=filepath))
EN

回答 1

Code Review用户

发布于 2019-11-05 15:26:09

根据我对Python文档的看法/经验和研究,我建议改进以下方法:

首先,namedtuple必须与大写名称一起使用,就像类一样。

代码语言:javascript
复制
Textual_Node = namedtuple('Textual_Node', ['id', 'label', 'type'])

这有助于确定创建新数据类对象的操作。namedtuple() Python文档

第二,尝试将与一个数据函数相关的所有函数组合到类中。

例如,在attr中使用root的所有函数都可以组合成一个函数,在改进之后使用self,它们会攻击root。它可以看起来像:

代码语言:javascript
复制
class ClassName():
    def __init__(self, root):
        self.root

    def get_caption(self):
        do_somethink_with_root(self.root)
        ...

第三,对各个相关代码块进行一行缩进。

第四,使用logging模块保存有关脚本执行的信息。

请试着在代码中使用指针。例如,它可能是一个pylintflake8,用于制作更多的pythonic代码样式,而Pydocstyle将执行更多的可理解代码文档字符串操作。作为请求Google提供Python代码样式,只在文档行中使用英语。

票数 6
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/231839

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档