首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >用于转换基因组数据文件的Python命令行程序

用于转换基因组数据文件的Python命令行程序
EN

Code Review用户
提问于 2017-04-05 13:15:54
回答 1查看 88关注 0票数 3

背景:

我编写了这段代码来转换从一个名为Geneious的包含.csv的软件导出的SNPs文件,并将它们连接到DNA序列中。

因此,基本上是从.csv文件中获取字段来创建字符串。

代码本身只是一堆执行小任务的函数,有些函数调用其他函数,最后将结果打印到文件中。我之所以使用a解析,是因为这将是一个命令行工具,对于其他命令行工具来说,有必要的参数和默认值是有用的。

我在编码方面没有经验,也没有人来审查我的代码。我觉得需要为每个函数调用每个参数是非常尴尬的。

我的问题:

这是最好的结构吗?创建这样的“链”功能是最佳实践吗?

代码语言:javascript
复制
import argparse
import collections
import csv


def cleaning(file_as_list, snp, names):
    """From input file get the SNPS."""
    with open(file_as_list, 'r') as input_file:
        reader = csv.reader(input_file)
        file = list(reader)
    have_SNP = [x for x in file if x[snp] == '1']
    for i in range(len(have_SNP)):
        mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
        sep_names = mult_names.split(',')
        only_names = [x for x in sep_names if ' ' not in x]
        have_SNP[i][names] = only_names
    return have_SNP


def reference_dic(file_as_list, snp, names, col_ref, pos):
    """Creates the dict with all positions and reference nucleotides."""
    have_SNP = cleaning(file_as_list, snp, names)
    ref_dic = {}
    for i in have_SNP:
        ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
    return ref_dic


def pos_list(file_as_list, snp, names, col_ref, pos):
    """Creates a list with all the ehxisting positions in reference."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    list_pos = []
    for key in ref_dic:
        list_pos.append(key)
    sorted_pos_lis = sorted(list_pos)
    return sorted_pos_lis


def genomes_list(file_as_list, snp, names, col_ref, pos):
    """Identifies the genomes present in the input file."""
    have_SNP = cleaning(file_as_list, snp, names)
    genomes_dic = {}
    for i in have_SNP:
        for j in i[names]:
            genomes_dic[j] = ""
    genomes_list = []
    for key in genomes_dic:
        genomes_list.append(key)
    return genomes_list


def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates a list of tuples with genome name and respesctive SNPs."""
    have_SNP = cleaning(file_as_list, snp, names)
    genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
    entrys_per_genome = []
    pos_genomes_in_dict = []
    for i in genomes:
        sub_tup = ()
        sub_list = []
        sub_dict = {}
        for j in have_SNP:
            if i in j[names]:
                sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
                sub_list.append(sub_sub_list)
                sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
        sub_tup = (i, sub_list)
        sub_dic_tup = (i, sub_dict)
        entrys_per_genome.append(sub_tup)
        pos_genomes_in_dict.append(sub_dic_tup)
    return entrys_per_genome, pos_genomes_in_dict


def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates a list without SNPs that appear 2 times for one genome."""
    entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
                                        pos, col_genome)[0]
    all_genomes_pos = []
    for i in entrys_per_genome:
        genome_pos = []
        for j in i[1]:
            genome_pos.append(j[0])
        all_genomes_pos.append(genome_pos)
    list_dup_pos = []
    for i in all_genomes_pos:
        duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
        list_dup_pos.extend(duplicated)
    no_dup_list_dup_pos = set(list_dup_pos)
    all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
    return pos_no_dup


def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates the reference sequence based on all SNPs."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
                                  pos, col_genome)
    reference_snps_list = ""
    for i in pos_no_dup:
        reference_snps_list += str(ref_dic[i])
    return reference_snps_list


def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
    """Uses the SNPs for each genome and 'N's to build each genome sequence."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
                                  col_genome)
    genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
                                  col_genome)[1]
    genomes = []
    for i in genomes_pos:
        dic_of_genome = i[1]
        this_genome = ""
        for j in pos_no_dup:
            if j in dic_of_genome.keys():
                this_genome += str(dic_of_genome[j])
            elif j in ref_dic:
                this_genome += 'N'
            else:
                print("ERROR!!!!")
                break
        genomes.append(">{0}".format(i[0]))
        genomes.append(this_genome)
    return genomes


def main(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates 'files.fasta' with the ref and genomes in fasta format."""
    ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
    genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
    with open("files.fasta", "w") as out_file:
        out_file.write(">reference_sequence\n")
        out_file.write("{0}\n".format(ref_genome))
        for i in genomes:
            out_file.write("{0}\n".format(i))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("input",
                        help="name of the input file")
    parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
                        help="""number of the column with the reference genome
                        nucleotides""")
    parser.add_argument("-g", "--col_genomes_nuc", default=8,
                        help="""number of the column with the genomes
                        nucleotides""")
    parser.add_argument("-p", "--position", default=3,
                        help="""number of the column with the position in the
                        genome""")
    parser.add_argument("-n", "--genome_names", default=10,
                        help="number of the column with the genomes names")
    parser.add_argument("-s", "--is_snp", default=7,
                        help="number of the column with lenght")
    args = parser.parse_args()
    print("""Columns:\n[Reference genome:{0}]\n[Genomes:{1}]
[Position of the SNP:{2}]\n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
                        args.position, args.genome_names, args.is_snp))
    col_ref = int(args.col_ref_genome_nuc) - 1
    col_genome = int(args.col_genomes_nuc) - 1
    pos = int(args.position) - 1
    names = int(args.genome_names) - 1
    snp = int(args.is_snp) - 1
    file_as_list = str(args.input)

    print("\nProcessing...")
    main(file_as_list, snp, names, col_ref, pos, col_genome)
    print("\nJob Done. Output written as <files.fasta>")
EN

回答 1

Code Review用户

回答已采纳

发布于 2017-04-05 13:54:53

这听起来是一个很好的用例,可以使用一个类,在这个类中,当前通过函数链传递的参数将是类属性,例如,如果我们有一个Converter类,我们可能会以这样的方式初始化它:

代码语言:javascript
复制
class Converter:
    def __init__(self, filename, snp, names, col_ref, pos, col_genome):
        self.filename = filename
        self.snp = snp
        self.names = names
        self.col_ref = col_ref
        self.pos = pos
        self.col_genome = col_genome

然后,您的函数将成为实例方法,您将通过self.<attribute>访问实例属性,而不是使用参数。

将类看作是对相关事物进行逻辑分组的一种方式,提供对公共变量和方法的共享访问。

还有其他一些需要改进的地方:

  • 您可以使用int定义参数,而不是将参数转换为type=int
  • 您可以在多个地方使用字典清单理解
  • 您可以使用str.join() --例如,在定义reference_snps_list:reference_snps_list = "".join(str(ref_dic我) for i in pos_no_dup)时
  • 您可以对输入文件参数使用特殊的argparse.FileType

FYI,因为这是一个普遍存在争议的话题:

票数 2
EN
页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://codereview.stackexchange.com/questions/159911

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档