文章/答案/技术大牛

发布

社区首页 >问答首页 >用于转换基因组数据文件的Python命令行程序

问用于转换基因组数据文件的Python命令行程序
EN

Code Review用户

提问于 2017-04-05 13:15:54

回答 1查看 88关注 0票数 3

背景：

我编写了这段代码来转换从一个名为Geneious的包含.csv的软件导出的SNPs文件，并将它们连接到DNA序列中。

因此，基本上是从.csv文件中获取字段来创建字符串。

代码本身只是一堆执行小任务的函数，有些函数调用其他函数，最后将结果打印到文件中。我之所以使用a解析，是因为这将是一个命令行工具，对于其他命令行工具来说，有必要的参数和默认值是有用的。

我在编码方面没有经验，也没有人来审查我的代码。我觉得需要为每个函数调用每个参数是非常尴尬的。

我的问题：

这是最好的结构吗？创建这样的“链”功能是最佳实践吗？

码

import argparse
import collections
import csv


def cleaning(file_as_list, snp, names):
    """From input file get the SNPS."""
    with open(file_as_list, 'r') as input_file:
        reader = csv.reader(input_file)
        file = list(reader)
    have_SNP = [x for x in file if x[snp] == '1']
    for i in range(len(have_SNP)):
        mult_names = have_SNP[i][names].replace(':', ',').replace(', ', ',')
        sep_names = mult_names.split(',')
        only_names = [x for x in sep_names if ' ' not in x]
        have_SNP[i][names] = only_names
    return have_SNP


def reference_dic(file_as_list, snp, names, col_ref, pos):
    """Creates the dict with all positions and reference nucleotides."""
    have_SNP = cleaning(file_as_list, snp, names)
    ref_dic = {}
    for i in have_SNP:
        ref_dic[int(i[pos].replace(',', ''))] = i[col_ref]
    return ref_dic


def pos_list(file_as_list, snp, names, col_ref, pos):
    """Creates a list with all the ehxisting positions in reference."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    list_pos = []
    for key in ref_dic:
        list_pos.append(key)
    sorted_pos_lis = sorted(list_pos)
    return sorted_pos_lis


def genomes_list(file_as_list, snp, names, col_ref, pos):
    """Identifies the genomes present in the input file."""
    have_SNP = cleaning(file_as_list, snp, names)
    genomes_dic = {}
    for i in have_SNP:
        for j in i[names]:
            genomes_dic[j] = ""
    genomes_list = []
    for key in genomes_dic:
        genomes_list.append(key)
    return genomes_list


def identify_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates a list of tuples with genome name and respesctive SNPs."""
    have_SNP = cleaning(file_as_list, snp, names)
    genomes = genomes_list(file_as_list, snp, names, col_ref, pos)
    entrys_per_genome = []
    pos_genomes_in_dict = []
    for i in genomes:
        sub_tup = ()
        sub_list = []
        sub_dict = {}
        for j in have_SNP:
            if i in j[names]:
                sub_sub_list = [int(j[pos].replace(',', '')), j[col_genome]]
                sub_list.append(sub_sub_list)
                sub_dict[int(j[pos].replace(',', ''))] = j[col_genome]
        sub_tup = (i, sub_list)
        sub_dic_tup = (i, sub_dict)
        entrys_per_genome.append(sub_tup)
        pos_genomes_in_dict.append(sub_dic_tup)
    return entrys_per_genome, pos_genomes_in_dict


def remove_dupli_pos(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates a list without SNPs that appear 2 times for one genome."""
    entrys_per_genome = identify_genomes(file_as_list, snp, names, col_ref,
                                        pos, col_genome)[0]
    all_genomes_pos = []
    for i in entrys_per_genome:
        genome_pos = []
        for j in i[1]:
            genome_pos.append(j[0])
        all_genomes_pos.append(genome_pos)
    list_dup_pos = []
    for i in all_genomes_pos:
        duplicated = [k for k, v in collections.Counter(i).items() if v > 1]
        list_dup_pos.extend(duplicated)
    no_dup_list_dup_pos = set(list_dup_pos)
    all_positions = pos_list(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = [x for x in all_positions if x not in no_dup_list_dup_pos]
    return pos_no_dup


def get_ref(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates the reference sequence based on all SNPs."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref,
                                  pos, col_genome)
    reference_snps_list = ""
    for i in pos_no_dup:
        reference_snps_list += str(ref_dic[i])
    return reference_snps_list


def get_genomes(file_as_list, snp, names, col_ref, pos, col_genome):
    """Uses the SNPs for each genome and 'N's to build each genome sequence."""
    ref_dic = reference_dic(file_as_list, snp, names, col_ref, pos)
    pos_no_dup = remove_dupli_pos(file_as_list, snp, names, col_ref, pos,
                                  col_genome)
    genomes_pos = identify_genomes(file_as_list, snp, names, col_ref, pos,
                                  col_genome)[1]
    genomes = []
    for i in genomes_pos:
        dic_of_genome = i[1]
        this_genome = ""
        for j in pos_no_dup:
            if j in dic_of_genome.keys():
                this_genome += str(dic_of_genome[j])
            elif j in ref_dic:
                this_genome += 'N'
            else:
                print("ERROR!!!!")
                break
        genomes.append(">{0}".format(i[0]))
        genomes.append(this_genome)
    return genomes


def main(file_as_list, snp, names, col_ref, pos, col_genome):
    """Creates 'files.fasta' with the ref and genomes in fasta format."""
    ref_genome = get_ref(file_as_list, snp, names, col_ref, pos, col_genome)
    genomes = get_genomes(file_as_list, snp, names, col_ref, pos, col_genome)
    with open("files.fasta", "w") as out_file:
        out_file.write(">reference_sequence\n")
        out_file.write("{0}\n".format(ref_genome))
        for i in genomes:
            out_file.write("{0}\n".format(i))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("input",
                        help="name of the input file")
    parser.add_argument("-r", "--col_ref_genome_nuc", default=2,
                        help="""number of the column with the reference genome
                        nucleotides""")
    parser.add_argument("-g", "--col_genomes_nuc", default=8,
                        help="""number of the column with the genomes
                        nucleotides""")
    parser.add_argument("-p", "--position", default=3,
                        help="""number of the column with the position in the
                        genome""")
    parser.add_argument("-n", "--genome_names", default=10,
                        help="number of the column with the genomes names")
    parser.add_argument("-s", "--is_snp", default=7,
                        help="number of the column with lenght")
    args = parser.parse_args()
    print("""Columns:\n[Reference genome:{0}]\n[Genomes:{1}]
[Position of the SNP:{2}]\n[Genomes name:{3}]
[Is SNP:{4}]""" .format(args.col_ref_genome_nuc, args.col_genomes_nuc,
                        args.position, args.genome_names, args.is_snp))
    col_ref = int(args.col_ref_genome_nuc) - 1
    col_genome = int(args.col_genomes_nuc) - 1
    pos = int(args.position) - 1
    names = int(args.genome_names) - 1
    snp = int(args.is_snp) - 1
    file_as_list = str(args.input)

    print("\nProcessing...")
    main(file_as_list, snp, names, col_ref, pos, col_genome)
    print("\nJob Done. Output written as <files.fasta>")

beginner

python-3.x

csv

bioinformatics

python

回答 1

Code Review用户

回答已采纳

发布于 2017-04-05 13:54:53

这听起来是一个很好的用例，可以使用一个类，在这个类中，当前通过函数链传递的参数将是类属性，例如，如果我们有一个Converter类，我们可能会以这样的方式初始化它：

class Converter:
    def __init__(self, filename, snp, names, col_ref, pos, col_genome):
        self.filename = filename
        self.snp = snp
        self.names = names
        self.col_ref = col_ref
        self.pos = pos
        self.col_genome = col_genome

然后，您的函数将成为实例方法，您将通过self.<attribute>访问实例属性，而不是使用参数。

将类看作是对相关事物进行逻辑分组的一种方式，提供对公共变量和方法的共享访问。

还有其他一些需要改进的地方：

您可以使用int定义参数，而不是将参数转换为type=int
您可以在多个地方使用字典和清单理解
您可以使用str.join() --例如，在定义reference_snps_list：reference_snps_list = "".join(str(ref_dic我) for i in pos_no_dup)时
您可以对输入文件参数使用特殊的argparse.FileType。

FYI，因为这是一个普遍存在争议的话题：

开始写更多的课
停止写作课

票数 2

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/159911

复制

相似问题

问用于转换基因组数据文件的Python命令行程序
EN

背景：

我的问题：

码

回答 1

Code Review用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问用于转换基因组数据文件的Python命令行程序EN

背景：

我的问题：

码

回答 1

Code Review用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问用于转换基因组数据文件的Python命令行程序
EN