文章/答案/技术大牛

发布

社区首页 >问答首页 >函数计算GC含量在序列中的变化。

问函数计算GC含量在序列中的变化。
EN

Code Review用户

提问于 2021-11-06 17:20:31

回答 1查看 451关注 0票数 4

我偶然看到一篇BMC基因组学论文：原核生物基因组内GC含量同源性分析

我实现了一些Python函数，将其作为个人项目的一部分。其职能如下：

import numpy as np
from collections import defaultdict

def get_sequence_chunks(sequence, window, step, as_overlap=False):
    """GC Content in a DNA/RNA sub-sequence length k. In
    overlap windows of length k, other wise non overlapping.
    
    Inputs:
        sequence - a string representing a DNA sequence.    
        as_overlap - boolean that represents if overlap is needed.
        k - a integer representing the lengths of overlapping bases.
            Default is 20.
    
    Outputs:
        subseq - a string representing a slice of the sequence with or without
                 overlapping characters.
    """
    # make sequence upper case and getting the length of it
    sequence, seq_len = sequence.upper(), len(sequence)
    # non overlap sequence length
    non_overlap = range(0, seq_len - window + 1, step)
    # overlap sequence length
    overlap = range(0, seq_len - window + 1)
    # overlap is needed
    if as_overlap:
        # iterates to the overlap region
        for i in overlap:
            # creates the substring 
            subseq = sequence[i:i + window]
            yield subseq
    # if non overlap is choosed
    else:
        # iterates to the non overlap region
        for j in non_overlap:
            # creates the sub-string to count the gc_content
            subseq = sequence[j:j + window]
            yield subseq

def gc_content(seq, percent=True):
    """
    Calculate G+C content, return percentage (as float between 0 and 100).
    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
    when counting the G and C content.  The percentage is calculated against
    the full length.
    
    Inputs:
        sequence - a string representing a sequence (DNA/RNA/Protein)
    
    Outputs:
        gc - a float number or a percent value representing the gc content of the sequence.     
    
    """
    seq_len = len(seq) 
    gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
    if percent:
        try:
            return gc * 100.0 / seq_len
        except ZeroDivisionError:
            return 0.0 
    else:
        return gc / seq_len      


def get_gc_content_slide_window(sequence, window, step):
    """
    Calculate teh GC (G+C) content along a sequence. Returns a dictionary -like
    object mapping the sequence window to the gc value (floats).
    Returns 0 for windows without any G/C by handling zero division errors, and 
    does NOT look at any ambiguous nucleotides.
    
    Inputs:
        sequence - a string representing a sequence (DNA/RNA/Protein)
        window_size - a integer representing the length of sequence to be analyzed
                   at each step in the full sequence.
        step - a integer representing the length of oerlapping sequence allowed.
    
    Outputs:
        gc- a dictionary-like object mapping the window size sequence to it gc values 
            as floating numbers
    """
    gc = defaultdict(float)
    for s in get_sequence_chunks(sequence, window, step, as_overlap=False):
        gc[s] = gc.get(s, 0.0) + gc_content(s)
    return gc   

def difference_gc(mean_gc, gc_dict):
    """
    Calculates the difference between the mean GC content of window i, and the
    mean chromosomal GC content, as Di = GC i − GC
    """
    # get the keys and default values in the dictionary
    d_i = defaultdict(float, [(k, 0.0) for k in gc_dict.keys()])
    # iterate through all keys, gc calculated values for the
    # genome chunk
    for chunk, gc_cnt in gc_dict.items():
        # get the difference between the chromosomal mean and the chunks
        dif = gc_cnt - mean_gc
        # add the difference to the appropriate chunk
        d_i[chunk] = dif
    return d_i

def get_chromosomal_gc_variantion(difference_dic):
    """
    Calculates the chromosomal GC variation defined as the log-transformed average 
    of the absolute value of the difference between the mean GC content of each 
    non-overlapping sliding window i and mean chromosomal GC content.    
    chromosomal_gc_variantion = log(1/N*sum(|Di|), where N is the maximum number of 
    non-overlapping window size in bp in a sliding windows.
    """
    # get the number of chunks
    n = len(difference_dic.keys())
    arr = np.array(list(difference_dic.values()))
    var = np.log((1/n) * sum(abs(arr)))
    return var

我研究大肠杆菌基因组：大肠杆菌体13322染色体全基因组

gc = gc_content(ec, percent=True) = 50.76819424506625

与此相似：基因组组装与注释报告(26286) ~ 50.73。

幻灯片窗口给出了不重叠的结果，其中ec =大肠杆菌基因组，100大小窗口，步骤= 100：

get_gc_content_slide_window(ec, 100, 100)

defaultdict(float,
            {'GTTGGCATGCCATGGTGATGTTTGTTAGGAAAGCAAAGATGGCAAAACTGCTGGGGGTTTTGTGGTTGAGTATGCCAATATAATTAATAGATTAAAGAGT': 39.0,
             'TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC': 39.0,
             'AAACGGTTTCTTTTGCAGGAAAGGAATATGAGTTAAAGGTCATTGATGAAAAAACGCCTATTCTTTTTCAGTGGTTTGAACCTAATCCTGAACGATATAA': 34.0,
             'GAAAGATGAGGTTCCAATAGTTAATACTAAGCAGCATCCCTATTTAGATAATGTCACAAATGCGGCAAGGATAGAGAGTGATCGTATGATAGGTATTTTT': 36.0,
             'GTTGATGGCGATTTTTCAGTCAACCAAAAGACTGCTTTTTCAAAATTGGAACGAGATTTTGAAAATGTAATGATAATCTATCGGGAAGATGTTGACTTCA': 34.0,
             'GTATGTATGACAGAAAACTATCAGATATTTATCATGATATTATATGTGAACAAAGGTTACGAACTGAAGACAAAAGAGATGAATACTTGTTGAATCTGTT': 29.0,...}

差分函数(滑动窗口gc内容- gc内容全序列)，其中ec_gc = e.coli基因组的gc含量和大小为100的子序列中的ec_non = gc含量不重叠：

difference_gc(ec_gc, ec_non)

defaultdict(float,
            {'GTTGGCATGCCATGGTGATGTTTGTTAGGAAAGCAAAGATGGCAAAACTGCTGGGGGTTTTGTGGTTGAGTATGCCAATATAATTAATAGATTAAAGAGT': -11.768194245066248,
             'TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC': -11.768194245066248,
             'AAACGGTTTCTTTTGCAGGAAAGGAATATGAGTTAAAGGTCATTGATGAAAAAACGCCTATTCTTTTTCAGTGGTTTGAACCTAATCCTGAACGATATAA': -16.768194245066248,
             'GAAAGATGAGGTTCCAATAGTTAATACTAAGCAGCATCCCTATTTAGATAATGTCACAAATGCGGCAAGGATAGAGAGTGATCGTATGATAGGTATTTTT': -14.768194245066248,
             'GTTGATGGCGATTTTTCAGTCAACCAAAAGACTGCTTTTTCAAAATTGGAACGAGATTTTGAAAATGTAATGATAATCTATCGGGAAGATGTTGACTTCA': -16.768194245066248,...}

以及(ec_dif = gc含量与总序列和子序列的差异)：

get_chromosomal_gc_variantion(ec_dif)
1.7809591767493207

我主要关心的是，是否像上面的文章那样实现了这个函数get_chromosomal_gc_variantion(ec_dif)：

正如所建议的，一个玩具例子：

s="TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC"

seq_gc_non_overllap = get_gc_content_slide_window(s, 20, 20)
seq_gc_non_overllap
defaultdict(float,
            {'TAGTTGTGAAGAAAATATGG': 30.0,
             'ATAAACAGGACGACGAATGC': 45.0,
             'TTTCACCGATAAGGACAACT': 40.0,
             'TTCCATAACTCAGTAAATAT': 25.0,
             'AGTGCAGAGTTCACCCTGTC': 55.0})

seq_gc_total = gc_content(s, percent=True)
seq_gc_total
39.0

seq_dif_gctot_gc_slidewindow = difference_gc(seq_gc_total, seq_gc_non_overllap)
defaultdict(float,
            {'TAGTTGTGAAGAAAATATGG': -9.0,
             'ATAAACAGGACGACGAATGC': 6.0,
             'TTTCACCGATAAGGACAACT': 1.0,
             'TTCCATAACTCAGTAAATAT': -14.0,
             'AGTGCAGAGTTCACCCTGTC': 16.0})

chromosome_gc_variation = get_chromosomal_gc_variantion(seq_dif_gctot, gc_slidewindow)
chromosome_gc_variation
2.2192034840549946

任何改进代码的技巧都将是非常受欢迎的，一旦我只使用python工作了几年，而且我不是很熟练，更不用说使用numpy了。

statistics

bioinformatics

python

python-3.x

numpy

回答 1

Code Review用户

回答已采纳

发布于 2021-11-07 03:39:48

挺有意思的！

您将从介绍PEP484类型提示中获益。

get_sequence_chunks的docstring不太正常；您需要重新查看参数名。

对于您的sequence和seq_len来说，元组组合分配并没有太多的好处，所以我建议单独执行它们。

重叠逻辑可以大大简化:注意，这两个代码路径之间唯一的区别是步骤大小。

次要的拼写问题，如other wise (应该是一个单词)、overllap -> overlap和variantion -> variation。

评论，如

# make sequence upper case and getting the length of it

比根本不发表评论更没有帮助。请注意两者之间的区别

# get the difference between the chromosomal mean and the chunks

它所包含的内容不一定仅从代码中推断出来(尽管即使这样也是多余的，并且具有足够的描述性变量名)。

这种理解：

gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])

将遍历每个潜在字符的整个seq字符串。将其重构为使用set成员资格检查和通过字符串进行一次迭代可能会更有效。

此代码可防止零除法：

seq_len = len(seq) 
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
if percent:
    try:
        return gc * 100.0 / seq_len
    except ZeroDivisionError:
        return 0.0 
else:
    return gc / seq_len

首先，可以通过检查序列长度是否为零来避免逻辑异常；还应该将该检查扩展到返回非百分比数字的情况。

在difference_gc中，不需要初始的defaultdict初始化，因为您覆盖了每个键。你可以用一个字典理解来代替整个功能。

尽量避免将count这样的缩写词缩写为cnt。

dict的键的len等于dict本身的len，所以您可以将.keys()放在len(difference_dic.keys())中。

您可以避免在这里构造一个列表：

arr = np.array(list(difference_dic.values()))

通过使用fromiter代替。

你的例子有点失败。get_chromosomal_gc_variation只接受一个参数。

考虑将您的示例重新处理为基本单元测试。

建议

import math
from typing import Iterator, DefaultDict, Dict

import numpy as np
from collections import defaultdict


def get_sequence_chunks(
    sequence: str,
    window: int,
    step: int,
    as_overlap: bool = False,
) -> Iterator[str]:
    """GC Content in a DNA/RNA sub-sequence length k. In
    overlap windows of length k, other wise non overlapping.

    Inputs:
        sequence - a string representing a DNA sequence.
        as_overlap - boolean that represents if overlap is needed.
        k - a integer representing the lengths of overlapping bases.
            Default is 20.

    Outputs:
        subseq - a string representing a slice of the sequence with or without
                 overlapping characters.
    """
    sequence = sequence.upper()

    if as_overlap:
        # overlap sequence length
        step = 1

    # iterates to the overlap region
    for i in range(0, len(sequence) - window + 1, step):
        # creates the substring
        yield sequence[i:i + window]


def gc_content(seq: str, percent: bool = True) -> float:
    """
    Calculate G+C content, return percentage (as float between 0 and 100).
    Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
    when counting the G and C content.  The percentage is calculated against
    the full length.

    Inputs:
        sequence - a string representing a sequence (DNA/RNA/Protein)

    Outputs:
        gc - a float number or a percent value representing the gc content of the sequence.

    """
    seq_len = len(seq)
    gc = sum(1 for x in seq.upper() if x in {"G", "C", "S"})
    if seq_len == 0:
        return 0
    gc /= seq_len
    if percent:
        return gc * 100
    return gc


def get_gc_content_slide_window(sequence: str, window: int, step: int) -> DefaultDict[str, float]:
    """
    Calculate the GC (G+C) content along a sequence. Returns a dictionary-like
    object mapping the sequence window to the gc value (floats).
    Returns 0 for windows without any G/C by handling zero division errors, and
    does NOT look at any ambiguous nucleotides.

    Inputs:
        sequence - a string representing a sequence (DNA/RNA/Protein)
        window_size - a integer representing the length of sequence to be analyzed
                   at each step in the full sequence.
        step - a integer representing the length of oerlapping sequence allowed.

    Outputs:
        gc- a dictionary-like object mapping the window size sequence to it gc values
            as floating numbers
    """
    gc = defaultdict(float)
    for s in get_sequence_chunks(sequence, window, step, as_overlap=False):
        gc[s] += gc_content(s)
    return gc


def difference_gc(mean_gc: float, gc_dict: Dict[str, float]) -> Dict[str, float]:
    """
    Calculates the difference between the mean GC content of window i, and the
    mean chromosomal GC content, as Di = GC i − GC
    """
    # iterate through all keys, gc calculated values for the
    # genome chunk
    # get the difference between the chromosomal mean and the chunks
    # add the difference to the appropriate chunk
    d_i = {
        chunk: gc_cnt - mean_gc
        for chunk, gc_cnt in gc_dict.items()
    }
    return d_i


def get_chromosomal_gc_variation(difference_dic: Dict[str, float]) -> float:
    """
    Calculates the chromosomal GC variation defined as the log-transformed average
    of the absolute value of the difference between the mean GC content of each
    non-overlapping sliding window i and mean chromosomal GC content.
    chromosomal_gc_variantion = log(1/N*sum(|Di|), where N is the maximum number of
    non-overlapping window size in bp in a sliding windows.
    """
    # get the number of chunks
    n = len(difference_dic)
    arr = np.fromiter(
        difference_dic.values(),
        dtype=np.float32,
        count=len(difference_dic),
    )
    var = np.log(np.sum(np.abs(arr)) / n)
    return var


def test() -> None:
    s = (
        "TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGAT"
        "AAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC"
    )

    seq_gc_non_overlap = get_gc_content_slide_window(s, 20, 20)
    expected = {
        'TAGTTGTGAAGAAAATATGG': 30,
        'ATAAACAGGACGACGAATGC': 45,
        'TTTCACCGATAAGGACAACT': 40,
        'TTCCATAACTCAGTAAATAT': 25,
        'AGTGCAGAGTTCACCCTGTC': 55,
    }
    assert expected.keys() == seq_gc_non_overlap.keys()
    for k, v in expected.items():
        assert math.isclose(seq_gc_non_overlap[k], v)

    seq_gc_total = gc_content(s, percent=True)
    assert math.isclose(seq_gc_total, 39)

    seq_dif_gctot_gc_slidewindow = difference_gc(seq_gc_total, seq_gc_non_overlap)
    expected = {
        'TAGTTGTGAAGAAAATATGG':  -9,
        'ATAAACAGGACGACGAATGC':   6,
        'TTTCACCGATAAGGACAACT':   1,
        'TTCCATAACTCAGTAAATAT': -14,
        'AGTGCAGAGTTCACCCTGTC':  16,
    }
    assert expected.keys() == seq_dif_gctot_gc_slidewindow.keys()
    for k, v in expected.items():
        assert math.isclose(seq_dif_gctot_gc_slidewindow[k], v)

    chromosome_gc_variation = get_chromosomal_gc_variation(seq_dif_gctot_gc_slidewindow)
    assert math.isclose(chromosome_gc_variation, 2.2192034840549946)


if __name__ == '__main__':
    test()

票数 2

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/269828

复制

相似问题

问函数计算GC含量在序列中的变化。
EN

回答 1

Code Review用户

建议

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问函数计算GC含量在序列中的变化。EN

回答 1

Code Review用户

建议

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问函数计算GC含量在序列中的变化。
EN