我偶然看到一篇BMC基因组学论文:原核生物基因组内GC含量同源性分析
我实现了一些Python函数,将其作为个人项目的一部分。其职能如下:
import numpy as np
from collections import defaultdict
def get_sequence_chunks(sequence, window, step, as_overlap=False):
"""GC Content in a DNA/RNA sub-sequence length k. In
overlap windows of length k, other wise non overlapping.
Inputs:
sequence - a string representing a DNA sequence.
as_overlap - boolean that represents if overlap is needed.
k - a integer representing the lengths of overlapping bases.
Default is 20.
Outputs:
subseq - a string representing a slice of the sequence with or without
overlapping characters.
"""
# make sequence upper case and getting the length of it
sequence, seq_len = sequence.upper(), len(sequence)
# non overlap sequence length
non_overlap = range(0, seq_len - window + 1, step)
# overlap sequence length
overlap = range(0, seq_len - window + 1)
# overlap is needed
if as_overlap:
# iterates to the overlap region
for i in overlap:
# creates the substring
subseq = sequence[i:i + window]
yield subseq
# if non overlap is choosed
else:
# iterates to the non overlap region
for j in non_overlap:
# creates the sub-string to count the gc_content
subseq = sequence[j:j + window]
yield subseq
def gc_content(seq, percent=True):
"""
Calculate G+C content, return percentage (as float between 0 and 100).
Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
when counting the G and C content. The percentage is calculated against
the full length.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
Outputs:
gc - a float number or a percent value representing the gc content of the sequence.
"""
seq_len = len(seq)
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
if percent:
try:
return gc * 100.0 / seq_len
except ZeroDivisionError:
return 0.0
else:
return gc / seq_len
def get_gc_content_slide_window(sequence, window, step):
"""
Calculate teh GC (G+C) content along a sequence. Returns a dictionary -like
object mapping the sequence window to the gc value (floats).
Returns 0 for windows without any G/C by handling zero division errors, and
does NOT look at any ambiguous nucleotides.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
window_size - a integer representing the length of sequence to be analyzed
at each step in the full sequence.
step - a integer representing the length of oerlapping sequence allowed.
Outputs:
gc- a dictionary-like object mapping the window size sequence to it gc values
as floating numbers
"""
gc = defaultdict(float)
for s in get_sequence_chunks(sequence, window, step, as_overlap=False):
gc[s] = gc.get(s, 0.0) + gc_content(s)
return gc
def difference_gc(mean_gc, gc_dict):
"""
Calculates the difference between the mean GC content of window i, and the
mean chromosomal GC content, as Di = GC i − GC
"""
# get the keys and default values in the dictionary
d_i = defaultdict(float, [(k, 0.0) for k in gc_dict.keys()])
# iterate through all keys, gc calculated values for the
# genome chunk
for chunk, gc_cnt in gc_dict.items():
# get the difference between the chromosomal mean and the chunks
dif = gc_cnt - mean_gc
# add the difference to the appropriate chunk
d_i[chunk] = dif
return d_i
def get_chromosomal_gc_variantion(difference_dic):
"""
Calculates the chromosomal GC variation defined as the log-transformed average
of the absolute value of the difference between the mean GC content of each
non-overlapping sliding window i and mean chromosomal GC content.
chromosomal_gc_variantion = log(1/N*sum(|Di|), where N is the maximum number of
non-overlapping window size in bp in a sliding windows.
"""
# get the number of chunks
n = len(difference_dic.keys())
arr = np.array(list(difference_dic.values()))
var = np.log((1/n) * sum(abs(arr)))
return var我研究大肠杆菌基因组:大肠杆菌体13322染色体全基因组
gc = gc_content(ec, percent=True) = 50.76819424506625与此相似:基因组组装与注释报告(26286) ~ 50.73。
幻灯片窗口给出了不重叠的结果,其中ec =大肠杆菌基因组,100大小窗口,步骤= 100:
get_gc_content_slide_window(ec, 100, 100)
defaultdict(float,
{'GTTGGCATGCCATGGTGATGTTTGTTAGGAAAGCAAAGATGGCAAAACTGCTGGGGGTTTTGTGGTTGAGTATGCCAATATAATTAATAGATTAAAGAGT': 39.0,
'TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC': 39.0,
'AAACGGTTTCTTTTGCAGGAAAGGAATATGAGTTAAAGGTCATTGATGAAAAAACGCCTATTCTTTTTCAGTGGTTTGAACCTAATCCTGAACGATATAA': 34.0,
'GAAAGATGAGGTTCCAATAGTTAATACTAAGCAGCATCCCTATTTAGATAATGTCACAAATGCGGCAAGGATAGAGAGTGATCGTATGATAGGTATTTTT': 36.0,
'GTTGATGGCGATTTTTCAGTCAACCAAAAGACTGCTTTTTCAAAATTGGAACGAGATTTTGAAAATGTAATGATAATCTATCGGGAAGATGTTGACTTCA': 34.0,
'GTATGTATGACAGAAAACTATCAGATATTTATCATGATATTATATGTGAACAAAGGTTACGAACTGAAGACAAAAGAGATGAATACTTGTTGAATCTGTT': 29.0,...}差分函数(滑动窗口gc内容- gc内容全序列),其中ec_gc = e.coli基因组的gc含量和大小为100的子序列中的ec_non = gc含量不重叠:
difference_gc(ec_gc, ec_non)
defaultdict(float,
{'GTTGGCATGCCATGGTGATGTTTGTTAGGAAAGCAAAGATGGCAAAACTGCTGGGGGTTTTGTGGTTGAGTATGCCAATATAATTAATAGATTAAAGAGT': -11.768194245066248,
'TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC': -11.768194245066248,
'AAACGGTTTCTTTTGCAGGAAAGGAATATGAGTTAAAGGTCATTGATGAAAAAACGCCTATTCTTTTTCAGTGGTTTGAACCTAATCCTGAACGATATAA': -16.768194245066248,
'GAAAGATGAGGTTCCAATAGTTAATACTAAGCAGCATCCCTATTTAGATAATGTCACAAATGCGGCAAGGATAGAGAGTGATCGTATGATAGGTATTTTT': -14.768194245066248,
'GTTGATGGCGATTTTTCAGTCAACCAAAAGACTGCTTTTTCAAAATTGGAACGAGATTTTGAAAATGTAATGATAATCTATCGGGAAGATGTTGACTTCA': -16.768194245066248,...}以及(ec_dif = gc含量与总序列和子序列的差异):
get_chromosomal_gc_variantion(ec_dif)
1.7809591767493207我主要关心的是,是否像上面的文章那样实现了这个函数get_chromosomal_gc_variantion(ec_dif):

正如所建议的,一个玩具例子:
s="TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGATAAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC"
seq_gc_non_overllap = get_gc_content_slide_window(s, 20, 20)
seq_gc_non_overllap
defaultdict(float,
{'TAGTTGTGAAGAAAATATGG': 30.0,
'ATAAACAGGACGACGAATGC': 45.0,
'TTTCACCGATAAGGACAACT': 40.0,
'TTCCATAACTCAGTAAATAT': 25.0,
'AGTGCAGAGTTCACCCTGTC': 55.0})
seq_gc_total = gc_content(s, percent=True)
seq_gc_total
39.0
seq_dif_gctot_gc_slidewindow = difference_gc(seq_gc_total, seq_gc_non_overllap)
defaultdict(float,
{'TAGTTGTGAAGAAAATATGG': -9.0,
'ATAAACAGGACGACGAATGC': 6.0,
'TTTCACCGATAAGGACAACT': 1.0,
'TTCCATAACTCAGTAAATAT': -14.0,
'AGTGCAGAGTTCACCCTGTC': 16.0})
chromosome_gc_variation = get_chromosomal_gc_variantion(seq_dif_gctot, gc_slidewindow)
chromosome_gc_variation
2.2192034840549946任何改进代码的技巧都将是非常受欢迎的,一旦我只使用python工作了几年,而且我不是很熟练,更不用说使用numpy了。
发布于 2021-11-07 03:39:48
挺有意思的!
您将从介绍PEP484类型提示中获益。
get_sequence_chunks的docstring不太正常;您需要重新查看参数名。
对于您的sequence和seq_len来说,元组组合分配并没有太多的好处,所以我建议单独执行它们。
重叠逻辑可以大大简化:注意,这两个代码路径之间唯一的区别是步骤大小。
次要的拼写问题,如other wise (应该是一个单词)、overllap -> overlap和variantion -> variation。
评论,如
# make sequence upper case and getting the length of it比根本不发表评论更没有帮助。请注意两者之间的区别
# get the difference between the chromosomal mean and the chunks它所包含的内容不一定仅从代码中推断出来(尽管即使这样也是多余的,并且具有足够的描述性变量名)。
这种理解:
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])将遍历每个潜在字符的整个seq字符串。将其重构为使用set成员资格检查和通过字符串进行一次迭代可能会更有效。
此代码可防止零除法:
seq_len = len(seq)
gc = sum(seq.count(x) for x in ["G", "C", "g", "c", "S", "s"])
if percent:
try:
return gc * 100.0 / seq_len
except ZeroDivisionError:
return 0.0
else:
return gc / seq_len 首先,可以通过检查序列长度是否为零来避免逻辑异常;还应该将该检查扩展到返回非百分比数字的情况。
在difference_gc中,不需要初始的defaultdict初始化,因为您覆盖了每个键。你可以用一个字典理解来代替整个功能。
尽量避免将count这样的缩写词缩写为cnt。
dict的键的len等于dict本身的len,所以您可以将.keys()放在len(difference_dic.keys())中。
您可以避免在这里构造一个列表:
arr = np.array(list(difference_dic.values()))通过使用fromiter代替。
你的例子有点失败。get_chromosomal_gc_variation只接受一个参数。
考虑将您的示例重新处理为基本单元测试。
import math
from typing import Iterator, DefaultDict, Dict
import numpy as np
from collections import defaultdict
def get_sequence_chunks(
sequence: str,
window: int,
step: int,
as_overlap: bool = False,
) -> Iterator[str]:
"""GC Content in a DNA/RNA sub-sequence length k. In
overlap windows of length k, other wise non overlapping.
Inputs:
sequence - a string representing a DNA sequence.
as_overlap - boolean that represents if overlap is needed.
k - a integer representing the lengths of overlapping bases.
Default is 20.
Outputs:
subseq - a string representing a slice of the sequence with or without
overlapping characters.
"""
sequence = sequence.upper()
if as_overlap:
# overlap sequence length
step = 1
# iterates to the overlap region
for i in range(0, len(sequence) - window + 1, step):
# creates the substring
yield sequence[i:i + window]
def gc_content(seq: str, percent: bool = True) -> float:
"""
Calculate G+C content, return percentage (as float between 0 and 100).
Copes mixed case sequences, and with the ambiguous nucleotide S (G or C)
when counting the G and C content. The percentage is calculated against
the full length.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
Outputs:
gc - a float number or a percent value representing the gc content of the sequence.
"""
seq_len = len(seq)
gc = sum(1 for x in seq.upper() if x in {"G", "C", "S"})
if seq_len == 0:
return 0
gc /= seq_len
if percent:
return gc * 100
return gc
def get_gc_content_slide_window(sequence: str, window: int, step: int) -> DefaultDict[str, float]:
"""
Calculate the GC (G+C) content along a sequence. Returns a dictionary-like
object mapping the sequence window to the gc value (floats).
Returns 0 for windows without any G/C by handling zero division errors, and
does NOT look at any ambiguous nucleotides.
Inputs:
sequence - a string representing a sequence (DNA/RNA/Protein)
window_size - a integer representing the length of sequence to be analyzed
at each step in the full sequence.
step - a integer representing the length of oerlapping sequence allowed.
Outputs:
gc- a dictionary-like object mapping the window size sequence to it gc values
as floating numbers
"""
gc = defaultdict(float)
for s in get_sequence_chunks(sequence, window, step, as_overlap=False):
gc[s] += gc_content(s)
return gc
def difference_gc(mean_gc: float, gc_dict: Dict[str, float]) -> Dict[str, float]:
"""
Calculates the difference between the mean GC content of window i, and the
mean chromosomal GC content, as Di = GC i − GC
"""
# iterate through all keys, gc calculated values for the
# genome chunk
# get the difference between the chromosomal mean and the chunks
# add the difference to the appropriate chunk
d_i = {
chunk: gc_cnt - mean_gc
for chunk, gc_cnt in gc_dict.items()
}
return d_i
def get_chromosomal_gc_variation(difference_dic: Dict[str, float]) -> float:
"""
Calculates the chromosomal GC variation defined as the log-transformed average
of the absolute value of the difference between the mean GC content of each
non-overlapping sliding window i and mean chromosomal GC content.
chromosomal_gc_variantion = log(1/N*sum(|Di|), where N is the maximum number of
non-overlapping window size in bp in a sliding windows.
"""
# get the number of chunks
n = len(difference_dic)
arr = np.fromiter(
difference_dic.values(),
dtype=np.float32,
count=len(difference_dic),
)
var = np.log(np.sum(np.abs(arr)) / n)
return var
def test() -> None:
s = (
"TAGTTGTGAAGAAAATATGGATAAACAGGACGACGAATGCTTTCACCGAT"
"AAGGACAACTTTCCATAACTCAGTAAATATAGTGCAGAGTTCACCCTGTC"
)
seq_gc_non_overlap = get_gc_content_slide_window(s, 20, 20)
expected = {
'TAGTTGTGAAGAAAATATGG': 30,
'ATAAACAGGACGACGAATGC': 45,
'TTTCACCGATAAGGACAACT': 40,
'TTCCATAACTCAGTAAATAT': 25,
'AGTGCAGAGTTCACCCTGTC': 55,
}
assert expected.keys() == seq_gc_non_overlap.keys()
for k, v in expected.items():
assert math.isclose(seq_gc_non_overlap[k], v)
seq_gc_total = gc_content(s, percent=True)
assert math.isclose(seq_gc_total, 39)
seq_dif_gctot_gc_slidewindow = difference_gc(seq_gc_total, seq_gc_non_overlap)
expected = {
'TAGTTGTGAAGAAAATATGG': -9,
'ATAAACAGGACGACGAATGC': 6,
'TTTCACCGATAAGGACAACT': 1,
'TTCCATAACTCAGTAAATAT': -14,
'AGTGCAGAGTTCACCCTGTC': 16,
}
assert expected.keys() == seq_dif_gctot_gc_slidewindow.keys()
for k, v in expected.items():
assert math.isclose(seq_dif_gctot_gc_slidewindow[k], v)
chromosome_gc_variation = get_chromosomal_gc_variation(seq_dif_gctot_gc_slidewindow)
assert math.isclose(chromosome_gc_variation, 2.2192034840549946)
if __name__ == '__main__':
test()https://codereview.stackexchange.com/questions/269828
复制相似问题