我正试图为这种类型的文件编写一个解析器,但事实证明它比我想象的要复杂一些。如果这是几年前的事情,我会多次阅读这个文件,但现在我想到的是效率,这是不可能的,因为肯定有更好的方法。
如何在逐行读取文件时解析每个部分?
我的尝试是设置一个变量,该变量将根据该节的不同进行更改。这开始起作用了,但正如你所看到的,它在第二个分组时会变得混乱(例如,反应)。我觉得while循环可能是答案,但我不知道如何在这种情况下实现它。
from io import StringIO
f = StringIO(
"""
ENTRY M00001 Pathway Module
NAME Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
DEFINITION (K00844,K12407,K00845,K00886,K08074,K00918) (K01810,K06859,K13810,K15916) (K00850,K16370,K21071,K00918) (K01623,K01624,K11645,K16305,K16306) K01803 ((K00134,K00150) K00927,K11389) (K01834,K15633,K15634,K15635) K01689 (K00873,K12406)
ORTHOLOGY K00844,K12407,K00845 hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786]
K00886 polyphosphate glucokinase [EC:2.7.1.63] [RN:R02189]
K08074,K00918 ADP-dependent glucokinase [EC:2.7.1.147] [RN:R09085]
K01810,K06859,K13810,K15916 glucose-6-phosphate isomerase [EC:5.3.1.9] [RN:R02740]
K00850,K16370,K21071 6-phosphofructokinase [EC:2.7.1.11] [RN:R04779]
K00918 ADP-dependent phosphofructokinase [EC:2.7.1.146] [RN:R09084]
K01623,K01624,K11645,K16305,K16306 fructose-bisphosphate aldolase [EC:4.1.2.13] [RN:R01070]
K01803 triosephosphate isomerase [EC:5.3.1.1] [RN:R01015]
K00134,K00150 glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12 1.2.1.59] [RN:R01061 R01063]
K00927 phosphoglycerate kinase [EC:2.7.2.3] [RN:R01512]
K11389 glyceraldehyde-3-phosphate dehydrogenase (ferredoxin) [EC:1.2.7.6] [RN:R07159]
K01834,K15633,K15634,K15635 phosphoglycerate mutase [EC:5.4.2.11 5.4.2.12] [RN:R01518]
K01689 enolase [EC:4.2.1.11] [RN:R00658]
K00873,K12406 pyruvate kinase [EC:2.7.1.40] [RN:R00200]
CLASS Pathway modules; Carbohydrate metabolism; Central carbohydrate metabolism
PATHWAY map00010 Glycolysis / Gluconeogenesis
map01200 Carbon metabolism
map01100 Metabolic pathways
REACTION R01786,R02189,R09085 C00267 -> C00668
R02740 C00668 -> C05345
R04779,R09084 C05345 -> C05378
R01070 C05378 -> C00111 + C00118
R01015 C00111 -> C00118
R01061,R01063 C00118 -> C00236
R01512 C00236 -> C00197
R07159 C00118 -> C00197
R01518 C00197 -> C00631
R00658 C00631 -> C00074
R00200 C00074 -> C00022
COMPOUND C00267 alpha-D-Glucose
C00668 alpha-D-Glucose 6-phosphate
C05345 beta-D-Fructose 6-phosphate
C05378 beta-D-Fructose 1,6-bisphosphate
C00111 Glycerone phosphate
C00118 D-Glyceraldehyde 3-phosphate
C00236 3-Phospho-D-glyceroyl phosphate
C00197 3-Phospho-D-glycerate
C00631 2-Phospho-D-glycerate
C00074 Phosphoenolpyruvate
C00022 Pyruvate
///
"""
)
kegg_definition = None
kegg_orthology = list()
kegg_ortholog_set = set()
kegg_class = None
kegg_pathways = list()
kegg_pathway_set = set()
kegg_reactions = list()
kegg_reaction_set = set()
kegg_compounds = list()
kegg_compound_set = set()
# Read KEGG module text
parsing = None
for line_level_1 in f:
line_level_1 = line_level_1.strip()
if not line_level_1.startswith("/"):
# Get orthologs
if line_level_1.startswith("DEFINITION"):
kegg_definition = line_level_1.replace("DEFINITION","").strip()
kegg_ortholog_set = str(kegg_definition)
for character in list("(+ -)"):
kegg_ortholog_set.replace(character, ",")
kegg_ortholog_set = set(filter(bool, kegg_ortholog_set.split(",")))
parsing = "ORTHOLOGY"
if parsing == "ORTHOLOGY":
ko_annot = line_level_1.replace("DEFINITION","").strip()
kegg_orthology.append(ko_annot)
# Get class
if line_level_1.startswith("CLASS"):
# parsing = None
kegg_class = line_level_1.replace("CLASS","").strip().split("; ")
# Get pathways
if line_level_1.startswith("PATHWAY"):
parsing = "PATHWAY"
if parsing == "PATHWAY":
kegg_pathway = line_level_1.replace("PATHWAY","").strip().split("; ")
kegg_pathways.append(kegg_pathway)
# Get reactions
if line_level_1.startswith("REACTION"):
parsing = "REACTION"
if parsing == "REACTION":
kegg_reaction = line_level_1.replace("REACTION","").strip().split("; ")
kegg_reactions.append(kegg_reaction)
# Get compounds
if line_level_1.startswith("COMPOUND"):
parsing = "COMPOUND"
if parsing == "COMPOUND":
kegg_compound = line_level_1.replace("COMPOUND","").strip().split("; ")
kegg_compounds.append(kegg_compound)
kegg_pathways
# [['map00010 Glycolysis / Gluconeogenesis'],
# ['map01200 Carbon metabolism'],
# ['map01100 Metabolic pathways'],
# ['REACTION R01786,R02189,R09085 C00267 -> C00668']]发布于 2021-03-19 19:55:56
以防万一这对未来的任何人都有帮助。这是我做的解析器。但是,如果您希望从一组同源词(例如,元组组装的基因组或bin)中计算模块完成率,那么您需要考虑到KEGG定义中的逻辑准则以及某些模块是bifurcated...in这一事实,您可能应该在MicrobeAnnotator中使用其中的一些代码。
import datetime
from Bio.KEGG.REST import kegg_list, kegg_get
# Parse KEGG Module
def parse_kegg_module(module_file):
"""
Example of a KEGG REST module file:
ENTRY M00001 Pathway Module
NAME Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
DEFINITION (K00844,K12407,K00845,K00886,K08074,K00918) (K01810,K06859,K13810,K15916) (K00850,K16370,K21071,K00918) (K01623,K01624,K11645,K16305,K16306) K01803 ((K00134,K00150) K00927,K11389) (K01834,K15633,K15634,K15635) K01689 (K00873,K12406)
ORTHOLOGY K00844,K12407,K00845 hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786]
K00886 polyphosphate glucokinase [EC:2.7.1.63] [RN:R02189]
K08074,K00918 ADP-dependent glucokinase [EC:2.7.1.147] [RN:R09085]
K01810,K06859,K13810,K15916 glucose-6-phosphate isomerase [EC:5.3.1.9] [RN:R02740]
K00850,K16370,K21071 6-phosphofructokinase [EC:2.7.1.11] [RN:R04779]
K00918 ADP-dependent phosphofructokinase [EC:2.7.1.146] [RN:R09084]
K01623,K01624,K11645,K16305,K16306 fructose-bisphosphate aldolase [EC:4.1.2.13] [RN:R01070]
K01803 triosephosphate isomerase [EC:5.3.1.1] [RN:R01015]
K00134,K00150 glyceraldehyde 3-phosphate dehydrogenase [EC:1.2.1.12 1.2.1.59] [RN:R01061 R01063]
K00927 phosphoglycerate kinase [EC:2.7.2.3] [RN:R01512]
K11389 glyceraldehyde-3-phosphate dehydrogenase (ferredoxin) [EC:1.2.7.6] [RN:R07159]
K01834,K15633,K15634,K15635 phosphoglycerate mutase [EC:5.4.2.11 5.4.2.12] [RN:R01518]
K01689 enolase [EC:4.2.1.11] [RN:R00658]
K00873,K12406 pyruvate kinase [EC:2.7.1.40] [RN:R00200]
CLASS Pathway modules; Carbohydrate metabolism; Central carbohydrate metabolism
PATHWAY map00010 Glycolysis / Gluconeogenesis
map01200 Carbon metabolism
map01100 Metabolic pathways
REACTION R01786,R02189,R09085 C00267 -> C00668
R02740 C00668 -> C05345
R04779,R09084 C05345 -> C05378
R01070 C05378 -> C00111 + C00118
R01015 C00111 -> C00118
R01061,R01063 C00118 -> C00236
R01512 C00236 -> C00197
R07159 C00118 -> C00197
R01518 C00197 -> C00631
R00658 C00631 -> C00074
R00200 C00074 -> C00022
COMPOUND C00267 alpha-D-Glucose
C00668 alpha-D-Glucose 6-phosphate
C05345 beta-D-Fructose 6-phosphate
C05378 beta-D-Fructose 1,6-bisphosphate
C00111 Glycerone phosphate
C00118 D-Glyceraldehyde 3-phosphate
C00236 3-Phospho-D-glyceroyl phosphate
C00197 3-Phospho-D-glycerate
C00631 2-Phospho-D-glycerate
C00074 Phosphoenolpyruvate
C00022 Pyruvate
///
"""
kegg_module = None
kegg_name = None
kegg_definition = None
kegg_orthology = list()
kegg_ortholog_set = set()
kegg_classes = list()
kegg_pathways = list()
kegg_pathway_set = set()
kegg_reactions = list()
kegg_reaction_set = set()
kegg_compounds = list()
kegg_compound_set = set()
# Read KEGG module text
parsing = None
for line in module_file:
line = line.strip()
if not line.startswith("/"):
if not line.startswith(" "):
first_word = line.split(" ")[0]
if first_word.isupper() and first_word.isalpha():
parsing = first_word
if parsing == "ENTRY":
kegg_module = list(filter(bool, line.split(" ")))[1]
if parsing == "NAME":
kegg_name = line.replace(parsing, "").strip()
parsing = None
if parsing == "DEFINITION":
kegg_definition = line.replace(parsing,"").strip()
kegg_ortholog_set = str(kegg_definition)
for character in list("(+ -)"):
kegg_ortholog_set = kegg_ortholog_set.replace(character, ",")
kegg_ortholog_set = set(filter(bool, kegg_ortholog_set.split(",")))
if parsing == "ORTHOLOGY":
kegg_orthology.append(line.replace(parsing,"").strip())
if parsing == "CLASS":
kegg_classes = line.replace(parsing,"").strip().split("; ")
if parsing == "PATHWAY":
kegg_pathway = line.replace(parsing,"").strip()
kegg_pathways.append(kegg_pathway)
id_pathway = kegg_pathway.split(" ")[0]
kegg_pathway_set.add(id_pathway)
if parsing == "REACTION":
kegg_reaction = line.replace(parsing,"").strip()
kegg_reactions.append(kegg_reaction)
for id_reaction in kegg_reaction.split(" ")[0].split(","):
kegg_reaction_set.add(id_reaction)
if parsing == "COMPOUND":
kegg_compound = line.replace(parsing,"").strip()
id_compound = kegg_compound.split(" ")[0]
kegg_compounds.append(kegg_compound)
kegg_compound_set.add(id_compound)
module_info = pd.Series(
data = OrderedDict([
("NAME",kegg_name),
("DEFINITION",kegg_definition),
("ORTHOLOGY",kegg_orthology),
("ORTHOLOGY_SET",kegg_ortholog_set),
("CLASS",kegg_classes),
("PATHWAY",kegg_pathways),
("PATHWAY_SET",kegg_pathway_set),
("REACTION",kegg_reactions),
("REACTION_SET",kegg_reaction_set),
("COMPOUND",kegg_compounds),
("COMPOUND_SET",kegg_compound_set),
]),
name=kegg_module,
)
return module_info
def get_kegg_modules(expand_nested_modules=True):
results = list()
for line in list(kegg_list("module")):
line = line.strip()
module, name = line.split("\t")
prefix, id_module = module.split(":")
module_file = kegg_get(module)
module_info = parse_kegg_module(module_file)
results.append(module_info)
df = pd.DataFrame(results)
df.index.name = datetime.datetime.now().strftime("Accessed: %Y-%m-%d @ %H:%M")
# Expand nested modules
if expand_nested_modules:
for id_module, row in df.iterrows():
kegg_orthology_set = row["ORTHOLOGY_SET"]
expanded = set()
for x in kegg_orthology_set:
if x.startswith("K"):
expanded.add(x)
if x.startswith("M"):
for id_ko in df.loc[x,"ORTHOLOGY_SET"]:
expanded.add(id_ko)
df.loc[id_module, "ORTHOLOGY_SET"] = expanded
return df发布于 2021-03-18 20:46:48
这是因为你不检查是否有反应,直到你已经收集了路线进入通道。你所有的“下一节”吗?“在开始使用“解析”值之前进行检查:
# Read KEGG module text
parsing = None
for line_level_1 in f:
line_level_1 = line_level_1.strip()
if not line_level_1.startswith("/"):
if line_level_1.startswith("DEFINITION"):
kegg_definition = line_level_1.replace("DEFINITION","").strip()
kegg_ortholog_set = str(kegg_definition)
for character in list("(+ -)"):
kegg_ortholog_set.replace(character, ",")
kegg_ortholog_set = set(filter(bool, kegg_ortholog_set.split(",")))
elif line_level_1.startswith("CLASS"):
kegg_class = line_level_1.replace("CLASS","").strip().split("; ")
elif line_level_1.startswith("ORTHOLOGY"):
parsing = "ORTHOLOGY"
elif line_level_1.startswith("PATHWAY"):
parsing = "PATHWAY"
elif line_level_1.startswith("REACTION"):
parsing = "REACTION"
elif line_level_1.startswith("COMPOUND"):
parsing = "COMPOUND"
if parsing == "ORTHOLOGY":
ko_annot = line_level_1.replace("DEFINITION","").strip()
kegg_orthology.append(ko_annot)
elif parsing == "PATHWAY":
# Get pathways
kegg_pathway = line_level_1.replace("PATHWAY","").strip().split("; ")
kegg_pathways.append(kegg_pathway)
# Get reactions
elif parsing == "REACTION":
kegg_reaction = line_level_1.replace("REACTION","").strip().split("; ")
kegg_reactions.append(kegg_reaction)
# Get compounds
elif parsing == "COMPOUND":
kegg_compound = line_level_1.replace("COMPOUND","").strip().split("; ")
kegg_compounds.append(kegg_compound)
print(kegg_pathways)https://stackoverflow.com/questions/66698744
复制相似问题