我创建了以下程序,该程序以分子式作为输入,例如CH3COOH并返回化合物的摩尔质量:
#! /usr/bin/env python3
element_weights = {
'H': 1.00794,
'He': 4.002602,
'C': 12.011,
'O': 15.999,
'Ts': 294.0,
'Og': 294.0,
}
def tokenize(string):
position = 0
tokens = []
striter = iter(string)
character = next(striter)
while True:
try:
token = character
if character in "()":
character = next(striter)
elif character.isnumeric():
while (character := next(striter)).isnumeric():
token += character
elif character.isupper():
while (character := next(striter)).islower():
token += character
else:
raise ValueError("Can't parse")
tokens.append(token)
except StopIteration:
tokens.append(token)
tokens.append("EOF")
return tokens
def get_composition(tokens_list):
composition = {}
tokens = iter(tokens_list)
token = next(tokens)
while True:
if(token == "EOF"):
break
num_parens = 0
if token == "(":
num_parens = 1
substr_tokens = []
while num_parens > 0:
token = next(tokens)
if token == "EOF":
raise ValueError(f"Unbalanced Parens, tokens: {tokens_list}")
elif token == "(":
num_parens += 1
elif token == ")":
num_parens -= 1
if (num_parens > 0):
substr_tokens.append(token)
substr_tokens.append("EOF")
substr_composition = get_composition(substr_tokens)
if (token := next(tokens)).isnumeric():
substr_composition = {k: int(token) * v for k,v in substr_composition.items()}
for k,v in substr_composition.items():
if k in composition:
composition[k] += v
else:
composition[k] = v
break
if token == ")":
raise ValueError(f"Unbalanced Parens, tokens: {tokens_list}")
if token not in element_weights:
raise ValueError(f"Can't find element {token}, tokens {tokens_list}")
element = token
if (token := next(tokens)).isnumeric():
element_count = int(token)
token = next(tokens)
else:
element_count = 1
if element in composition:
composition[element] += element_count
else:
composition[element] = element_count
return composition
def convertToAMU(element_count):
return sum(element_weights[k] * v for k,v in element_count.items())
if __name__ == "__main__":
import sys
if(len(sys.argv) > 1):
print(convertToAMU(get_composition(tokenize(sys.argv[1]))))
else:
print(f"Usage: {sys.argv[0]} [chemical_formula]")发布于 2020-10-14 07:36:58
对于这样一个简单的解析任务,我会使用正则表达式和re.findall()‘将字符串拆分为一个标记列表。它只是一条线而不是20条线。标记是元素(大写字母可能带有小写字母)、重复计数(1或更多位数)或括号。"{}[]“匹配任何括号。
tokens = re.findall(r"[A-Z][a-z]|[0-9]+|[](){}[]", string)collections.Counter()对于计数元素很有用。用作简单堆栈的列表有助于跟踪括号内的子化合物。
composition = Counter()
match = []
stack = []对于大型解析任务来说,使用标记列表可能不是最优的,但对于这个命令行计算器来说就足够了。此外,它还允许您“查看”下一个令牌是否为一个数字,从而大大简化了对公式中的数字的处理。(我在最终代码中使用了if-表达式)
if tokens[0].isalpha():
element = tokens.pop(0)
if tokens[0].isdigit(): # <-- peek at next token to see if
repeat = int(tokens.pop(0)) # there is a count
else:
repeat = 1
compound[element] += repeat遇到开始括号时,保存堆栈上的当前状态(计数器)并启动新状态。将匹配的括号也保存在堆栈中。
elif tokens[0] in ('(', '[', '{'):
match.append(MATCH[tokens.pop(0)])
stack.append(composition)
composition = Counter()当遇到结束括号时,查看它是否与括号堆栈顶部的括号匹配。从堆栈中弹出保存的状态,并将其从括号内的子化合物中组合。
elif tokens[0] == match[-1]:
tokens.pop(0)
match.pop()
repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
for element in composition.keys():
composition[element] *= repeat
composition.update(stack.pop())任何其他标记都是错误。
else:
if token in (')', ']', '}'):
raise ValueError(f"Error, mismatched bracket: "
f"expected {match[-1]} got {tokens[0]}.")
else:
raise ValueError(f"Error, unrecognized token "
f"in formula: '{tokens[0]}'.")如果堆栈末尾有任何括号,那么就会出现不匹配的括号。
if match:
brackets = ', '.join(f"'{b}'" for b in match[::-1])
raise ValueError(f"Error, missing bracket(s): {brackets}.")完整的解析器:
import re
from collections import Counter
MATCH = {'(':')', '[':']', '{':'}'}
def parse(molecule):
tokens = re.findall(r"[A-Z][a-z]?|[0-9]+|[]{}()[]", molecule)
composition = Counter()
match = []
stack = []
while tokens:
# element with optional count
if tokens[0].isalpha():
element = tokens.pop(0)
count = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
composition[element] += count
# start a sub-compound
elif tokens[0] in ('(', '[', '{'):
match.append(MATCH[tokens.pop(0)])
stack.append(composition)
composition = Counter()
# matching close bracket ends a sub-compound with an optional count
elif tokens[0] == match[-1]:
tokens.pop(0)
match.pop()
repeat = int(tokens.pop(0)) if tokens and tokens[0].isdigit() else 1
for element in composition.keys():
composition[element] *= repeat
composition.update(stack.pop())
# "syntax" error in the formula
else:
if token[0] in (')', ']', '}'):
raise ValueError(f"Error, mismatched bracket: "
f"expected '{match[-1]}' got '{token[0]}'.")
else:
raise ValueError(f"Error, unrecognized token in "
f"formula: '{token[0]}'.")
# left over, unmatched brackets
if match:
brackets = ', '.join(f"'{b}'" for b in match[::-1])
raise ValueError(f"Error, missing bracket(s): {brackets}.")
return dict(composition)https://codereview.stackexchange.com/questions/249782
复制相似问题