我正在从文本中提取一些具有自己的函数的变量名。下面你可以看到代码和文本。
import re
text = '''
def function_cal_new1 (revenues_stories_new1, surplus_margin_new1, meadian_profit):
median_profit= revenues_stories_new1* surplus_margin_new1
return median_profit
def cal_tti_c_a(e_BusinessPropertyRightsSuccessor_c,e_Premium_c,e_Interests_c,e_OtherIncome_c,e_PassivelIncome_c,e_SubSubLease_c,estimation_base_BusinessPropertyRights_c,estimation_base_SubLease_c,estimation_base_SubLeaseBusiness_c,estimation_base_SolidWaste_c,estimation_base_Premium_c,estimation_base_PassiveGainsSaleSharePassive_c,estimation_base_PassiveGainsRealEstateThreeYear_c,estimation_base_PassiveGainssaleOtherMovableAssets_c,estimation_base_PassiveGainsSellsRealEstateFiveYear_c,tti_b_a):
tti_b_a=e_BusinessPropertyRightsSuccessor_c+e_Premium_c+e_Interests_c+e_OtherIncome_c+e_PassivelIncome_c+e_SubSubLease_c+estimation_base_BusinessPropertyRights_c+estimation_base_SubLease_c+estimation_base_SubLeaseBusiness_c+estimation_base_SolidWaste_c+estimation_base_Premium_c+estimation_base_PassiveGainsSaleSharePassive_c+estimation_base_PassiveGainsRealEstateThreeYear_c+estimation_base_PassiveGainssaleOtherMovableAssets_c+estimation_base_PassiveGainsSellsRealEstateFiveYear_c
return(tti_b_a)
'''
# Extracting text
def extraction_variables(text):
splited_table1, splited_table2 = dict(), dict()
lines = text.split('\n')
for line in lines:
x = re.search(r"^def.*:$", line)
if x is not None:
values = x[0].split('def ')[1].split('(')
splited_table1 = values[0]
splited_table2 = values[1][:-2].split(', ') # <--- Probably error is here
yield splited_table1, splited_table2
# Merging extracted text
splited_table1, splited_table2 = zip(*extraction_variables(text))
table = []
for elem in splited_table1:
table.append(elem)
for sub_array in splited_table2:
for elem in sub_array:
table.append(elem)
# Converting in list
final_table = list(table)
final_table在执行这些代码行之后,您可以看到下面的结果
['function_cal_new1 ',
'cal_tti_c_a',
'revenues_stories_new1',
'surplus_margin_new1',
'meadian_profit',
'e_BusinessPropertyRightsSuccessor_c,e_Premium_c,e_Interests_c,e_OtherIncome_c,e_PassivelIncome_c,e_SubSubLease_c,estimation_base_BusinessPropertyRights_c,estimation_base_SubLease_c,estimation_base_SubLeaseBusiness_c,estimation_base_SolidWaste_c,estimation_base_Premium_c,estimation_base_PassiveGainsSaleSharePassive_c,estimation_base_PassiveGainsRealEstateThreeYear_c,estimation_base_PassiveGainssaleOtherMovableAssets_c,estimation_base_PassiveGainsSellsRealEstateFiveYear_c,tti_b_a']也就是说,从function_cal_new1提取文本中的meadian_profit单词是正确的,但是在这一行之后,单词不会在单独的行中提取。
那么,谁能帮我解决这个问题,并在不同的行中提取这些单词呢?最后,我需要有这个输出
'function_cal_new1 ',
'cal_tti_c_a',
'revenues_stories_new1',
'surplus_margin_new1',
'meadian_profit',
'e_BusinessPropertyRightsSuccessor_c,
'e_Premium_c',
'e_Interests_c',
'e_OtherIncome_c',
'e_PassivelIncome_c',
'e_SubSubLease_c',
'estimation_base_BusinessPropertyRights_c'.
'estimation_base_SubLease_c'.
'estimation_base_SubLeaseBusiness_c',
'estimation_base_SolidWaste_c',
'estimation_base_Premium_c',
'estimation_base_PassiveGainsSaleSharePassive_c',
'estimation_base_PassiveGainsRealEstateThreeYear_c',
'estimation_base_PassiveGainssaleOtherMovableAssets_c',
'estimation_base_PassiveGainsSellsRealEstateFiveYear_c',
'tti_b_a'发布于 2022-10-19 18:44:58
您可以使用ast模块遍历代码并提取您想要的部分。这可能比使用正则表达式更健壮。代码必须是有效的,包括缩进。字符串中的代码将需要删除额外的缩进。在这种情况下,这里有一些代码可以遍历树并产生不同的位。
text = '''
def function_cal_new1 (revenues_stories_new1, surplus_margin_new1, meadian_profit):
median_profit= revenues_stories_new1* surplus_margin_new1
return median_profit
def cal_tti_c_a(e_BusinessPropertyRightsSuccessor_c,e_Premium_c,e_Interests_c,e_OtherIncome_c,e_PassivelIncome_c,e_SubSubLease_c,estimation_base_BusinessPropertyRights_c,estimation_base_SubLease_c,estimation_base_SubLeaseBusiness_c,estimation_base_SolidWaste_c,estimation_base_Premium_c,estimation_base_PassiveGainsSaleSharePassive_c,estimation_base_PassiveGainsRealEstateThreeYear_c,estimation_base_PassiveGainssaleOtherMovableAssets_c,estimation_base_PassiveGainsSellsRealEstateFiveYear_c,tti_b_a):
tti_b_a=e_BusinessPropertyRightsSuccessor_c+e_Premium_c+e_Interests_c+e_OtherIncome_c+e_PassivelIncome_c+e_SubSubLease_c+estimation_base_BusinessPropertyRights_c+estimation_base_SubLease_c+estimation_base_SubLeaseBusiness_c+estimation_base_SolidWaste_c+estimation_base_Premium_c+estimation_base_PassiveGainsSaleSharePassive_c+estimation_base_PassiveGainsRealEstateThreeYear_c+estimation_base_PassiveGainssaleOtherMovableAssets_c+estimation_base_PassiveGainsSellsRealEstateFiveYear_c
return(tti_b_a)
'''
import ast
def get_names_and_functions(text):
root = ast.parse(text)
for node in ast.walk(root):
if isinstance(node, ast.FunctionDef):
yield node.name
for arg in node.args.args:
yield arg.arg
elif isinstance(node, ast.Name):
yield node.id
found = set(get_names_and_functions(text))这将给你:
{'cal_tti_c_a',
'e_BusinessPropertyRightsSuccessor_c',
'e_Interests_c',
'e_OtherIncome_c',
'e_PassivelIncome_c',
'e_Premium_c',
'e_SubSubLease_c',
'estimation_base_BusinessPropertyRights_c',
'estimation_base_PassiveGainsRealEstateThreeYear_c',
'estimation_base_PassiveGainsSaleSharePassive_c',
'estimation_base_PassiveGainsSellsRealEstateFiveYear_c',
'estimation_base_PassiveGainssaleOtherMovableAssets_c',
'estimation_base_Premium_c',
'estimation_base_SolidWaste_c',
'estimation_base_SubLeaseBusiness_c',
'estimation_base_SubLease_c',
'function_cal_new1',
'meadian_profit',
'median_profit',
'revenues_stories_new1',
'surplus_margin_new1',
'tti_b_a'}当考虑函数体中的参数和变量时,它使用一个集合来消除dupes。当然,如果您不想考虑参数,可以删除带有args的elif。
https://stackoverflow.com/questions/74130082
复制相似问题