我编写了以下脚本,用于基于NCTID从美国国家医学图书馆网站ClinicalTrials.Gov中抓取数据。
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = dict((str('ct' + tag_matches[i].name.capitalize()), tag_matches[i].text) for i in range(0, len(tag_matches)))
tag_dict = multipleFields(data, ['intervention_name'], tag_dict)
tag_dict = multipleFields(data, ['intervention_type'], tag_dict)
tag_dict = multipleFields(data, ['arm_group_type'], tag_dict)
tag_dict['ctID'] = nctid
#for key in tag_dict:
#print(key + ': ' + tag_dict[key])
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = []
try:
for each in fields:
field.append(each.text)
tagDict[str('ct' + subset[0].capitalize())] = ", ".join(field)
return tagDict
except:
return tagDict
def removeEmptyKeys (dict1):
newDict = {}
for key in dict1:
if str(dict1[key]) is not '':
newDict[key] = dict1[key]
return newDict我能做些什么来使这个过程更有效?
发布于 2018-12-29 19:11:26
希望我不会太晚。
有几件事你可以做:
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model', 'primary_purpose', 'masking', 'enrollment', 'official_title', 'condition', 'minimum_age', 'maximum_age', 'gender', 'healthy_volunteers', 'phase', 'primary_outcome', 'secondary_outcome', 'number_of_arms']
tag_matches = data.find_all(subset)
tag_dict = {'ct' + current_tag.name.capitalize(): current_tag.text for current_tag in tag_matches}
tag_dict = multipleFields(data, 'intervention_name', tag_dict)
tag_dict = multipleFields(data, 'intervention_type', tag_dict)
tag_dict = multipleFields(data, 'arm_group_type', tag_dict)
tag_dict['ctID'] = nctid
return removeEmptyKeys(tag_dict)
def multipleFields (data, subset, tagDict):
fields = data.find_all(subset)
field = [each.text for each in fields]
tagDict['ct' + subset.capitalize()] = ", ".join(field)
return tagDict
def removeEmptyKeys (dict1):
newDict = {k:v for (k, v) in dict1.items() if v}
return newDict
pprint.pprint(clinicalTrialsGov("NCT01220960"))发布于 2018-12-29 19:14:24
但我们可以更进一步:
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'primary_outcome', 'secondary_outcome',
'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.find_all(subset_detail)
if current_tag.text.strip()]
for subset_detail in subset}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))result_data),用于合并答案(如果有多个答案),并筛选出没有与它们关联的文本的标记。发布于 2018-12-29 19:30:48
我已经查看了进来的xml数据,我注意到,例如,“primary_outcome”和“secondary_outcome”包括其他标记(“度量衡”、“时间框架”和“描述”)。也许您需要标记中的所有信息,但是如果只需要提取这些标记的“度量值”,您可以这样做:
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers',
'phase', 'number_of_arms', 'intervention_name',
'intervention_type', 'arm_group_type']
subset_has_measure = ['primary_outcome', 'secondary_outcome',]
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text for current_tag in data.find_all(subset_detail)]
for subset_detail in subset}
tag_dict_with_measure = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} measure')]
for subset_detail in subset_has_measure}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data.update((k, ", ".join(v)) for (k, v) in tag_dict_with_measure.items() if v)
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960")).find_all(),而是.select(),它使我们能够对我们想要的标记使用CSS表达式。实际上,您可以将这种情况概括为“度量”以外的其他内容:
import requests
from bs4 import BeautifulSoup
import pprint
def clinicalTrialsGov (nctid):
data = BeautifulSoup(requests.get("https://clinicaltrials.gov/ct2/show/" + nctid + "?displayxml=true").text, "xml")
subset = { '': ['study_type', 'allocation', 'intervention_model',
'primary_purpose', 'masking', 'enrollment',
'official_title', 'condition', 'minimum_age',
'maximum_age', 'gender', 'healthy_volunteers', 'phase',
'number_of_arms', 'intervention_name', 'intervention_type',
'arm_group_type'],
'measure': ['primary_outcome', 'secondary_outcome',]
}
tag_dict = {f'ct{subset_detail.capitalize()}' : [current_tag.text
for current_tag
in data.select(f'{subset_detail} {subset_category}')]
for (subset_category, subset_types) in subset.items() for subset_detail in subset_types}
result_data = {k: ", ".join(v) for (k, v) in tag_dict.items() if v}
result_data['ctID'] = nctid
return result_data
pprint.pprint(clinicalTrialsGov("NCT01220960"))https://codereview.stackexchange.com/questions/194465
复制相似问题