我写了从加拿大统计局抓取数据的解析器。
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_number_of_sources() -> int:
'''
Retrieves Number of STATCAN Sources
Returns
-------
int
Number of STATCAN Sources.
'''
URL = 'https://www150.statcan.gc.ca/n1/en/type/data'
page = requests.get(URL)
soup = BeautifulSoup(page.text, 'lxml')
result = re.search(r'\((.*?)\)', soup.summary.get_text()).group(1)
return int(result.replace(',', ''))
def main():
'''
Builds Resulting DataFrame and Dumps It To Excel File
Returns
-------
None.
'''
FILE_NAME = 'stat_can_all.xlsx'
number_of_sources = get_number_of_sources()
data_list = []
for _ in range(1 + number_of_sources // 100):
GENERIC_URL = 'https://www150.statcan.gc.ca/n1/en/type/data?count=100&p={}-All#all'
page = requests.get(GENERIC_URL.format(_))
print(f'Parsing Page {1+_:3} Out of {1+number_of_sources // 100}')
soup = BeautifulSoup(page.text, 'lxml')
details_soup = soup.find('details', id='all')
items = details_soup.find_all('li', {'class': 'ndm-item'})
for item in items:
tag_description = item.find('div', class_='ndm-result-description')
tag_former_id = item.find('div', class_='ndm-result-formerid')
tag_frequency = item.find('div', class_='ndm-result-freq')
tag_geo = item.find('div', class_='ndm-result-geo')
data_list.append(
{
'title': item.find('div', class_='ndm-result-title').get_text(),
'product_id': item.find('div', class_='ndm-result-productid').get_text(),
'former_id': None if tag_former_id is None else tag_former_id.get_text(),
'geo': None if tag_geo is None else tag_geo.get_text(),
'frequency': None if tag_frequency is None else tag_frequency.get_text(),
'description': None if tag_description is None else tag_description.get_text(),
'release_date': item.find('span', class_='ndm-result-date').get_text(),
'type': item.find(
'div',
class_='ndm-result-productid'
).get_text().split(':')[0],
'ref': item.a.get('href'),
}
)
data = pd.DataFrame.from_dict(data_list)
data[['id', 'title_only']] = data.iloc[:, 0].str.split(
pat='. ',
n=1,
expand=True
)
data['id'] = pd.to_numeric(data['id'].str.replace(',', ''))
data.fillna('None').to_excel(FILE_NAME, index=False)
if __name__ == '__main__':
main()我想知道是否有一种方法可以对以下代码行和表示三元操作符的类似代码行进行重新表述:
'former_id': None if tag_former_id is None else tag_former_id.get_text()让它更优雅、更简洁。
您可以看到,如果tag_former_id是class bs4.element.Tag的一个实例,则可以使用.get_text()方法检索str。
否则,tag_former_id可能是None,不需要进一步的操作。
请您回顾这篇文章,并指出偏离最佳实践的地方吗?
任何其他改进建议也很受欢迎,例如在代码中引入更多的功能方法等等。
发布于 2022-10-05 20:32:43
https://codereview.stackexchange.com/questions/280222
复制相似问题