下面是链接:https://www.sec.gov/cgi-bin/browse-edgar?CIK=20&owner=exclude&action=getcompany&Find=Search
我想提取SIC (即3823)和年底(即0102)。我的代码如下。它很有效,但我觉得它很麻烦。更好的方法是什么?谢谢。
#soup is a BeatutifulSoup soup object
link_tags = soup.find_all("a")
if link_tags:
for link in link_tags:
if "SIC=" in link.get("href"):
sic = link.string.strip()
re_yend = re.compile(r"Fiscal Year End: *(\d{4})")
match = re_yend.search(str(soup))
if match:
y_end = str(match.group(1))发布于 2019-02-03 01:24:58
以下是从网站获取数据的另一种方法:
import re
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
response = requests.get(url)
if response.status_code != 200:
raise ValueError('Cannot read the data')
return response.text
def get_sic_fiscal(data):
soup = bs(data, 'html.parser')
# Get the compagny info block
company_info = soup.find('div', {'class': 'companyInfo'})
# Get the acronym tag
acronym = company_info.find('acronym', {'title': 'Standard Industrial Code'})
# find the next url to acronym tag
sic = acronym.findNext('a')
# Reduce the search of the fiscal year end only
# in the compagny info block
fiscal_year_end = re.search(r'Fiscal Year End:\s+(\d+)', company_info.text)
if fiscal_year_end:
return sic.text, fiscal_year_end.group(1)
return sic.text, None
url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK=20&owner=exclude&action=getcompany&Find=Search'
data = get_data(url)
sic, fiscal = get_sic_fiscal(data)
print('SIC: {sic} and Fiscal year end: {fiscal}'.format(sic=sic, fiscal=fiscal))输出:
SIC: 3823 and Fiscal year end: 0102https://stackoverflow.com/questions/54499016
复制相似问题