这是我在这里上的最后一个问题的后续。
在上一篇文章中,我在@Reinderien的建议之后,按照以下方式提供了我的web刮刀代码:
from dataclasses import dataclass, asdict
from itertools import count
from typing import Dict, Iterable, Tuple, List
from bs4 import BeautifulSoup
from requests import Session, get
from datetime import date, datetime
import json
import os
import re
@dataclass
class Link:
caption: str
url: str
clicks: int
replies: int
added: date
@classmethod
def from_row(cls, props: Dict[str, str], url: str) -> 'Link':
clicks, replies = props['点击/回复'].split('/')
# Skip number=int(props['编号']) - this only has meaning within one page
return cls(
caption=props['资源标题'],
url=url,
clicks=int(clicks),
replies=int(replies),
added=datetime.strptime(props['添加时间'], '%Y/%m/%d').date().isoformat(),
)
def __str__(self):
return f'{self.added} {self.url} {self.caption}'
# @dataclass
# class Result:
# author: str
# title: str
# date: date
# download: str
# publication: str
# url: str
# @classmethod
# def from_metadata(cls, metadata: Dict) -> 'Result':
# author = metadata['author']
# title = metadata['title']
# date = metadata['date']
# download = metadata['download']
# publication = "復旦大學出土文獻與古文字研究中心學者文庫"
# url = metadata['url']
# def __str__(self) -> str:
# return(
# f"\n作者 {self.author}"
# f"\n標題 {self.title}"
# f"\n發佈日期 {self.date}"
# f"\n下載連結 {self.download}"
# f"\n發表平台 {self.publication}"
# f"\n訪問網頁 {self.url}"
# )
def get_primary_result():
path = os.path.join(os.getcwd(),
'primary_search_result.json')
with open(path, "r") as f:
data = f.read()
data = data.replace('\n][\n',',')
primary_rslt = json.loads(data)
return primary_rslt
def get_article():
primary_rslt = get_primary_result()
captions_list = [item['caption'] for item in primary_rslt]
base_url = 'http://www.gwz.fudan.edu.cn'
url_list = [base_url + item['url'] for item in primary_rslt]
date_list = [item['added'] for item in primary_rslt]
for i, url in enumerate(url_list):
with get(url) as resp:
resp.raise_for_status()
doc = BeautifulSoup(resp.text, 'html.parser')
content = doc.select_one('span.ny_font_content')
category = doc.select('#_top td a')[1].text
if category == '学者文库':
try:
author, title = captions_list[i].split(":")
except:
author = None
title = captions_list[i]
if author == "網摘":
author = None
title = captions_list[i]
date = date_list[i]
dl_tag = content.find_all('a', {"href" :
re.compile("/?(lunwen/|articles/up/).+")})[0]
download = dl_tag['href']
download = download.replace("\r","").replace("\n", "").strip()
if download == "#_edn1":
download = None
elif download[0] != "/":
download = "/" + download
yield {
"author": author,
"title": title,
"date": date,
"url": url,
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": download}
def get_page(session: Session, query: str, page: int) -> Tuple[List[Link], int]:
with session.get(
'http://www.gwz.fudan.edu.cn/Web/Search',
params={
's': query,
'page': page,
},
) as resp:
resp.raise_for_status()
doc = BeautifulSoup(resp.text, 'html.parser')
table = doc.select_one('#tab table')
heads = [h.text for h in table.select('tr.cap td')]
links = []
for row in table.find_all('tr', class_=''):
cells = [td.text for td in row.find_all('td')]
links.append(Link.from_row(
props=dict(zip(heads, cells)),
url=row.find('a')['href'],
))
page_td = doc.select_one('#tab table:nth-child(2) td')
n_pages = int(page_td.text.rsplit('/', 1)[1])
return links, n_pages
def remove_json_if_exist(filename):
json_file = filename + ".json"
filePath = os.path.join(os.getcwd(), json_file)
if os.path.exists(filePath):
os.remove(filePath)
def get_all_links(session: Session, query: str) -> Iterable[Link]:
for page in count(1):
links, n_pages = get_page(session, query, page)
print(f'{page}/{n_pages}')
yield from links
with open('primary_search_result.json', 'a') as file:
json.dump([asdict(link) for link in links], file, ensure_ascii=False, indent=4)
if page >= n_pages:
break
def search(keyword):
remove_json_if_exist('primary_search_result')
with Session() as session:
for link in get_all_links(session, keyword):
print(link)
print()
def compile_search_result():
print("Articles Retrieved:")
remove_json_if_exist('fudan_search_result')
rslt = get_article()
for item in rslt:
with open('fudan_search_result.json', 'a') as file:
json.dump(item, file, ensure_ascii=False, indent=4)
print(item)
def main():
search('尹至')
compile_search_result()
if __name__ == '__main__':
main()上面的代码增加了循环浏览主要搜索结果中的urls列表的功能,从而生成在网站上发布的单个文章的元数据。
{
"author": "許文獻",
"title": "重讀清華〈厚父〉簡釋字懸想一則",
"date": "2018-08-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/4286",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1936重讀清華〈厚父〉簡釋字懸想一則.docx"
}{
"author": "雷燮仁",
"title": "談《尚書》中表勉義的幾組字",
"date": "2017-10-31",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/3152",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1861雷燮仁:談《尚書》中表勉義的幾組字.doc"
}{
"author": "雷燮仁",
"title": "誤“埶”為“執”及相關問題考辨",
"date": "2017-10-31",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/3146",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1855雷燮仁:誤“埶”為“執”及相關問題考辨.doc"
}{
"author": "蘇建洲",
"title": "楚系文字“祟”字構形補說兼論相關問題",
"date": "2017-01-15",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2969",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1731蘇建洲:楚系文字“祟”字構形補說兼論相關問題.doc"
}{
"author": "王寧",
"title": "《周易》“童蒙”解",
"date": "2016-03-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2767",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1623王寧:《周易》“童蒙”解.doc"
}{
"author": "王寧",
"title": "北大漢簡《蒼頡篇》讀札(下)",
"date": "2016-03-07",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2747",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1618王寧:北大漢簡《蒼頡篇》讀札(下).doc"
}{
"author": "王寧",
"title": "讀《殷高宗問於三壽》散札",
"date": "2015-05-17",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2525",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1479王寧:讀《殷高宗問於三壽》散札.doc"
}{
"author": "高月",
"title": "《漢書·藝文志·諸子略》之道家補",
"date": "2015-05-09",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2516",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1474高月:《漢書·藝文志·諸子略》之道家補.doc"
}{
"author": "陳劍",
"title": "《清華簡(伍)》與舊說互證兩則",
"date": "2015-04-14",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2494",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1454陳劍:《清華簡(伍)》與舊說互證兩則.doc"
}{
"author": "王寧",
"title": "上博二《容成氏》湯伐桀記載辨析",
"date": "2015-03-11",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2464",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1433王寧:上博二《容成氏》湯伐桀記載辨析.doc"
}{
"author": "王寧",
"title": "上博二《容成氏》“南藻氏”相關問題考論",
"date": "2015-03-01",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2455",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1425王寧:上博二《容成氏》“南藻氏”相關問題考論.doc"
}{
"author": "張崇禮",
"title": "清華簡《尹誥》考釋",
"date": "2014-12-17",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2400",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1387張崇禮:清華簡《尹誥》考釋.doc"
}{
"author": "王寧",
"title": "《清華簡〈尹誥〉獻疑》之疑",
"date": "2014-06-23",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2298",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1337王寧:《清華簡〈尹誥〉獻疑》之疑.doc"
}{
"author": "孫合肥",
"title": "清華簡《筮法》札記一則",
"date": "2014-01-25",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2222",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1300孫合肥:清華簡《筮法》札記一則.doc"
}{
"author": "陸離",
"title": "清華簡《別卦》讀“解”之字試說",
"date": "2014-01-08",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2208",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1292陸離:清華簡《別卦》讀“解”之字試說.doc"
}{
"author": "王寧",
"title": "清華簡《尹至》《赤鳩之集湯之屋》對讀一則",
"date": "2013-11-28",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2183",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1276王寧:清華簡《尹至》《赤鳩之集湯之屋》對讀一則.doc"
}{
"author": "呂廟軍",
"title": "“出土文獻與中國古代文明”國際學術研討會綜述",
"date": "2013-10-22",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2145",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1254呂廟軍:“出土文獻與中國古代文明”國際學術研討會綜述.doc"
}{
"author": "王挺斌",
"title": "清華簡《尹誥》“遠邦歸志”考",
"date": "2013-06-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2082",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1218王挺斌:清华简《尹诰》“远邦归志”考.doc"
}{
"author": "高中華",
"title": "《清華簡》(壹)校讀四則",
"date": "2013-06-08",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2069",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1208高中华:《清华简》(壹)校读四则.doc"
}{
"author": "陳民鎮",
"title": "清華簡《說命上》首句試解",
"date": "2013-01-21",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2003",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1169陳民鎮:清華簡《說命上》首句試解.doc"
}{
"author": "劉剛",
"title": "清華叁《良臣》為具有晉系文字風格的抄本補證",
"date": "2013-01-17",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/2002",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1168劉剛:清華叁《良臣》為具有晉系文字風格的抄本補證.doc"
}{
"author": "陳劍",
"title": "簡談《繫年》的“ ”和楚簡部分“ ”字當釋讀爲“捷””",
"date": "2013-01-16",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1996",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1167陳劍:簡談《繫年》的“ ”和楚簡部分“ ”字當釋讀爲“捷”.doc"
}{
"author": "韓祖倫",
"title": "利用楚簡文字釋讀古璽文字四例",
"date": "2012-06-05",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1884",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1086韩祖伦:利用楚简文字释读古玺文字四例.doc"
}{
"author": "蘇建洲",
"title": "楚竹書的“罝”字",
"date": "2012-04-13",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1844",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1057蘇建洲:楚竹書的“罝”字.doc"
}{
"author": "苗豐",
"title": "卜辭“中录”補證",
"date": "2012-03-25",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1809",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1049苗豐:卜辭“中录”補證.doc"
}{
"author": "張世超",
"title": "佔畢脞說(八)",
"date": "2012-03-09",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1800",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1040張世超:佔畢脞說(八).doc"
}{
"author": "張世超",
"title": "佔畢脞說(三、四)",
"date": "2012-02-23",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1787",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1029張世超:佔畢脞說(三、四).doc"
}{
"author": "張崇禮",
"title": "釋清華簡《尹至》的“瓚”字",
"date": "2011-12-23",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1748",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/1001張崇禮:釋清華簡《尹至》的“瓚”字.doc"
}{
"author": "陳民鎮",
"title": "清華簡《楚居》集釋",
"date": "2011-09-23",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1663",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0951陳民鎮:清華簡《楚居》集釋.doc"
}{
"author": "胡凱",
"title": "清華簡《祭公之顧命》集釋",
"date": "2011-09-23",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1662",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0950胡凱:清華簡《祭公之顧命》集釋.doc"
}{
"author": "汪亞洲",
"title": "清華簡《皇門》集釋",
"date": "2011-09-23",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1660",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0949汪亞洲:清華簡《皇門》集釋.doc"
}{
"author": "陳民鎮、胡凱",
"title": "清華簡《金縢》集釋",
"date": "2011-09-20",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1658",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0947陳民鎮、胡凱:清華簡《金縢》集釋.doc"
}{
"author": "顏偉明、陳民鎮",
"title": "清華簡《耆夜》集釋",
"date": "2011-09-20",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1657",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0946顏偉明、陳民鎮:清華簡《耆夜》集釋.doc"
}{
"author": "胡凱、陳民鎮",
"title": "清華簡《保訓》集釋",
"date": "2011-09-19",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1654",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0943陳民鎮、胡凱:清華簡《保訓》集釋.doc"
}{
"author": "禚孝文",
"title": "清華簡《程寤》集釋",
"date": "2011-09-17",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1653",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0942禚孝文:清華簡《程寤》集释.doc"
}{
"author": "陳民鎮",
"title": "清華簡《尹誥》集釋",
"date": "2011-09-12",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1648",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0938陳民鎮:清華簡《尹誥》集釋.doc"
}{
"author": "陳民鎮",
"title": "清華簡《尹至》集釋",
"date": "2011-09-12",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1647",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0937陳民鎮:清華簡《尹至》集釋.doc"
}{
"author": "汪亞洲、陳民鎮",
"title": "清華簡研究論著目錄簡編",
"date": "2011-09-12",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1646",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0936汪亞洲、陳民鎮:清華簡研究論著目錄簡編.doc"
}{
"author": "劉信芳",
"title": "清華藏簡(壹)試讀",
"date": "2011-09-09",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1643",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0934劉信芳:清華藏簡(壹)試讀.doc"
}{
"author": "鄧少平",
"title": "清華簡研究論著目錄(2008.12—2011.8)",
"date": "2011-08-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1631",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0928鄧少平:清華簡研究論著目錄.doc"
}{
"author": "汪亞洲",
"title": "清華簡《尹至》“亡典”說",
"date": "2011-06-17",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1556",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/lunwen/0882汪亞洲:清華簡《尹至》“亡典”說.doc"
}{
"author": null,
"title": "網摘:2011年3月",
"date": "2011-05-02",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1485",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0855網摘:2011年3月.doc"
}{
"author": "劉光勝",
"title": "清華簡《耆夜》考論",
"date": "2011-04-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1484",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0854刘光胜:清华简耆夜新探.doc"
}{
"author": "劉洪濤",
"title": "清華簡補釋四則",
"date": "2011-04-27",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1479",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0851刘洪涛:清华简补释四则.doc"
}{
"author": "蘇建洲",
"title": "論楚竹書“厇”字構形",
"date": "2011-04-10",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1459",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0838蘇建洲:論楚竹書“厇”字構形.doc"
}{
"author": null,
"title": "網摘:2011年2月",
"date": "2011-04-02",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1450",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0830網摘:2011年2月.doc"
}{
"author": "劉波",
"title": "清華簡《尹至》“僮亡典”補說",
"date": "2011-03-04",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1421",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0812清華簡《尹至》“僮亡典”補說.doc"
}{
"author": null,
"title": "網摘:2011年1月",
"date": "2011-03-01",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1417",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0809網摘:2011年1月.doc"
}{
"author": "陳劍",
"title": "清華簡《皇門》“賏爾”字補說",
"date": "2011-02-04",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1397",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0803清華簡《皇門》“賏爾”字補說.doc"
}{
"author": "王寧",
"title": "清華簡《尹至》《尹誥》中的“衆”和“民”",
"date": "2011-02-04",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1396",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0802清華簡《尹至》《尹誥》中的“衆”與“民”.doc"
}{
"author": null,
"title": "網摘:《清華一》專輯",
"date": "2011-02-02",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1393",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0799網摘《清華一》專輯.doc"
}{
"author": "王寧",
"title": "讀清華簡《程寤》偶記一則",
"date": "2011-01-28",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1389",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0797讀清華簡《程寤》偶記一則.doc"
}{
"author": "蕭旭",
"title": "清華竹簡《程寤》校補",
"date": "2011-01-13",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1379",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0793清華竹簡《程寤》校補.doc"
}{
"author": "袁瑩",
"title": "清華簡《程寤》校讀",
"date": "2011-01-11",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1376",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0790清華簡《程寤》校讀.doc"
}{
"author": "孫飛燕",
"title": "試論《尹至》的“至在湯”與《尹誥》的“及湯”",
"date": "2011-01-10",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1373",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0788試論《尹至》的“至在湯”與《尹誥》的“及湯”.doc"
}{
"author": "蘇建洲",
"title": "《清華簡》考釋四則",
"date": "2011-01-09",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1368",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0784《清華簡》考釋四則.doc"
}{
"author": "沈培",
"title": "清華簡字詞考釋二則",
"date": "2011-01-09",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1367",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0783清華簡字詞考釋二則.doc"
}{
"author": "讀書會",
"title": "清華簡《尹至》、《尹誥》研讀札記",
"date": "2011-01-05",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1352",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0774清華簡《尹至》、《尹誥》研讀札記.doc"
}{
"author": "讀書會",
"title": "清華簡《耆夜》研讀札記",
"date": "2011-01-05",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1347",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0773清華簡《耆夜》研讀札記.doc"
}{
"author": "朱曉海",
"title": "〈尹至〉可能是百篇《尚書》中前所未見的一篇",
"date": "2010-06-17",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1187",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0692〈尹至〉可能是百篇《尚書》中前所未見的一篇.doc"
}{
"author": null,
"title": "清華九簡研讀札記",
"date": "2010-05-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1166",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0676清華九簡研讀札記.doc"
}{
"author": "蘇建洲",
"title": "《清華簡九篇綜述》封二所刊《皇門》簡簡釋",
"date": "2010-05-30",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/1165",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0675《清華簡九篇綜述》封二所刊《皇門》簡簡釋.doc"
}{
"author": "淺野裕一",
"title": "上博楚簡《柬大王泊旱》之災異思想",
"date": "2009-09-13",
"url": "http://www.gwz.fudan.edu.cn/Web/Show/904",
"publication": "復旦大學出土文獻與古文字研究中心學者文庫",
"download": "/articles/up/0494上博楚簡《柬大王泊旱》之災異思想.doc"
}我最初打算使用另一个dataclass (注释掉了)来扩展@Reinderien的答案,但最终却用函数和字典完成了整个过程。
所以我的主要问题是:
dataclass对于一个任务是否是必需的?get_page函数在fudan.py中:with session.get(
'http://www.gwz.fudan.edu.cn/Web/Search',
params={
's': query,
'page': page,
},
) as resp:
resp.raise_for_status()
doc = BeautifulSoup(resp.text, 'html.parser')为什么我们不能(或者不应该)用这个代替?
from requests import get
with get('http://www.gwz.fudan.edu.cn/Web/Search?s=' + query) as resp:
... (关注点:1 get和session.get 2之间的区别是使用params而不是直接将查询作为url字符串的一部分发布)
发布于 2021-06-23 15:56:58
try/except。在本例中,由于您正在寻找分隔符,所以只需测试分隔符是否存在,而不是使用逻辑逐个异常。date-typed列,将其存储为字符串。在到达程序的边缘之前,不要将其格式化为字符串。get_all_links不是保存到JSON的合适位置。#中查找href。你所关注的具体问题:
get与session.get的区别
您已经有了一个会话,您使用了一半的时间--另一半是您正在执行的直接get,在这种情况下,您应该避免这样做。就用这个会话吧。它更好地表达了您的意图:(1)携带浏览器可能使用的任何cookie,(2)在必要时应用公共标头,(3)共享连接池。
使用params而不是将查询直接作为url字符串的一部分提交。
params dict是传递参数的一种更Pythonic的方式:键值对操作更容易,您可以传递非字符串参数,请求将为您强制字符串,请求将完成所有您不应该关心的必要编码和转义。
我们如何决定一个数据块是否是一个任务所必需的?
没有什么是“必要的”,但数据经常是“非常适合”。考虑是否:
__init__将是对成员的简单的参数分配。@classmethods返回一个或多个类实例。那么数据集是非常适合的。
from dataclasses import dataclass
from itertools import count
from pathlib import Path
from typing import Dict, Iterable, Tuple, List, Optional
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from requests import Session
from datetime import date, datetime
import json
import re
BASE_URL = 'http://www.gwz.fudan.edu.cn'
@dataclass
class Link:
caption: str
url: str
clicks: int
replies: int
added: date
@classmethod
def from_row(cls, props: Dict[str, str], path: str) -> 'Link':
clicks, replies = props['点击/回复'].split('/')
# Skip number=int(props['编号']) - this only has meaning within one page
return cls(
caption=props['资源标题'],
url=urljoin(BASE_URL, path),
clicks=int(clicks),
replies=int(replies),
added=datetime.strptime(props['添加时间'], '%Y/%m/%d').date(),
)
def __str__(self):
return f'{self.added} {self.url} {self.caption}'
def author_title(self) -> Tuple[Optional[str], str]:
sep = ':' # full-width colon, U+FF1A
if sep not in self.caption:
return None, self.caption
author, title = self.caption.split(sep, 1)
author, title = author.strip(), title.strip()
net_digest = '網摘'
if author == net_digest:
return None, title
return author, title
@dataclass
class Article:
author: Optional[str]
title: str
date: date
download: Optional[str]
url: str
@classmethod
def from_link(cls, link: Link, download: str) -> 'Article':
author, title = link.author_title()
download = download.replace("\r", "").replace("\n", "").strip()
if download == '#_edn1':
download = None
elif download[0] != '/':
download = '/' + download
return cls(
author=author,
title=title,
date=link.added,
download=download,
url=link.url,
)
def __str__(self) -> str:
return(
f"\n作者 {self.author}"
f"\n標題 {self.title}"
f"\n發佈日期 {self.date}"
f"\n下載連結 {self.download}"
f"\n訪問網頁 {self.url}"
)
def as_dict(self) -> Dict[str, str]:
return {
'author': self.author,
'title': self.title,
'date': self.date.isoformat(),
'download': self.download,
'url': self.url,
}
def compile_search_results(session: Session, links: Iterable[Link], category_filter: str) -> Iterable[Article]:
for link in links:
with session.get(link.url) as resp:
resp.raise_for_status()
doc = BeautifulSoup(resp.text, 'html.parser')
category = doc.select_one('#_top td a[href="#"]').text
if category != category_filter:
continue
content = doc.select_one('span.ny_font_content')
dl_tag = content.find(
'a', {
'href': re.compile("/?(lunwen/|articles/up/).+")
}
)
yield Article.from_link(link, download=dl_tag['href'])
def get_page(session: Session, query: str, page: int) -> Tuple[List[Link], int]:
with session.get(
urljoin(BASE_URL, '/Web/Search'),
params={
's': query,
'page': page,
},
) as resp:
resp.raise_for_status()
doc = BeautifulSoup(resp.text, 'html.parser')
table = doc.select_one('#tab table')
heads = [h.text for h in table.select('tr.cap td')]
links = []
for row in table.find_all('tr', class_=''):
cells = [td.text for td in row.find_all('td')]
links.append(Link.from_row(
props=dict(zip(heads, cells)),
path=row.find('a')['href'],
))
page_td = doc.select_one('#tab table:nth-child(2) td')
n_pages = int(page_td.text.rsplit('/', 1)[1])
return links, n_pages
def get_all_links(session: Session, query: str) -> Iterable[Link]:
for page in count(1):
links, n_pages = get_page(session, query, page)
print(f'{page}/{n_pages}')
yield from links
if page >= n_pages:
break
def save_articles(articles: Iterable[Article], file_prefix: str) -> None:
file_path = Path(file_prefix).with_suffix('.json')
with file_path.open('w') as file:
file.write('[\n')
first = True
for article in articles:
if first:
first = False
else:
file.write(',\n')
json.dump(article.as_dict(), file, ensure_ascii=False, indent=4)
file.write('\n]\n')
def main():
with Session() as session:
links = get_all_links(session, query='尹至')
academic_library = '学者文库'
articles = compile_search_results(session, links, category_filter=academic_library)
save_articles(articles, 'fudan_search_result')
if __name__ == '__main__':
main()https://codereview.stackexchange.com/questions/263357
复制相似问题