文章/答案/技术大牛

发布

社区首页 >问答首页 >使用请求刮取数据集

问使用请求刮取数据集
EN

Code Review用户

提问于 2021-06-23 08:56:05

回答 1查看 147关注 0票数 4

这是我在这里上的最后一个问题的后续。

在上一篇文章中，我在@Reinderien的建议之后，按照以下方式提供了我的web刮刀代码：

fudan.py

from dataclasses import dataclass, asdict
from itertools import count
from typing import Dict, Iterable, Tuple, List

from bs4 import BeautifulSoup
from requests import Session, get
from datetime import date, datetime

import json
import os
import re

@dataclass
class Link:
    caption: str
    url: str
    clicks: int
    replies: int
    added: date

    @classmethod
    def from_row(cls, props: Dict[str, str], url: str) -> 'Link':
        clicks, replies = props['点击/回复'].split('/')
        # Skip number=int(props['编号']) - this only has meaning within one page

        return cls(
            caption=props['资源标题'],
            url=url,
            clicks=int(clicks),
            replies=int(replies),
            added=datetime.strptime(props['添加时间'], '%Y/%m/%d').date().isoformat(),
        )

    def __str__(self):
        return f'{self.added} {self.url} {self.caption}'


# @dataclass
# class Result:
#     author: str
#     title: str
#     date: date
#     download: str
#     publication: str
#     url: str
    

#     @classmethod
#     def from_metadata(cls, metadata: Dict) -> 'Result':
#         author = metadata['author']
#         title = metadata['title']
#         date = metadata['date']
#         download = metadata['download']
#         publication = "復旦大學出土文獻與古文字研究中心學者文庫"
#         url = metadata['url']


#     def __str__(self) -> str:
#         return(
#             f"\n作者 {self.author}"
#             f"\n標題 {self.title}"
#             f"\n發佈日期 {self.date}"
#             f"\n下載連結 {self.download}"
#             f"\n發表平台 {self.publication}"
#             f"\n訪問網頁 {self.url}"
#         )


def get_primary_result():
    path = os.path.join(os.getcwd(), 
        'primary_search_result.json')

    with open(path, "r") as f:
        data = f.read()
        data = data.replace('\n][\n',',')
        primary_rslt = json.loads(data)

    return primary_rslt


def get_article():
    primary_rslt = get_primary_result()
    captions_list = [item['caption'] for item in primary_rslt]
    base_url = 'http://www.gwz.fudan.edu.cn'
    url_list = [base_url + item['url'] for item in primary_rslt]
    date_list = [item['added'] for item in primary_rslt]

    for i, url in enumerate(url_list):

        with get(url) as resp:
            resp.raise_for_status()
            doc = BeautifulSoup(resp.text, 'html.parser')
            content = doc.select_one('span.ny_font_content')

            category = doc.select('#_top td a')[1].text
            if category == '学者文库':

                try:
                    author, title =  captions_list[i].split("：")
                except:
                    author = None
                    title = captions_list[i]

                if author == "網摘":
                    author = None
                    title = captions_list[i]

                date = date_list[i]
                dl_tag = content.find_all('a', {"href" : 
                    re.compile("/?(lunwen/|articles/up/).+")})[0]
                download = dl_tag['href']
                download = download.replace("\r","").replace("\n", "").strip()

                if download == "#_edn1":
                    download = None
                elif download[0] != "/":
                    download = "/" + download
            
                yield {
                    "author": author, 
                    "title": title, 
                    "date": date, 
                    "url": url, 
                    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
                    "download": download}


def get_page(session: Session, query: str, page: int) -> Tuple[List[Link], int]:
    with session.get(
        'http://www.gwz.fudan.edu.cn/Web/Search',
        params={
            's': query,
            'page': page,
        },
    ) as resp:
        resp.raise_for_status()
        doc = BeautifulSoup(resp.text, 'html.parser')

    table = doc.select_one('#tab table')
    heads = [h.text for h in table.select('tr.cap td')]
    links = []

    for row in table.find_all('tr', class_=''):
        cells = [td.text for td in row.find_all('td')]
        links.append(Link.from_row(
            props=dict(zip(heads, cells)),
            url=row.find('a')['href'],
        ))

    page_td = doc.select_one('#tab table:nth-child(2) td')
    n_pages = int(page_td.text.rsplit('/', 1)[1])

    return links, n_pages


def remove_json_if_exist(filename):
    json_file = filename + ".json"
    filePath = os.path.join(os.getcwd(), json_file)

    if os.path.exists(filePath):
        os.remove(filePath)


def get_all_links(session: Session, query: str) -> Iterable[Link]:

    for page in count(1):
        links, n_pages = get_page(session, query, page)
        print(f'{page}/{n_pages}')
        yield from links

        with open('primary_search_result.json', 'a') as file:
            json.dump([asdict(link) for link in links], file, ensure_ascii=False, indent=4)

        if page >= n_pages:
            break


def search(keyword):
    remove_json_if_exist('primary_search_result')

    with Session() as session:
        for link in get_all_links(session, keyword):
            print(link)
    
    print()


def compile_search_result():
    print("Articles Retrieved:")

    remove_json_if_exist('fudan_search_result')

    rslt = get_article()

    for item in rslt:
        with open('fudan_search_result.json', 'a') as file:
            json.dump(item, file, ensure_ascii=False, indent=4)
        
        print(item)


def main():

    search('尹至')
    compile_search_result()


if __name__ == '__main__':
    main()

上面的代码增加了循环浏览主要搜索结果中的urls列表的功能，从而生成在网站上发布的单个文章的元数据。

输出：

{
    "author": "許文獻",
    "title": "重讀清華〈厚父〉簡釋字懸想一則",
    "date": "2018-08-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/4286",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1936重讀清華〈厚父〉簡釋字懸想一則.docx"
}{
    "author": "雷燮仁",
    "title": "談《尚書》中表勉義的幾組字",
    "date": "2017-10-31",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/3152",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1861雷燮仁：談《尚書》中表勉義的幾組字.doc"
}{
    "author": "雷燮仁",
    "title": "誤“埶”為“執”及相關問題考辨",
    "date": "2017-10-31",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/3146",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1855雷燮仁：誤“埶”為“執”及相關問題考辨.doc"
}{
    "author": "蘇建洲",
    "title": "楚系文字“祟”字構形補說兼論相關問題",
    "date": "2017-01-15",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2969",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1731蘇建洲：楚系文字“祟”字構形補說兼論相關問題.doc"
}{
    "author": "王寧",
    "title": "《周易》“童蒙”解",
    "date": "2016-03-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2767",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1623王寧：《周易》“童蒙”解.doc"
}{
    "author": "王寧",
    "title": "北大漢簡《蒼頡篇》讀札（下）",
    "date": "2016-03-07",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2747",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1618王寧：北大漢簡《蒼頡篇》讀札（下）.doc"
}{
    "author": "王寧",
    "title": "讀《殷高宗問於三壽》散札",
    "date": "2015-05-17",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2525",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1479王寧：讀《殷高宗問於三壽》散札.doc"
}{
    "author": "高月",
    "title": "《漢書·藝文志·諸子略》之道家補",
    "date": "2015-05-09",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2516",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1474高月：《漢書·藝文志·諸子略》之道家補.doc"
}{
    "author": "陳劍",
    "title": "《清華簡（伍）》與舊說互證兩則",
    "date": "2015-04-14",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2494",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1454陳劍：《清華簡（伍）》與舊說互證兩則.doc"
}{
    "author": "王寧",
    "title": "上博二《容成氏》湯伐桀記載辨析",
    "date": "2015-03-11",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2464",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1433王寧：上博二《容成氏》湯伐桀記載辨析.doc"
}{
    "author": "王寧",
    "title": "上博二《容成氏》“南藻氏”相關問題考論",
    "date": "2015-03-01",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2455",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1425王寧：上博二《容成氏》“南藻氏”相關問題考論.doc"
}{
    "author": "張崇禮",
    "title": "清華簡《尹誥》考釋",
    "date": "2014-12-17",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2400",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1387張崇禮：清華簡《尹誥》考釋.doc"
}{
    "author": "王寧",
    "title": "《清華簡〈尹誥〉獻疑》之疑",
    "date": "2014-06-23",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2298",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1337王寧：《清華簡〈尹誥〉獻疑》之疑.doc"
}{
    "author": "孫合肥",
    "title": "清華簡《筮法》札記一則",
    "date": "2014-01-25",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2222",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1300孫合肥：清華簡《筮法》札記一則.doc"
}{
    "author": "陸離",
    "title": "清華簡《別卦》讀“解”之字試說",
    "date": "2014-01-08",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2208",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1292陸離：清華簡《別卦》讀“解”之字試說.doc"
}{
    "author": "王寧",
    "title": "清華簡《尹至》《赤鳩之集湯之屋》對讀一則",
    "date": "2013-11-28",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2183",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1276王寧：清華簡《尹至》《赤鳩之集湯之屋》對讀一則.doc"
}{
    "author": "呂廟軍",
    "title": "“出土文獻與中國古代文明”國際學術研討會綜述",
    "date": "2013-10-22",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2145",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1254呂廟軍：“出土文獻與中國古代文明”國際學術研討會綜述.doc"
}{
    "author": "王挺斌",
    "title": "清華簡《尹誥》“遠邦歸志”考",
    "date": "2013-06-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2082",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1218王挺斌：清华简《尹诰》“远邦归志”考.doc"
}{
    "author": "高中華",
    "title": "《清華簡》（壹）校讀四則",
    "date": "2013-06-08",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2069",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1208高中华：《清华简》（壹）校读四则.doc"
}{
    "author": "陳民鎮",
    "title": "清華簡《說命上》首句試解",
    "date": "2013-01-21",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2003",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1169陳民鎮：清華簡《說命上》首句試解.doc"
}{
    "author": "劉剛",
    "title": "清華叁《良臣》為具有晉系文字風格的抄本補證",
    "date": "2013-01-17",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/2002",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1168劉剛：清華叁《良臣》為具有晉系文字風格的抄本補證.doc"
}{
    "author": "陳劍",
    "title": "簡談《繫年》的“ ”和楚簡部分“ ”字當釋讀爲“捷””",
    "date": "2013-01-16",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1996",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1167陳劍：簡談《繫年》的“ ”和楚簡部分“ ”字當釋讀爲“捷”.doc"
}{
    "author": "韓祖倫",
    "title": "利用楚簡文字釋讀古璽文字四例",
    "date": "2012-06-05",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1884",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1086韩祖伦：利用楚简文字释读古玺文字四例.doc"
}{
    "author": "蘇建洲",
    "title": "楚竹書的“罝”字",
    "date": "2012-04-13",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1844",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1057蘇建洲：楚竹書的“罝”字.doc"
}{
    "author": "苗豐",
    "title": "卜辭“中录”補證",
    "date": "2012-03-25",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1809",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1049苗豐：卜辭“中录”補證.doc"
}{
    "author": "張世超",
    "title": "佔畢脞說（八）",
    "date": "2012-03-09",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1800",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1040張世超：佔畢脞說（八）.doc"
}{
    "author": "張世超",
    "title": "佔畢脞說（三、四）",
    "date": "2012-02-23",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1787",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1029張世超：佔畢脞說（三、四）.doc"
}{
    "author": "張崇禮",
    "title": "釋清華簡《尹至》的“瓚”字",
    "date": "2011-12-23",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1748",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/1001張崇禮：釋清華簡《尹至》的“瓚”字.doc"
}{
    "author": "陳民鎮",
    "title": "清華簡《楚居》集釋",
    "date": "2011-09-23",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1663",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0951陳民鎮：清華簡《楚居》集釋.doc"
}{
    "author": "胡凱",
    "title": "清華簡《祭公之顧命》集釋",
    "date": "2011-09-23",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1662",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0950胡凱：清華簡《祭公之顧命》集釋.doc"
}{
    "author": "汪亞洲",
    "title": "清華簡《皇門》集釋",
    "date": "2011-09-23",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1660",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0949汪亞洲：清華簡《皇門》集釋.doc"
}{
    "author": "陳民鎮、胡凱",
    "title": "清華簡《金縢》集釋",
    "date": "2011-09-20",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1658",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0947陳民鎮、胡凱：清華簡《金縢》集釋.doc"
}{
    "author": "顏偉明、陳民鎮",
    "title": "清華簡《耆夜》集釋",
    "date": "2011-09-20",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1657",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0946顏偉明、陳民鎮：清華簡《耆夜》集釋.doc"
}{
    "author": "胡凱、陳民鎮",
    "title": "清華簡《保訓》集釋",
    "date": "2011-09-19",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1654",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0943陳民鎮、胡凱：清華簡《保訓》集釋.doc"
}{
    "author": "禚孝文",
    "title": "清華簡《程寤》集釋",
    "date": "2011-09-17",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1653",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0942禚孝文：清華簡《程寤》集释.doc"
}{
    "author": "陳民鎮",
    "title": "清華簡《尹誥》集釋",
    "date": "2011-09-12",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1648",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0938陳民鎮：清華簡《尹誥》集釋.doc"
}{
    "author": "陳民鎮",
    "title": "清華簡《尹至》集釋",
    "date": "2011-09-12",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1647",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0937陳民鎮：清華簡《尹至》集釋.doc"
}{
    "author": "汪亞洲、陳民鎮",
    "title": "清華簡研究論著目錄簡編",
    "date": "2011-09-12",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1646",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0936汪亞洲、陳民鎮：清華簡研究論著目錄簡編.doc"
}{
    "author": "劉信芳",
    "title": "清華藏簡（壹）試讀",
    "date": "2011-09-09",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1643",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0934劉信芳：清華藏簡（壹）試讀.doc"
}{
    "author": "鄧少平",
    "title": "清華簡研究論著目錄（2008.12—2011.8）",
    "date": "2011-08-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1631",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0928鄧少平：清華簡研究論著目錄.doc"
}{
    "author": "汪亞洲",
    "title": "清華簡《尹至》“亡典”說",
    "date": "2011-06-17",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1556",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/lunwen/0882汪亞洲：清華簡《尹至》“亡典”說.doc"
}{
    "author": null,
    "title": "網摘：2011年3月",
    "date": "2011-05-02",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1485",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0855網摘：2011年3月.doc"
}{
    "author": "劉光勝",
    "title": "清華簡《耆夜》考論",
    "date": "2011-04-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1484",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0854刘光胜：清华简耆夜新探.doc"
}{
    "author": "劉洪濤",
    "title": "清華簡補釋四則",
    "date": "2011-04-27",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1479",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0851刘洪涛：清华简补释四则.doc"
}{
    "author": "蘇建洲",
    "title": "論楚竹書“厇”字構形",
    "date": "2011-04-10",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1459",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0838蘇建洲：論楚竹書“厇”字構形.doc"
}{
    "author": null,
    "title": "網摘：2011年2月",
    "date": "2011-04-02",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1450",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0830網摘：2011年2月.doc"
}{
    "author": "劉波",
    "title": "清華簡《尹至》“僮亡典”補說",
    "date": "2011-03-04",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1421",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0812清華簡《尹至》“僮亡典”補說.doc"
}{
    "author": null,
    "title": "網摘：2011年1月",
    "date": "2011-03-01",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1417",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0809網摘：2011年1月.doc"
}{
    "author": "陳劍",
    "title": "清華簡《皇門》“賏爾”字補說",
    "date": "2011-02-04",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1397",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0803清華簡《皇門》“賏爾”字補說.doc"
}{
    "author": "王寧",
    "title": "清華簡《尹至》《尹誥》中的“衆”和“民”",
    "date": "2011-02-04",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1396",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0802清華簡《尹至》《尹誥》中的“衆”與“民”.doc"
}{
    "author": null,
    "title": "網摘：《清華一》專輯",
    "date": "2011-02-02",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1393",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0799網摘《清華一》專輯.doc"
}{
    "author": "王寧",
    "title": "讀清華簡《程寤》偶記一則",
    "date": "2011-01-28",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1389",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0797讀清華簡《程寤》偶記一則.doc"
}{
    "author": "蕭旭",
    "title": "清華竹簡《程寤》校補",
    "date": "2011-01-13",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1379",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0793清華竹簡《程寤》校補.doc"
}{
    "author": "袁瑩",
    "title": "清華簡《程寤》校讀",
    "date": "2011-01-11",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1376",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0790清華簡《程寤》校讀.doc"
}{
    "author": "孫飛燕",
    "title": "試論《尹至》的“至在湯”與《尹誥》的“及湯”",
    "date": "2011-01-10",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1373",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0788試論《尹至》的“至在湯”與《尹誥》的“及湯”.doc"
}{
    "author": "蘇建洲",
    "title": "《清華簡》考釋四則",
    "date": "2011-01-09",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1368",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0784《清華簡》考釋四則.doc"
}{
    "author": "沈培",
    "title": "清華簡字詞考釋二則",
    "date": "2011-01-09",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1367",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0783清華簡字詞考釋二則.doc"
}{
    "author": "讀書會",
    "title": "清華簡《尹至》、《尹誥》研讀札記",
    "date": "2011-01-05",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1352",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0774清華簡《尹至》、《尹誥》研讀札記.doc"
}{
    "author": "讀書會",
    "title": "清華簡《耆夜》研讀札記",
    "date": "2011-01-05",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1347",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0773清華簡《耆夜》研讀札記.doc"
}{
    "author": "朱曉海",
    "title": "〈尹至〉可能是百篇《尚書》中前所未見的一篇",
    "date": "2010-06-17",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1187",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0692〈尹至〉可能是百篇《尚書》中前所未見的一篇.doc"
}{
    "author": null,
    "title": "清華九簡研讀札記",
    "date": "2010-05-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1166",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0676清華九簡研讀札記.doc"
}{
    "author": "蘇建洲",
    "title": "《清華簡九篇綜述》封二所刊《皇門》簡簡釋",
    "date": "2010-05-30",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/1165",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0675《清華簡九篇綜述》封二所刊《皇門》簡簡釋.doc"
}{
    "author": "淺野裕一",
    "title": "上博楚簡《柬大王泊旱》之災異思想",
    "date": "2009-09-13",
    "url": "http://www.gwz.fudan.edu.cn/Web/Show/904",
    "publication": "復旦大學出土文獻與古文字研究中心學者文庫",
    "download": "/articles/up/0494上博楚簡《柬大王泊旱》之災異思想.doc"
}

问题：

我最初打算使用另一个dataclass (注释掉了)来扩展@Reinderien的答案，但最终却用函数和字典完成了整个过程。

所以我的主要问题是：

我们如何决定一个dataclass对于一个任务是否是必需的？
特定于此部分的内容-上面的get_page函数在fudan.py中：

with session.get(
        'http://www.gwz.fudan.edu.cn/Web/Search',
        params={
            's': query,
            'page': page,
        },
    ) as resp:
        resp.raise_for_status()
        doc = BeautifulSoup(resp.text, 'html.parser')

为什么我们不能(或者不应该)用这个代替？

from requests import get
with get('http://www.gwz.fudan.edu.cn/Web/Search?s=' + query) as resp:
...

(关注点:1 get和session.get 2之间的区别是使用params而不是直接将查询作为url字符串的一部分发布)

任何其他改进我的代码的建议也是欢迎的！

python

web-scraping

回答 1

Code Review用户

回答已采纳

发布于 2021-06-23 15:56:58

区分链接类和结果类；我将后者重命名为
出版物对每一行都是一样的，所以我省略了它。如果您真的想要的话，可以重新引入它，但是不得不对文件中的每个记录都相同的值进行硬编码是很奇怪的。
保存一个中间JSON只为了再次加载不是一个好主意。只需对内存中的值进行操作。传统的做法可能是将整个中间数据加载到一个列表中，然后对其进行迭代，但这并不能很好地扩展到内存中。我所展示的方法完全是迭代的，每次只带一个页面。
删除并在附加模式下打开文件是没有意义的。只要在普通的写入模式下打开它，它就会截断和覆盖任何现有的内容。
永远不要裸露try/except。在本例中，由于您正在寻找分隔符，所以只需测试分隔符是否存在，而不是使用逻辑逐个异常。
您打破了我建议的date-typed列，将其存储为字符串。在到达程序的边缘之前，不要将其格式化为字符串。
get_all_links不是保存到JSON的合适位置。
类别筛选器不需要将索引硬编码到特定锚点.相反，在#中查找href。

你所关注的具体问题：

get与session.get的区别

您已经有了一个会话，您使用了一半的时间--另一半是您正在执行的直接get，在这种情况下，您应该避免这样做。就用这个会话吧。它更好地表达了您的意图：(1)携带浏览器可能使用的任何cookie，(2)在必要时应用公共标头，(3)共享连接池。

使用params而不是将查询直接作为url字符串的一部分提交。

params dict是传递参数的一种更Pythonic的方式:键值对操作更容易，您可以传递非字符串参数，请求将为您强制字符串，请求将完成所有您不应该关心的必要编码和转义。

我们如何决定一个数据块是否是一个任务所必需的？

没有什么是“必要的”，但数据经常是“非常适合”。考虑是否：

你使用的是Python 3.7+，无论如何你都应该这么做
您知道类的属性及其类型，无论如何您应该知道这些属性和类型。
通用构造函数__init__将是对成员的简单的参数分配。
任何特定用途的伪构造函数都可以表示为@classmethods返回一个或多个类实例。

那么数据集是非常适合的。

建议

from dataclasses import dataclass
from itertools import count
from pathlib import Path
from typing import Dict, Iterable, Tuple, List, Optional
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from requests import Session
from datetime import date, datetime

import json
import re

BASE_URL = 'http://www.gwz.fudan.edu.cn'


@dataclass
class Link:
    caption: str
    url: str
    clicks: int
    replies: int
    added: date

    @classmethod
    def from_row(cls, props: Dict[str, str], path: str) -> 'Link':
        clicks, replies = props['点击/回复'].split('/')
        # Skip number=int(props['编号']) - this only has meaning within one page

        return cls(
            caption=props['资源标题'],
            url=urljoin(BASE_URL, path),
            clicks=int(clicks),
            replies=int(replies),
            added=datetime.strptime(props['添加时间'], '%Y/%m/%d').date(),
        )

    def __str__(self):
        return f'{self.added} {self.url} {self.caption}'

    def author_title(self) -> Tuple[Optional[str], str]:
        sep = '：'  # full-width colon, U+FF1A

        if sep not in self.caption:
            return None, self.caption

        author, title = self.caption.split(sep, 1)
        author, title = author.strip(), title.strip()

        net_digest = '網摘'
        if author == net_digest:
            return None, title

        return author, title


@dataclass
class Article:
    author: Optional[str]
    title: str
    date: date
    download: Optional[str]
    url: str

    @classmethod
    def from_link(cls, link: Link, download: str) -> 'Article':

        author, title = link.author_title()

        download = download.replace("\r", "").replace("\n", "").strip()
        if download == '#_edn1':
            download = None
        elif download[0] != '/':
            download = '/' + download

        return cls(
            author=author,
            title=title,
            date=link.added,
            download=download,
            url=link.url,
        )

    def __str__(self) -> str:
        return(
            f"\n作者   {self.author}"
            f"\n標題   {self.title}"
            f"\n發佈日期 {self.date}"
            f"\n下載連結 {self.download}"
            f"\n訪問網頁 {self.url}"
        )

    def as_dict(self) -> Dict[str, str]:
        return {
            'author': self.author,
            'title': self.title,
            'date': self.date.isoformat(),
            'download': self.download,
            'url': self.url,
        }


def compile_search_results(session: Session, links: Iterable[Link], category_filter: str) -> Iterable[Article]:

    for link in links:
        with session.get(link.url) as resp:
            resp.raise_for_status()
            doc = BeautifulSoup(resp.text, 'html.parser')

        category = doc.select_one('#_top td a[href="#"]').text
        if category != category_filter:
            continue

        content = doc.select_one('span.ny_font_content')
        dl_tag = content.find(
            'a', {
                'href': re.compile("/?(lunwen/|articles/up/).+")
            }
        )

        yield Article.from_link(link, download=dl_tag['href'])


def get_page(session: Session, query: str, page: int) -> Tuple[List[Link], int]:
    with session.get(
        urljoin(BASE_URL, '/Web/Search'),
        params={
            's': query,
            'page': page,
        },
    ) as resp:
        resp.raise_for_status()
        doc = BeautifulSoup(resp.text, 'html.parser')

    table = doc.select_one('#tab table')
    heads = [h.text for h in table.select('tr.cap td')]
    links = []

    for row in table.find_all('tr', class_=''):
        cells = [td.text for td in row.find_all('td')]
        links.append(Link.from_row(
            props=dict(zip(heads, cells)),
            path=row.find('a')['href'],
        ))

    page_td = doc.select_one('#tab table:nth-child(2) td')
    n_pages = int(page_td.text.rsplit('/', 1)[1])

    return links, n_pages


def get_all_links(session: Session, query: str) -> Iterable[Link]:
    for page in count(1):
        links, n_pages = get_page(session, query, page)
        print(f'{page}/{n_pages}')
        yield from links

        if page >= n_pages:
            break


def save_articles(articles: Iterable[Article], file_prefix: str) -> None:
    file_path = Path(file_prefix).with_suffix('.json')

    with file_path.open('w') as file:
        file.write('[\n')
        first = True

        for article in articles:
            if first:
                first = False
            else:
                file.write(',\n')
            json.dump(article.as_dict(), file, ensure_ascii=False, indent=4)

        file.write('\n]\n')


def main():
    with Session() as session:
        links = get_all_links(session, query='尹至')
        academic_library = '学者文库'
        articles = compile_search_results(session, links, category_filter=academic_library)
        save_articles(articles, 'fudan_search_result')


if __name__ == '__main__':
    main()

票数 2

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/263357

复制

相似问题

问使用请求刮取数据集
EN

fudan.py

输出：

问题：

回答 1

Code Review用户

建议

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问使用请求刮取数据集EN

fudan.py

输出：

问题：

回答 1

Code Review用户

建议

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问使用请求刮取数据集
EN