文章/答案/技术大牛

发布

社区首页 >问答首页 >狐狸是黑色的--一个简单的图像刮刀。

问狐狸是黑色的--一个简单的图像刮刀。
EN

Code Review用户

提问于 2019-07-10 11:32:14

回答 1查看 83关注 0票数 3

我写了一个简单的图像刮刀桌面墙纸项目由福克斯是黑色。该网站是一个奇妙的(但，显然已经死了)博客充满了精彩的图片。然而，没有办法下载一整束图像，因此这个简单的刮板。

我要把代码放上去接受检查。代码样式和/或实现改进的领域有哪些？我很感谢你的反馈。

#!/usr/bin/python

import os
import requests


from bs4 import BeautifulSoup as bs


MAX_PAGES = 1  # Max. number of pages is 41
SAVE_DIRECTORY = 'fox_backgrounds'
BASE_URL = "http://www.thefoxisblack.com/category/the-desktop-wallpaper-project/page"
RESOLUTIONS = [
            "1280x800", "1440x900", "1680x1050", "1920x1200", "2560x1440",
            "iphone", "iphone-5", "iphone6", "iphone-6-plus", "iphone6plus",
            "ipad",
            ]


def fetch_url(url):
    return requests.get(url).text


def clip_url(href):
    return href[href.rfind('/'):]    


def save_image(href):
    print(f"Downloading: {clip_url(href)}")
    request = requests.get(href)
    with open(f"fox_backgrounds{clip_url(href)}", 'wb') as output:
        output.write(request.content)


def get_images_from_page(url):
    html = fetch_url(url)
    soup = bs(html, "html.parser")
    for link in soup.find_all("a", class_="btn_download"):
        href = link["href"]
        for resolution in RESOLUTIONS:
            if resolution in href:
                save_image(href)


def make_dir():
    os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def get_backgrounds():
    make_dir()
    for page in range(0, MAX_PAGES):
        get_images_from_page(BASE_URL + str(page + 1))


def main():
    get_backgrounds()


if __name__ == '__main__':
    main()

python

python-3.x

web-scraping

beautifulsoup

回答 1

Code Review用户

回答已采纳

发布于 2019-07-10 14:27:51

建议：

#!/usr/bin/python

import os
import requests


from bs4 import BeautifulSoup as bs
from pathlib import Path
from shutil import copyfileobj


MAX_PAGES = 1  # Max. number of pages is 41
SAVE_DIRECTORY = Path('fox_backgrounds')
BASE_URL = 'http://www.thefoxisblack.com/category/the-desktop-wallpaper-project/page'
RESOLUTIONS = {
    '1280x800', '1440x900', '1680x1050', '1920x1200', '2560x1440',
    'iphone', 'iphone-5', 'iphone6', 'iphone-6-plus', 'iphone6plus',
    'ipad'
}


def fetch_url(url):
    return requests.get(url).text


def clip_part(href):
    return href.rpartition('/')[-1]


def save_image(href):
    part = clip_part(href)
    print(f'   Downloading: {part}')
    fn = SAVE_DIRECTORY / part
    with requests.get(href, stream=True) as response, \
         open(fn, 'wb') as output:
        copyfileobj(response.raw, output)


def get_images_from_page(url):
    html = fetch_url(url)
    soup = bs(html, 'html.parser')
    for link in soup.find_all('a', class_='btn_download'):
        href = link['href']
        if any(href.endswith(f'-{res}.jpg') for res in RESOLUTIONS):
            save_image(href)
        else:
            print(f'Unknown resolution {href}')


def make_dir():
    os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def get_backgrounds():
    make_dir()
    for page in range(1, MAX_PAGES+1):
        print(f'Fetching page {page}...')
        get_images_from_page(f'{BASE_URL}{page}')


def main():
    get_backgrounds()


if __name__ == '__main__':
    main()

评论：

您初始化了SAVE_DIRECTORY，但随后将其用于创建而不是文件写入。
RESOLUTIONS应该是一个集合，或者是元组。
clip_url有点误导；它返回一个URL部件，而不是一个完整的URL。
最好在下载索引页之前有提示符，否则它会挂起，用户不知道发生了什么
流你的内容，这样你就不会因为大文件而耗尽内存
您的RESOLUTIONS检查有点令人费解。也许这是个验证步骤？但是，如果这是一个验证步骤，您将无声地失败，而不是打印警告。而且，即使在找到正确的分辨率之后，也要继续迭代。我重写了这个文件，只检查了当前的分辨率，并且对文件名中显示的位置也要小心一点。
range(0, ...)是多余的，但是对于您的用例，您最好还是使用range(1。
rpartition与您编写的基本相同，但不需要任何花哨的数组切片。
不要给clip_path打两次电话

编辑

下面的版本合理地使用了生成器，这样迭代函数只需要知道它们的迭代，而不需要知道内部业务逻辑。

此外，对于站点上的许多文件，您的分辨率检查需要不区分大小写；并且站点有gif和png图像以及jpg。你错过了一些决议和一些替代的iPhone拼写。我不认为这是值得做一个决议检查，特别是在这些边缘的情况下，但我留下了它。

#!/usr/bin/python

import os
import requests


from bs4 import BeautifulSoup as bs
from pathlib import Path
from shutil import copyfileobj


MAX_PAGES = 1  # Max. number of pages is 41
SAVE_DIRECTORY = Path('fox_backgrounds')
BASE_URL = 'http://www.thefoxisblack.com/category/the-desktop-wallpaper-project/page'
RESOLUTIONS = {
    '1280x800', '1440x900', '1680x1050', '1920x1200', '2560x1440', '3840x2400',
    'iphone', 'iphone5', 'iphone-5', 'iphone6', 'iphone-6-plus', 'iphone6plus', 'iphone6-plus',
    'ipad'
}


def clip_part(href):
    return href.rpartition('/')[-1]


def save_image(href):
    part = clip_part(href)
    print(f'   Downloading: {part}')
    fn = SAVE_DIRECTORY / part
    with requests.get(href, stream=True) as response, \
         open(fn, 'wb') as output:
        copyfileobj(response.raw, output)


def urls_from_page(url):
    soup = bs(requests.get(url).text, 'html.parser')
    for link in soup.find_all('a', class_='btn_download'):
        href = link['href']
        if any(href.lower().contains(f'-{res}.') for res in RESOLUTIONS):
            yield href
        else:
            print(f'Unknown resolution {href}')


def make_dir():
    os.makedirs(SAVE_DIRECTORY, exist_ok=True)


def all_urls():
    for page in range(1, MAX_PAGES+1):
        print(f'Fetching page {page}...')
        yield from urls_from_page(f'{BASE_URL}{page}')


def main():
    make_dir()
    for url in all_urls():
        save_image(url)


if __name__ == '__main__':
    main()

票数 2

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/223858

复制

相似问题

问狐狸是黑色的--一个简单的图像刮刀。
EN

回答 1

Code Review用户

编辑

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问狐狸是黑色的--一个简单的图像刮刀。EN

回答 1

Code Review用户

编辑

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问狐狸是黑色的--一个简单的图像刮刀。
EN