首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >抓取以下载特定类型文件

抓取以下载特定类型文件
EN

Stack Overflow用户
提问于 2018-11-16 11:32:12
回答 2查看 908关注 0票数 0

我对scrapy和python很陌生,我可以下载所有的文件,但是我只想下载特定的Type文件"EX-10",这样它就可以下载下面的文件。( Ex-10.1,Ex-10.2至EX-10.99)。

我的密码

代码语言:javascript
复制
import scrapy, os

class legco(scrapy.Spider):
name = "sec_gov"

start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]

def parse(self, response):
    for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
        absoluteLink = response.urljoin(link)
        yield scrapy.Request(url = absoluteLink, callback = self.parse_links)

def parse_links(self, response):
    for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
        targetLink = response.urljoin(links)
        yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)

def collecting_file_links(self, response):
    for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
        if links.endswith(".htm") or links.endswith(".txt"):
            baseLink = response.urljoin(links)
            yield scrapy.Request(url = baseLink, callback = self.download_files)

def download_files(self, response):
    path = response.url.split('/')[-1]
    dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
    if not os.path.exists(dirf):os.makedirs(dirf)
    os.chdir(dirf)
    with open(path, 'wb') as f:
        f.write(response.body)

而且Scrapy也想检查下一页。(直到最后一页),它不工作很好。

代码语言:javascript
复制
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@value="Next 40"]',)), callback="parse", follow= True),)

# follow next page links
    next_page = response.xpath('.//a[@value="Next 40"]/@href').extract()
    if next_page:
        next_href = next_page[0]
        next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
        request = scrapy.Request(url=next_page_url)
        yield request
EN

回答 2

Stack Overflow用户

回答已采纳

发布于 2018-11-17 11:29:26

你的问题似乎已经解决了。下面的脚本应该根据每个分页链接从该站点获取所需的文件,并按照您想要的方式下载这些文件。

代码语言:javascript
复制
import scrapy, os

class legco(scrapy.Spider):
    name = "sec_gov"

    start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]

    def parse(self, response):
        for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
            absoluteLink = response.urljoin(link)
            yield scrapy.Request(url = absoluteLink, callback = self.parse_links)

        nextpage = response.css("input[value='Next 40']::attr(onclick)")
        if nextpage:
            tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
            nlink = response.urljoin(tpage)
            yield scrapy.Request(url=nlink, callback = self.parse)

    def parse_links(self, response):
        for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
            targetLink = response.urljoin(links)
            yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)

    def collecting_file_links(self, response):
        for links in response.xpath('//table[contains(@summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(@href, ".htm") or contains(@href, ".txt")]/@href').extract():
            baseLink = response.urljoin(links)
            yield scrapy.Request(url = baseLink, callback = self.download_files)

    def download_files(self, response):
        path = response.url.split('/')[-1]
        dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
        if not os.path.exists(dirf):os.makedirs(dirf)
        os.chdir(dirf)
        with open(path, 'wb') as f:
            f.write(response.body)
票数 0
EN

Stack Overflow用户

发布于 2018-11-16 17:50:22

您需要使用FilesPipeline,但是scrapy提供的文件名是根据URL的散列生成的。

如果您想要一个自定义文件名,您必须使自己的FilesPipeline如下所示:

代码语言:javascript
复制
import scrapy, os
from scrapy.pipelines.files import FilesPipeline

class legco(scrapy.Spider):
    name = "sec_gov"

    start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]

    custom_settings = {
        'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
        'FILES_STORE': '/my/valid/path/',
    }

    def parse(self, response):
        for link in response.xpath('//table[@summary="Results"]//td[@scope="row"]/a/@href').extract():
            absoluteLink = response.urljoin(link)
            yield scrapy.Request(url = absoluteLink, callback = self.parse_links)

    def parse_links(self, response):
        for links in response.xpath('//table[@summary="Results"]//a[@id="documentsbutton"]/@href').extract():
            targetLink = response.urljoin(links)
            yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)

    def collecting_file_links(self, response):
        for links in response.xpath('//table[contains(@summary,"Document")]//td[@scope="row"]/a/@href').extract():
            if links.endswith(".htm") or links.endswith(".txt"):
                yield {
                    'file_urls': [response.urljoin(links)]
                }

class MyFilesPipeline(FilesPipeline):

    def file_path(self, request, response=None, info=None):
        return request.url.split('/')[-1]
票数 0
EN
页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://stackoverflow.com/questions/53337018

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档