我试图查找电子邮件in,我有一个列表,我想一个接一个地传递多个搜索查询,但是当我尝试使用列表显示我和缩进错误时,有人能帮我解决这个问题吗?
我的密码在这里:
import scrapy
from scrapy.spiders import CrawlSpider, Request
from googlesearch import search
import re
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import openpyxl
wb = openpyxl.load_workbook("")
sh = wb.active()
class email_extractor(CrawlSpider):
name = 'email_ex'
def __init__(self,query, *args, **kwargs):
super(email_extractor, self).__init__(*args, **kwargs)
self.email_list = []
self.query = query
# sending requests
def start_requests(self):
for results in search(self.query, num=10, stop=None, pause=2):
yield SeleniumRequest(
url=results,
callback=self.parse,
wait_until=EC.presence_of_element_located(
(By.TAG_NAME, "html")),
dont_filter=True
)
# extracting emails
def parse(self, response):
EMAIL_REGEX = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
emails = re.finditer(EMAIL_REGEX, str(response.text))
for email in emails:
self.email_list.append(email.group())
for email in set(self.email_list):
yield{
"emails": email
}
self.email_list.clear()我想传递包含搜索_query=‘info’,‘联系人’,‘销售’,‘营销’,‘市场位置’中包含多个域的列表,有人能帮我找到解决这个问题的方法吗?
发布于 2022-10-24 00:31:53
仍然不能100%确定我是否理解这个问题,但是下面是一个例子,说明如何处理一个查询列表,而不仅仅是一个查询。
def __init__(self,query, *args, **kwargs):
super(email_extractor, self).__init__(*args, **kwargs)
self.email_list = []
self.queries = [query1,query2,query3,query4] # list of queries
# sending requests
def start_requests(self):
for query in self.queries: # iterate through queries
for results in search(query, num=10, stop=None, pause=2):
yield SeleniumRequest(
url=results,
callback=self.parse,
wait_until=EC.presence_of_element_located(
(By.TAG_NAME, "html")),
dont_filter=True
)https://stackoverflow.com/questions/74150773
复制相似问题