我正在使用我在github上找到的一个程序,从我用csv下载的excel电子表格中的网站列表中抓取电子邮件。运行它时会出现以下错误:
Cannot retrive URL:
Traceback (most recent call last):
File "/usr/lib/python3.5/urllib/request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "/usr/lib/python3.5/http/client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python3.5/http/client.py", line 1152, in _send_request
self.endheaders(body)
File "/usr/lib/python3.5/http/client.py", line 1103, in endheaders
self._send_output(message_body)
File "/usr/lib/python3.5/http/client.py", line 934, in _send_output
self.send(msg)
File "/usr/lib/python3.5/http/client.py", line 877, in send
self.connect()
File "/usr/lib/python3.5/http/client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "/usr/lib/python3.5/socket.py", line 694, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
File "/usr/lib/python3.5/socket.py", line 733, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -5] No address associated with hostname
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 39, in parseAddress
website = urllib2.urlopen(getAddress(url))
File "/usr/lib/python3.5/urllib/request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python3.5/urllib/request.py", line 466, in open
response = self._open(req, data)
File "/usr/lib/python3.5/urllib/request.py", line 484, in _open
'_open', req)
File "/usr/lib/python3.5/urllib/request.py", line 444, in _call_chain
result = func(*args)
File "/usr/lib/python3.5/urllib/request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "/usr/lib/python3.5/urllib/request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno -5] No address associated with hostname>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "pagescanner.py", line 85, in <module>
main()
File "pagescanner.py", line 71, in main
execute()
File "pagescanner.py", line 60, in execute
parseAddress(s)
File "pagescanner.py", line 51, in parseAddress
print ("Cannot retrive URL: ") + err.reason[1]
TypeError: 'gaierror' object does not support indexing下面是我使用的代码
import sys
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import re
import csv
list1 = []
list2 = []
list3 = []
def addList():
with open('file.csv', 'rt') as f:
reader = csv.reader(f)
for row in reader:
for s in row:
list2.append(s)
def getAddress(url):
http = "http://"
https = "https://"
if http in url:
return url
elif https in url:
return url
else:
url = "http://" + url
return url
def parseAddress(url):
global list3
try:
website = urllib2.urlopen(getAddress(url))
html = website.read()
addys = re.findall('''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''', html, flags=re.IGNORECASE)
global list1
list1.append(addys)
except urllib2.HTTPError as err:
print ("Cannot retrieve URL: HTTP Error Code: "), err.code
list3.append(url)
except urllib2.URLError as err:
print ("Cannot retrive URL: ") + err.reason[1]
list3.append(url)
def execute():
global list2
addList()
totalNum = len(list2)
atNum = 1
for s in list2:
parseAddress(s)
print ("Processing ") + str(atNum) + (" out of ") + str(totalNum)
atNum = atNum + 1
print ("Completed. Emails parsed: ") + str(len(list1)) + "."
### MAIN
def main():
global list2
execute()
global list1
myFile = open("finishedFile.csv", "w+")
wr = csv.writer(myFile, quoting=csv.QUOTE_ALL)
for s in list1:
wr.writerow(s)
myFile.close
global list3
failFile = open("failedSites.csv", "w+")
write = csv.writer(failFile, quoting=csv.QUOTE_ALL)
for j in list3:
write.writerow(j)
failFile.close
main()我假设这与我试图将代码从python2转换为python3有关,但我被难住了。
发布于 2020-01-11 11:16:11
如果是URL错误,则可能是您正在尝试解析Excel电子表格中的无效地址,因此显示这些URL会很有帮助。如果是套接字错误,您可以尝试通过使用socket类解析URL并获取主机名来解决此问题:
import socket
ipAdd = socket.gethostbyname(url) # Retrieves URL's IP address (example: 'www.google.com'
website = urllib2.urlopen(getAddress(ipAdd)) # Formats to http:// or https://
html = website.read() # Gets HTML code如果这不起作用,您可能需要安装Flask,这是一个Web应用程序框架,旨在充当套接字(用于服务器和应用程序之间的通信)。Python3.5及更高版本、Python2.7和PyPi都支持它:
$ pip install Flaskhttps://stackoverflow.com/questions/59691191
复制相似问题