我在爬维基百科转储json文件时遇到了UnicodeEncodeError。下面是我的代码片段和错误消息。似乎是因为这个问题。不过,我不知道如何解决这个问题。
import urllib2
import json
# List of philosopher's name: mergel list
# print mergel
i = 0
for name in mergel:
# Use the API to get the page content in a format that we like.
# https://en.wikipedia.org/w/api.php?action=query&titles=Spider-Man&prop=revisions&rvprop=content&format=json
# set the parameters (https://www.mediawiki.org/wiki/API:Tutorial)
i = i+1
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
titlename = name.replace(" ", "_")
print titlename
title = "titles="+titlename
content = "prop=revisions&rvprop=content"
dataformat = "format=json"
# construct the query
query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
print query
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
# print wikisource
wikijson = json.loads(wikisource)
jsonfilename = './json/'+titlename+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)错误消息:
Tenzin_Gyatso
https://en.wikipedia.org/w/api.php?action=query&titles=Tenzin_Gyatso&prop=revisions&rvprop=content&format=json
Claude_Lévi-Strauss
https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-203-8430fc805550> in <module>()
21 query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)
22 print query
---> 23 wikiresponse = urllib2.urlopen(query)
24 wikisource = wikiresponse.read()
25 # print wikisource
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
152 else:
153 opener = _opener
--> 154 return opener.open(url, data, timeout)
155
156 def install_opener(opener):
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
429 req = meth(req)
430
--> 431 response = self._open(req, data)
432
433 # post-process response
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _open(self, req, data)
447 protocol = req.get_type()
448 result = self._call_chain(self.handle_open, protocol, protocol +
--> 449 '_open', req)
450 if result:
451 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
407 func = getattr(handler, meth_name)
408
--> 409 result = func(*args)
410 if result is not None:
411 return result
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in https_open(self, req)
1238 def https_open(self, req):
1239 return self.do_open(httplib.HTTPSConnection, req,
-> 1240 context=self._context)
1241
1242 https_request = AbstractHTTPHandler.do_request_
/Users/sundong/anaconda/lib/python2.7/urllib2.pyc in do_open(self, http_class, req, **http_conn_args)
1192
1193 try:
-> 1194 h.request(req.get_method(), req.get_selector(), req.data, headers)
1195 except socket.error, err: # XXX what error?
1196 h.close()
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in request(self, method, url, body, headers)
1051 def request(self, method, url, body=None, headers={}):
1052 """Send a complete request to the server."""
-> 1053 self._send_request(method, url, body, headers)
1054
1055 def _set_content_length(self, body, method):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_request(self, method, url, body, headers)
1091 for hdr, value in headers.iteritems():
1092 self.putheader(hdr, value)
-> 1093 self.endheaders(body)
1094
1095 def getresponse(self, buffering=False):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in endheaders(self, message_body)
1047 else:
1048 raise CannotSendHeader()
-> 1049 self._send_output(message_body)
1050
1051 def request(self, method, url, body=None, headers={}):
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in _send_output(self, message_body)
891 msg += message_body
892 message_body = None
--> 893 self.send(msg)
894 if message_body is not None:
895 #message_body was not a string (i.e. it is a file) and
/Users/sundong/anaconda/lib/python2.7/httplib.pyc in send(self, data)
867 datablock = data.read(blocksize)
868 else:
--> 869 self.sock.sendall(data)
870
871 def _output(self, s):
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in sendall(self, data, flags)
719 count = 0
720 while (count < amount):
--> 721 v = self.send(data[count:])
722 count += v
723 return amount
/Users/sundong/anaconda/lib/python2.7/ssl.pyc in send(self, data, flags)
685 self.__class__)
686 try:
--> 687 v = self._sslobj.write(data)
688 except SSLError as x:
689 if x.args[0] == SSL_ERROR_WANT_READ:
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 43: ordinal not in range(128)然而,下面的简单和直接的代码没有从列表中获得一个标题,只是工作没有任何问题。
import urllib2
import json
query = 'https://en.wikipedia.org/w/api.php?action=query&titles=Claude_Lévi-Strauss&prop=revisions&rvprop=content&format=json'
wikiresponse = urllib2.urlopen(query)
wikisource = wikiresponse.read()
wikijson = json.loads(wikisource)
jsonfilename = './json/'+'Claude_Lévi-Strauss'+'.json'
with open(jsonfilename, 'w') as outfile:
json.dump(wikijson, outfile)发布于 2015-09-28 00:46:58
不要混合Unicode和字节字符串:使用Unicode字符串处理Python中的文本。
不要手工创建urls,使用urllib函数,如quote()、urlencode()。另外,考虑来自urlparse模块的函数,如urljoin()、urlunsplit()。
您已经请求了json格式,不需要解析它,只需要使用相同的格式立即转储它;您可以使用shutil.copyfileobj()复制类似文件的对象。您可以稍后检查结果文件,以确保它已正确下载。
综上所述,下面是如何将具有给定标题的wiki页面保存到JSON格式的文件中:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from contextlib import closing
from urllib import quote
from urllib2 import urlopen
from shutil import copyfileobj
def urlretrieve(url, filename, chunksize=8096):
with closing(urlopen(url)) as response, open(filename, 'wb') as file:
copyfileobj(response, file, chunksize)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))注意:
.replace(' ', '_')os.path.join('json', name + '.json')行混合字节字符串('json'、'.json')和Unicode (type(name) == unicode)。在这里可以,因为'json'和'.json'在源代码中都是仅限于ascii的文字。# -*- coding: utf-8 -*-编码声明只影响字面上出现在您的Python源代码中的字符,例如,在这种特殊情况下,查询字符串也使用相同的编码是偶然的。源代码的编码与字符编码无关,该字符编码可能用于文件名,或通过http传输数据,或将Unicode文本写入终端等(所有这些编码可能彼此不同)。urllib.urlretrieve(url, filename)而不是urlopen + copyfile,但是urllib.urlretrieve()行为与Python2上的urllib2.urlopen()不同。下面是使用requests的相同代码
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
import os
from urllib import quote
import requests # $ pip install requests
def urlretrieve(url, filename, chunksize=8096):
r = requests.get(url, stream=True)
r.raise_for_status() # raise on http error
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunksize):
f.write(chunk)
#XXX for name in mergel:
name = u"Claude Lévi-Strauss" #NOTE: Unicode string
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))然而,下面的简单和直接的代码没有从列表中获得一个标题,只是工作没有任何问题。
您的代码使用非ascii字节串文字(在Python 3中是非法的)。没有编码错误,因为所有数据都已经是字节了。使用字节串的问题是,如果不同的环境可能使用不同的字符编码而它们使用不同的字符编码,那么字节字符串就会中断(您不能期望所有东西都使用utf-8,尽管它可能是可取的)。还有, should be sent as '%C3%A9'。
无关:要同时下载多个网页,可以使用线程池:
from multiprocessing.dummy import Pool # use threads
def download(name):
urlretrieve("https://en.wikipedia.org/w/api.php?"
"action=query&prop=revisions&rvprop=content&format=json&"
"titles=" + quote(name.encode('utf-8')),
os.path.join('json', name + '.json'))
pool = Pool(4) # download 4 titles concurrently
for _ in pool.imap_unordered(download, mergel, chunksize=100):
pass这是对 http header的礼貌。维基百科API有几个包装器可以帮你。
https://stackoverflow.com/questions/32809287
复制相似问题