我正在尝试从Fandango网站上获取影评。甚至当我点击电影评论的第二个页面的URL时,我仍然会得到第一个页面。我需要随请求一起发送cookie吗?
下面是我的代码片段:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
baseUrl = 'https://www.fandango.com/movie-reviews'
req = Request(baseUrl, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, 'html.parser')
# Getting all the movie links from the first page
movieLinks = soup.find_all("a", class_='dark')
# Get reviews for every movie
for i in range(2):#len(movieLinks)
try:
movieName = movieLinks[i].text.replace(' Review', '')
count = 1
print('\n\n****** ' + movieName + ' ********\n\n')
# Getting movie reviews from first 10
for j in range(3):
pageNum = j + 1;
movieReviewUrl = movieLinks[i]['href'] + '?pn=' + str(pageNum)
print('Hitting URL: ' + movieReviewUrl)
revReq = Request(movieReviewUrl, headers = {'User-Agent': 'Mozilla/5.0'})
revWebpage = urlopen(revReq).read()
revSoup = BeautifulSoup(revWebpage, 'html.parser')
revArr = revSoup.find_all("p", class_ = "fan-reviews__item-content")
for k in range(len(revArr)):
if len(revArr[k])>0:
print(str(count) + ' : ' + revArr[k].text)
count = count + 1
except:
print('Error for movie: ' + movieName)发布于 2018-03-17 04:42:35
我建议使用Requests,用它来处理这样的请求要容易得多。
from bs4 import BeautifulSoup
import requests
baseUrl = 'https://www.fandango.com/movie-reviews'
# req = Request(baseUrl, headers={'User-Agent': 'Mozilla/5.0'})
webpage = requests.get(baseUrl).text
soup = BeautifulSoup(webpage, 'html.parser')
# Getting all the movie links from the first page
movieLinks = soup.find_all("a", class_='dark')
# Get reviews for every movie
for i in range(2):#len(movieLinks)
try:
movieName = movieLinks[i].text.replace(' Review', '')
count = 1
print('\n\n****** ' + movieName + ' ********\n\n')
# Getting movie reviews from first 10
for j in range(3):
pageNum = j + 1;
movieReviewUrl = movieLinks[i]['href'] + '?pn=' + str(pageNum)
print('Hitting URL: ' + movieReviewUrl)
# revReq = Request(movieReviewUrl, headers = {'User-Agent': 'Mozilla/5.0'})
# revWebpage = urlopen(revReq).read()
revWebpage = requests.get(movieReviewUrl).text
revSoup = BeautifulSoup(revWebpage, 'html.parser')
revArr = revSoup.find_all("p", class_ = "fan-reviews__item-content")
print(len(revArr))
for k in range(len(revArr)):
if len(revArr[k])>0:
print(str(count) + ' : ' + revArr[k].text)
count = count + 1
except:
print('Error for movie: ' + movieName)当你运行它时,你可以看到revArr返回0,所以请检查"fan-reviews__item-content“。
https://stackoverflow.com/questions/49310922
复制相似问题