我有一个脚本,它正在刮网络。但它会引发一些错误:
返回self.attrskey KeyError:“数据索引”
也许,这是因为‘数据索引’是not existing。但是,我想收集所有可用的数据,并且
期望输出:
熊猫数据框架(这是虚拟数据):
标题价格赞助的url asin index_asin $12,Price 1,B $14,No,Y ABCD 4,B $14,Price 1
import requests
from bs4 import BeautifulSoup
#from textwrap import shorten
import pandas as pd
urls = ['https://www.amazon.com/s?k=shaver+for+men&i=beauty&ref=nb_sb_noss_2',
"https://www.amazon.com/s?k=electric+shaver&ref=nb_sb_noss_2"]
headers={'User-Agent':'Mozilla/5.0'}
#df = pd.DataFrame(columns =['Title', 'Price', 'Sponsored', 'asin', 'index_asin'])
df = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml') #lxml
for div in soup.select('div[data-asin]'):
title, price = div.select_one('span.a-text-normal').text, div.select_one('.a-offscreen').text if div.select_one('.a-offscreen') else '-'
sponsored = 'Yes' if div.select_one('span:contains("Sponsored")') else 'No'
url = response.url
asin = div['data-asin']
index_asin = div['data-index']
print('title',title)
print('price',price)
print('sponsored',sponsored)
print('url',url)
print('asin',asin)
print('index_asin',index_asin)
# I want to store everything in a data frame
#df.append(title, price, sponsored, url, asin, index_asin)发布于 2019-07-15 13:52:16
如果索引不存在,则使用try..except块,它将转到not块。
import requests
from bs4 import BeautifulSoup
#from textwrap import shorten
import pandas as pd
urls = ['https://www.amazon.com/s?k=shaver+for+men&i=beauty&ref=nb_sb_noss_2',
"https://www.amazon.com/s?k=electric+shaver&ref=nb_sb_noss_2"]
headers={'User-Agent':'Mozilla/5.0'}
#df = pd.DataFrame(columns =['Title', 'Price', 'Sponsored', 'asin', 'index_asin'])
df = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml') #lxml
for div in soup.select('div[data-asin]'):
title, price = div.select_one('span.a-text-normal').text, div.select_one('.a-offscreen').text if div.select_one('.a-offscreen') else '-'
sponsored = 'Yes' if div.select_one('span:contains("Sponsored")') else 'No'
url = response.url
asin = div['data-asin']
try:
index_asin = div['data-index']
except:
index_asin='NAN'
print('title',title)
print('price',price)
print('sponsored',sponsored)
print('url',url)
print('asin',asin)
print('index_asin',index_asin)
# I want to store everything in a data frame
df.append({title, price, sponsored, url, asin, index_asin})
print(df)编辑了 df。
df=df.append({'Title':title,'Price':price,'Sponsored':sponsored,'url':url,'asin':asin,'index_asin':index_asin},ignore_index=True)import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = ['https://www.amazon.com/s?k=shaver+for+men&i=beauty&ref=nb_sb_noss_2',
"https://www.amazon.com/s?k=electric+shaver&ref=nb_sb_noss_2"]
headers={'User-Agent':'Mozilla/5.0'}
df = pd.DataFrame(columns =['Title', 'Price', 'Sponsored','url', 'asin', 'index_asin'])
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(requests.get(url, headers=headers).text, 'lxml') #lxml
for div in soup.select('div[data-asin]'):
title, price = div.select_one('span.a-text-normal').text, div.select_one('.a-offscreen').text if div.select_one('.a-offscreen') else '-'
sponsored = 'Yes' if div.select_one('span:contains("Sponsored")') else 'No'
url = response.url
asin = div['data-asin']
try:
index_asin = div['data-index']
except:
index_asin='NAN'
print('title',title)
print('price',price)
print('sponsored',sponsored)
print('url',url)
print('asin',asin)
print('index_asin',index_asin)
# I want to store everything in a data frame
df=df.append({'Title':title,'Price':price,'Sponsored':sponsored,'url':url,'asin':asin,'index_asin':index_asin},ignore_index=True)
print(df)https://stackoverflow.com/questions/57040852
复制相似问题