我试图通过使用JSON调用从Shopee网站抓取产品的信息。我知道我的问题类似于- How can I crawl the product items from shopee website?中给出的答案。
然而,我的问题涉及到一个2步骤过程,它要求我获得产品的url,以便继续第二步,并为解决这个问题打开了可能性。
两个步骤的过程解释如下:
步骤1涉及在搜索栏中输入关键字(对于我要搜索的产品),然后提取所有顶级产品的产品信息。最初,我使用Selenium和BeautifulSoup,但意识到使用URL从服务器获取JSON数据是更有效的选择。
import requests
Shopee_url = 'https://shopee.sg'
keyword_search = 'Lipton'
headers = {
'User-Agent': 'Chrome',
'Referer': '{}search?keyword={}'.format(Shopee_url, keyword_search)
}
url = 'https://shopee.sg/api/v2/search_items/?by=relevancy&keyword={}&limit=100&newest=0&order=desc&page_type=search'.format(keyword_search)
#can change "relevancy" to "latest": to sort by latest products instead
# Shopee API request
r = requests.get(url, headers = headers).json()下面是从1个产品返回的所有JSON数据的示例:
{
"itemid": 7997227782,
"price_max_before_discount": 990000,
"item_status": "normal",
"can_use_wholesale": false,
"show_free_shipping": false,
"display_name": null,
"upcoming_flash_sale": null,
"add_on_deal_info": null,
"is_non_cc_installment_payment_eligible": false,
"ctime": 1628604243,
"name": "Lipton Infusions Fruit Tea, 20 Pyramid Teabags",
"show_shopee_verified_label": true,
"campaignid": 2768568,
"size_chart": null,
"service_by_shopee_flag": null,
"historical_sold": 11,
"campaign_stock": null,
"reference_item_id": "",
"recommendation_info": null,
"bundle_deal_info": null,
"has_lowest_price_guarantee": false,
"shipping_icon_type": null,
"overall_purchase_limit": null,
"images": [
"06770c2d7077dc5430a44f030f1efe48"
],
"price_before_discount": 990000,
"catid": 100629,
"is_official_shop": false,
"coin_earn_label": null,
"sold": 11,
"item_rating": {
"rating_star": 5.0,
"rating_count": [
3,
0,
0,
0,
0,
3
],
"rcount_with_image": 1,
"rcount_with_context": 1
},
"show_official_shop_label_in_title": false,
"discount": "41%",
"label_ids": [
1000055,
22,
1007623
],
"has_group_buy_stock": null,
"algo_image": null,
"tracking_info": {
"multi_search_tracking": null,
"viral_spu_tracking": null,
"ruleid": null,
"groupid": 0,
"business_tracking": null
},
"pack_size": null,
"badge_icon_type": 0,
"liked": false,
"is_on_flash_sale": false,
"cmt_count": 3,
"image": "06770c2d7077dc5430a44f030f1efe48",
"recommendation_algorithm": null,
"is_cc_installment_payment_eligible": false,
"shopid": 323430152,
"video_info_list": [],
"ads_keyword": "lipton tea",
"json_data": "aP///////////wFxO6BCgr9Dsj95+HOgMCiwqz+IAQI=",
"view_count": 356,
"voucher_info": null,
"liked_count": 8,
"show_official_shop_label": false,
"price_min_before_discount": 990000,
"show_discount": 41,
"preview_info": null,
"flag": 0,
"exclusive_price_info": null,
"distance": null,
"wholesale_tier_list": [],
"fe_flags": null,
"group_buy_info": null,
"shopee_verified": true,
"hidden_price_display": null,
"transparent_background_image": "",
"welcome_package_info": null,
"match_type": 1,
"is_adult": false,
"currency": "SGD",
"raw_discount": 41,
"is_preferred_plus_seller": false,
"is_category_failed": false,
"price_min": 585000,
"can_use_bundle_deal": false,
"cb_option": 0,
"brand": null,
"deduction_info": "AAAAAAAAAAAAAAAAAAAAAFz5yPUIFAImyA4NRO+XPGonKqZTmkZlKacrYO7uw8Qx5HzU6gGT9zYYfnvHkpQbfctiPF/QqIikCqPGJnPHD8FOZqHDiw9HVccOiJwd5i0A5rN2o44A33/RihzLZeoP8w==",
"stock": 125,
"status": 1,
"price_max": 585000,
"is_group_buy_item": null,
"flash_sale": null,
"is_mart": null,
"price": 585000,
"shop_location": "Singapore",
"tier_variations": [
{
"images": [
"b392790c3b5749fbe9c08698d3ddb7ae",
"c93868109beabc85fa4f47d21f7b0cf3",
"b371c4379fd4f2a518cf4b5467bbe65b",
"0dfc442c280866b596af8e25bb26f5f4",
"e39ded7159a8d2595e3de664d289002e",
"523d6df729a190003fabdc08f70e36ef",
"409df69f3010e1cc11420ea5b88b1d48"
],
"properties": [],
"type": 0,
"name": "Options",
"options": [
"Blue Fruit",
"Calming Chamomile",
"Citrus",
"Forest Fruit",
"Grape Raspberry",
"Mandarin Orange",
"Strawberry Mint"
]
}
],
"min_purchase_limit": null,
"can_use_cod": false,
"collection_id": null,
"welcome_package_type": 0,
"show_official_shop_label_in_normal_position": null,
"adsid": 4974802,
"item_type": 0,
"spl_installment_tenure": null
}目前,我能够提取产品的名称、price、sold 和name;方法是将相应的JSON数据附加到各种列表中,如下所示。
titles_list = []
prices_list = []
sold_list = []
ratings_list = []
for item in r['items']:
titles_list.append(item['name'])
prices_list.append(item['price_min'])
sold_list.append(item['historical_sold'])
ratings_list.append(item['item_rating']['rating_star'])但是,为了继续执行步骤2,我还需要产品的URL,但是在返回的JSON数据中找不到它。作为..。
步骤2需要获取产品的URL以便访问每个产品的网页,以便进一步刮除每个产品的所有评论。
类似地,用于刮取产品网页中所有评论的方法是使用JSON调用完成的,如下所示:
import re
import json
import requests
import pandas as pd
#url link is used manually here as for now
url = "https://shopee.sg/%E3%80%90Japan-limited%E3%80%91%E3%80%90Made-in-Japan%E3%80%91-Lipton-Flavor-Tea-Assortment-Pack-Tea-Bag-10-Bags-Japan-food-instant-Tea-%E3%80%90Direct-from-Japan%E3%80%91-i.219170680.4523492238?ads_keyword=wkdaelpmissisiht&adsid=238017&campaignid=160393&position=1"
r = re.search(r"i\.(\d+)\.(\d+)", url)
shop_id, item_id = r[1], r[2]
ratings_url = "https://shopee.sg/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"
offset = 0
d = {"username": [], "rating": [], "comment": []}
while True:
data = requests.get(
ratings_url.format(shop_id=shop_id, item_id=item_id, offset=offset)
).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
i = 1
for i, rating in enumerate(data["data"]["ratings"], 1):
d["username"].append(rating["author_username"])
d["rating"].append(rating["rating_star"])
d["comment"].append(rating["comment"])
print(rating["author_username"])
print(rating["rating_star"])
print(rating["comment"])
print("-" * 100)
if i % 20:
break
offset += 20
df = pd.DataFrame(d)
print(df)
df.to_csv("data.csv", index=False)样本输出:
l*****1
5
Reliable seller, repeat purchases, items are always well packaged & received in excellent condition.
----------------------------------------------------------------------------------------------------
spyuc
5
Takes a while deliver as it’s shipped from Japan. Priced a bit on the high side. But taste good both hot and cold brewed.
----------------------------------------------------------------------------------------------------
i*****r
5
Delivery took a week. Considered okay. Yet to try the teas but looks good.
----------------------------------------------------------------------------------------------------从代码中可以看出,我需要来自步骤1的产品URL才能继续到步骤2,因为步骤2需要URL作为输入。
因此,我想知道如何修改步骤1中的代码以检索产品的URL,以便我可以使用每个URL来进一步刮除每个产品的所有评论。或者,为了达到我的目标,我还可以尝试其他的解决办法吗?
为这篇长篇文章道歉,谢谢。
发布于 2021-09-02 18:15:59
要将这两个步骤组合在一起,您只需要项目中的"shopid“和"itemid”:
import re
import json
import requests
import pandas as pd
def get_ratings(shop_id, item_id):
ratings_url = "https://shopee.sg/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"
offset = 0
d = {"username": [], "rating": [], "comment": []}
while True:
data = requests.get(
ratings_url.format(shop_id=shop_id, item_id=item_id, offset=offset)
).json()
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
i = 1
for i, rating in enumerate(data["data"]["ratings"], 1):
d["username"].append(rating["author_username"])
d["rating"].append(rating["rating_star"])
d["comment"].append(rating["comment"])
if i % 20:
break
offset += 20
return d
Shopee_url = "https://shopee.sg"
keyword_search = "Lipton"
headers = {
"User-Agent": "Chrome",
"Referer": "{}search?keyword={}".format(Shopee_url, keyword_search),
}
url = "https://shopee.sg/api/v2/search_items/?by=relevancy&keyword={}&limit=100&newest=0&order=desc&page_type=search".format(
keyword_search
)
# can change "relevancy" to "latest": to sort by latest products instead
# Shopee API request
r = requests.get(url, headers=headers).json()
for item in r["items"]:
print(item["name"])
df = pd.DataFrame(get_ratings(item["shopid"], item["itemid"]))
print(df.head()) # print only the head for brevity
print("-" * 80)指纹:
Lipton Infusions Fruit Tea, 20 Pyramid Teabags
username rating comment
0 blehhhbowling 5 Prompt delivery, been looking for these flavours everywhere in big name supermarkets but to no avail
1 d*****h 5
2 sitisujanah13 5
--------------------------------------------------------------------------------
【Japan limited】【Made in Japan】 Lipton Flavor Tea Assortment Pack Tea Bag 10 Bags Japan food instant Tea 【Direct from Japan】
username rating comment
0 s*****e 5 约10天到货\n日本直运 买了5种不同的(花茶/调味茶来取代泡泡茶)\n还有特别款的全麦kitkat\n外包装是完整的\n日系商品的独特风味\n值得品尝
1 l*****1 5 Reliable seller, repeat purchases, items are always well packaged & received in excellent condition.
2 spyuc 5 Takes a while deliver as it’s shipped from Japan. Priced a bit on the high side. But taste good both hot and cold brewed.
3 i*****r 5 Delivery took a week. Considered okay. Yet to try the teas but looks good.
4 m*****y 4 Delivery took abt 1 week. Have yet to try hope it's good. Long expiry
--------------------------------------------------------------------------------
...and so on.https://stackoverflow.com/questions/69035175
复制相似问题