下载地址:https://www.pan38.com/dow/share.php?code=JCnzE 提取密码:1125
这个工具包含三个主要模块:主爬虫程序、工具函数模块和执行脚本。主程序实现了笔记详情获取、评论采集和用户信息提取功能,工具模块提供了数据处理方法,执行脚本展示了完整采集流程。使用时需要配置合适的请求头和Cookie,并注意控制采集频率。
import requests
import json
import re
import time
import random
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import pandas as pd
import os
from fake_useragent import UserAgent
class XHSCrawler:
def __init__(self):
self.session = requests.Session()
self.ua = UserAgent()
self.headers = {
'User-Agent': self.ua.random,
'Referer': 'https://www.xiaohongshu.com/',
'Cookie': ''
}
self.base_url = "https://www.xiaohongshu.com"
self.data_dir = "xhs_data"
os.makedirs(self.data_dir, exist_ok=True)
def get_note_detail(self, note_id):
url = f"{self.base_url}/explore/{note_id}"
try:
resp = self.session.get(url, headers=self.headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'html.parser')
script = soup.find('script', text=re.compile('window.__INITIAL_STATE__'))
json_str = script.text.split('=', 1)[1].strip().rstrip(';')
data = json.loads(json_str)
return data
else:
print(f"请求失败,状态码:{resp.status_code}")
return None
except Exception as e:
print(f"获取笔记详情出错:{str(e)}")
return None
def get_user_info(self, user_id):
url = f"{self.base_url}/user/profile/{user_id}"
try:
resp = self.session.get(url, headers=self.headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, 'html.parser')
script = soup.find('script', text=re.compile('window.__INITIAL_STATE__'))
json_str = script.text.split('=', 1)[1].strip().rstrip(';')
data = json.loads(json_str)
user_info = data.get('user', {}).get('userInfo', {})
return user_info
else:
print(f"请求失败,状态码:{resp.status_code}")
return None
except Exception as e:
print(f"获取用户信息出错:{str(e)}")
return None
def get_note_comments(self, note_id, page=1, per_page=20):
params = {
'note_id': note_id,
'page': page,
'page_size': per_page,
'sort': 'hot'
}
url = f"{self.base_url}/fe_api/burdock/weixin/v2/note/{note_id}/comments?" + urlencode(params)
try:
resp = self.session.get(url, headers=self.headers)
if resp.status_code == 200:
return resp.json()
else:
print(f"请求失败,状态码:{resp.status_code}")
return None
except Exception as e:
print(f"获取评论出错:{str(e)}")
return None
def save_to_csv(self, data, filename):
filepath = os.path.join(self.data_dir, filename)
df = pd.DataFrame(data)
df.to_csv(filepath, index=False, encoding='utf_8_sig')
print(f"数据已保存到:{filepath}")
def random_sleep(self):
time.sleep(random.uniform(1, 3))
re
import json
import hashlib
from datetime import datetime
def extract_user_ids(text):
"""从文本中提取小红书用户ID"""
pattern = r'(?<=profile/)[a-zA-Z0-9]{24}'
return list(set(re.findall(pattern, text)))
def parse_note_data(raw_data):
"""解析笔记原始数据"""
try:
note_info = raw_data['note']['note']
user_info = raw_data['note']['note']['user']
parsed_data = {
'note_id': note_info.get('id', ''),
'title': note_info.get('title', ''),
'desc': note_info.get('desc', ''),
'likes': note_info.get('likes', 0),
'collected': note_info.get('collected', 0),
'comments': note_info.get('comments', 0),
'create_time': datetime.fromtimestamp(note_info.get('time', 0)/1000).strftime('%Y-%m-%d %H:%M:%S'),
'user_id': user_info.get('userid', ''),
'user_name': user_info.get('nickname', ''),
'user_fans': user_info.get('fans', 0),
'user_follows': user_info.get('follows', 0),
'user_liked': user_info.get('liked', 0)
}
return parsed_data
except Exception as e:
print(f"解析笔记数据出错:{str(e)}")
return None
def parse_comment_data(comment_json):
"""解析评论数据"""
comments = []
try:
for comment in comment_json['data']['comments']:
comments.append({
'comment_id': comment['id'],
'content': comment['content'],
'likes': comment['like_count'],
'time': datetime.fromtimestamp(comment['create_time']/1000).strftime('%Y-%m-%d %H:%M:%S'),
'user_id': comment['user']['userid'],
'user_name': comment['user']['nickname'],
'user_level': comment['user']['level']['level']
})
return comments
except Exception as e:
print(f"解析评论数据出错:{str(e)}")
return None
def generate_md5(text):
"""生成MD5"""
return hashlib.md5(text.encode('utf-8')).hexdigest()
from xhs_crawler import XHSCrawler
from xhs_utils import parse_note_data, parse_comment_data, extract_user_ids
import time
import random
def main():
crawler = XHSCrawler()
# 示例笔记ID
note_id = "63f9c3a9000000001a01b2b1"
# 1. 获取笔记详情
print(f"开始采集笔记 {note_id} 的数据...")
note_data = crawler.get_note_detail(note_id)
if note_data:
parsed_note = parse_note_data(note_data)
if parsed_note:
crawler.save_to_csv([parsed_note], f"note_{note_id}.csv")
print(f"成功采集笔记数据: {parsed_note['title']}")
# 2. 获取笔记评论
print("开始采集评论数据...")
all_comments = []
for page in range(1, 6): # 采集前5页评论
comments = crawler.get_note_comments(note_id, page=page)
if comments:
parsed_comments = parse_comment_data(comments)
if parsed_comments:
all_comments.extend(parsed_comments)
crawler.random_sleep()
if all_comments:
crawler.save_to_csv(all_comments, f"comments_{note_id}.csv")
print(f"成功采集 {len(all_comments)} 条评论")
# 3. 提取评论中的用户ID并获取用户信息
print("开始提取用户ID并采集用户信息...")
user_ids = set()
for comment in all_comments:
user_ids.add(comment['user_id'])
user_infos = []
for uid in list(user_ids)[:20]: # 限制采集20个用户
user_info = crawler.get_user_info(uid)
if user_info:
user_infos.append({
'user_id': user_info.get('userid', ''),
'nickname': user_info.get('nickname', ''),
'fans': user_info.get('fans', 0),
'follows': user_info.get('follows', 0),
'liked': user_info.get('liked', 0),
'level': user_info.get('level', {}).get('level', 0),
'location': user_info.get('location', ''),
'gender': user_info.get('gender', ''),
'birthday': user_info.get('birthday', '')
})
crawler.random_sleep()
if user_infos:
crawler.save_to_csv(user_infos, f"users_{note_id}.csv")
print(f"成功采集 {len(user_infos)} 个用户信息")
if __name__ == "__main__":
main()
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。