背景:有时遇到一篇深刻的回答,评论区的讨论有些也同样深刻。若想保存这些评论或者查看指定用户的评论确有些困难。
import requests
import time
import sys
import re
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor # 多线程加速
# ====【知乎 API 相关】====
BASE_URL = "https://www.zhihu.com/api/v4/answers/{}/root_comments?order=normal&limit=20&offset={}&status=open"
CHILD_COMMENT_URL = "https://www.zhihu.com/api/v4/comment_v5/comment/{}/child_comment"
GET_NAME_URL = 'https://www.zhihu.com/api/v4/questions/{}/feeds'
# ====【请求头】====
cookies = {
#请自行填写
}
headers = {
'accept': '*/*',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
}
# ====【全局变量】====
MAX_RETRIES = 3 # 最大重试次数
TIMEOUT = 10 # 超时时间
DELAY = 0.3 # 每次请求间隔,防止被封
def getname(url):
INVALID_CHARS = r'[<>:"/\\|?*\x00-\x1F]'
id = url.split("/")[-3]
resp = requests.get(GET_NAME_URL.format(
id), cookies=cookies, headers=headers)
filename = resp.json()['data'][0]['target']['question']['title']
# 过滤非法字符
filename = re.sub(INVALID_CHARS, "_", filename).strip() + '.md'
# 避免文件名为空
return filename if filename else "default.md"
def trans_date(v_timestamp):
"""时间戳转换为时间字符串"""
timeArray = time.localtime(float(v_timestamp))
return time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
def html_to_md(html_text):
"""HTML 转换为 Markdown"""
soup = BeautifulSoup(html_text, "html.parser")
for p in soup.find_all("p"):
p.insert_before("\n")
p.insert_after("\n")
for br in soup.find_all("br"):
br.replace_with("\n")
return soup.get_text().strip()
def fetch_json(url):
"""获取 JSON 数据,支持超时重试"""
for _ in range(MAX_RETRIES):
try:
response = requests.get(
url, cookies=cookies, headers=headers, timeout=TIMEOUT)
response.raise_for_status() # 检查 HTTP 状态码
return response.json()
except requests.exceptions.RequestException as e:
print(f"⚠️ 请求失败: {e},重试中...")
time.sleep(2) # 等待后重试
print("❌ 多次请求失败,跳过该请求")
return None # 如果多次失败,返回 None
def get_comment_page(answer_id):
"""获取评论页数"""
url = BASE_URL.format(answer_id, 0)
data = fetch_json(url)
if not data:
return 0
common_counts = data.get('common_counts', 0)
pages = (common_counts // 20) + 1
print(f"📌 发现 {common_counts} 条评论")
return pages
def fetch_comments(answer_id, page):
"""获取单页评论"""
offset = 20 * page
url = BASE_URL.format(answer_id, offset)
data = fetch_json(url)
if not data or 'data' not in data:
print(f"🚫 第 {page} 页无数据,跳过")
return []
return data['data']
def fetch_child_comments(comment_id):
"""获取楼中楼评论"""
url = CHILD_COMMENT_URL.format(comment_id)
data = fetch_json(url)
if not data or 'data' not in data:
return []
return data['data']
def save_comment(f, i, comment):
"""保存主评论及楼中楼评论"""
content = html_to_md(comment['content'])
date = trans_date(comment['created_time'])
author_name = comment['author']['member']['name']
f.write(f"### {i}楼 {author_name} {date}\n{content}\n")
# 处理楼中楼
# if comment["child_comments"]:
# 部分回答会把主评论判定为子评论,如果加上这个判断,会丢失评论。
number = comment['child_comment_count']
if number != 0:
f.write(f"\n=={number}条回复==\n")
child_comments = fetch_child_comments(comment['id'])
i = 1
for child in child_comments:
child_date = trans_date(child['created_time'])
child_id = child['id']
child_content = html_to_md(
child['content']).replace("\n\n", "\n> \n> ")
child_content = re.sub(
r'(\d+)\.', r')', child_content) # 修正数字+点格式
child_author = child['author']['name']
f.write(
f"> {i} {child_author} {child_date}\n> {child_content}\n\n")
i += 1
f.write("\n")
def main():
"""主函数:爬取评论并写入文件"""
url = input("请输入知乎回答 URL:")
answer_id = url.split("/")[-1]
# answer_id = sys.argv[1].split("/")[-1]
pages = get_comment_page(answer_id)
if pages == 0:
print("❌ 获取评论失败,程序退出")
return
total_comments = 0 # 统计评论总数
with open(getname(url), "w", encoding="utf-8") as f:
i = 1 # 楼层计数
# 【使用多线程加速爬取】
with ThreadPoolExecutor(max_workers=5) as executor:
for page in range(pages):
print(f"📃 正在爬取第 {page + 1} 页评论...")
time.sleep(DELAY) # 适当延迟防止封 IP
comments = executor.submit(
fetch_comments, answer_id, page).result()
if not comments:
break # 如果当前页无数据,直接停止
for comment in comments:
save_comment(f, i, comment)
i += 1
total_comments += 1 # 统计主评论数
# 统计楼中楼评论
total_comments += comment['child_comment_count']
# print(comment)
print(f"✅ 爬取完成,共获取 {total_comments} 条评论")
if __name__ == "__main__":
main()

已知不足:
实测旧版api的offset最大只能为400,记直接回复作者的评论为主楼,一页二十层楼,offset400则最多21*20等于420条主评论,包括楼中楼的话通常最多只能获取大约900~1000条评论(经验数字,试了几个几千评论的回答,基本都在这个数字附近)。使用默认排序。
而楼中楼的评论最多显示前20条。
极少情况下会把部分主楼视为楼中楼,导致一些回复丢失。例如如何看待数学大神韦东奕的讲课方式难以理解学生纷纷退课,课堂教学应该采取什么样的互动方式?这个回答,原因未知。如果想获得,可以把save_comment函数中的if comment[“child_comments”]注释掉,即不论是否有子评论都当作有子评论处理,但会降低一定的运行速度。注意处理缩进。
评论区还有一个api,
https://www.zhihu.com/api/v4/comment_v5/answers/{answer_id}/root_comment
参数offset是类似这样的:1651303267_10137984404_0,另外每次翻页都需要获取x-zse-96的加密参数,应该可以获得全部评论。但较为复杂。
服务器搭建了一个简易版的网页,如需尝试请访问:http://154.64.253.59:5000