Press "Enter" to skip to content

导出知乎回答评论为markdown

背景:有时遇到一篇深刻的回答,评论区的讨论有些也同样深刻。若想保存这些评论或者查看指定用户的评论确有些困难。

import requests
import time
import sys
import re
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor  # 多线程加速

# ====【知乎 API 相关】====
BASE_URL = "https://www.zhihu.com/api/v4/answers/{}/root_comments?order=normal&limit=20&offset={}&status=open"
CHILD_COMMENT_URL = "https://www.zhihu.com/api/v4/comment_v5/comment/{}/child_comment"
GET_NAME_URL = 'https://www.zhihu.com/api/v4/questions/{}/feeds'

# ====【请求头】====
cookies = {
 #请自行填写
}

headers = {
    'accept': '*/*',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36',
}

# ====【全局变量】====
MAX_RETRIES = 3  # 最大重试次数
TIMEOUT = 10  # 超时时间
DELAY = 0.3  # 每次请求间隔,防止被封


def getname(url):
    INVALID_CHARS = r'[<>:"/\\|?*\x00-\x1F]'
    id = url.split("/")[-3]
    resp = requests.get(GET_NAME_URL.format(
        id), cookies=cookies, headers=headers)
    filename = resp.json()['data'][0]['target']['question']['title']

    # 过滤非法字符
    filename = re.sub(INVALID_CHARS, "_", filename).strip() + '.md'

    # 避免文件名为空
    return filename if filename else "default.md"


def trans_date(v_timestamp):
    """时间戳转换为时间字符串"""
    timeArray = time.localtime(float(v_timestamp))
    return time.strftime("%Y-%m-%d %H:%M:%S", timeArray)


def html_to_md(html_text):
    """HTML 转换为 Markdown"""
    soup = BeautifulSoup(html_text, "html.parser")
    for p in soup.find_all("p"):
        p.insert_before("\n")
        p.insert_after("\n")
    for br in soup.find_all("br"):
        br.replace_with("\n")
    return soup.get_text().strip()


def fetch_json(url):
    """获取 JSON 数据,支持超时重试"""
    for _ in range(MAX_RETRIES):
        try:
            response = requests.get(
                url, cookies=cookies, headers=headers, timeout=TIMEOUT)
            response.raise_for_status()  # 检查 HTTP 状态码
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"⚠️ 请求失败: {e},重试中...")
            time.sleep(2)  # 等待后重试
    print("❌ 多次请求失败,跳过该请求")
    return None  # 如果多次失败,返回 None


def get_comment_page(answer_id):
    """获取评论页数"""
    url = BASE_URL.format(answer_id, 0)
    data = fetch_json(url)
    if not data:
        return 0
    common_counts = data.get('common_counts', 0)
    pages = (common_counts // 20) + 1
    print(f"📌 发现 {common_counts} 条评论")
    return pages


def fetch_comments(answer_id, page):
    """获取单页评论"""
    offset = 20 * page
    url = BASE_URL.format(answer_id, offset)
    data = fetch_json(url)
    if not data or 'data' not in data:
        print(f"🚫 第 {page} 页无数据,跳过")
        return []
    return data['data']


def fetch_child_comments(comment_id):
    """获取楼中楼评论"""
    url = CHILD_COMMENT_URL.format(comment_id)
    data = fetch_json(url)
    if not data or 'data' not in data:
        return []
    return data['data']


def save_comment(f, i, comment):
    """保存主评论及楼中楼评论"""
    content = html_to_md(comment['content'])

    date = trans_date(comment['created_time'])
    author_name = comment['author']['member']['name']
    f.write(f"### {i}楼  {author_name}  {date}\n{content}\n")

    # 处理楼中楼
    # if comment["child_comments"]:
    # 部分回答会把主评论判定为子评论,如果加上这个判断,会丢失评论。
    number = comment['child_comment_count']
    if number != 0:
        f.write(f"\n=={number}条回复==\n")

    child_comments = fetch_child_comments(comment['id'])
    i = 1
    for child in child_comments:
        child_date = trans_date(child['created_time'])
        child_id = child['id']
        child_content = html_to_md(
            child['content']).replace("\n\n", "\n> \n> ")
        child_content = re.sub(
            r'(\d+)\.', r')', child_content)  # 修正数字+点格式
        child_author = child['author']['name']
        f.write(
            f"> {i} {child_author}  {child_date}\n> {child_content}\n\n")
        i += 1

    f.write("\n")


def main():
    """主函数:爬取评论并写入文件"""
    url = input("请输入知乎回答 URL:")
    answer_id = url.split("/")[-1]

    # answer_id = sys.argv[1].split("/")[-1]
    pages = get_comment_page(answer_id)
    if pages == 0:
        print("❌ 获取评论失败,程序退出")
        return

    total_comments = 0  # 统计评论总数

    with open(getname(url), "w", encoding="utf-8") as f:
        i = 1  # 楼层计数

        # 【使用多线程加速爬取】
        with ThreadPoolExecutor(max_workers=5) as executor:
            for page in range(pages):
                print(f"📃 正在爬取第 {page + 1} 页评论...")
                time.sleep(DELAY)  # 适当延迟防止封 IP

                comments = executor.submit(
                    fetch_comments, answer_id, page).result()
                if not comments:
                    break  # 如果当前页无数据,直接停止

                for comment in comments:
                    save_comment(f, i, comment)
                    i += 1
                    total_comments += 1  # 统计主评论数

                    # 统计楼中楼评论
                    total_comments += comment['child_comment_count']
                    # print(comment)

    print(f"✅ 爬取完成,共获取 {total_comments} 条评论")


if __name__ == "__main__":
    main()
输出内容样式参考

已知不足:

实测旧版api的offset最大只能为400,记直接回复作者的评论为主楼,一页二十层楼,offset400则最多21*20等于420条主评论,包括楼中楼的话通常最多只能获取大约900~1000条评论(经验数字,试了几个几千评论的回答,基本都在这个数字附近)。使用默认排序。
而楼中楼的评论最多显示前20条。
极少情况下会把部分主楼视为楼中楼,导致一些回复丢失。例如如何看待数学大神韦东奕的讲课方式难以理解学生纷纷退课,课堂教学应该采取什么样的互动方式?这个回答,原因未知。如果想获得,可以把save_comment函数中的if comment[“child_comments”]注释掉,即不论是否有子评论都当作有子评论处理,但会降低一定的运行速度。注意处理缩进。

评论区还有一个api,

https://www.zhihu.com/api/v4/comment_v5/answers/{answer_id}/root_comment

参数offset是类似这样的1651303267_10137984404_0,另外每次翻页都需要获取x-zse-96的加密参数,应该可以获得全部评论。但较为复杂。

服务器搭建了一个简易版的网页,如需尝试请访问:http://154.64.253.59:5000

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注