//请自行设置header和cookie。
import requests
import re
import html2text
import time
# 把问题名作为文件名,替换其中的非法字符
def rename(filename):
illegal_characters = ['\\', '/', ':', '*', '?', '"', '<', '>', '|']
for char in illegal_characters:
filename = filename.replace(char, '')
return filename
# 获取收藏夹回答数量
def get_answer_count(url, cookies, headers, params):
resp = requests.get(url, params=params, cookies=cookies, headers=headers)
pattern = r'"answerCount":\s*(\d+)'
resp.close()
return int(re.search(pattern, resp.text).group(1))
# 获取每个回答的json
def get_page_json(url, cookies, headers, params):
response = requests.get(
f'https://www.zhihu.com/api/v4/collections/{url.split('/')[-1]}/items',
params=params,
cookies=cookies,
headers=headers,
)
response.close()
return response.json()['data']
def get_answer_content(answer):
try:
title = rename(answer['content']['question']['title'])
# 普通回答
except:
try:
title = rename(answer['content']['title'])
# 文章或视频
except:
title = '想法:' + rename(answer['content']['content'][0]['title'])
# 想法
# -------------------------------------------------以上获取标题,以下获取内容
# 先获取html,再转换为markdown
try:
# 想法
html_content = answer['content']['content'][0]['content']
except:
try:
# 文章和回答
html_content = answer['content']['content']
except:
# 视频
html_content = '请点击作者观看视频...'
converter = html2text.HTML2Text()
try:
md_text = converter.handle(html_content)
# 回答
except:
md_text = converter.handle(html_content[0]['content'])
# 文章
return {'title': title, 'md_text': md_text, 'url': answer['content']['url'], 'autor': answer['content']['author']['name']}
def re_connect(start, end, url, params, cookies, headers):
try:
for offset in range(start, end, 20):
# if offset == 60:
# break
page = int(offset / 20) + 1
offset = int(offset / 20) * 20
params['offset'] = offset
print(f'\n第{page}页')
time.sleep(1)
print('\n保护服务器,休眠1秒...\n')
i = 1
# 计数器,每页的第i个回答
for answer in get_page_json(url, cookies, headers, params):
number = i + (page - 1) * 20
i += 1
ord = get_answer_content(answer)
#print(ord['autor'])
print(str(number) + '. ' + ord['title'])
print(ord['url'])
head = f'> Autor: [{ord['autor']}]({ord['url']})\n\n'
with open(str(number) + '. ' + ord['title'] + '.md', "w", encoding="utf-8") as file:
file.write(head)
file.write(ord['md_text'])
except:
print("已断开连接,正在重连...")
time.sleep(5)
return re_connect(offset, end, url, params, cookies, headers)
def main():
cookies = {
}
headers = {
}
url = input('请输入url:')
params = {
'offset': '0',
'limit': '20',
}
print(f'收藏夹总回答数量:{get_answer_count(url, cookies, headers, params)}')
re_connect(0, get_answer_count(url, cookies, headers, params),
url, params, cookies, headers)
main()