headers.py
,内含一个名为 headers 的字典,因为涉及隐私,不共享。可以把自己的请求头贴进去。get_danmaku.py
,获取指定 oid(cid)和 date 的弹幕。get_all_history_danmaku.py
,获取所有历史弹幕。get_all_history_danmaku.py
,会生成一个 json。#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Load bilibili history danmaku, and return json.
import requests
import logging as log
import json
import time
from lxml import etree
from .headers import headers
def get_history_danmaku(oid, date):
dm_list = []
# get data
url = f'https://api.bilibili.com/x/v2/dm/history'
params = {'type': 1, 'oid': oid, 'date': date}
r = requests.get(url, params=params, headers=headers)
content = r.content
log.debug(content.decode('utf-8'))
# read xml
xml = etree.HTML(content)
for d in xml.xpath('//d'):
attrs = d.xpath('./@p')[0]
attrs = attrs.split(',')
text = d.xpath('./text()')[0]
log.debug(f'{attrs}, {text}')
# format data
d = {
'cid': int(oid),
'time': int(float(attrs[0])), # 发送时间点(视频播放点)
'position': int(attrs[1]), # 弹幕位置
'fontsize': int(attrs[2]), # 字体大小
'color': ('000000' + str(hex(int(attrs[3])))[2:])[-6:], # 弹幕颜色
'ctime': int(attrs[4]), # 弹幕创建时间
'unknown': attrs[5],
'author': attrs[6], # 发送者编号(不同于 uid )
'dmid': int(attrs[7]), # 弹幕 id
'content': text, # 弹幕内容
'date': date,
'updateTime': int(time.time())
}
dm_list.append(d)
return dm_list
if __name__ == '__main__':
log.basicConfig(level=log.DEBUG)
oid = 136870419
date = '2019-12-20'
d = get_history_danmaku(oid, date)
print(json.dumps(d, ensure_ascii=False, indent=4))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import logging as log
import os
import re
import json
from datetime import datetime, timedelta
from .get_danmaku import get_history_danmaku
from .headers import headers
def get_all_history_danmaku(aid):
url = f'https://www.bilibili.com/video/av{aid}'
body = requests.get(url).text
log.debug(body)
# get cid / oid
pages = re.findall(r'(?<="pages":)\[.*?\]', body)[0]
cids = re.findall(r'(?<="cid":)\d*', pages)
log.info(f'{cids=}')
# get post date
publish = re.findall(r'(?<=Published" content=")\d{4}-\d{2}-\d{2}', body)[0]
start_date = datetime.strptime(publish, '%Y-%m-%d')
log.info(f'{publish=}')
result = {}
while True:
date = start_date.strftime('%Y-%m-%d')
log.info(f'get danmaku of {date}')
for cid in cids:
dms = get_history_danmaku(cid, date) # get danmaku
for dm in dms: # format data
dmid = dm['dmid']
result.setdefault(dmid, dm) # 防止重复添加
# go next day or exit
start_date += timedelta(1)
if start_date > datetime.now():
break
here = os.path.abspath(os.path.dirname(__file__))
output = os.path.join(here, f'av{aid}_dm.json')
with open(output, 'w', encoding='utf-8') as f:
f.write(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == '__main__':
log.basicConfig(level=log.INFO)
aid = 79974337
get_all_history_danmaku(aid)
1
NSAgold 2020-01-17 19:48:16 +08:00
没记错是弹幕池,区分是否是高级弹幕用的
|
2
JCZ2MkKb5S8ZX9pq OP @NSAgold 原来如此,我等下找个视频验证一下,非常感谢。
这个字段在我这儿躺了至少两年了…… |
3
JCZ2MkKb5S8ZX9pq OP |
4
Va1n3R 2020-01-19 01:47:09 +08:00
弹幕 aid 被 hash 过,能彩虹表枚举出来的。
|
5
JCZ2MkKb5S8ZX9pq OP @Va1n3R 那可以拿评论最多的前十和弹幕最多的前十撞一下试试,请问用的是哪种 hash ?
|