当前位置 博文首页 > shelgi的博客:想见你的弹幕爬取和对弹幕信息简单可视化
以后真的要改掉拖延症,其实两天前我就应该写这个博客的,拖了两天。。。结果现在去翻朋友圈,三天可见,无朋友圈截图
由于不是VIP,第一件事就是打开一集然后等着45s的广告。。。
然后找到了弹幕的链接网址
https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910014051953985803944_1579443369825&target_id=4576819405%26vid%3Di0033qa01p1&session_key=23873%2C84%2C1579443370×tamp=75&_=1579443369830
然后多看几集多找几个弹幕链接,发现主要改变的就是target_id和timestamp,然后简化链接,最终实验出了只要target_id和timestamp的url。
# 'https://mfm.video.qq.com/danmu?otype=json×tamp=2385&target_id=4576819404%26vid%3Du0033tu6jy5&count=80'#无数据
# #最后一页1995=133*15
# '2385'
#最后就是
#https://mfm.video.qq.com/danmu?otype=json×tamp={}&target_id={}%26vid%3D{}&count=400&second_count=5
下面这是我找的一段记录
# #第一集
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087666001&target_id=4576819405%26vid%3Di0033qa01p1&session_key=19564%2C70%2C1579087668×tamp=135&_=1579087666006'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087666001&target_id=4576819405%26vid%3Di0033qa01p1&session_key=19564%2C70%2C1579087668×tamp=165&_=1579087666007'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087666001&target_id=4576819405%26vid%3Di0033qa01p1&session_key=19564%2C70%2C1579087668×tamp=195&_=1579087666008'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087666001&target_id=4576819405%26vid%3Di0033qa01p1&session_key=19564%2C70%2C1579087668×tamp=225&_=1579087666009'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087666001&target_id=4576819405%26vid%3Di0033qa01p1&session_key=19564%2C70%2C1579087668×tamp=255&_=1579087666010'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087666001&target_id=4576819405%26vid%3Di0033qa01p1&session_key=19572%2C70%2C1579088455×tamp=2115&_=1579087666018'
#
# #第二集
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087665999&target_id=4576819403%26vid%3Dw0033mb4upm&session_key=17967%2C94%2C1579088496×tamp=105&_=1579087666034'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery1910975216511090891_1579087665999&target_id=4576819403%26vid%3Dw0033mb4upm&session_key=17967%2C94%2C1579088496×tamp=135&_=1579087666035'
# '''callback: jQuery1910975216511090891_1579087665999
# target_id: 4576819403&vid=w0033mb4upm
# session_key: 17967,94,1579088496'''
#
#
# '''Request URL: https://tunnel.video.qq.com/fcgi/danmu_read_count?ddwTargetId=4576819404%26vid%3Du0033tu6jy5&ddwUin=0&dwGetTotal=1&wOnlyTotalCount=0&strSessionKey=&dwGetPubCount=1&raw=1&vappid=29188582&vsecret=37ae5f4003c9a2332e566d8c53bf32b0d4ddfa4ac6717cd1'''
# '''ddwTargetId: 4576819404&vid=u0033tu6jy5
# ddwUin: 0
# dwGetTotal: 1
# wOnlyTotalCount: 0
# strSessionKey:
# dwGetPubCount: 1
# raw: 1
# vappid: 29188582
# vsecret: 37ae5f4003c9a2332e566d8c53bf32b0d4ddfa4ac6717cd1'''
#
#
# '''ddwTargetId: 4576819404
# ddwUin: 0
# dwUpCount: 1
# ddwUpUin: 0
# dwTotalCount: 16924
# stLastComment: {ddwTargetId: 0, ddwUin: 0, dwIsFriend: 0, dwIsOp: 0, dwIsSelf: 0, dwTimePoint: 0, dwUpCount: 0,…}
# ddwTargetId: 0
# ddwUin: 0
# dwIsFriend: 0
# dwIsOp: 0
# dwIsSelf: 0
# dwTimePoint: 0
# dwUpCount: 0
# ddwPostTime: 0
# dwHlwLevel: 0
# dwRichType: 0
# dwDanmuContentType: 0
# dwTimeInterval: 59
# strSessionKey: "59,16924,1579088736"
# dwMaxUpNum: 133
# dwPubCount: 13190'''
#
# 'https://mfm.video.qq.com/danmu?otype=json×tamp=45&target_id=4576819404%26vid%3Du0033tu6jy5&count=80'
#
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19109959296720329007_1579088733741&target_id=4576819404%26vid%3Du0033tu6jy5&session_key=0%2C0%2C0×tamp=15&_=1579088733743'
# 'https://mfm.video.qq.com/danmu?otype=json&callback=jQuery19109959296720329007_1579088733741&target_id=4576819404%26vid%3Du0033tu6jy5&_=1579088733743'
#
# 'https://mfm.video.qq.com/danmu?otype=json×tamp=2385&target_id=4576819404%26vid%3Du0033tu6jy5&count=80'#无数据
# #最后一页1995=133*15
# '2385'
#
#
# 'https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
# 'https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
# '''{wRegistType: 2, vecIdList: ["h00336e2bmu"], wSpeSource: 0, bIsGetUserCfg: 1,…}
# wRegistType: 2
# vecIdList: ["h00336e2bmu"]
# 0: "h00336e2bmu"
# wSpeSource: 0
# bIsGetUserCfg: 1
# mapExtData: {h00336e2bmu: {strCid: "mzc00200umueb9v", strLid: ""}}
# h00336e2bmu: {strCid: "mzc00200umueb9v", strLid: ""}
# strCid: "mzc00200umueb9v"
# strLid: ""'''
然后就是要找到每一集对应的target_id和v_id才能得到每一集的弹幕,所以再去找找哪里有那些target_id和v_id。
这里就得到了v_id,然后再去通过v_id,post得到target_id
大概思路就是这样,到处找我们想要的最后得到就行。
之前也看到了别人写的爬取腾讯视频弹幕的代码,不过链接忘了,我在这个基础上改成了我需要的,上爬虫代码:
import requests
import json
import pandas as pd
import time
import random
# 页面基本信息解析,获取构成弹幕网址所需的后缀ID、播放量、集数等信息。
def parse_base_info(url, headers):
df = pd.DataFrame()
html = requests.get(url, headers=headers)
bs = json.loads(html.text[html.text.find('{'):-1])
for i in bs['results']:
v_id = i['id']
title = i['fields']['title']
view_count = i['fields']['view_all_count']
episode = int(i['fields']['episode'])
if episode == 0:
pass
else:
cache = pd.DataFrame({'id': [v_id], 'title': [title], '播放量': [view_count], '第几集': [episode]})
df = pd.concat([df, cache])
return df
# 传入后缀ID,获取该集的target_id并返回
def get_episode_danmu(v_id, headers):
base_url = 'https://access.video.qq.com/danmu_manage/regist?vappid=97767206&vsecret=c0bdcbae120669fff425d0ef853674614aa659c605a613a4&raw=1'
pay = {"wRegistType": 2, "vecIdList": [v_id],
"wSpeSource": 0, "bIsGetUserCfg": 1,
"mapExtData": {v_id: {"strCid": "mzc00200umueb9v", "strLid": ""}}}
html = requests.post(base_url, data=json.dumps(pay), headers=headers)
bs = json.loads(html.text)
danmu_key = bs['data']['stMap'][v_id]['strDanMuKey']
target_id = danmu_key[danmu_key.find('targetid') + 9: danmu_key.find('vid') - 1]
return [v_id, target_id]
# 解析单个弹幕页面,需传入target_id,v_id(后缀ID)和集数(方便匹配),返回具体的弹幕信息
def parse_danmu(url, target_id, v_id, headers, period):
html = requests.get(url, headers=headers)
bs = json.loads(html.text, strict=False)
df = pd.DataFrame()
try:
for i in bs['comments']:
content = i['content']
name = i['opername']
upcount = i['upcount']
user_degree = i['uservip_degree']
timepoint = i['timepoint']
comment_id = i['commentid']
cache = pd.DataFrame({'用户名': [name], '内容': [content], '会员等级': [user_degree],
'弹幕时间点': [timepoint], '弹幕点赞': [upcount], '弹幕id': [comment_id], '集数': [period]})
df = pd.concat([df, cache])
except:
pass
return df
# 构造单集弹幕的循环网页,传入target_id和后缀ID(v_id),通过设置爬取页数来改变timestamp的值完成翻页操作
def format_url(target_id, v_id, page=85):
urls = []
base_url = 'https://mfm.video.qq.com/danmu?otype=json×tamp={}&target_id={}%26vid%3D{}&count=400&second_count=5'
for num in range(15, page * 30 + 15, 30):
url = base_url.format(num, target_id, v_id)
urls.append(url)
#print(urls)
return urls
def get_all_ids(part1_url,part2_url, headers):
part_1 = parse_base_info(part1_url, headers)
part_2 = parse_base_info(part2_url, headers)
df = pd.concat([part_1, part_2])
df.sort_values('第几集', ascending=True, inplace=True)
count = 1
# 创建一个列表存储target_id
info_lst = []
for i in df['id']:
info = get_episode_danmu(i, headers)
info_lst.append(info)
print('正在努力爬取第 %d 集的target_id' % count)
count += 1
time.sleep(2 + random.random())
print('是不是发现多了一集?别担心,会去重的'