异步爬取视频完整版
"""
还是91看剧,这次使用的是 云播 源,爬取相对复杂
思路:
1.拿到主页面的页面源代码,尝试使用video关键字找m3u8,发现找不到。通过元素定位发现有iframe
2.从iframe的页面源代码中拿到m3u8文件的地址,发现里面是一个路由跳转到另一个m3u8文件,此时才是真正的m3u8文件
3.下载第一层m3u8文件 -> 下载第二层m3u8文件(视频存放文件)
4.下载视频
5.下载秘钥,进行解密操作
6.合并所有ts文件为一个mp4文件
"""
import re
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import aiofiles
from Crypto.Cipher import AES
import os
def get_iframe_src(ur):
page_text = requests.get(ur).text
main_page = BeautifulSoup(page_text,'lxml')
src = main_page.find("iframe")["src"]
print(src)
return src
def get_first_m3u8_url(url):
response = requests.get(url)
obj = re.compile(r'var main = "(?P<m3u8_url>.*?)"',re.S)
m3u8_url = obj.search(response.text).group("m3u8_url")
return m3u8_url
def download_m3u8_file(url,name):
resp = requests.get(url)
with open(name,mode="wb") as f:
f.write(resp.content)
print("m3u8下载完毕")
async def download_ts(url,name,session):
async with session.get(url) as resp:
async with aiofiles.open(f"video2/{name}",mode="wb") as f:
await f.write(await resp.content.read())
print(f"{name}下载完毕")
async def aio_download(up_url):
tasks = []
async with aiohttp.ClientSession as session:
async with aiofiles.open("越狱第一季第一集_second_m3u8.txt", mode="r", encoding="utf-8") as f:
async for line in f:
if line.startswith("#"):continue
else:
line = line.strip()
ts_url = up_url+line
task = asyncio.create_task(download_ts(ts_url,line,session))
tasks.append(task)
await asyncio.wait(tasks)
def get_key(url):
resp = requests.get(url)
return resp.text
async def dec_ts(name,key):
aes = AES.new(key=key,IV=b"0000000000000000",mode=AES.MODE_CBC)
async with aiofiles.open(f"video2/{name}",mode="rb") as f1,\
aiofiles.open(f"video2/temp_{name}",mode="wb") as f2:
bs = await f1.read()
await f2.write(aes.decrypt(bs))
print(f"{name}处理完毕")
async def aio_dec(key):
tasks = []
async with aiofiles.open("越狱第一季第一集_second_m3u8.txt", mode="r",encoding="utf-8") as f:
async for line in f:
if line.startswith("#"):continue
line = line.strip()
task = asyncio.create_task(dec_ts(line,key))
tasks.append(task)
await asyncio.wait(tasks)
def merge_ts():
lst = []
with open("越狱第一季第一集_second_m3u8.txt",mode="r",encoding="utf-8") as f:
for line in f:
if line.startswith("#"):continue
line = line.strip()
lst.append(f"video2/temp_{line}")
s = "+".join(lst)
os.system(f"copy /b {s} xx.mp4")
def main(url):
iframe_src = get_iframe_src(url)
first_m3u8_url = get_first_m3u8_url(iframe_src)
iframe_domain = iframe_src.split("/share")[0]
first_m3u8_url = iframe_domain + first_m3u8_url
download_m3u8_file(first_m3u8_url,"越狱第一季第一集_first_m3u8.txt")
with open("越狱第一季第一集_first_m3u8.txt",mode="r",encoding="utf-8") as f:
for line in f:
if line.startswith("#"):continue
else:
line.strip()
second_m3u8_url = first_m3u8_url.split("index.m3u8")[0] + line
download_m3u8_file(second_m3u8_url,"越狱第一季第一集_second_m3u8.txt")
second_m3u8_url_up = second_m3u8_url.replace("index.m3u8","")
asyncio.run(aio_download(second_m3u8_url_up))
key_url = second_m3u8_url_up + "key.key"
key = get_key(key_url)
asyncio.run(aio_dec(key))
merge_ts()
if __name__ == '__main__':
url = "https://www.91kanju.com/vod-play/541-2-1.html"
main(url)