其他
Python3 网络爬虫:视频下载,那些事儿!
The following article is from Jack Cui Author Jack Cui
1
2
you-get https://www.bilibili.com/video/BV1NZ4y1j7nw
3
因为我们是搜索嘛,得告诉服务器,咱搜索啥,给服务器传递数据,就是通过 POST。
import requests
from bs4 import BeautifulSoup
search_keyword = '越狱第一季'
search_url = 'http://www.jisudhw.com/index.php'
serach_params = {
'm': 'vod-search'
}
serach_headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'Referer': 'http://www.jisudhw.com/',
'Origin': 'http://www.jisudhw.com',
'Host': 'www.jisudhw.com'
}
serach_datas = {
'wd': search_keyword,
'submit': 'search'
}
r = requests.post(url=search_url, params=serach_params, headers=serach_headers, data=serach_datas)
r.encoding = 'utf-8'
server = 'http://www.jisudhw.com'
search_html = BeautifulSoup(r.text, 'lxml')
search_spans = search_html.find_all('span', class_='xing_vb4')
for span in search_spans:
url = server + span.a.get('href')
name = span.a.string
print(name)
print(url)
import requests
from bs4 import BeautifulSoup
detail_url = 'http://www.jisudhw.com/?m=vod-detail-id-15409.html'
r = requests.get(url = detail_url)
r.encoding = 'utf-8'
detail_bf = BeautifulSoup(r.text, 'lxml')
num = 1
serach_res = {}
for each_url in detail_bf.find_all('input'):
if 'm3u8' in each_url.get('value'):
url = each_url.get('value')
if url not in serach_res.keys():
serach_res[url] = num
print('第%03d集:' % num)
print(url)
num += 1
pip install ffmpy3
ffmpeg -i "http://youku.com-youku.net/20180614/11920_4c9e1cc1/index.m3u8" "第001集.mp4"
import ffmpy3
ffmpy3.FFmpeg(inputs={'http://youku.com-youku.net/20180614/11920_4c9e1cc1/index.m3u8': None}, outputs={'第001集.mp4':None}).run()
4
代码整合在一起,开始下载电视剧。
import os
import ffmpy3
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
search_keyword = '越狱第一季'
search_url = 'http://www.jisudhw.com/index.php'
serach_params = {
'm': 'vod-search'
}
serach_headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'Referer': 'http://www.jisudhw.com/',
'Origin': 'http://www.jisudhw.com',
'Host': 'www.jisudhw.com'
}
serach_datas = {
'wd': search_keyword,
'submit': 'search'
}
video_dir = ''
r = requests.post(url=search_url, params=serach_params, headers=serach_headers, data=serach_datas)
r.encoding = 'utf-8'
server = 'http://www.jisudhw.com'
search_html = BeautifulSoup(r.text, 'lxml')
search_spans = search_html.find_all('span', class_='xing_vb4')
for span in search_spans:
url = server + span.a.get('href')
name = span.a.string
print(name)
print(url)
video_dir = name
if name not in os.listdir('./'):
os.mkdir(name)
detail_url = url
r = requests.get(url = detail_url)
r.encoding = 'utf-8'
detail_bf = BeautifulSoup(r.text, 'lxml')
num = 1
serach_res = {}
for each_url in detail_bf.find_all('input'):
if 'm3u8' in each_url.get('value'):
url = each_url.get('value')
if url not in serach_res.keys():
serach_res[url] = num
print('第%03d集:' % num)
print(url)
num += 1
def downVideo(url):
num = serach_res[url]
name = os.path.join(video_dir, '第%03d集.mp4' % num)
ffmpy3.FFmpeg(inputs={url: None}, outputs={name:None}).run()
# 开8个线程池
pool = ThreadPool(8)
results = pool.map(downVideo, serach_res.keys())
pool.close()
pool.join()
由于视频太大了,要是一个一个地下载,有些慢,适当开个线程,8 线程下载。
来吧,一起看「越狱」吧!
5
本文仅是探讨一些关于视频下载的技术,切勿滥用。
支持正版,从我做起。
后台回复「进群」,加入读者交流群~
点击红字「积分」,可了解积分规则~