import json
import os
import requests
import bs4
from lxml import etree
# 模拟真实浏览器标头
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
# 获取专辑页数
def get_album(url):
res = requests.get(url, headers=header)
soup = bs4.BeautifulSoup(res.text, "html.parser")
# pagingBar_page为开发者模式下查看到的页码标识
elems = soup.select('a[class^="pagingBar_page"]')
print("本频道共有{}个页面".format(len(elems)-1))
for i in elems:
if i.text == "下一页":
continue
print("正在下载第{}/{}个页面".format(i.text, len(elems)-1))
if i.text != "1":
url = "http://www.ximalaya.com" + i.attrs["href"]
get_url(url)
def get_url(url):
res = requests.get(url, headers=header)
soup = bs4.BeautifulSoup(res.text, "html.parser")
# sound_id为开发者模式下查看到的页码标识
elems = soup.select('li[sound_id]')
for i in range(len(elems)):
murl = 'http://www.ximalaya.com/tracks/{}.json'.format(elems[i].attrs["sound_id"])
html = requests.get(murl, headers=header).text
dic = json.loads(html)
try:
print("正在下载第{}/{}文件,文件名{}:{}。".format(i+1, len(elems),
elems[i].attrs["sound_id"],dic["title"]))
get_m4a(dic["play_path"], elems[i].attrs["sound_id"])
except:
print("下载{}/{}文件,文件名{}:{}时失败。".format(i + 1, len(elems),
elems[i].attrs["sound_id"], dic["title"]))
def get_m4a(url, id):
folder = "郭德纲相声" # 自定义文件夹名称
res = requests.get(url)
file = open(os.path.join(folder, os.path.basename(id)), 'wb')
for chunk in res.iter_content(100000):
file.write(chunk)
file.close()
if __name__ == '__main__':
url = "http://www.ximalaya.com/1000202/album/2667276/" # 专辑地址
get_album(url)