Python 爬虫 4 mins

从零开始写一个音乐爬虫-3:酷狗音乐

18-02-20 / 1797 Words

本文将会详细的写从零开始制作酷狗音乐爬虫的过程,可以下载酷狗音乐音乐付费歌曲,使用Python3开发。

随便打开一个歌单,右键检查属性发现这个<a>标签中含有data

点进去播放,会发现:

看来,音乐外链和这个hash哈希值是脱不了关系的了。

事实上,酷狗音乐歌曲数据外链如下:

http://www.kugou.com/yy/index.php?r=play/getdata&hash=<刚才看到的哈希值>

填入刚刚看到的值(URL:https://www.kugou.com/yy/index.php?r=play/getdata&hash=FE2A4AC53981BB3FAF077CCA2461401A),会看到如下结果:

{
	"status":1,
	"err_code":0,
	"data":{
		"hash":"FE2A4AC53981BB3FAF077CCA2461401A",
		"timelength":76000,
		"filesize":1220066,
		"audio_name":"Toby Fox - Snowdin Town",
		"have_album":1,
		"album_name":"UNDERTALE Soundtrack",
		"album_id":"1983482",
		"img":"http:\/\/imge.kugou.com\/stdmusic\/20170320\/20170320173006740587.jpg",
		"have_mv":0,
		"video_id":0,
		"author_name":"Toby Fox",
		"song_name":"Snowdin Town",
		"lyrics":"\ufeff[id:$00000000]\r\n[ar:arkady sevidov]\r\n[ti:June]\r\n[by:]\r\n[hash:4399c9872c7235b60b58ce88dc487897]\r\n[al:]\r\n[sign:]\r\n[qq:]\r\n[total:320317]\r\n[offset:0]\r\n[00:01.58]\u7eaf\u97f3\u4e50\uff0c\u8bf7\u6b23\u8d4f\r\n",
		"author_id":"579236",
		"privilege":8,
		"privilege2":"1000",
		"play_url":"https:\/\/webfs.yun.kugou.com\/202002251957\/d2bff4d2209ca251954d38990445b8d2\/G057\/M08\/1A\/17\/GZQEAFaVAR6AYuQ_ABKd4lSFeUo118.mp3",
		"authors":[
			{
			"author_id":"579236",
			"sizable_avatar":"http:\/\/singerimg.kugou.com\/uploadpic\/softhead\/{size}\/20171223\/20171223024709440.jpg",
			"is_publish":"1",
			"author_name":"Toby Fox",
			"avatar":"http:\/\/singerimg.kugou.com\/uploadpic\/softhead\/400\/20171223\/20171223024709440.jpg"
			}
		],
		"is_free_part":0,
		"bitrate":128,
		"audio_id":"20519359",
		"play_backup_url":"https:\/\/webfs.cloud.kugou.com\/202002251957\/5d639c7a92c6750b759cab7e718ede04\/G057\/M08\/1A\/17\/GZQEAFaVAR6AYuQ_ABKd4lSFeUo118.mp3"
	}
}

我们打开这个play_urlhttps:\/\/webfs.yun.kugou.com\/202002251957\/d2bff4d2209ca251954d38990445b8d2\/G057\/M08\/1A\/17\/GZQEAFaVAR6AYuQ_ABKd4lSFeUo118.mp3),就会打开真实的音乐地址。

所以我们酷狗音乐获取真实下载链接的流程如下:

graph TD; 初始化程序,引入模块-->批量获取哈希值; 批量获取哈希值-->打开json文件; 打开json文件-->得到play_url; 得到play_url--下载音乐;

先尝试性获取hash:

#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
@Author  :   Ray
@Contact :   ray@raycoder.me
@Software:   PyCharm
@File    :   KugouSpider.py
@Time    :   2019/11/4 21:19
'''

import requests, re
from fake_useragent import *
from random import randint

class KugouSpider():
    HEADER = {  # 伪造请求头,防止被封IP
        'Accept': '*/*',
        'Accept-Encoding': 'gzip,deflate,sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://kugou.com/',
        'User-Agent': UserAgent().random
    }

    def getArtUrl(self, artlist_url):
        jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
        response = requests.get(url = artlist_url, headers = self.HEADER)
        songs = {}
        #print(response.text)
        for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
            #print(jsonUrl.format(i))
            jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = self.cookies)
            jsondata = jsonfile.json()
            #print(jsondata)
            try:
                songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
            except:
                pass
        return songs

哎呀!最后返回了一个空字典!{}为什么呢?我观察后发现,酷狗需要添加一个cookie才可以访问到数据。

我们加一下:

    def getUrl(self, artlist_url):
        jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
        response = requests.get(url = artlist_url, headers = self.HEADER)
        songs = {}
        for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
            jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = {'kg_mid': '1'})
            jsondata = jsonfile.json()
            try:
                songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
            except:
                pass
        return songs

嗯?还是没有数据!我觉得可能是HEADER中的host的问题。

把它去掉!

class KugouSpider():
    HEADER = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip,deflate,sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://kugou.com/',
        'User-Agent': UserAgent().random
    }
    def getUrl(self, artlist_url):
        jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
        response = requests.get(url = artlist_url, headers = self.HEADER)
        songs = {}
        #print(response.text)
        for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
            #print(jsonUrl.format(i))
            jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = {'kg_mid': '1'})
            jsondata = jsonfile.json()
            try:
                songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
            except:
                pass
        return songs

这样,我们就获得了真实的数据:

{'Toby Fox - Snowy': 'https://webfs.yun.kugou.com/202002280800/8b338c7c966f73984a16d332fe0a1905/G053/M04/0C/10/dQ0DAFaVAPCAdhQcABl6n5DvBc0140.mp3', 'Toby Fox - Undertale': 'https://webfs.yun.kugou.com/202002280800/91772c1108ab0627888c84f9fb6380f1/G057/M04/17/05/eQ0DAFaiNEWAXZVoAF0rmY34-tU840.mp3', 'Toby Fox - Tem Shop': 'https://webfs.yun.kugou.com/202002280800/02fb916f700f9df912d93daa77c9d420/G055/M0A/06/07/14YBAFaVAdyIb_RdAAsQ0yCL9vEAAAorgG2O-IACxDr193.mp3', 'Toby Fox - sans.': 'https://webfs.yun.kugou.com/202002280800/3580fcc155cd84b346acc9465504b01a/G052/M02/0C/07/1IYBAFaVAOWINLXsAAxhJsPWw2UAAAy4gIAOs0ADGE-807.mp3', 'Toby Fox - Dogsong': 'https://webfs.yun.kugou.com/202002280800/399c52d789e7f659001ab84e2bcfa823/G055/M00/06/11/F5QEAFaVARWIZFBMAAkgfyVHyJ4AAAoqAMZ4XYACSCX641.mp3'}

不对呀!为什么只找到了几首音乐?我们打印一下jsondata

def getUrl(self, artlist_url):
    jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
    response = requests.get(url = artlist_url, headers = self.HEADER)
    songs = {}
    for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
        jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = {'kg_mid': '1'})
        jsondata = jsonfile.json()
        print(jsondata)
        try:
            pass
            #songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
        except:
            pass
    return songs

Woc原来这个东西返回的是这种数据:

{'status': 0, 'err_code': 30020, 'data': {'SSA-CODE': 'bj_event_ee07e35318276c48c519523c26d9f4ce', 'SSA-HMID': '1'}}

我们随便打开一个hash源链:

也是这种情况。

我们再打开播放地址https://www.kugou.com/song/#hash=FE2A4AC53981BB3FAF077CCA2461401A&album_id=1983482:喔!它出现了一个验证码。这个东西就是罪魁祸首。

验证之后再打开hash源链,数据就出现了。

所以我们可以判断酷狗音乐这个cookie不止kg_mid一个。

我们打开浏览器控制台:

这里有恶心的cookie

把它们复制到程序中去。

乃得:

class KugouSpider():
    HEADER = {  # 伪造请求头,防止被封IP
        'Accept': '*/*',
        'Accept-Encoding': 'gzip,deflate,sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://kugou.com/',
        'User-Agent': UserAgent().random
    }
    cookies = {'kg_mid': '44c117b70a8abbe992b50ee90d19784f',
               'kg_dfid': '2SoQgm4W6BHT0eH1EP3qAt3X',
                'kg_dfid_collect': 'd41d8cd98f00b204e9800998ecf8427e',
                'KuGooRandom': '66781582627339920',
                'Hm_lvt_aedee6983d4cfc62f509129360d6bb3d	1582': '1582627291,1582630139,1582674521',
                'kg_mid_temp': '44c117b70a8abbe992b50ee90d19784f',
                'Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d': '1582676297',
                'ACK_SERVER_10016': '%7B%22list%22%3A%5B%5B%22gzreg-user.kugou.com%22%5D%5D%7D',
                'ACK_SERVER_10015': '%7B%22list%22%3A%5B%5B%22bjlogin-user.kugou.com%22%5D%5D%7D',
                'ACK_SERVER_10017': '%7B%22list%22%3A%5B%5B%22gzverifycode.service.kugou.com%22%5D%5D%7D'
               }

    def getUrl(self, artlist_url):
        jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
        response = requests.get(url = artlist_url, headers = self.HEADER, cookies = self.cookies)
        songs = {}
        for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
            jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = self.cookies)
            jsondata = jsonfile.json()
            try:
                songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
            except:
                pass
        return songs

这个cookie是可以重复使用的,我们只需要在数据无法提取时去验证一下即可。

下载音乐可以去看我的这篇文章:

从零开始写一个音乐爬虫-2-网易云音乐:下载音乐文件

所以我们完成的程序如下:

#!/usr/bin/env python
#-*- coding:utf8 -*-
'''
@Author  :   Ray
@Contact :   ray@raycoder.me
@Software:   PyCharm
@File    :   KugouSpider.py
@Time    :   2019/11/4 21:19
'''

import requests, re
from fake_useragent import *
from random import randint

class KugouSpider():
    HEADER = {  # 伪造请求头,防止被封IP
        'Accept': '*/*',
        'Accept-Encoding': 'gzip,deflate,sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8,gl;q=0.6,zh-TW;q=0.4',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Referer': 'https://kugou.com/',
        'User-Agent': UserAgent().random
    }
    cookies = {'kg_mid': 'd7d2db42653ccb5262c6b3a53d3e513e',
               'kg_dfid': '4E1aaE0U1ZlH0aaWYz3Z7N6n',
               'kg_dfid_collect': 'd41d8cd98f00b204e9800998ecf8427e',
               'KuGooRandom': '66781582627339920',
               'Hm_lvt_aedee6983d4cfc62f509129360d6bb3d	1582': '1583409383',
               'kg_mid_temp': 'd7d2db42653ccb5262c6b3a53d3e513e',
               'Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d': '1583409803',
               'ACK_SERVER_10016': '%7B%22list%22%3A%5B%5B%22gzreg-user.kugou.com%22%5D%5D%7D',
               'ACK_SERVER_10015': '%7B%22list%22%3A%5B%5B%22bjlogin-user.kugou.com%22%5D%5D%7D',
               'ACK_SERVER_10017': '%7B%22list%22%3A%5B%5B%22gzverifycode.service.kugou.com%22%5D%5D%7D'
              }

    def getArtUrl(self, artlist_url):
        jsonUrl = 'http://www.kugou.com/yy/index.php?r=play/getdata&hash={0:s}'
        response = requests.get(url = artlist_url, headers = self.HEADER)
        songs = {}
        #print(response.text)
        for i in re.findall(r'<a title=".+" hidefocus="true" href="javascript:;" data="(\w+)\|\d+">', response.text):
            #print(jsonUrl.format(i))
            jsonfile = requests.get(url = jsonUrl.format(i), headers = self.HEADER, cookies = self.cookies)
            jsondata = jsonfile.json()
            #print(jsondata)
            try:
                songs[jsondata['data']['audio_name']] = jsondata['data']['play_url']
            except:
                pass
        return songs

测试:

from KugouSpider import KugouSpider
from download import downloadMusic

def _test():
    spider = KugouSpider()
    downloadMusic(spider.getUrl('https://www.kugou.com/yy/special/single/1094256.html'))
if __name__ == '__main__':
    _test()

这个也要写为一个单独的文件:test.py。 P.S.我发现貌似随机Cookie有效:

    randten = lambda large: randint(1, 10**large)
    cookies = {'kg_mid': str(randten(30)),
               'kg_dfid': str(randten(30)),
               'kg_dfid_collect': str(randten(30)),
               'KuGooRandom': str(randten(12)),
               'Hm_lvt_aedee6983d4cfc62f509129360d6bb3d	1582': str(randten(10)),
               'kg_mid_temp': str(randten(30)),
               'Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d': str(randten(30)),
               'ACK_SERVER_10016': str(randten(30)),
               'ACK_SERVER_10015': str(randten(30)),
               'ACK_SERVER_10017': str(randten(30))
              }

可以试试。