Python 爬取QQ音樂個人單曲排行榜

分析網頁成分，歌曲信息都存在圖中所示xhr中

詳細代碼實現如下：

import os
import threading
import jieba
import numpy
import html
import requests
import openpyxl
from wordcloud import WordCloud
import PIL.Image as Image


# 設置最大線程鎖
thread_lock = threading.BoundedSemaphore(value=10)

# 全局域名地址，用於拼接網址
domain_url = 'https://y.qq.com/n/yqq/song/'

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/80.0.3987.162 Safari/537.36 Edg/80.0.361.109',
    'Referer': 'https://y.qq.com/n/yqq/song/0039MnYb0qxYhV.html'
}

# 創建工作簿
wb = openpyxl.Workbook()
# 獲取工作簿的活動表
sheet = wb.active
# 工作表重命名
sheet.title = 'song'

# 加表頭，分別給A1B1C1單元格賦值
sheet['A1'] = '歌曲名'
sheet['B1'] = '所屬專輯'
sheet['C1'] = '播放鏈接'


# 判斷是否有文件夾，無則創建
if not os.path.exists('H:\PythonWorks\QQmusic\評論'):
    os.makedirs('H:\PythonWorks\QQmusic\評論')
    COMMENT_PATH = 'H:\PythonWorks\QQmusic\評論'
if not os.path.exists("H:\PythonWorks\QQmusic\歌詞"):
    os.makedirs("H:\PythonWorks\QQmusic\歌詞")
    LYRIC_PATH = "H:\PythonWorks\QQmusic\歌詞"
if not os.path.exists('H:\PythonWorks\QQmusic\熱評'):
    os.makedirs('H:\PythonWorks\QQmusic\熱評')
    HOT_COMMENT_PATH = 'H:\PythonWorks\QQmusic\熱評'
if not os.path.exists('H:\PythonWorks\QQmusic\詞雲'):
    os.makedirs('H:\PythonWorks\QQmusic\詞雲')
    CIYUN_PATH = 'H:\PythonWorks\QQmusic\詞雲'


def cut(text):
    wordlist_jieba = jieba.cut(text)
    space_wordlist = " ".join(wordlist_jieba)
    return space_wordlist


# 生成熱評詞雲
def ciyun(path_name, music_name):
    with open(path_name, encoding='utf-8') as f:
        text = f.read()
        text = cut(text)
        mask_pic = numpy.array(Image.open('kkx.png'))
        wordcloud = WordCloud(
            font_path='C:\Windows\Fonts\simfang.ttf',
            collocations=False,
            max_words=100,
            min_font_size=10,
            max_font_size=500,
            mask=mask_pic
        ).generate(text)
        image = wordcloud.to_image()

        dir_path = CIYUN_PATH + '/' + music_name
        wordcloud.to_file(dir_path + '.png')


# 下載歌詞
def download_lyric(singer, music_name, music_id):
    lrc_url = 'https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg'
    params = {
        'nobase64': '1',
        'musicid': music_id,  # 使用上面獲取到的id
        '-': 'jsonp1',
        'g_tk_new_20200303': '5381',
        'g_tk': '5381',
        'loginUin': '0',
        'hostUin': '0',
        'format': 'json',
        'inCharset': 'utf8',
        'outCharset': 'utf-8',
        'notice': '0',
        'platform': 'yqq.json',
        'needNewCode': '0'
    }
    response = requests.get(lrc_url, params=params, headers=HEADERS)
    lrc_json = response.json()
    lyric = lrc_json['lyric']
    lyric_html = html.unescape(lyric)

    f = open(LYRIC_PATH + '/' + singer + '-' + music_name + '.txt', 'a', encoding='utf-8')
    f.writelines(lyric_html)
    f.close()
    print("歌詞下載完成...")


# 爬取音樂評論
def parse_comments(singer, music_name, music_id):
    comment_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg'
    n = 1
    for x in range(20):
        params = {
            'g_tk_new_20200303': '5381',
            'g_tk': '5381',
            'loginUin': '0',
            'hostUin': '0',
            'format': 'json',
            'inCharset': 'utf8',
            'outCharset': 'GB2312',
            'notice': '0',
            'platform': 'yqq.json',
            'needNewCode': '0',
            'cid': '205360772',
            'reqtype': '2',
            'biztype': '1',
            'topid': music_id,
            'cmd': '8',
            'needmusiccrit': '0',
            'pagenum': x,
            'pagesize': '25',
            'lasthotcommentid': '',
            'domain': 'qq.com',
            'ct': '24',
            'cv': '10101010'
        }
        response = requests.get(comment_url, params=params, headers=HEADERS)
        # 發起請求
        comment_json = response.json()
        comments = comment_json['comment']['commentlist']

        # 將評論存儲到指定的txt
        f = open(COMMENT_PATH + '/' + singer + '-' + music_name + '_評論.txt', 'a', encoding='utf-8')

        for i in comments:
            comment = str(n) + '.' + i['rootcommentcontent'] + '\n------------------------------\n'
            f.writelines(comment)
            n += 1
        f.close()
    print("評論下載完成...")


# 爬取音樂熱評
def parse_hot_comments(singer, music_name, music_id):
    comment_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg'
    params = {
        'g_tk_new_20200303': '5381',
        'g_tk': '5381',
        'loginUin': '0',
        'hostUin': '0',
        'format': 'json',
        'inCharset': 'utf8',
        'outCharset': 'GB2312',
        'notice': '0',
        'platform': 'yqq.json',
        'needNewCode': '0',
        'cid': '205360772',
        'reqtype': '2',
        'biztype': '1',
        'topid': music_id,
        'cmd': '8',
        'needmusiccrit': '0',
        'pagenum': 0,
        'pagesize': '25',
        'lasthotcommentid': '',
        'domain': 'qq.com',
        'ct': '24',
        'cv': '10101010'
    }
    response = requests.get(comment_url, params=params, headers=HEADERS)
    # 發起請求
    comment_json = response.json()
    comments = comment_json['hot_comment']['commentlist']

    path = HOT_COMMENT_PATH + '/' + singer + '-' + music_name + '_熱評.txt'
    # 將評論存儲到指定的txt
    f = open(path, 'a', encoding='utf-8')
    n = 1
    for i in comments:
        comment = str(n) + '.' + i['rootcommentcontent'] + '\n------------------------------\n'
        f.writelines(comment)
        n += 1
    f.close()
    print("熱評下載完成...")
    ciyun(path, music_name)


def download(singer, music_name, music_id, album, music_url, sheet):

    # 把name、album、link寫成列表，用append函數多行寫入Excel
    sheet.append([music_name, album, music_url])

    download_lyric(singer, music_name, music_id)
    parse_comments(singer, music_name, music_id)
    parse_hot_comments(singer, music_name, music_id)

    # 下載完了，解鎖
    thread_lock.release()


def parse_list(singer, page_number):

    # 查詢音樂網址
    search_url = 'https://c.y.qq.com/soso/fcgi-bin/client_search_cp'

    for x in range(page_number):
        Params = {
            'ct': '24',
            'qqmusic_ver': '1298',
            'new_json': '1',
            'remoteplace': 'txt.yqq.song',
            'searchid': '56438082219898629',
            't': '0',
            'aggr': '1',
            'cr': '1',
            'catZhida': '1',
            'lossless': '0',
            'flag_qc': '0',
            'p': str(x + 1),
            'n': '10',
            'w': singer,
            'g_tk_new_20200303': '5381',
            'g_tk': '5381',
            'loginUin': '0',
            'hostUin': '0',
            'format': 'json',
            'inCharset': 'utf8',
            'outCharset': 'utf - 8',
            'notice': '0',
            'platform': 'yqq.json',
            'needNewCode': '0'
        }

        response = requests.get(search_url, params=Params, headers=HEADERS)
        music_json = response.json()
        # 獲取音樂列表
        music_list = music_json['data']['song']['list']

        for music in music_list:
            # song = {'歌曲名': music['name'], '所屬專輯': music['album']['name'], '播放鏈接': domain_url + music['mid'] +
            # '.html'}

            # 獲取音樂對應id
            music_id = music['id']

            album = music['album']['name']
            music_url = domain_url + music['mid'] + '.html'

            # 上鎖
            thread_lock.acquire()
            t = threading.Thread(target=download, args=(singer, music['name'], music_id, album, music_url, sheet))
            t.start()

            # # 下載
            # download(singer, music['name'], music_id, album, music_url, sheet)

    # 最後保存並命名這個Excel文件
    wb.save(singer + '個人單曲排行前' + str(page_number * 10) + '清單.xlsx')
    return input('下載完成，是否繼續下載(y/n):')


if __name__ == '__main__':
    # 設置參數，判斷是否需要重複下載
    answer = 'y'
    while answer != 'n':
        # 輸入指定參數值進行查詢
        name = input('請輸入要查詢的歌手姓名：')
        page = int(input('請輸入要查詢的歌曲頁數：'))

        answer = parse_list(name, page)
    print('下載已全部完成，退出程序。')

總結：

通過XHR爬取數據一般要使用json，格式爲：

response = requsets.get(url)
json = response.json()
list = json['']['']...

Python 爬取QQ音樂個人單曲排行榜

通過HPA+CronHPA組合應對業務複雜彈性伸縮場景

#導出Python爬蟲工程所用的庫

Python爬蟲爬取糖堆網指定圖片

# PyCharm 常用快捷鍵

# 關於爬蟲常見HTTP基礎原理

Python 爬取QQ音樂個人單曲排行榜

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結