- 分析網頁成分,歌曲信息都存在圖中所示xhr中
- 詳細代碼實現如下:
import os
import threading
import jieba
import numpy
import html
import requests
import openpyxl
from wordcloud import WordCloud
import PIL.Image as Image
# 設置最大線程鎖
thread_lock = threading.BoundedSemaphore(value=10)
# 全局域名地址,用於拼接網址
domain_url = 'https://y.qq.com/n/yqq/song/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.162 Safari/537.36 Edg/80.0.361.109',
'Referer': 'https://y.qq.com/n/yqq/song/0039MnYb0qxYhV.html'
}
# 創建工作簿
wb = openpyxl.Workbook()
# 獲取工作簿的活動表
sheet = wb.active
# 工作表重命名
sheet.title = 'song'
# 加表頭,分別給A1B1C1單元格賦值
sheet['A1'] = '歌曲名'
sheet['B1'] = '所屬專輯'
sheet['C1'] = '播放鏈接'
# 判斷是否有文件夾,無則創建
if not os.path.exists('H:\PythonWorks\QQmusic\評論'):
os.makedirs('H:\PythonWorks\QQmusic\評論')
COMMENT_PATH = 'H:\PythonWorks\QQmusic\評論'
if not os.path.exists("H:\PythonWorks\QQmusic\歌詞"):
os.makedirs("H:\PythonWorks\QQmusic\歌詞")
LYRIC_PATH = "H:\PythonWorks\QQmusic\歌詞"
if not os.path.exists('H:\PythonWorks\QQmusic\熱評'):
os.makedirs('H:\PythonWorks\QQmusic\熱評')
HOT_COMMENT_PATH = 'H:\PythonWorks\QQmusic\熱評'
if not os.path.exists('H:\PythonWorks\QQmusic\詞雲'):
os.makedirs('H:\PythonWorks\QQmusic\詞雲')
CIYUN_PATH = 'H:\PythonWorks\QQmusic\詞雲'
def cut(text):
wordlist_jieba = jieba.cut(text)
space_wordlist = " ".join(wordlist_jieba)
return space_wordlist
# 生成熱評詞雲
def ciyun(path_name, music_name):
with open(path_name, encoding='utf-8') as f:
text = f.read()
text = cut(text)
mask_pic = numpy.array(Image.open('kkx.png'))
wordcloud = WordCloud(
font_path='C:\Windows\Fonts\simfang.ttf',
collocations=False,
max_words=100,
min_font_size=10,
max_font_size=500,
mask=mask_pic
).generate(text)
image = wordcloud.to_image()
dir_path = CIYUN_PATH + '/' + music_name
wordcloud.to_file(dir_path + '.png')
# 下載歌詞
def download_lyric(singer, music_name, music_id):
lrc_url = 'https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg'
params = {
'nobase64': '1',
'musicid': music_id, # 使用上面獲取到的id
'-': 'jsonp1',
'g_tk_new_20200303': '5381',
'g_tk': '5381',
'loginUin': '0',
'hostUin': '0',
'format': 'json',
'inCharset': 'utf8',
'outCharset': 'utf-8',
'notice': '0',
'platform': 'yqq.json',
'needNewCode': '0'
}
response = requests.get(lrc_url, params=params, headers=HEADERS)
lrc_json = response.json()
lyric = lrc_json['lyric']
lyric_html = html.unescape(lyric)
f = open(LYRIC_PATH + '/' + singer + '-' + music_name + '.txt', 'a', encoding='utf-8')
f.writelines(lyric_html)
f.close()
print("歌詞下載完成...")
# 爬取音樂評論
def parse_comments(singer, music_name, music_id):
comment_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg'
n = 1
for x in range(20):
params = {
'g_tk_new_20200303': '5381',
'g_tk': '5381',
'loginUin': '0',
'hostUin': '0',
'format': 'json',
'inCharset': 'utf8',
'outCharset': 'GB2312',
'notice': '0',
'platform': 'yqq.json',
'needNewCode': '0',
'cid': '205360772',
'reqtype': '2',
'biztype': '1',
'topid': music_id,
'cmd': '8',
'needmusiccrit': '0',
'pagenum': x,
'pagesize': '25',
'lasthotcommentid': '',
'domain': 'qq.com',
'ct': '24',
'cv': '10101010'
}
response = requests.get(comment_url, params=params, headers=HEADERS)
# 發起請求
comment_json = response.json()
comments = comment_json['comment']['commentlist']
# 將評論存儲到指定的txt
f = open(COMMENT_PATH + '/' + singer + '-' + music_name + '_評論.txt', 'a', encoding='utf-8')
for i in comments:
comment = str(n) + '.' + i['rootcommentcontent'] + '\n------------------------------\n'
f.writelines(comment)
n += 1
f.close()
print("評論下載完成...")
# 爬取音樂熱評
def parse_hot_comments(singer, music_name, music_id):
comment_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg'
params = {
'g_tk_new_20200303': '5381',
'g_tk': '5381',
'loginUin': '0',
'hostUin': '0',
'format': 'json',
'inCharset': 'utf8',
'outCharset': 'GB2312',
'notice': '0',
'platform': 'yqq.json',
'needNewCode': '0',
'cid': '205360772',
'reqtype': '2',
'biztype': '1',
'topid': music_id,
'cmd': '8',
'needmusiccrit': '0',
'pagenum': 0,
'pagesize': '25',
'lasthotcommentid': '',
'domain': 'qq.com',
'ct': '24',
'cv': '10101010'
}
response = requests.get(comment_url, params=params, headers=HEADERS)
# 發起請求
comment_json = response.json()
comments = comment_json['hot_comment']['commentlist']
path = HOT_COMMENT_PATH + '/' + singer + '-' + music_name + '_熱評.txt'
# 將評論存儲到指定的txt
f = open(path, 'a', encoding='utf-8')
n = 1
for i in comments:
comment = str(n) + '.' + i['rootcommentcontent'] + '\n------------------------------\n'
f.writelines(comment)
n += 1
f.close()
print("熱評下載完成...")
ciyun(path, music_name)
def download(singer, music_name, music_id, album, music_url, sheet):
# 把name、album、link寫成列表,用append函數多行寫入Excel
sheet.append([music_name, album, music_url])
download_lyric(singer, music_name, music_id)
parse_comments(singer, music_name, music_id)
parse_hot_comments(singer, music_name, music_id)
# 下載完了,解鎖
thread_lock.release()
def parse_list(singer, page_number):
# 查詢音樂網址
search_url = 'https://c.y.qq.com/soso/fcgi-bin/client_search_cp'
for x in range(page_number):
Params = {
'ct': '24',
'qqmusic_ver': '1298',
'new_json': '1',
'remoteplace': 'txt.yqq.song',
'searchid': '56438082219898629',
't': '0',
'aggr': '1',
'cr': '1',
'catZhida': '1',
'lossless': '0',
'flag_qc': '0',
'p': str(x + 1),
'n': '10',
'w': singer,
'g_tk_new_20200303': '5381',
'g_tk': '5381',
'loginUin': '0',
'hostUin': '0',
'format': 'json',
'inCharset': 'utf8',
'outCharset': 'utf - 8',
'notice': '0',
'platform': 'yqq.json',
'needNewCode': '0'
}
response = requests.get(search_url, params=Params, headers=HEADERS)
music_json = response.json()
# 獲取音樂列表
music_list = music_json['data']['song']['list']
for music in music_list:
# song = {'歌曲名': music['name'], '所屬專輯': music['album']['name'], '播放鏈接': domain_url + music['mid'] +
# '.html'}
# 獲取音樂對應id
music_id = music['id']
album = music['album']['name']
music_url = domain_url + music['mid'] + '.html'
# 上鎖
thread_lock.acquire()
t = threading.Thread(target=download, args=(singer, music['name'], music_id, album, music_url, sheet))
t.start()
# # 下載
# download(singer, music['name'], music_id, album, music_url, sheet)
# 最後保存並命名這個Excel文件
wb.save(singer + '個人單曲排行前' + str(page_number * 10) + '清單.xlsx')
return input('下載完成,是否繼續下載(y/n):')
if __name__ == '__main__':
# 設置參數,判斷是否需要重複下載
answer = 'y'
while answer != 'n':
# 輸入指定參數值進行查詢
name = input('請輸入要查詢的歌手姓名:')
page = int(input('請輸入要查詢的歌曲頁數:'))
answer = parse_list(name, page)
print('下載已全部完成,退出程序。')
- 總結:
通過XHR爬取數據一般要使用json,格式爲:
response = requsets.get(url)
json = response.json()
list = json['']['']...