[TextMatch框架] 生成詞雲

TextMatch

TextMatch is a semantic matching model library for QA & text search … It’s easy to train models and to export representation vectors.

[TextMatch框架] : 文本匹配/文本分類/文本embedding/文本聚類/文本檢索(bow/ifidf/ngramtf-df/bert/albert/bm25/…/nn/gbdt/xgb/kmeans/dscan/faiss/….):https://github.com/MachineLP/TextMatch

 

git clone https://github.com/MachineLP/TextMatch
cd TextMatch
pip install -r requirements.txt
cd tests/tools_test
python generate_word_cloud.py

code:


# -*- coding:utf-8 -*-
# 網易雲音樂 通過歌手ID,生成該歌手的詞雲
import requests
import sys
import re
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from PIL import Image
import numpy as np
from lxml import etree

headers = {
		'Referer'	:'http://music.163.com',
		'Host'	 	:'music.163.com',
		'Accept' 	:'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
		'User-Agent':'Chrome/10'
	}

# 得到某一首歌的歌詞
def get_song_lyric(headers, lyric_url):
	res = requests.request('GET', lyric_url, headers=headers)
	if 'lrc' in res.json():
		lyric = res.json()['lrc']['lyric']
		new_lyric = re.sub(r'[\d:.[\]]','',lyric)
		return new_lyric
	else:
		return ''
		print(res.json())

# 去掉停用詞
def remove_stop_words(f):
	stop_words = ['作詞', '作曲', '編曲', 'Arranger', '錄音', '混音', '人聲', 'Vocal', '絃樂', 'Keyboard', '鍵盤', '編輯', '助理', 'Assistants', 'Mixing', 'Editing', 'Recording', '音樂', '製作', 'Producer', '發行', 'produced', 'and', 'distributed']
	for stop_word in stop_words:
		f = f.replace(stop_word, '')
	return f

# 生成詞雲
def create_word_cloud(f):
	print('根據詞頻,開始生成詞雲!')
	f = remove_stop_words(f)
	cut_text = " ".join(jieba.cut(f,cut_all=False, HMM=True))
	wc = WordCloud(
		font_path="./wc.ttf",
		max_words=100,
		width=2000,
		height=1200,
    )
	print(cut_text)
	wordcloud = wc.generate(cut_text)
	# 寫詞雲圖片
	wordcloud.to_file("wordcloud.jpg")
	# 顯示詞雲文件
	plt.imshow(wordcloud)
	plt.axis("off")
	plt.show()


# 得到指定歌手頁面 熱門前50的歌曲ID,歌曲名
def get_songs(artist_id):
	page_url = 'https://music.163.com/artist?id=' + artist_id
	# 獲取網頁HTML
	res = requests.request('GET', page_url, headers=headers)
	# 用XPath解析 前50首熱門歌曲
	html = etree.HTML(res.text)
	href_xpath = "//*[@id='hotsong-list']//a/@href"
	name_xpath = "//*[@id='hotsong-list']//a/text()"
	hrefs = html.xpath(href_xpath)
	names = html.xpath(name_xpath)
	# 設置熱門歌曲的ID,歌曲名稱
	song_ids = []
	song_names = []
	for href, name in zip(hrefs, names):
		song_ids.append(href[9:])
		song_names.append(name)
		print(href, '  ', name)
	return song_ids, song_names

# 設置歌手ID,毛不易爲12138269
artist_id = '12138269'
[song_ids, song_names] = get_songs(artist_id)

# 所有歌詞
all_word = ''
# 獲取每首歌歌詞
for (song_id, song_name) in zip(song_ids, song_names):
	# 歌詞API URL
	lyric_url = 'http://music.163.com/api/song/lyric?os=pc&id=' + song_id + '&lv=-1&kv=-1&tv=-1'
	lyric = get_song_lyric(headers, lyric_url)
	all_word = all_word + ' ' + lyric
	print(song_name)

#根據詞頻 生成詞雲
create_word_cloud(all_word)



 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章