TextMatch is a semantic matching model library for QA & text search … It’s easy to train models and to export representation vectors.
[TextMatch框架] : 文本匹配/文本分類/文本embedding/文本聚類/文本檢索(bow/ifidf/ngramtf-df/bert/albert/bm25/…/nn/gbdt/xgb/kmeans/dscan/faiss/….):https://github.com/MachineLP/TextMatch
git clone https://github.com/MachineLP/TextMatch
cd TextMatch
pip install -r requirements.txt
cd tests/tools_test
python generate_word_cloud.py
code:
# -*- coding:utf-8 -*-
# 網易雲音樂 通過歌手ID,生成該歌手的詞雲
import requests
import sys
import re
import os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from PIL import Image
import numpy as np
from lxml import etree
headers = {
'Referer' :'http://music.163.com',
'Host' :'music.163.com',
'Accept' :'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent':'Chrome/10'
}
# 得到某一首歌的歌詞
def get_song_lyric(headers, lyric_url):
res = requests.request('GET', lyric_url, headers=headers)
if 'lrc' in res.json():
lyric = res.json()['lrc']['lyric']
new_lyric = re.sub(r'[\d:.[\]]','',lyric)
return new_lyric
else:
return ''
print(res.json())
# 去掉停用詞
def remove_stop_words(f):
stop_words = ['作詞', '作曲', '編曲', 'Arranger', '錄音', '混音', '人聲', 'Vocal', '絃樂', 'Keyboard', '鍵盤', '編輯', '助理', 'Assistants', 'Mixing', 'Editing', 'Recording', '音樂', '製作', 'Producer', '發行', 'produced', 'and', 'distributed']
for stop_word in stop_words:
f = f.replace(stop_word, '')
return f
# 生成詞雲
def create_word_cloud(f):
print('根據詞頻,開始生成詞雲!')
f = remove_stop_words(f)
cut_text = " ".join(jieba.cut(f,cut_all=False, HMM=True))
wc = WordCloud(
font_path="./wc.ttf",
max_words=100,
width=2000,
height=1200,
)
print(cut_text)
wordcloud = wc.generate(cut_text)
# 寫詞雲圖片
wordcloud.to_file("wordcloud.jpg")
# 顯示詞雲文件
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
# 得到指定歌手頁面 熱門前50的歌曲ID,歌曲名
def get_songs(artist_id):
page_url = 'https://music.163.com/artist?id=' + artist_id
# 獲取網頁HTML
res = requests.request('GET', page_url, headers=headers)
# 用XPath解析 前50首熱門歌曲
html = etree.HTML(res.text)
href_xpath = "//*[@id='hotsong-list']//a/@href"
name_xpath = "//*[@id='hotsong-list']//a/text()"
hrefs = html.xpath(href_xpath)
names = html.xpath(name_xpath)
# 設置熱門歌曲的ID,歌曲名稱
song_ids = []
song_names = []
for href, name in zip(hrefs, names):
song_ids.append(href[9:])
song_names.append(name)
print(href, ' ', name)
return song_ids, song_names
# 設置歌手ID,毛不易爲12138269
artist_id = '12138269'
[song_ids, song_names] = get_songs(artist_id)
# 所有歌詞
all_word = ''
# 獲取每首歌歌詞
for (song_id, song_name) in zip(song_ids, song_names):
# 歌詞API URL
lyric_url = 'http://music.163.com/api/song/lyric?os=pc&id=' + song_id + '&lv=-1&kv=-1&tv=-1'
lyric = get_song_lyric(headers, lyric_url)
all_word = all_word + ' ' + lyric
print(song_name)
#根據詞頻 生成詞雲
create_word_cloud(all_word)