參考

spaCy實踐

語法方面

準備工作

import spacy#讀入Spacy軟件包
from spacy.lang.en import English
nlp = spacy.load('en_core_web_sm')#讓Spacy使用英語模型，將模型存儲到變量nlp；注：anaconda中可以使用spacy.load('en'）但pycharm中無法使用，故修改
text = "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013."
doc = nlp(text)#用nlp模型分析文本，將結果命名爲doc；doc看似與原文本沒區別，實際上spacy在後臺已經進行了很多分析

展示全部詞例（token）

#1、展示全部詞例（token）
for token in doc:
    print('"'+token.text+'"')#輸出形式："for"，注意引號的使用

只對前10個詞例（token），輸出token的索引值、詞元、詞性等

#2、只對前10個詞例（token），輸出token的索引值、詞元、詞性等
for token in doc[:10]:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,#文本
        token.idx,#索引值（即在原文中的定位）
        token.lemma_,#詞元
        token.is_punct,#是否爲標點符號
        token.is_space,#是否爲空格
        token.shape_,
        token.pos_,#詞性
        token.tag_#標記
    ))

不再考慮全部詞性，只關注文本中出現的實體（entity）詞彙

#3、不再考慮全部詞性，只關注文本中出現的實體（entity）詞彙
for ent in doc.ents:
    print(ent.text,ent.label_)

把一段文字拆解爲語句（按.分隔）

#4、把一段文字拆解爲語句（按.分隔）
for sent in doc.sents:
    print(sent)

#注意這裏doc.sents並不是列表類型,而是<generator at 0x116e95e18>
#假設我們需要從中篩選出某一句話，需要先將其轉化爲列表
doc=list(doc.sents)
print('1',doc[0])

搞清其中每一個詞例（token）之間的依賴關係

#下面要展示的功能，分析範圍侷限在第一句話
#將第一句抽取出來，並且重新用nlp模型處理，存入到新的變量newdoc中
newdoc = nlp(list(doc.sents)[0].text)

#搞清其中每一個詞例（token）之間的依賴關係
for token in newdoc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

語義方面

我們利用的工具，叫做詞嵌入（word embedding）模型。

使用spaCy的詞嵌入模型查看單詞對應的向量

nlp = spacy.load('en_core_web_lg')#使用詞嵌入模型，我們需要Spacy讀取一個新的文件
print(nlp.vocab['minister'].vector)#打印“minister”這個單詞對應的向量取值

結果顯示，單詞用總長度爲300的浮點數組成向量來表示。

Spacy讀入的這個模型，是採用word2vec，在海量語料上訓練的結果。

查看spacy的語義近似度判別能力

import spacy#讀入Spacy軟件包
nlp = spacy.load('en_core_web_lg')#使用詞嵌入模型，我們需要Spacy讀取一個新的文件

#將4個變量，賦值爲對應單詞的向量表達結果
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]
#看看“狗”和“貓”/“蘋果”的相似度結果
print(dog.similarity(cat))#0.80168545
print(dog.similarity(apple))#0.26339024
#看來Spacy利用詞嵌入模型，對語義有了一定的理解

scipy計算相似度的餘弦函數

import spacy#讀入Spacy軟件包
nlp = spacy.load('en_core_web_lg')#使用詞嵌入模型，我們需要Spacy讀取一個新的文件
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]
#若計算詞典中可能不存在的向量，Spacy自帶的similarity()函數，就顯得不夠用了。
#從scipy中，找到相似度計算需要用到的餘弦函數
from scipy.spatial.distance import cosine
print(1-cosine(dog.vector,cat.vector))#0.8016855120658875
#除了保留幾位小數外，計算結果與Spacy自帶的similarity()運行結果沒有差別


#我們把它做成一個小函數，專門處理向量輸入
def vector_similarity(x,y):
    return 1-cosine(x,y)

print(vector_similarity(dog.vector, apple.vector))#0.2633902430534363

計算guess_word取值（guess_word = king - queen + woman）

import spacy#讀入Spacy軟件包
nlp = spacy.load('en_core_web_lg')#使用詞嵌入模型，我們需要Spacy讀取一個新的文件

#? - woman = king - queen，即guess_word = king - queen + woman
#編寫下面函數，計算guess_word取值
def make_guess_word(words):
    [first,second,third]=words
    return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector
make_guess_word(['king','queen','woman'])
print(make_guess_word(['king','queen','woman']))#得一堆向量值

用上面計算的 guess_word 取值，與字典詞語逐個覈對近似性，打印最近似的10個候選詞

import spacy#讀入Spacy軟件包
nlp = spacy.load('en_core_web_lg')#使用詞嵌入模型，我們需要Spacy讀取一個新的文件
from scipy.spatial.distance import cosine


def vector_similarity(x, y):
    return 1 - cosine(x, y)
#編寫下面函數，計算guess_word取值
def make_guess_word(words):
    [first,second,third]=words
    return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector
make_guess_word(['king','queen','woman'])

def get_similar_word(words,scope=nlp.vocab):
    guess_word=make_guess_word(words)
    similarities=[]
    for word in scope:
        if not word.has_vector:
            continue

        similarity=vector_similarity(guess_word,word.vector)
        similarities.append((word,similarity))#注意兩層()，否則報錯TypeError: append() takes exactly one argument (2 given)
    similarities = sorted(similarities, key=lambda item: -item[1])
    print([word[0].text for word in similarities[:10]])

#嘗試：#? - woman = king - queen，即guess_word = king - queen + woman
words = ["king", "queen", "woman"]#輸入右側詞序列
get_similar_word(words)#然後執行對比函數
#結果：['MAN', 'Man', 'mAn', 'MAn', 'MaN', 'man', 'mAN', 'WOMAN', 'womAn', 'WOman']

#嘗試：? - England = Paris - London，即guess_word = Paris - London + England
words = ["Paris", "London", "England"]#把這幾個單詞輸入
get_similar_word(words)#讓Spacy來猜
#結果：['france', 'FRANCE', 'France', 'Paris', 'paris', 'PARIS', 'EUROPE', 'EUrope', 'europe', 'Europe']

把高維度的詞向量（300維）壓縮到二維平面，並用TSNE可視化

#把詞向量的300維的高空間維度，壓縮到一張紙（二維）上，看看詞語之間的相對位置關係。
import numpy as np
import spacy
text = "The sequel, Yes, Prime Minister, ran from 1986 to 1988. In total there were 38 episodes, of which all but one lasted half an hour. Almost all episodes ended with a variation of the title of the series spoken as the answer to a question posed by the same character, Jim Hacker. Several episodes were adapted for BBC Radio, and a stage play was produced in 2010, the latter leading to a new television series on UKTV Gold in 2013."
nlp = spacy.load('en_core_web_lg')
doc = nlp(text)

embedding = np.array([])#把詞嵌入矩陣先設定爲空。一會兒慢慢填入
word_list = []#需要演示的單詞列表，也先空着

#再次讓Spacy遍歷texts，加入到單詞列表中。注意這次我們要進行判斷：如果是標點，丟棄；如果詞彙已經在詞語列表中，丟棄
#即 若不是標點符號且不在詞語列表，則保留
for token in doc:
    if not(token.is_punct) and not (token.text in word_list):
        word_list.append(token.text)
print(word_list)#注意打印內容：word_list，若打印print(word_list.append(token.text))>>>None

#把每個詞彙對應的空間向量，追加到詞嵌入矩陣中
for word in word_list:
    embedding = np.append(embedding , nlp.vocab[word].vector)
#此時嵌入矩陣的維度爲(18900,)：所有向量都被放在了一個長串上面。這顯然不符合我們的要求
# 我們將不同的單詞對應的詞向量，拆解到不同行上面去
embedding = embedding.reshape(len(word_list), -1)
print(embedding.shape) #看看此時詞嵌入矩陣的維度：(63, 300)

from sklearn.manifold import TSNE #從scikit-learn軟件包中，讀入TSNE模塊
tsne = TSNE()#建立一個同名小寫的tsne，作爲調用對象（tsne的作用，是把高維度的詞向量（300維）壓縮到二維平面上）
low_dim_embedding = tsne.fit_transform(embedding)#執行壓縮轉換過程，low_dim_embedding ，就是63個詞彙降低到二維的向量表示
#降維後的詞向量可視化
import matplotlib.pyplot as plt #繪圖工具包
#下面這個函數，用來把二維向量的集合，繪製出來
def plot_with_labels(low_dim_embs, labels, filename='tsne.pdf'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                 xy=(x, y),
                 xytext=(5, 2),
                 textcoords='offset points',
                 ha='right',
                 va='bottom')
    plt.savefig(filename)

plot_with_labels(low_dim_embedding, word_list)
#可視化圖在路徑下，.pdf文件

【NLP】spaCy筆記

參考

spaCy實踐

語法方面

準備工作

展示全部詞例（token）

只對前10個詞例（token），輸出token的索引值、詞元、詞性等

不再考慮全部詞性，只關注文本中出現的實體（entity）詞彙

把一段文字拆解爲語句（按.分隔）

搞清其中每一個詞例（token）之間的依賴關係

語義方面

使用spaCy的詞嵌入模型查看單詞對應的向量

查看spacy的語義近似度判別能力

scipy計算相似度的餘弦函數

計算guess_word取值（guess_word = king - queen + woman）

用上面計算的 `guess_word` 取值，與字典詞語逐個覈對近似性，打印最近似的10個候選詞

把高維度的詞向量（300維）壓縮到二維平面，並用TSNE可視化

Golang爬蟲代理接入的技術與實踐

【Python】100基礎例（1-50）+數據分析例

【NLP】預訓練詞向量

【NLP】LDA2Vec筆記（基於cemoody/lda2vec 未實現）

【Python】面向對象_菜鳥教程

【NLP】英文數據預處理___詞幹/詞元處理

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

【NLP】spaCy筆記

參考

spaCy實踐

語法方面

準備工作

展示全部詞例（token）

只對前10個詞例（token），輸出token的索引值、詞元、詞性等

不再考慮全部詞性，只關注文本中出現的實體（entity）詞彙

把一段文字拆解爲語句（按.分隔）

搞清其中每一個詞例（token）之間的依賴關係

語義方面

使用spaCy的詞嵌入模型 查看單詞對應的向量

查看spacy的語義近似度判別能力

scipy計算相似度的餘弦函數

計算guess_word取值（guess_word = king - queen + woman）

用上面計算的 guess_word 取值，與字典詞語逐個覈對近似性，打印最近似的10個候選詞

把高維度的詞向量（300維）壓縮到二維平面，並用TSNE可視化

使用spaCy的詞嵌入模型查看單詞對應的向量

用上面計算的 `guess_word` 取值，與字典詞語逐個覈對近似性，打印最近似的10個候選詞