概述
廣告,敏感詞檢測一直以來都是讓人頭疼的話題,僅僅通過添加敏感詞列表是解決不了問題的。今天封禁了這個詞,明天又會有新的違禁詞冒出來,比起愚公無窮盡的子孫更甚。
敏感詞匹配這種治標不治本的方法,在一定的語義下蠻有效的,但是這個場景對高併發,訪問QPS高的服務來說,不是很合適。前段時間看到垃圾郵件檢測用到的貝葉斯分類算法,這種“半學習”形式的方法的準確度依賴於先驗概率的準確性,而公司長期以來整理到的違禁詞列表就是一個很好的源,隨着貝葉斯分類的數據越來越多,分類的準確性也會越來越高,後期僅僅需要對違禁詞文件進行添加即可,方便又準確。
PHP做貝葉斯分類不能很好的利用內存,針對每一個請求都會創建一個進程,各個請求相互獨立,所以每個請求都會重新來一遍貝葉斯分類數據集構建,這效率可想而知,因此不打算用PHP去實現。
go語言一直以來以快著稱,就用它吧。那麼問題又來了,怎麼讓go作爲PHP的後端實現這個檢測服務呢。進程間的數據通常來講有這麼幾種方式:
- http
- rpc
- unix domain socket
- pipe
看完了 https://blog.csdn.net/lengyuezuixue/article/details/79314987 這篇文章後,決定採用unix domain socket的形式,畢竟NGINX和php-fpm之間的通信都是這麼搞起來的,效率應該還不賴。
實現
golang 後端
package main
import (
"src/github.com/ajph/nbclassifier-go"
"log"
"os"
"bufio"
"io"
"net"
"syscall"
"fmt"
"src/github.com/yanyiwu/gojieba"
"strings"
)
const SPAM_CHECK_SOCKET_FILE = "/tmp/spamcheck.sock"
// 使用go 實現簡單的貝葉斯分類
func getWords(filepath string)[]string {
file, err := os.Open(filepath)
if err != nil {
log.Fatal(err)
}
defer file.Close()
reader := bufio.NewReader(file)
ret := []string{}
for {
line, err := reader.ReadString('\n')
if err != nil || io.EOF == err {
if line == "" {
break
}
}
line = strings.Trim(line, "\n")
fmt.Println("處理單詞:" + line)
ret = append(ret, line)
}
return ret
}
func learn(){
m := nbclassifier.New()
m.NewClass("normal")
normalwords := getWords("normalwords.txt")
//fmt.Println(normalwords)
m.Learn("normal", normalwords...)
//m.Learn("normal", "a", "need")
m.NewClass("forbidden")
forbiddenwords := getWords("forbiddenwords.txt")
//fmt.Println(forbiddenwords)
m.Learn("forbidden", forbiddenwords...)
//m.Learn("forbidden", " design ", "banner", " picture", " logo ", "clip art", " ad ", "clipart", "hairstyles", " drawing", " rendering", " diagram ", " poster", "изображение")
m.NewClass("terror")
terrorwords := getWords("terrorwords.txt")
//fmt.Println(terrorwords)
m.Learn("terror", terrorwords...)
//m.Learn("terror", "...", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "…", "image", "pinterest", ".c", "ltd.", "vector", "quote", "video", "search", "?", "click", "psd", "ai", "print", "file", "related", "download", "submit", "view", "buy", "how", "maker", "online", " on", "by")
m.SaveToFile("materiel.json")
}
func reloadModel() *nbclassifier.Model{
model, _ := nbclassifier.LoadFromFile("materiel.json")
//fmt.Println(model.Classes[0].Items[0])
//fmt.Println(model.Classes[1])
//fmt.Println(model.Classes[2])
return model
}
func match(model *nbclassifier.Model, content string) string {
// 分詞
jieba := gojieba.NewJieba()
defer jieba.Free()
words := jieba.Cut(content, true)
cls, unsure,_ := model.Classify(words...)
fmt.Println("檢測到分類爲:" + cls.Id)
result := "normal"
if unsure == false {
result = cls.Id
fmt.Println(cls, unsure)
}
return result
}
func run() {
socket, _ := net.Listen("unix", SPAM_CHECK_SOCKET_FILE)
defer syscall.Unlink(SPAM_CHECK_SOCKET_FILE)
learn()
// 訓練物料
model := reloadModel()
for {
client, _ := socket.Accept()
buf := make([]byte, 1024)
datalength, _ := client.Read(buf)
data := buf[:datalength]
fmt.Println("client msg:" + string(data))
checkret := match(model, string(data))
fmt.Println("check result: " + checkret)
response := []byte("")
if len(checkret) > 0 {
response = []byte(checkret)
}
_,_ = client.Write(response)
}
}
func main() {
// 開啓sock,檢測服務
run()
//fmt.Println(reloadModel())
}
php 前端
<?php
$msg = "你說謊, 你放屁,你這個傻子";
$SOCKET_FILE = "/tmp/spamcheck.sock";
$socket = socket_create(AF_UNIX, SOCK_STREAM, 0);
socket_connect($socket, $SOCKET_FILE);
socket_send($socket, $msg, strlen($msg), 0);
$response = socket_read($socket, 1024);
socket_close($socket);
var_dump($response);
測試
源碼
https://github.com/guoruibiao/spamcheck
總結整理
目前看起來,sock的形式還有蠻多侷限性的。目前只是單機,後續可以考慮繼續進行優化。先這麼着吧,後面應該還要繼續跟進…