Trie樹魔改實現千萬數據秒級排序

Trie 樹的優點是查找摘要O(len)的複雜度 ,本質上也是一顆樹的結構。

花了一天時間,在自己上一篇文章的原基礎上實現了一個Trie 能排序的算法,
進過調整算法實現了插入排序,和通過公共前綴索引功能。

使用了重複率非常高的文本(將512 K 複製成 500M) 的情況下,排序的速度非常快因爲此時不需要額外創建節點消耗內存,所以內存佔用率也出奇的低。

內存佔用情況:

下面再使用,相對較長的幾乎沒有重複的文本800萬數據讀取做了下測試,此時性能就下降很多。 800萬就用了 20秒左右,並且此時內存佔用,高達1.78G。

查詢速度在沒有打印的情況下,查詢共同公共前綴的 Key 花費了 162 ms ,可以看到在查詢速度上速度還是可以接受的。

總結:文本重複較高,較短的情況下適合使用 Trie 來排序 Trie 的優勢是查詢比較快查詢一個文本是否包含在Trie 中只需要幾十微秒以內。

下面給出代碼:

package main

import (
	"bufio"
	"bytes"
	"encoding/binary"
	"fmt"
	"io"
	"os"
	"strings"
	"time"
)

/**
 * Created by @CaomaoBoy on 2020/3/17.
 *  email:<[email protected]>
 */
type trieIndex struct {
	subTrieNodeKey byte      //子節點的索引 字節
	arrindex      int 	   //子節點所在索引
}
//新建 trie 索引結構
func NewtrieIndex() *trieIndex{
	return &trieIndex{
		subTrieNodeKey: 0,
		arrindex:      0,
	}
}
//獲取 索引信息
func(ti *trieIndex) getTrieKey() byte{
	return ti.subTrieNodeKey
}
type trieNode struct {
	count		 uint32		  //單詞出現次數
	isWord       bool         //是否單詞結尾
	subTrieNode  []*trieNode  //所有子節點 list
	subTrieIndex []*trieIndex //記錄subTrie排序後的索引
}

//插入 子節點
func(td *trieNode) inserSubTrie(key byte,trienode *trieNode){
	//長度不相等 報錯
	if len(td.subTrieNode) != len(td.subTrieIndex){
		panic("error")
	}
	//插入 索引和內容
	td.subTrieNode = append(td.subTrieNode,trienode)
	//初始化一索引信息節點
	nti := NewtrieIndex()
	//索引爲 長度 -1
	nti.arrindex = len(td.subTrieNode) -1
	//添加 節點的key
	nti.subTrieNodeKey = key
	//插入
	td.subTrieIndex = append(td.subTrieIndex, nti)

	MergeSort(td.subTrieIndex)

}
////自動排序
//func(trie *Trie) SortItSelf(){
//	trie.sortInterval ++
//	//每插入一定次數 進行一次排序
//	if trie.sortInterval > SortIntervalTimes{
//		trie.sortInterval = 0
//		//對索引 進行 希爾排序
//		ShellSort(trie.root.subTrieIndex)
//	}
//}

//遞歸進行希爾排序
func sortIndexs(root *trieNode){
	ShellSort(root.subTrieIndex)
	for i:=0;i<len(root.subTrieIndex);i++{
		_,next:=root.getByIndex(i)
		sortIndexs(next)
	}
}

//通過索引 獲取 子節點 並調整 索引
func(td *trieNode) delTrieNode(key byte) *trieNode{
	//查找指定Key 對應的index結構
	index := binSearch(td.subTrieIndex,key)
	//沒有找到
	if index == -1{
		return nil
	}
	//找到索引 獲取 對應索引
	sti := td.subTrieIndex[index].arrindex
	tmpres := td.subTrieNode[sti]
	//通過索引找到結構
	td.subTrieNode = append(td.subTrieNode[:sti],td.subTrieNode[sti+1:]...)
	td.subTrieIndex = append(td.subTrieIndex[:index],td.subTrieIndex[index+1:]...)
	//調整後面節點 索引
	for i:=0;i<len(td.subTrieNode);i++{
		if  td.subTrieIndex[i].arrindex >= sti{
			td.subTrieIndex[i].arrindex = td.subTrieIndex[i].arrindex -1
		}
	}
	return  tmpres
}


//通過索引 獲取 子節點
func(td *trieNode) getByIndex(index int) (byte,*trieNode){

	sti := td.subTrieIndex[index].arrindex
	return td.subTrieIndex[index].subTrieNodeKey,td.subTrieNode[sti]
}

//通過二分查找
func binSearch(arr []*trieIndex,data byte) int{
	left := 0
	right := len(arr) -1//最下面最上面
	//針對 數組長度爲0的情況
	if right == 0{
		if arr[0].subTrieNodeKey == data{
			return 0
		}else{
			return -1
		}
	}
	for left <= right{
		leftv := float64(data) -float64( arr[left].subTrieNodeKey)
		//如果小於最小值則找不到
		if leftv < 0{
			return -1
		}
		allv  := float64(arr[right].subTrieNodeKey) - float64(arr[left].subTrieNodeKey)
		if leftv > allv{
			return -1
		}
		diff  := float64(right - left)
		mid   := int(float64(left) + leftv /allv * diff)
		if mid < 0 || mid >= len(arr)  {
			return -1
		}
		if  arr[mid].subTrieNodeKey > data{
			right = mid - 1
		}else if arr[mid].subTrieNodeKey < data{
			left = mid + 1
		}else{
			return mid
		}
	}
	return -1
}


//通過 key 獲取 節點
func(td *trieNode) getByKey(key byte) *trieNode{
	//查找指定Key 對應的index結構
	index := binSearch(td.subTrieIndex,key)
	//沒有找到
	if index == -1{
		return nil
	}
	//找到索引 獲取 對應索引
	sti := td.subTrieIndex[index].arrindex
	//通過索引找到結構
	return td.subTrieNode[sti]
}

func MergeSort(arr []*trieIndex){
	if len(arr) <=1{
		return
	}
	j := len(arr) -1
	for (j > 0 && arr[j-1].getTrieKey() > arr[j].getTrieKey() ){
		swap(arr,j,j-1)
		j --
	}
}

//交換位置
func swap(arr []*trieIndex,i,j int){
	arr[i],arr[j] =arr[j],arr[i]
}
//對內容進行堆排序
func ShellSort(arr []*trieIndex){
	if len(arr) <=1{
		return
	}else{
		//排序 gap
		gap := len(arr) / 2
		for gap >0{
			//計算gap間距的數組
			for i:=0;i<gap;i++{
				ShellSortStep(arr,i,gap)
			}
			//縮小間距
			gap /= 2
		}
	}
}

func ShellSortStep(arr []*trieIndex,start,gap int){
	len := len(arr)
	//按間隔 遍歷 所有數組
	for i:=start + gap;i<len;i+=gap{
		//記錄 前一個 數組
		j := i - gap
		//由於後面的 會被 前面的插入 所以備份下
		bak := arr[i]
		//如果 前一個數組小於arr 就不聽向前插入
		for j >=0  && arr[j].getTrieKey() > bak.getTrieKey()   {
			//如果當前的 比後面的 大 就往後移
			arr[j+gap] = arr[j]
			j -= gap
		}
		//上面的循環完成後 把 bak 放到後面的位置
		arr[j + gap] = bak
	}
}

type Trie struct{
	sortInterval int     //排序 每間隔次
	size int		 //節點大小
	root *trieNode  //根節點
}


//初始化 Trie樹
func NewTrie() *Trie{
	node := trieNode{
		isWord:         false,
		count:0,
		subTrieNode:    make([]*trieNode,0),
		subTrieIndex: make([]*trieIndex,0),
	}
	a := Trie{
		size: 0,
		root: &node,
	}
	return &a
}

//如果存在插入
func(trie *Trie) InsertByte(content []byte){
		trie.Insert(content)
}

//如果存在插入
func(trie *Trie) InsertStr(content string){
	trie.Insert([]byte(content))
}

//如果存在插入
func(trie *Trie) InsertInt(content int){
	x := uint32(content)
	bytesBuffer := bytes.NewBuffer([]byte{})
	binary.Write(bytesBuffer, binary.BigEndian, x)
	trie.Insert(bytesBuffer.Bytes())
}
//往 節點內 添加內容
func(trie *Trie) Insert(content []byte){
	if len(content) ==0{
		panic("error!")
	}
	if  trie.root == nil{
		node := trieNode{
			isWord:         false,
			count:0,
			subTrieNode:    make([]*trieNode,0),
			subTrieIndex: make([]*trieIndex,0),
		}
		trie.root = &node
	}
	var cur *trieNode
	trie.size ++
	for i,v := range content{
		var tmpnode *trieNode
		//如果cur爲 nil
		if cur == nil{
			//取出 索引對應的索引
			if trie.root.getByKey(v) == nil || len(trie.root.subTrieNode) ==0{

				tmpnode = &trieNode{
					isWord:         false,
					count:0,
					subTrieNode:    make([]*trieNode,0),
					subTrieIndex: make([]*trieIndex,0),
				}
				//如果是字符串 結尾
				if len(content) -1 == i{
					tmpnode.isWord = true
					tmpnode.count ++
				}
				//更新成當前節點
				cur = tmpnode
				//插入節點
				trie.root.inserSubTrie(v,tmpnode)
			}else{
				cur = trie.root.getByKey(v)
			}

		//如果 cur 已經被初始化
		}else {
			tnode := cur.getByKey(v)
			if tnode == nil{
				tmpnode = &trieNode{
					isWord:      false,
					count:0,
					subTrieNode:    make([]*trieNode,0),
					subTrieIndex: make([]*trieIndex,0),
				}
				//如果是字符串 結尾
				if len(content) -1 == i{
					tmpnode.isWord = true
					tmpnode.count ++
				}
				cur.inserSubTrie(v,tmpnode)
				cur = tmpnode
			}else{
				cur = tnode
				//如果是字符串 結尾
				if len(content) -1 == i{
					cur.count ++
					//重複計次
					cur.isWord = true
				}

			}
		}
	}
}

func (trie *Trie) ContainsStr(content string) bool{
	return trie.contains([]byte(content))
}

//是否包含 str
func (trie *Trie) contains(content []byte) bool{
	if trie.root == nil{
		panic("error nil")
	}
	var cur *trieNode
	for i,v := range content{
		if cur == nil{
			//第一次 從root的next子節點獲取
			cur = trie.root.getByKey(v)
		}else{
			//當前節點 的next子節點裏獲取
			cur = cur.getByKey(v)
		}
		//如果 沒有 獲取到 認爲 沒有
		if cur == nil{
			return false
		}
		//如果已經 到達了最後一個節點 但是這個節點沒有被標記成 單詞
		if len(content) -1 ==  i && cur.isWord == false{
			return false
		}

	}
	return true
}

//遍歷查找所有節點內容
func(trie *Trie) PrintAllBytes(deep int,count int){
	if trie.root == nil{
		panic("root nil")
	}
	//res := make([][]byte,0)
	//deepSearch(trie.root,0,&res)
	//for _,v := range res{
	//	fmt.Println(string(v))
	//}
	res := make([][]byte,0)
	deepSearchx(trie.root,deep,nil,&res)
	for _,v := range res {
		if count !=0 {
			ct := BytesToInt(v[len(v)-4:])
			if ct == count{
				fmt.Println("Key:",v[:len(v) -4],"計數:",ct,string(v[:len(v) -4]))
			}

		}else{
			fmt.Println("Key:",v[:len(v) -4],"計數:",BytesToInt(v[len(v)-4:]))
		}

	}

}
//整形轉換成字節
func IntToBytes(n uint32) []byte {
	x := uint32(n)
	bytesBuffer := bytes.NewBuffer([]byte{})
	binary.Write(bytesBuffer, binary.BigEndian, x)
	return bytesBuffer.Bytes()
}
//字節轉換成整形
func BytesToInt(b []byte) int {
	bytesBuffer := bytes.NewBuffer(b)

	var x uint32
	binary.Read(bytesBuffer, binary.BigEndian, &x)

	return int(x)
}

//遍歷查找所有節點內容
func(trie *Trie) PrintAllStr(deep int){
	if trie.root == nil{
		panic("root nil")
	}
	//res := make([][]byte,0)
	//deepSearch(trie.root,0,&res)
	//for _,v := range res{
	//	fmt.Println(string(v))
	//}
	res := make([][]byte,0)
	deepSearchx(trie.root,deep,nil,&res)
	for _,v := range res {
		fmt.Println("Key:",string(v[:len(v) -4]),"計數:",BytesToInt(v[len(v)-4:]))
	}
}


//深度優先遍歷
func deepSearchx(root *trieNode,deep int,res []byte,buf *[][]byte) {
	for _,v := range res {
		if v ==0{
			panic("error")
		}
	}
	if root.isWord{
		rr := make([]byte,0)
		rr = append(rr, res...)
		rr = append(rr, IntToBytes(root.count)...)
		//添加 保存
		*buf = append(*buf,rr)
	}
	for i:=0;i<len(root.subTrieNode);i++{
		//搜索深度
		if deep == 0 {
			return
		}
		key,next:=root.getByIndex(i)
		tmp := make([]byte,0)
		tmp = append(tmp,res...)
		tmp =append(tmp,key)
		deepSearchx(next, deep -1, tmp,buf)
	}
}
//用來輸出顯示
func deepSearchforDelete(root *trieNode,k byte,res []byte) {
	if len(root.subTrieNode) == 0{
		fmt.Println("clear:",string(k) + string(res))
	}
	for i:=0;i<len(root.subTrieNode);i++{
		key,next:=root.getByIndex(i)
		tmp := make([]byte,0,0)
		tmp = append(tmp,res...)
		tmp =append(tmp,key)
		deepSearchforDelete(next,k, tmp)
	}
}

//是否包含 str
func (trie *Trie) Delete(content string) bool{
	if trie.root == nil{
		panic("error nil")
	}
	var cur *trieNode
	for i,v := range []byte(content){
		if cur == nil{
			//第一次 從root的next子節點獲取
			cur = trie.root.getByKey(v)
		}else{
			//當前節點 的next子節點裏獲取
			cur = cur.getByKey(v)
		}
		//如果 沒有 獲取到 認爲 沒有
		if cur == nil{
			return false
		}
		//如果已經 到達了最後一個節點 但是這個節點沒有被標記成 單詞
		if len(content) -1 ==  i && cur.isWord == false{
			return false
		}

	}
	//計數器 1次 就修改爲false
	if cur.count == 1{
		cur.isWord = false//找到了 就刪除
	}else{
		cur.count --
	}

	trie.Adjust()//刪除沒用節點
	return true
}


//將被刪除的節點清除掉
func(trie *Trie) Adjust(){
	adjust(trie.root,trie.root,0,0)
}
//將被刪除的節點清除掉
func adjust(root *trieNode,ptr *trieNode,key byte,deep int){

	//如果 沒有next了 並且 這個節點 沒有 記錄單詞
	if len(root.subTrieNode) == 0 && root.isWord == false{
		deepSearchforDelete(ptr.getByKey(key),key,nil)
		//刪除子節點
		ptr.delTrieNode(key)
		return
	}else if len(root.subTrieNode) == 0{
		return
	}
	//如果 是 單詞記錄節點
	for i:=0;i< len(root.subTrieNode);i++{
		k,subnode := root.getByIndex(i)
		if deep == 0{
			key = k
		}
		//記錄下這個節點
		if root.isWord {
			ptr = root //記錄父節點
			key = k//記錄子節點 key
		}
		adjust(subnode,ptr,key,deep + 1)
	}

}

//匹配 字符串
func(trie *Trie) Match(content string){


}

//自動提示
func(trie *Trie) Suggess(content string){
	if trie.root == nil{
		panic("error nil")
	}
	var cur *trieNode
	savebyte := make([]byte,0,0)
	for i,v := range []byte(content){
		//保存前面輸入的字節
		savebyte = append(savebyte, v)
		if cur == nil{
			//第一次 從root的next子節點獲取
			cur = trie.root.getByKey(v)
		}else{
			//當前節點 的next子節點裏獲取
			cur = cur.getByKey(v)

		}
		//如果 沒有 獲取到 認爲 沒有
		if cur == nil{
			fmt.Println("no srarch")
			return
		}
		//如果已經 到達了最後一個節點 但是這個節點沒有被標記成 單詞
		if len(content) -1 ==  i  && cur!= nil{
			res := make([][]byte,0)
			deepSearchx(cur,-1,nil,&res)
			for _,v := range res {
				new := savebyte
				new = append(new,v[:len(v) -4] ...)
				//fmt.Println("提示:",string(new),"計數:",BytesToInt(v[len(v)-4:]))
			}
			fmt.Println("查詢到:",len(res))
		}

	}
}
func main(){
	t := NewTrie()
	t.InsertStr("mriemap")
	t.InsertStr("mriema")
	t.InsertStr("tEST00001")
	t.InsertStr("tEST0000")
	t.InsertStr("tEST0001")
	t.InsertStr("zEST0")
	//t.InsertStr("caomao")
	//t.InsertStr("caomao")
	//t.InsertStr("SADASDASDSADASD")
	////t.Insert("apple");
	//t.PrintAllStr(-1)
	//t.InsertStr("app");
	//fmt.Println(t.ContainsStr("caomao"))
	//t.Delete("caomao")
	//t.Delete("caomao")
	//fmt.Println("--------")
	//fmt.Println(t.ContainsStr("caomao"))
	//t.PrintAllStr(-1)
	//t.Suggess("tE")
	//sortarr := make([]int,0)
	//rand.Seed(time.Now().UnixNano())
	//for i := 0; i < 10000000; i++  {
	//	sortarr = append(sortarr, rand.Intn(100000))
	//}
	//now := time.Now()
	////for i:=0;i<len(sortarr);i++{
	////	t.InsertInt(sortarr[i])
	////}
	//fmt.Println(time.Now().Sub(now).Milliseconds(),"毫秒")

	now := time.Now()
	file ,err:= os.Open("/Users/qiao/go/src/qqsort/trie/CSDNpass.txt")
	if err != nil{
		panic(err)
	}
	rf := bufio.NewReader(file)
	i := 0
	for{
		line,_,err := rf.ReadLine()
		if err == io.EOF{
			break
		}
		for _,m := range strings.Split(string(line)," "){
			val := strings.TrimSpace(m)
			i+=1
			if len(val) == 0{
				continue
			}
			t.InsertStr(val)
		}

	}


	//t.Suggess([]byte{1})
	fmt.Println(time.Now().Sub(now).Milliseconds(),"毫秒")

	fmt.Println("key個數",t.size)
	for{
		fmt.Println("請輸入一個字符串:")
		//讀鍵盤
		reader := bufio.NewReader(os.Stdin)
		//以換行符結束
		str, _ := reader.ReadString('\n')
		fmt.Println("當前Trie樹單詞數:",t.size)
		nowx := time.Now()
		//t.Suggess(strings.TrimSpace(str))
		fmt.Println("是否包含:",t.ContainsStr(str))
		fmt.Println(time.Now().Sub(nowx).Microseconds(),"us")
	}
	time.Sleep(time.Second * 30)

}

Trie樹魔改實現千萬數據秒級排序

下面再使用,相對較長的幾乎沒有重複的文本800萬數據讀取做了下測試,此時性能就下降很多。 800萬就用了 20秒左右,並且此時內存佔用,高達1.78G。

總結:文本重複較高,較短的情況下適合使用 Trie 來排序 Trie 的優勢是查詢比較快查詢一個文本是否包含在Trie 中只需要幾十微秒以內。

一文帶你快速瞭解編譯原理

KMP 算法筆記

Trie樹魔改實現千萬數據秒級排序

布隆過濾器原理應用場景推導及Go實現

王爽第三版彙編實驗題實驗(四五)

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

Trie樹魔改 實現千萬數據 秒級排序

下面再使用,相對較長的 幾乎沒有重複的文本800萬數據讀取做了下測試,此時性能就下降很多。 800萬就用了 20秒左右,並且此時 內存佔用,高達1.78G。

總結:文本重複較高,較短的情況下 適合使用 Trie 來排序 Trie 的優勢 是查詢比較快 查詢一個文本是否包含在Trie 中只需要 幾十微秒以內。

Trie樹魔改實現千萬數據秒級排序

下面再使用,相對較長的幾乎沒有重複的文本800萬數據讀取做了下測試,此時性能就下降很多。 800萬就用了 20秒左右,並且此時內存佔用,高達1.78G。

總結:文本重複較高,較短的情況下適合使用 Trie 來排序 Trie 的優勢是查詢比較快查詢一個文本是否包含在Trie 中只需要幾十微秒以內。