（15）字符串

一、統計文本中各字符串出現的個數

利用map容器，代碼實現如下所示：

#include <iostream>
#include <map>
using namespace std;
int main()
{
	map <string ,int> M;
	map <string ,int>::iterator j;
	string t[5]={"abc","dd","abc","dd","dd"};
	
	for(int i=0;i<5;++i)
		M[t[i]]++;
	
	for(j=M.begin();j!=M.end();++j)
		cout<<"<"<<j->first<<" ,"<<j->second<<">"<<endl;
	return 0;
}

二、通過計算字符串的散列值並利用散列表來統計字符串個數

爲了減少處理時間，可以建立散列表。其中內存分配函數malloc 被改爲自定義更高效的 nmalloc和 smalloc。實現代碼如下：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct node *nodeptr;
typedef struct node {
	char *word; //單詞 
	int count;  //單詞個數 
	nodeptr next;
} node;
#define NHASH 29989/*聖經中共29131個單詞，用跟29131最接近的質數作爲散列表大小*/
#define MULT 31    /*乘數*/
nodeptr bin[NHASH];//散列表 

unsigned int hash(char *p)//哈希函數，將每個字符串映射成小於NHASH的正整數 
{	unsigned int h = 0;
	for ( ; *p; p++)
		h = MULT * h + *p;
	return h % NHASH;
}

#define NODEGROUP 1000
int nodesleft = 0;
nodeptr freenode;

nodeptr nmalloc()
{	if (nodesleft == 0) {
		freenode = malloc(NODEGROUP*sizeof(node));
		nodesleft = NODEGROUP;
	}
	nodesleft--;
	return freenode++;
}

#define CHARGROUP 10000
int charsleft = 0;
char *freechar;

char *smalloc(int n)
{	if (charsleft < n) {
		freechar = malloc(n+CHARGROUP);
		charsleft = n+CHARGROUP;
	}
	charsleft -= n;
	freechar += n;
	return freechar - n;
}

void incword(char *s)//增加與單詞相關聯的計數器的值，如果之前沒有這個詞，對計數器初始化 
{	nodeptr p;
	int h = hash(s);//找到與單詞對應的箱 
	for (p = bin[h]; p != NULL; p = p->next)
		if (strcmp(s, p->word) == 0) {//該箱子中若有 該單詞，則對應count++ ,否則新建單詞指針 （採取頭插法） 
			(p->count)++;
			return;
		}
	p = nmalloc();//本來用malloc就可以，但優化成了nmalloc 
	p->count = 1;
	p->word = smalloc(strlen(s)+1);//本來用malloc就可以，但優化成了smalloc 
	strcpy(p->word, s);
	p->next = bin[h];
	bin[h] = p;
}

int main()
{	int i;
	nodeptr p;
	char buf[100];
	for (i = 0; i < NHASH; i++)//將每個箱初始化 
		bin[i] = NULL;
	while (scanf("%s", buf) != EOF)
		incword(buf);//增加與輸入單詞相關聯的計數器的值 
	for (i = 0; i < NHASH; i++)//輸出每一個不等於NULL的箱的字符串和個數 
		for (p = bin[i]; p != NULL; p = p->next)
			printf("%s %d\n", p->word, p->count);
	return 0;
}

三、利用set容器，排序輸出各個字符串（按字母表順序）

代碼如下：

#include <iostream>
#include <set>
#include <string>
using namespace std;

int main()
{
	set<string> S;	
 	set<string>::iterator j;
 	
 	string t[5]={"abc","dd","abc","dd","dd"};
	
	for(int i=0;i<5;++i)					
		S.insert(t[i]);
		
	for (j = S.begin(); j != S.end(); ++j)
		cout << *j << "\n";
	return 0;
}

四、短語

輸出短語中的重複子串，例如char *a="banana"，其最長重複子串爲“ana”，通過以下兩種方式來解決問題：

方案一：雙重for循環比較每個字符串，找到最長重複子字符串（效率低，不可取）；

方案二：利用後綴數組，通過對其排序，然後比較相鄰字符串間相同的字符個數。最後得到文本文件最長的重複子字符串。

後綴數組說明如下：

char *a="banana";
a[0]=banana;
a[1]=anana;
a[2]=nana;
a[3]=ana;
a[4]=na;
a[5]=a;

方案二代碼實現如下：

#include <stdlib.h>
#include <string.h>
#include <stdio.h>

int comlen(char *p, char *q)//返回兩個參數共同部分的長度 
{	int i = 0;
	while (*p && (*p++ == *q++))
		i++;
	return i;
}

#define M 1
#define MAXN 5000000
char c[MAXN], *a[MAXN];

int main()
{   int i, ch, n = 0, maxi, maxlen = -1;
    while ((ch = getchar()) != EOF) {
        a[n] = &c[n];			//生成後綴數組
        c[n++] = ch;
    }
    c[n] = 0;
    qsort(a, n, sizeof(char *), pstrcmp);//快速排序 
    for (i = 0; i < n-M; i++)
        if (comlen(a[i], a[i+M]) > maxlen) {//比較相鄰字符串相同個數 
            maxlen = comlen(a[i], a[i+M]);  //記錄最大長度值
            maxi = i;
        }
    printf("%.*s\n", maxlen, a[maxi]);
    return 0;
}

五、生成文本

生成隨機文本的方法：1、基於字母：下一個字符設置爲前一個字母的隨機函數。或者是下一個字母是前n個字符的隨機函數；2、基於單詞：a隨機輸出字典中單詞，b隨機打開一頁選一個字母，再隨機打開另一頁找到第一個字母后的單詞爲輸出單詞。

六、原理

（1）字符串的數據結構，set、map、hash表等；

（2）散列，平均速度快，易於實現；

（3）平衡樹，C++標準模板庫的set和map的大部分實現就採用平衡樹；

（4）後綴數組，初始化指向文本中每個字符的指針數組，對其排序後得到後綴數組，在遍歷每個數組以查找接近的字符串，也可以使用二分搜索查找單詞或短語。

七、習題

（1）習題3：採用專用的內存分配器來提高散列函數的速度，即一次申請多個內存空間。只有上一次用光時，再次申請。減少了申請內存次數，代碼如下：

#define NODEGROUP 1000
int nodesleft = 0;
nodeptr freenode;

nodeptr nmalloc()
{	if (nodesleft == 0) {
		freenode = malloc(NODEGROUP*sizeof(node));
		nodesleft = NODEGROUP;
	}
	nodesleft--;
	return freenode++;
}

#define CHARGROUP 10000
int charsleft = 0;
char *freechar;

char *smalloc(int n)
{	if (charsleft < n) {
		freechar = malloc(n+CHARGROUP);
		charsleft = n+CHARGROUP;
	}
	charsleft -= n;
	freechar += n;
	return freechar - n;
}

（2）習題5：將單詞按頻率遞減的順序輸出，如何實現？若僅輸出M個最常見的單詞呢？

可在C++（map）程序中添加另一個映射，將一組單詞跟他們的計數聯繫起來。在C程序中，我們可以根據計數對數組進行排序，然後對其迭代（由於一些單詞的計數會比較大，數組應該比輸入文件小得多）。對於常見的文檔，我們可以用關鍵字索引，並保存一個在一定範圍內（如1~1000）的計數的鏈表數組，從而實現對M個常見單詞的輸出。

（3）習題8：如何找出出現超過M次的最長字符串。

子數組a[i...i+M]表示M+1個字符串。由於數組是有序的，我們可以通過調用在第一個和最後一個字符串上調用comlen函數來快速確定這M+1個字符串共有的字符數：comlen(a[i], a[i+M]);

（4）習題9：給定兩個輸入文本，找到他們共有的最長字符串。

當第一個字符串讀入數組c，記錄其結束的位置並在其最後填入空字符；然後讀入第二個字符串並進行相同的處理。跟以前一樣進行排序。掃描數組時，使用“異或”操作來確保恰有一個字符串是從過渡點前面開始的。

利用本章的方法：給出兩個字符串後綴，標記每個後綴屬於第一個字符串還是第二個字符串。從相鄰的N個字符串找公關字符串，保證這N個字符串兩個都有。代碼實現如下：

//求兩字符串的最長公共子串
#include<stdio.h>
#include<string.h>

char * maxsamesubstring(char *s1,char *s2)
{
    int i,j,len,maxlen,index,maxindex;
    
    maxlen=0;     //初始化最長公共子串的長度
    maxindex=0;   //初始化最長公共子串的位置

    len=0;        //當前公共子串的長度
    
    for(i=0;s1[i]!='\0';i++)
        for(j=0;s2[j]!='\0';j++)
            if(s1[i+len]==s2[j])
            {
                if(!len)//len=0 的時候執行（第一次執行）
                {
                    index=j;   //記下公共子串的起始位置
                }
                len++;
            }
            else if(len)
            {
                if(maxlen<len)    //經過一次掃描找到了最長公共子串
                {
                    maxlen=len;
                    maxindex=index;
                }
                len=0;           //進行下一次的掃描
            }

    char *p=new char[maxlen+1];
    strncpy(p,s2+maxindex,maxlen);  //把最長公共字符串複製到p所指的空間
    p[maxlen+1]='\0';               //置串結束標誌
    return p;

}

int main()
{
    char *s1="president hujintao",*s2="times jin",*sub;

    sub=maxsamesubstring(s1,s2);

    printf("%s\n",sub);

    return 0;

}

（5）習題14：使用散列函數對馬爾科夫程序提速：

下面的函數對k個單詞組成的序列進行了散列，其中每個單詞都以空字符結束：

unsigned int hash(char* p)
{
    unsigned int h = 0;
    int n;
    for(n = k; n > 0; p++)
    {
        h = NULT * h + *p;
        if(*p == 0)
            n--;
     }
     return h % NHASH;
}

可用這個散列函數取代馬爾科夫文本生成算法中的二分搜索，使得平均時間從O(nlogn)降到了O(n)。改程序在散列表中爲元素使用了鏈表表示法，只增加了nwords個32位整數的額外空間，其中nwords是輸入中的單詞個數。

修改後的馬爾科夫程序如下所示：

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define NHASH 49979
#define MULT 31
#define MAXWORDS 80000
char inputchars[4300000];//存儲輸入數據
char *word[MAXWORDS];//後綴數組
int nword=0;//記錄單詞數
int k=2;//2階
int next[MAXWORDS];//用於構建hash表
int bin[NHASH];
//以k個單詞爲單位，進行hash
unsigned int hash(char* str){
	int n;
	unsigned int h=0;
	char* p=str;
	for(n=k;n>0;++p){
		h=MULT*h+*p;
		if(*p=='\0')
			--n;
	}
	return h%NHASH;
}
//比較前k個單詞的大小
int wordncmp(char* p,char *q){
	int n;
	for(n=k;*p==*q;++p,++q){
		if(*p=='\0'&&(--n)==0)
			return 0;
	}
	return *p-*q;
}
//從當前單詞出發，跳過前n個單詞
char* skip(char* p,int n){
	for(;n>0;++p){
		if(*p=='\0')
			--n;
	}
	return p;
}
 

int main(){
	int i,j;
	//步驟1：構建後綴數組
	word[0]=inputchars;
	//scanf以空格作爲分隔符, 並且自動加上'\0'
	while((scanf("%s",word[nword]))!=EOF){
		word[nword+1]=word[nword]+strlen(word[nword])+1;
		++nword;
	}
	//附加k個空字符,保證wordncmp()正確（感覺不需要這個）
	for(i=0;i<k;++i)
		word[nword][i]='\0';
	//步驟2：構建hash table
	//初始化hash table
	for(i=0;i<NHASH;++i)
		bin[i]=-1;
	//hash表採用前插的方式。例如：word[0], word[1], word[5]擁有相同的hash值15
	//則：  bin[15](5)->next[5](1)->next[1](0)->next[0](-1) 
	for(i=0;i<=nword-k;++i)	{
		j=hash(word[i]);
		next[i]=bin[j];
		bin[j]=i;
	}
	//步驟3：生成隨機文本
	int wordsleft;//生成單詞數
	int psofar;
	char *phrase,*p;
	phrase=inputchars;
	for(wordsleft=10000;wordsleft>0;--wordsleft){
		psofar=0;        
		for(j=bin[hash(phrase)];j>=0;j=next[j])
              //在hash值相同的項中找出字符串值相同的後綴數組表項，根據概率選擇一個
			if(wordncmp(phrase,word[j])==0&&rand()%(++psofar)==0)
				p=word[j];
		//將phrase重新設置
		phrase=skip(p,1);
		//輸出符合要求單詞的後面第k個單詞
		if(strlen(skip(phrase,k-1))==0)
			break;
		printf("%s\n",skip(phrase,k-1));
	}
	return 0;    
}

（5）RARP：逆地址解析協議

有關Linux文件描述符中的close on exec標誌位

（9）IP選路

Linux下管道使用的一些限制

（10）動態選路協議

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結