html2txt h2t.c

/*//////////////////////

文件名:	h2t.c v0.2

作者:	蘇曉(suxiaojack)

日期:	2008.7

用途:	轉換HTML內容爲TXT文本

許可 ( License ):	GPL



v0.2

 處理Bug

1、修正無法識別&#數字;問題 UNICODE=>GB2312

2、添加©和 ® 處理

3、修正&處理死循環.



v0.1



//////////////////////*/



#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <windows.h>

#include <locale.h>



UnicodeToGB2312(char* pOut,unsigned short uData)

{

	WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(WCHAR),NULL,NULL);

	return;

}



#define BUFSIZE 1024*1024*2



char buf[1024*1024*20];

char shadowbuf[1024*1024*20];

char buffer[BUFSIZE];

long size;

int type=0;



#define tocsize 14



//這個東西太多了!常用的可能也就這些吧。

//Windows控制檯太變態!註冊商標等一些符號無法輸出! 

char* toc[tocsize]={"&nbsp;"," ","&lt;","<","&gt;",">","&quot;","/"","&amp;","&","&copy;","◎版權","&reg;","◎註冊"};



void usage(char** argv)

{

	char *us="用來轉換html =>txt. ver0.2/n"

			"suxiaojack寫於2008.7/n";

	char *ue="tstart_in_tag_text:開始的Tag標記中的特徵文字,好理解end_in_tag_text了。/n"

	"jump_num:跳過幾次開始找到的,默認爲0./n"

	"注意不支持正則式!未曾處理水印文字。/n";

	printf("%s",us);

	printf("使用方法:%s <file> [ <start_in_tag_text> [jump_num] <end_in_tag_text> ] /n",argv[0]);

	printf("%s",ue);

};



//strstr快速比較

int ministrstr(char* s,char* f)

{

	char minibuf[16];

	memcpy(minibuf,s,15);

	minibuf[15]=0;

	return strstr(minibuf,f)-minibuf;

};



//strstr轉換爲小寫快速比較。

int ministrstri(char* s,char *f)

{

	char minibuf[16];

	memcpy(minibuf,s,15);

	minibuf[15]=0;

	strlwr(minibuf);

	return strstr(minibuf,f)-minibuf;

}



// 等標記轉換

int isintoc(char* streamstart)

{

	int i=0;

	int ret=0;

	while(i<tocsize)

	{

		if(!ministrstr(streamstart,toc[i]))

		{

			printf("%s",toc[i+1]);

			ret=strlen(toc[i]);

			break;

		};

		i+=2;

	};

	if(ret==0) //沒有轉換處理

	{

		printf("&");

		ret=1;

	};

	return ret;

};





int num2txt(char* numstart)

{

	char tmp[256];

	int pos=0;

	char* s=numstart;

	unsigned short word;

	char os[3];

	while( *s>='0' && *s <='9' )

	{

		tmp[pos++]=*s++;

	};

	tmp[pos]=0;

	word=atoi(tmp);

	memset(os,0,3);

	UnicodeToGB2312(os,word);

	printf("%s",os);

	//s是;跳過

	s++;

	return  s-numstart;

}



//文件全部進入緩存

void read2buf(FILE* fp)

{

	buf[0]=0;

	size=0;

	while(!feof(fp))

	{

		fgets(buffer,sizeof(buffer),fp);

		strcat(buf+size,buffer);

		size+=strlen(buffer);

	};

	buf[size]=0;

	memcpy(shadowbuf,buf,size+1);

	strlwr(shadowbuf);

};



//找標記的開始位置。返回找到後'>'之後的第一個字符位置。

int findstart(char* start,int jump)

{

	char* pos=shadowbuf;

	strlwr(start);

	do

	{

		pos=strstr(pos,start);

		if(pos-shadowbuf < 0 )return -1;

		pos++;

	}while(jump--);

	while(*pos++ != '>')

	{};

	return pos-shadowbuf;

};

//找標記的結束位置。返回找到後'<'之前的最後字符位置。

int findend(char* end,int start)

{

	char* pos=shadowbuf+start;

	strlwr(end);

	pos=strstr(pos,end);

	if(pos-shadowbuf<0)return -1;

	while(*pos-- != '<')

	{

	};

	return pos-shadowbuf;

};



void printline()

{

	switch(type)

	{

		case 1:

		printf("%c",'/r');

		break;

		case 2:

		printf("%s","/r/n");

		break;

		case 3:

		printf("%c",'/n');

		break;

		default:

		break;

	};

};



//轉換輸出

void h2t(char* s,int len)

{

	char* ss=s;

	while(ss-s<len)

	{

		//判斷一下文章換行符號類型

		if(type==0 && ( *ss=='/r'|| *ss=='/n'))

		{

			if(*ss=='/r' &&*(ss+1)=='/n')

			{

				type=2;

			}else if(*ss=='/n')

			{

				type=3;

			}else

			{

				type=1;

			};

		};

		if(*ss!='<')

		{//非標記

			if(*ss=='&')

			{

				if(*(ss+1)=='#')

				{

					ss+=2;

					int may=num2txt(ss);

						ss+=may;

				}else

				{

					int may=isintoc(ss);

					if(may>0)

					{

						ss+=may;

					};

				}

			}

		    else

			{

				printf("%c",*ss);

				ss++;

			};

		}

		else

		{

			//<script標記

			if(!ministrstri(ss,"<script"))

			{

				ss++;

				findnext:

				while(*ss!='<' && ss-s <len)

				{

					ss++;

				};



				if(ss-s>=len)break;



				while(ministrstri(ss,"</script")!=0 && ss-s<len)

				{

					ss++;

					goto findnext;

				};

				if(ss-s>=len)break;

				while(*ss!='>')ss++;

				ss++;

			}else if(!ministrstri(ss,"<style")) //<style標記

			{

				ss++;

				findnext2:

				while(*ss!='<' && ss-s <len)

				{

					ss++;

				};

				if(ss-s>=len)break;

				while(ministrstri(ss,"</style")!=0 && ss-s<len)

				{

					ss++;

					goto findnext2;

				};

				if(ss-s>=len)break;

				while(*ss!='>')ss++;

				ss++;

			}else if(!ministrstri(ss,"</br>"))

			{

				printline();

				ss+=5;

			}else if(!ministrstri(ss,"</p>"))

			{

				printline();

				ss+=4;

			}else if(!ministrstri(ss,"<br>"))

			{

				printline();

				ss+=4;

			}

			else //普通標記

			{

				while(*ss!='>' && ss-s<len)

				{

					ss++;

				};

				if(ss-s>=len)break;

				ss++;

			};

		};

	};

};



int main(int argc,char *argv[])

{

	FILE* fp=0;

	int start,end,jump;

	if(argc==2)

	{

		fp=fopen(argv[1],"r");

		if(!fp)

		{

			usage(argv);

			exit(0);

		};

		read2buf(fp);

		h2t(buf,size);

		fclose(fp);

	}else if(argc==4)

	{

		fp=fopen(argv[1],"r");

		if(!fp)

		{

			usage(argv);

			exit(0);

		};

		read2buf(fp);

		start=findstart(argv[2],0);

	    if(start<0)

		{

			printf("can't find:%s/n",argv[2]);

			exit(1);

		};

		end=findend(argv[3],start);

		if(end<0)

		{

			printf("can't find:%s/n",argv[3]);

			exit(1);

		}

		if(start<end)

		{

			h2t(buf+start,end-start);

		}else

		{

			usage(argv);

		};

		fclose(fp);

	}else if(argc == 5)

	{

		fp=fopen(argv[1],"r");

		if(!fp)

		{

			usage(argv);

			exit(0);

		};

		read2buf(fp);

		jump=atoi(argv[3]);

		start=findstart(argv[2],jump);

		if(start<0)

		{

			printf("can't find:%s/n",argv[2]);

			exit(1);

		};

		end=findend(argv[4],start);

		if(end<0)

		{

			printf("can't find:%s/n",argv[4]);

			exit(1);

		}

		if(start<end)

		{

			h2t(buf+start,end-start);

		}else

		{

			usage(argv);

		};

		fclose(fp);

	}

	else

	{

		usage(argv);

	}



	return 0;

}



/*

與noblank聯合使用

h2t filename.htm |noblank >out.txt

*/
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章