October 21
convert html to txt
html parser
C++;
本程序可以用來處理spider爬下來的網頁,提取其文本,去除html的tag;
本程序用於批量轉換某個文件夾下面所有的html文件;
另一方面是將html轉換爲txt
- October 21
- Html to Txt in C++
- convert html to txt
- html parser
- C++;
- 本程序可以用來處理spider爬下來的網頁,提取其文本,去除html的tag;
- 本程序用於批量轉換某個文件夾下面所有的html文件;
- 程序總共分爲2部分,一部分是列出該文件夾下所有的文件(用C++遍歷一個文件夾下面所有的文件 Zz)
- 另一方面是將html轉換爲txt
- 點擊此處下載完整版本
- --------------------------------------------------------------------------------
- #include <stdio.h>
- #include <stdlib.h>
- #include <windows.h>
- #include <locale.h>
- #include <string>
- #include "ffsco.h"
- void UnicodeToGB2312(char* pOut,unsigned short uData)
- {
- WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(WCHAR),NULL,NULL);
- return;
- }
- #define BUFSIZE 1024*1024*2
- char buf[1024*1024*20];
- char shadowbuf[1024*1024*20];
- char buffer[BUFSIZE];
- long size;
- int type=0;
- FILE* fin=0,*fout=0;
- #define tocsize 14
- char* toc[tocsize]={" "," ","<","<",">",">",""","/"","&","&","©","◎版權","®","◎註冊"};
- typedef helper_coffs::ffsco::typeT filesc;
- filesc getfile(string mypath)
- {
- helper_coffs::ffsco myfile;
- myfile.dirs(1);
- myfile.find(mypath);
- filesc content;
- content=myfile.co_file();
- return content;
- }
- int ministrstr(char* s,char* f)
- {
- char minibuf[16];
- memcpy(minibuf,s,15);
- minibuf[15]=0;
- return strstr(minibuf,f)-minibuf;
- };
- int ministrstri(char* s,char *f)
- {
- char minibuf[16];
- memcpy(minibuf,s,15);
- minibuf[15]=0;
- strlwr(minibuf);
- return strstr(minibuf,f)-minibuf;
- }
- int isintoc(char* streamstart)
- {
- int i=0;
- int ret=0;
- while(i<tocsize)
- {
- if(!ministrstr(streamstart,toc[i]))
- {
- fprintf(fout,"%s",toc[i+1]);
- ret=strlen(toc[i]);
- break;
- };
- i+=2;
- };
- if(ret==0)
- {
- fprintf(fout,"&");
- ret=1;
- };
- return ret;
- };
- int num2txt(char* numstart)
- {
- char tmp[256];
- int pos=0;
- char* s=numstart;
- unsigned short word;
- char os[3];
- while( *s>='0' && *s <='9' )
- {
- tmp[pos++]=*s++;
- };
- tmp[pos]=0;
- word=atoi(tmp);
- memset(os,0,3);
- UnicodeToGB2312(os,word);
- fprintf(fout,"%s",os);
-
- s++;
- return s-numstart;
- }
- void read2buf(FILE* fp)
- {
- buf[0]=0;
- size=0;
- while(!feof(fp))
- {
- fgets(buffer,sizeof(buffer),fp);
- strcat(buf+size,buffer);
- size+=strlen(buffer);
- };
- buf[size]=0;
- memcpy(shadowbuf,buf,size+1);
- strlwr(shadowbuf);
- };
- int findstart(char* start,int jump)
- {
- char* pos=shadowbuf;
- strlwr(start);
- do
- {
- pos=strstr(pos,start);
- if(pos-shadowbuf < 0 )return -1;
- pos++;
- }while(jump--);
- while(*pos++ != '>')
- {};
- return pos-shadowbuf;
- };
- int findend(char* end,int start)
- {
- char* pos=shadowbuf+start;
- strlwr(end);
- pos=strstr(pos,end);
- if(pos-shadowbuf<0)return -1;
- while(*pos-- != '<')
- {};
- return pos-shadowbuf;
- };
- void printline()
- {
- switch(type)
- {
- case 1:
- fprintf(fout,"%c",'/r');
- break;
- case 2:
- fprintf(fout,"%s","/r/n");
- break;
- case 3:
- fprintf(fout,"%c",'/n');
- break;
- default:
- break;
- };
- };
- void h2t(char* s,int len)
- {
- char* ss=s;
- while(ss-s<len)
- {
-
- if(type==0 && ( *ss=='/r'|| *ss=='/n'))
- {
- if(*ss=='/r' &&*(ss+1)=='/n')
- {
- type=2;
- }else if(*ss=='/n')
- {
- type=3;
- }else
- {
- type=1;
- };
- };
- if(*ss!='<')
- {
- if(*ss=='&')
- {
- if(*(ss+1)=='#')
- {
- ss+=2;
- int may=num2txt(ss);
- ss+=may;
- }else
- {
- int may=isintoc(ss);
- if(may>0)
- {
- ss+=may;
- };
- }
- }
- else
- {
- fprintf(fout,"%c",*ss);
- ss++;
- };
- }
- else
- {
-
- if(!ministrstri(ss,"<script"))
- {
- ss++;
- findnext:
- while(*ss!='<' && ss-s <len)
- {
- ss++;
- };
- if(ss-s>=len)break;
- while(ministrstri(ss,"</script")!=0 && ss-s<len)
- {
- ss++;
- goto findnext;
- };
- if(ss-s>=len)break;
- while(*ss!='>')ss++;
- ss++;
- }else if(!ministrstri(ss,"<style"))
- {
- ss++;
- findnext2:
- while(*ss!='<' && ss-s <len)
- {
- ss++;
- };
- if(ss-s>=len)break;
- while(ministrstri(ss,"</style")!=0 && ss-s<len)
- {
- ss++;
- goto findnext2;
- };
- if(ss-s>=len)break;
- while(*ss!='>')ss++;
- ss++;
- }else if(!ministrstri(ss,"</br>"))
- {
-
- ss+=5;
- }else if(!ministrstri(ss,"</p>"))
- {
-
- ss+=4;
- }else if(!ministrstri(ss,"<br>"))
- {
-
- ss+=4;
- }
- else
- {
- while(*ss!='>' && ss-s<len)
- {
- ss++;
- };
- if(ss-s>=len)break;
- ss++;
- };
- };
- };
- };
- int main(void)
- {
- std::string mypath="C://WT2G//";
- filesc content=getfile(mypath);
- for (helper_coffs::ffsco::typeT::iterator it = content.begin(); content.end() != it; it ++)
- {
- std::string filename;
- filename=*it;
- std::string writefile=filename+".txt";
- fin=fopen(filename.c_str(),"r");
- fout=fopen(writefile.c_str(),"w");
- if(!fin)
- {
- exit(0);
- };
- read2buf(fin);
- h2t(buf,size);
- fclose(fin);
- fclose(fout);
- }
- system("pause");
- return 0;
- }