VC MSHTML解析HTML獲得RSS源CparseHTML類

原創

2020-02-21 16:20

//CparseHTML.h
#include <afx.h>
#include <iostream>
#include <comdef.h>
#include <CString>
#include <mshtml.h>
#include <String>
#pragma   warning (disable: 4786)
#include <vector>
#pragma warning(disable : 4146) //see Q231931 for explaintation
#import <mshtml.tlb> no_auto_exclude
using namespace std;
class CparseHTML{
public:
CparseHTML();
    IHTMLDocument2Ptr anylyseHTML(char *);
    vector<string> getRSSURL(char *);
    vector<string> getRSSTitle(char *);
// int getErrorCode();
private:
    vector< string > V_RSSURL;
    vector< string > V_RSSTitle;
};

//CparseHTML.cpp
#include "CparseHTML.h"
using namespace std;
MSHTML::IHTMLDocument3Ptr pDoc3;
MSHTML::IHTMLElementCollectionPtr pCollection;
MSHTML::IHTMLElementPtr pElement;
CparseHTML::CparseHTML()
{
}
vector<string> CparseHTML::getRSSURL(char * strHTML)
{
   char *s1="application/rss+xml";
   char *s="http://";
   string forstr=strHTML;
pDoc3=anylyseHTML(strHTML);
    MSHTML::IHTMLElementCollectionPtr pCollection;
    MSHTML::IHTMLElementPtr pElement;
    pCollection = pDoc3->getElementsByTagName("link");
    if(pCollection==NULL){
pCollection = pDoc3->getElementsByTagName("LINK");
}
    CString l_temp;
    CString l_tp;
CString l_title;
BSTR bsText;
    for(long i=0; i<pCollection->length; i++)
{
        pElement = pCollection->item(i, (long)0);
        if(pElement != NULL)
{
     l_tp=(LPSTR)(LPCTSTR)bstr_t(pElement->getAttribute("type",2));
     char *tp=(LPSTR)(LPCTSTR)l_tp;
     if(tp==NULL)
   cout<<"沒有找到RSS源"<<endl;
   else{
    int m=memcmp(tp,s1,19);
    if(m==0)
    {
                   l_temp =(LPCTSTR)bstr_t(pElement->getAttribute("href",2));
                   HRESULT hr=pElement->get_innerText(&bsText);
                   char *RSSURL = (LPSTR)(LPCTSTR)l_temp;
       int n=memcmp(RSSURL,s,7);
                   string str1 = RSSURL;
       if(n!=0)
        {
      str1=forstr+str1;
        }
        V_RSSURL.push_back(str1);
    }
    }
}
}
return V_RSSURL;
}
vector<string> CparseHTML::getRSSTitle(char * strHTML)
{
   char *s1="application/rss+xml";
BSTR bsText;
pDoc3=anylyseHTML(strHTML);
    pCollection = pDoc3->getElementsByTagName("link");
    if(pCollection==NULL){
pCollection = pDoc3->getElementsByTagName("LINK");
}
    CString l_tp;
CString l_title;
    for(long i=0; i<pCollection->length; i++)
{
        pElement = pCollection->item(i, (long)0);
        if(pElement != NULL)
{
     l_tp=(LPSTR)(LPCTSTR)bstr_t(pElement->getAttribute("type",2));
     char *tp=(LPSTR)(LPCTSTR)l_tp;
     if(tp==NULL)
   cout<<"沒有找到RSS源"<<endl;
   else
   {
    int m=memcmp(tp,s1,19);
    if(m==0)
    {
                       l_title= (LPCTSTR)bstr_t(pElement->getAttribute("title",2));
                       char *RSSTitle=(LPSTR)(LPCTSTR)l_title;
                       HRESULT hr=pElement->get_innerText(&bsText);
        string RSSTit=RSSTitle;
        V_RSSTitle.push_back(RSSTit);
    }
   }
}
}
return V_RSSTitle;
}

IHTMLDocument2Ptr CparseHTML::anylyseHTML(char *strHTML )
{
   CFile f;
   CString m_csFilename="D:\\test1.html";
//CFile:: modeCreate 創建一個新的文件，如果文件存在將文件截取成長度爲0
// CFile::shareDenyNone    打開這個文件同時允許其它進程讀寫這個文件。如果文件被其它進程以兼容的模式打開，這是create操作會失敗。
//let's open file and read it into CString (u can use any buffer to read though
   if (f.Open(m_csFilename, CFile::modeRead|CFile::shareDenyNone))
   {
    CString csWholeFile;
       char *buf    =csWholeFile.GetBuffer(f.GetLength());
       f.Read(buf, f.GetLength());
       csWholeFile.ReleaseBuffer(f.GetLength());
       f.Close();
       CoInitialize(NULL);
       IHTMLDocument2Ptr pDoc;
       HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,
       IID_IHTMLDocument2, (void**)&pDoc);
       SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
       VARIANT *param;
       bstr_t bsData = (LPCTSTR)csWholeFile;
       hr = SafeArrayAccessData(psa, (LPVOID*)&param);//typedef void *LPVOID;
       param->vt = VT_BSTR;
       param->bstrVal = (BSTR)bsData;
       hr = pDoc->write(psa); //write your buffer
       hr = pDoc->close();//and closes the document, "applying" your code
       SafeArrayDestroy(psa);
       CoUninitialize();
       return pDoc;
}
   else return -1;

}

/*int CparseHTML::getErrorCode()
{

}*/

主函數

#include "CparseHTML.h"
#include <iostream>
using namespace std;
int main()
{
CparseHTML pHTML;
cout<<"please input HTMLURL"<<endl;
char HTMLURL[100];
cin>>HTMLURL;
vector <string> V_RSSURL;
vector <string> V_RSSTitle;
V_RSSURL=pHTML.getRSSURL(HTMLURL);
V_RSSTitle=pHTML.getRSSTitle(HTMLURL);
for(int i=0;i<V_RSSURL.size();i++){
cout<<V_RSSTitle[i]<<endl;
cout<<V_RSSURL[i]<<endl;
}
return 0;
}

zhihu008

發佈了56 篇原創文章 · 獲贊 5 · 訪問量 81萬+

私信關注

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

VC MSHTML解析HTML獲得RSS源CparseHTML類

linux安裝cuda和cudnn

測試人員都是畫畫大神，讓我看看誰還不會用代碼圖？

Object.values()對象遍歷

我拍了拍Redis，被移出了羣聊···

網絡現代化通向雲原生應用的高速公路

面試官：說說你對序列化的理解

我宣佈，這是我找到的史上AI最全論文體系！

LockHandle

threadmanage

filewatcherservice

netproperty

使用XSL樣式表對XML文件排序

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結