(源碼來自http://ifilter.codeplex.com/, 源碼其實比較簡單的使用了ifilter接口的函數,直接使用源碼會使得解析出來的中文標點符號出現錯誤,所以要去掉ValidUnicode()和CleanUpCharacters()函數.
HRESULT CIfilter::ExtractText(BSTR fileName, long maxLength, BSTR * fileText)
{
//CoInitialize(NULL);
if (NULL == fileName)
return E_POINTER;
if (0 == ::SysStringLen(fileName))
return E_INVALIDARG;
if (NULL == fileText)
return E_POINTER;
*fileText = NULL;
HRESULT hr = E_UNEXPECTED;
try
{
CComPtr<IUnknown> spIUnk;
hr = LoadIFilter(fileName, NULL, reinterpret_cast<void**>(&spIUnk));
if(SUCCEEDED(hr))
{
CComQIPtr<IFilter> spIFilter = spIUnk;
if (spIFilter)
{
DWORD dwFlags = 0;
hr = spIFilter->Init(IFILTER_INIT_CANON_PARAGRAPHS |
IFILTER_INIT_CANON_HYPHENS |
IFILTER_INIT_CANON_SPACES |
IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
IFILTER_INIT_INDEXING_ONLY|
IFILTER_INIT_FILTER_OWNED_VALUE_OK ,
0, NULL, &dwFlags);
AtlTrace(_T("Init() hr=%x, dwFlags=%x\n"), hr, dwFlags);
if (SUCCEEDED(hr))
{
std::wstringstream wout;
STAT_CHUNK statChunk;
memset(&statChunk, 0, sizeof(statChunk));
bool moreChunks = true;
while (moreChunks)
{
if (maxLength && wout.tellp() > maxLength)
{
hr = S_FALSE;
break;
}
hr = spIFilter->GetChunk(&statChunk);
AtlTrace(_T("GetChunk() hr=%x, breakType=%d, flags=%x\n"), hr, statChunk.breakType, statChunk.flags);
if (SUCCEEDED(hr))
{
// ignore non-text chunks...
if (CHUNK_TEXT != (CHUNK_TEXT & statChunk.flags))
continue;
switch (statChunk.breakType)
{
case CHUNK_NO_BREAK:
break;
case CHUNK_EOW:
wout << L' ';
break;
case CHUNK_EOS:
case CHUNK_EOC:
case CHUNK_EOP:
wout << L"\r\n";
break;
}
bool moreText = true;
while (moreText)
{
static const int cChunkSize = 1024;
wchar_t buf[cChunkSize + 1];
unsigned long chBuf = cChunkSize;
memset(buf, 0, sizeof(buf));
hr = spIFilter->GetText(&chBuf, buf);
AtlTrace(_T("GetText() hr=%x, chBuf=%d\n"), hr, chBuf);
if (SUCCEEDED(buf))
{
//CleanUpCharacters(chBuf, buf);
wout << buf;
if (maxLength && wout.tellp() > maxLength)
{
hr = S_FALSE;
moreText = false;
}
if (FILTER_S_LAST_TEXT == hr)
{
hr = S_OK;
moreText = false;
}
if (FILTER_E_NO_MORE_TEXT ==hr)
{
hr = S_OK;
moreText = false;
break;
}
if (FILTER_E_NO_TEXT == hr)
{
return FILTER_E_NO_TEXT;
}
}
}
*fileText = SysAllocString(wout.str().c_str());
}
else
{
return hr;
}
}
}
else
{
switch (hr)
{
case E_FAIL:
return E_FAIL;
case E_INVALIDARG:
return E_INVALIDARG;
case FILTER_E_PASSWORD:
return FILTER_E_PASSWORD;
case FILTER_E_ACCESS:
return FILTER_E_ACCESS;
default:
return hr;
}
}
}
else
{
return E_FAIL;
}
}
else
{
switch (hr)
{
case E_ACCESSDENIED:
return E_ACCESSDENIED;
case E_HANDLE:
return E_HANDLE;
case E_INVALIDARG:
return E_INVALIDARG;
case E_OUTOFMEMORY:
return E_OUTOFMEMORY;
case E_FAIL:
return E_FAIL;
case FILTER_E_PASSWORD:
return FILTER_E_PASSWORD;
case FILTER_E_ACCESS:
return FILTER_E_ACCESS;
default:
return hr;
}
}
}
catch (...)
{
return E_FAIL;
}
return hr;
}
Ifilter裏面使用了其他組件的ifilter接口。經測試要解析office2003不需要安裝相應的office軟件,而要解析office2007需要安裝相應的office軟件,至少需要其ifilter組件,這是因爲office文件使用的OOXML格式。
同樣要解析PDF,ZIP都需要相應的Ifilter接口。其中如果使用的64位機需要PDFFilter.dll(從PDF官網下載)。
在這裏我遇到無法成功loadfilter(DLL路徑)的錯誤,找了好多天都沒找出來,最後通過重寫該函數在使用LoadLibrary()函數時才發現了問題,原來我的機子是64位的而編譯的環境是WIN32,使用的ifliter接口是64位的,所以無法成功加載動態鏈接庫
只需將編譯環境改爲X64就行。雖然這麼簡單個問題花了我好多天,但是我弄懂了LoadFilter函數的流程。
對於LoadFilter函數的流程事例可以看程序http://www.codeproject.com/Articles/13391/Using-IFilter-in-C# 。
LoadFilter的流程如下(以.pdf爲例,打開註冊表):
1. 在HKEY_LOCAL_MACHINE\SOFTWARE\Classes\ 找到.pdf, .pdf中子鍵PersistentHandler 值爲{F6594A6D-D57F-4EFD-B2C3-DCD9779E382E}
2. 在HKEY_LOCAL_MACHINE\SOFTWARE\CLSID中查找{F6594A6D-D57F-4EFD-B2C3-DCD9779E382E}, 找到含有PersistentAddinsRegistered的結點,PersistentAddinsRegistered含有結點{89BCB740-6119-101A-BCB7-00DD010655AF}。
{89BCB740-6119-101A-BCB7-00DD010655AF} 爲Ifilter接口的GUID, 取出該結點的值{E8978DA6-047F-4E3D-9C78-CDBE46041603}。
3. 在HKEY_LOCAL_MACHINE\SOFTWARE\CLSID查找{E8978DA6-047F-4E3D-9C78-CDBE46041603}結點, 該結點中子節點InprocServer32的默認值即爲PDF ifilter DLL的地址。
4. 後續任務就是對該DLL進行註冊和 load,詳見http://www.codeproject.com/Articles/13391/Using-IFilter-in-C#