html源码解析急解决思路
html源码解析急急急。
小弟有个程序要解析html to txt 用了mshtml.dll
能解析,但是遇到一些网页就会弹出来一些提示说,“安全隐患”,“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html,
部分代码
如果可以程序中忽略提示也可以,或者用别的方式也可以。(不要用正则),那位大虾帮帮我啊!
------解决方案--------------------
参考
小弟有个程序要解析html to txt 用了mshtml.dll
能解析,但是遇到一些网页就会弹出来一些提示说,“安全隐患”,“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html,
部分代码
- C/C++ code
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc); SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1); VARIANT *param; bstr_t bsData = (LPCTSTR)html; AfxMessageBox(bsData); hr = SafeArrayAccessData(psa, (LPVOID*)¶m); param->vt = VT_BSTR; param->bstrVal = (BSTR)bsData; hr = pDoc->write(psa); hr = pDoc->close(); SafeArrayDestroy(psa); BSTR bstr; pDoc->body->get_outerText(&bstr); pElement->get_outerText(&bstr);
如果可以程序中忽略提示也可以,或者用别的方式也可以。(不要用正则),那位大虾帮帮我啊!
------解决方案--------------------
参考
- C/C++ code
void CIllegalCheck::ParseHtml(byte* lpOrigBuffers, size_t dwBufferCount,byte* pOutbuf,size_t& nOutBufSize,CConverAndOrigPosHelp& caoPosHelp)
{
caoPosHelp.m_pOrigHead = lpOrigBuffers;
byte* pCurrentOutIndex = pOutbuf;
nOutBufSize = 0;
byte* lpBuffers = new byte[dwBufferCount + 1 + m_strPreNeedCheckStream.length()];
if( lpBuffers == NULL)
{
return ;
}
lpBuffers[dwBufferCount+m_strPreNeedCheckStream.length()]='\0';
memcpy(lpBuffers + m_strPreNeedCheckStream.length(),lpOrigBuffers,dwBufferCount);
byte* pPoshelpOrigHead = lpBuffers + m_strPreNeedCheckStream.length();
//if( NULL == strlwr((char*)(lpBuffers + m_strPreNeedCheckStream.length())))
//{
// if( lpBuffers)
// {
// delete[] lpBuffers;
// }
// return;
//}
CUtility::FastStrLwr( ( char*)(lpBuffers + m_strPreNeedCheckStream.length()) );
memcpy(lpBuffers,m_strPreNeedCheckStream.c_str(),m_strPreNeedCheckStream.length());
dwBufferCount += (DWORD)m_strPreNeedCheckStream.length();
m_strPreNeedCheckStream="";
byte* pStart = lpBuffers ;
byte* pFindPos = NULL;
byte* pEndFindPos=NULL;
byte* pWholeBufEnd = lpBuffers+dwBufferCount;
while (pStart < pWholeBufEnd)
{
pFindPos = (byte*)memchr(pStart,' <',dwBufferCount - (pStart - lpBuffers) );
if( NULL == pFindPos )
{
while( aHtmlRemoveChar[*pStart])
{
++pStart;
}
size_t nNewContentSize = dwBufferCount - (pStart - lpBuffers);
if( nNewContentSize > 0 )
{
memcpy(pCurrentOutIndex,pStart,nNewContentSize);
if( m_bIsUtf8 == CHART_SET_UTF8)
{
pCurrentOutIndex[nNewContentSize] = '\0';
nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize);
if( CConfig::GetInstance().m_bCheckTraditional )
{
size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex);
}
}
caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf);
caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) );
pCurrentOutIndex +=nNewContentSize;
}
break;
}
if( pStart != pFindPos)
{
while( aHtmlRemoveChar[*pStart] )
{
++pStart;
}
size_t nNewContentSize = pFindPos-pStart;
if( nNewContentSize > 0 )
{
memcpy(pCurrentOutIndex,pStart,nNewContentSize);
if( m_bIsUtf8 == CHART_SET_UTF8)