html源码解析急解决思路

html源码解析急急急。
小弟有个程序要解析html to txt 用了mshtml.dll
能解析，但是遇到一些网页就会弹出来一些提示说，“安全隐患”，“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html，
部分代码

C/C++ code

        HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc);
        SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
        VARIANT *param;
        bstr_t bsData = (LPCTSTR)html;
        AfxMessageBox(bsData);
        hr = SafeArrayAccessData(psa, (LPVOID*)&param);
        param->vt = VT_BSTR;
        param->bstrVal = (BSTR)bsData;
        hr = pDoc->write(psa);
        hr = pDoc->close();
        SafeArrayDestroy(psa);
        BSTR bstr;
        pDoc->body->get_outerText(&bstr);
        pElement->get_outerText(&bstr);

如果可以程序中忽略提示也可以，或者用别的方式也可以。（不要用正则），那位大虾帮帮我啊！

------解决方案--------------------
参考

C/C++ code

 
 void CIllegalCheck::ParseHtml(byte* lpOrigBuffers,  size_t dwBufferCount,byte* pOutbuf,size_t& nOutBufSize,CConverAndOrigPosHelp& caoPosHelp) 
 { 
 	caoPosHelp.m_pOrigHead = lpOrigBuffers; 

 	byte* pCurrentOutIndex = pOutbuf; 
 	nOutBufSize = 0; 

 	byte* lpBuffers = new byte[dwBufferCount + 1 + m_strPreNeedCheckStream.length()]; 
 	if( lpBuffers == NULL) 
 	{ 
 		return ; 
 	} 
 	lpBuffers[dwBufferCount+m_strPreNeedCheckStream.length()]='\0'; 

 	memcpy(lpBuffers +  m_strPreNeedCheckStream.length(),lpOrigBuffers,dwBufferCount); 

 	byte* pPoshelpOrigHead = lpBuffers + m_strPreNeedCheckStream.length(); 

 	//if( NULL == strlwr((char*)(lpBuffers +  m_strPreNeedCheckStream.length()))) 
 	//{ 
 	//	if( lpBuffers) 
 	//	{ 
 	//		delete[] lpBuffers; 
 	//	} 
 	//	return; 
 	//} 
 	CUtility::FastStrLwr( ( char*)(lpBuffers +  m_strPreNeedCheckStream.length()) ); 


 	memcpy(lpBuffers,m_strPreNeedCheckStream.c_str(),m_strPreNeedCheckStream.length()); 

 	dwBufferCount += (DWORD)m_strPreNeedCheckStream.length(); 
 	m_strPreNeedCheckStream=""; 

 	byte* pStart = lpBuffers ;  
 	byte* pFindPos = NULL; 
 	byte* pEndFindPos=NULL; 
 	byte* pWholeBufEnd = lpBuffers+dwBufferCount; 
 	while (pStart  < pWholeBufEnd) 
 	{ 
 		pFindPos = (byte*)memchr(pStart,' <',dwBufferCount - (pStart - lpBuffers) ); 
 		if( NULL == pFindPos ) 
 		{ 
 			while( aHtmlRemoveChar[*pStart]) 
 			{ 
 				++pStart; 
 			} 

 			size_t nNewContentSize = dwBufferCount - (pStart - lpBuffers); 
 			if( nNewContentSize > 0 ) 
 			{ 
 				memcpy(pCurrentOutIndex,pStart,nNewContentSize); 

 				if( m_bIsUtf8 == CHART_SET_UTF8) 
 				{ 
 					pCurrentOutIndex[nNewContentSize] = '\0'; 
 					nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize); 
 					if( CConfig::GetInstance().m_bCheckTraditional ) 
 					{ 
 						size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex); 
 					} 
 				} 

 				caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf); 
 				caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) ); 

 				pCurrentOutIndex +=nNewContentSize; 

 			} 
 			break; 
 		} 

 		if( pStart != pFindPos) 
 		{ 
 			while( aHtmlRemoveChar[*pStart] ) 
 			{ 
 				++pStart; 
 			} 

 			size_t nNewContentSize = pFindPos-pStart; 
 			if( nNewContentSize > 0 ) 
 			{ 
 				memcpy(pCurrentOutIndex,pStart,nNewContentSize); 

 				if( m_bIsUtf8 == CHART_SET_UTF8)

相关推荐