html源码解析急解决思路

html源码解析急急急。
小弟有个程序要解析html to txt 用了mshtml.dll
能解析,但是遇到一些网页就会弹出来一些提示说,“安全隐患”,“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html,
部分代码
C/C++ code
        HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc);
        SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
        VARIANT *param;
        bstr_t bsData = (LPCTSTR)html;
        AfxMessageBox(bsData);
        hr = SafeArrayAccessData(psa, (LPVOID*)&param);
        param->vt = VT_BSTR;
        param->bstrVal = (BSTR)bsData;
        hr = pDoc->write(psa);
        hr = pDoc->close();
        SafeArrayDestroy(psa);
        BSTR bstr;
        pDoc->body->get_outerText(&bstr);
        pElement->get_outerText(&bstr);


如果可以程序中忽略提示也可以,或者用别的方式也可以。(不要用正则),那位大虾帮帮我啊!

------解决方案--------------------
参考

C/C++ code
 
void CIllegalCheck::ParseHtml(byte* lpOrigBuffers,  size_t dwBufferCount,byte* pOutbuf,size_t& nOutBufSize,CConverAndOrigPosHelp& caoPosHelp)
{
caoPosHelp.m_pOrigHead = lpOrigBuffers;

byte* pCurrentOutIndex = pOutbuf;
nOutBufSize = 0;

byte* lpBuffers = new byte[dwBufferCount + 1 + m_strPreNeedCheckStream.length()];
if( lpBuffers == NULL)
{
return ;
}
lpBuffers[dwBufferCount+m_strPreNeedCheckStream.length()]='\0';

memcpy(lpBuffers +  m_strPreNeedCheckStream.length(),lpOrigBuffers,dwBufferCount);

byte* pPoshelpOrigHead = lpBuffers + m_strPreNeedCheckStream.length();

//if( NULL == strlwr((char*)(lpBuffers +  m_strPreNeedCheckStream.length())))
//{
// if( lpBuffers)
// {
// delete[] lpBuffers;
// }
// return;
//}
CUtility::FastStrLwr( ( char*)(lpBuffers +  m_strPreNeedCheckStream.length()) );


memcpy(lpBuffers,m_strPreNeedCheckStream.c_str(),m_strPreNeedCheckStream.length());

dwBufferCount += (DWORD)m_strPreNeedCheckStream.length();
m_strPreNeedCheckStream="";

byte* pStart = lpBuffers ;
byte* pFindPos = NULL;
byte* pEndFindPos=NULL;
byte* pWholeBufEnd = lpBuffers+dwBufferCount;
while (pStart < pWholeBufEnd)
{
pFindPos = (byte*)memchr(pStart,' <',dwBufferCount - (pStart - lpBuffers) );
if( NULL == pFindPos )
{
while( aHtmlRemoveChar[*pStart])
{
++pStart;
}

size_t nNewContentSize = dwBufferCount - (pStart - lpBuffers);
if( nNewContentSize > 0 )
{
memcpy(pCurrentOutIndex,pStart,nNewContentSize);

if( m_bIsUtf8 == CHART_SET_UTF8)
{
pCurrentOutIndex[nNewContentSize] = '\0';
nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize);
if( CConfig::GetInstance().m_bCheckTraditional )
{
size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex);
}
}

caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf);
caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) );

pCurrentOutIndex +=nNewContentSize;

}
break;
}

if( pStart != pFindPos)
{
while( aHtmlRemoveChar[*pStart] )
{
++pStart;
}

size_t nNewContentSize = pFindPos-pStart;
if( nNewContentSize > 0 )
{
memcpy(pCurrentOutIndex,pStart,nNewContentSize);

if( m_bIsUtf8 == CHART_SET_UTF8)