怎么从html代码中获取纯文本?在线,多谢
如何从html代码中获取纯文本?在线,谢谢
我想问一下:保存到数据库字段中含有:<p></p>类似的代码,如何去掉这些,只获取纯文本
qq:543644213
e-mail:xufy576@163.com
------最佳解决方案--------------------
已发邮箱:需要BOOST正则表达式,邮件里有
我想问一下:保存到数据库字段中含有:<p></p>类似的代码,如何去掉这些,只获取纯文本
qq:543644213
e-mail:xufy576@163.com
------最佳解决方案--------------------
已发邮箱:需要BOOST正则表达式,邮件里有
#include <process.h>
#include <iostream>
using namespace std;
#include <vector>
#include <boost/regex.hpp>
using namespace boost;
#include <afxinet.h>
#include <fstream>
#ifdef _UNICODE
#define String std::wstring
#define COUT wcout
#define CIN wcin
#else
#define String std::string
#define COUT cout
#define CIN cin
#endif
#define SIZE_HTTPCACHE 1024
typedef struct tag_SoftTopInfo
{
String szSoftName;
String szGallery;
String szSynopsis;
String szUrl;
}SoftTopInfo,*lpSoftTopInfo;
typedef struct tag_WebPageData
{
int nSoftIndex;
WCHAR* pData;
tag_WebPageData()
{
pData = NULL;
}
}WebPageData, *lpWebPageData;
std::vector<String> g_vBaseUrl;
std::vector<SoftTopInfo> g_vBaseSoftInfo;
std::vector<SoftTopInfo> g_vSecondSoftInfo;
std::vector<SoftTopInfo> g_vFinalSoftInfo;
std::vector<WCHAR*> g_vBaseWebPageDataSet;
std::vector<WebPageData> g_vSecondWebPageDataSet;
std::vector<WebPageData> g_vFinalWebPageDataSet;
String g_strUrlHead = _T("http://www.onlinedown.net");
HANDLE g_hAnsEvent[3], g_hGetHttpEvent[3];
std::vector<String> g_vBaseGetHttpFailedItem;
std::vector<SoftTopInfo> g_vSecondGetHttpFailedItem;
std::vector<SoftTopInfo> g_vFinalGetHttpFailedItem;
std::vector<int> g_vSecondAnsFailedItem;
std::vector<int> g_vFinalAnsFailedItem;
std::vector<int> g_vNoOfficalDownItem;
//////////////////////////////////////////////////////////////////////////
BOOL GetHtml(String strUrl,std::vector<char> &vBaseInfo)
{
COUT<<endl;
COUT<<_T("提取网页数据:")<<strUrl.c_str()<<endl;
//删除缓存
DeleteUrlCacheEntry(strUrl.c_str());
BOOL bReturn = FALSE;
CInternetSession session(_T("SoftTop"));
CInternetFile* httpFile = NULL;
try
{
httpFile = (CInternetFile*) session.OpenURL(strUrl.c_str());
if (httpFile)
{
char byContent[SIZE_HTTPCACHE];
int nTotalSize = 0;;
while (TRUE)
{
ZeroMemory(byContent,SIZE_HTTPCACHE);
int nReadSize = httpFile->Read(byContent,SIZE_HTTPCACHE);
if (nReadSize <= 0)
break;
vBaseInfo.resize(vBaseInfo.size() + nReadSize);
memcpy_s(&vBaseInfo.front() + nTotalSize, nReadSize,byContent,nReadSize);
nTotalSize += nReadSize;
}
httpFile->Close();
delete httpFile;
bReturn = TRUE;
}
}
catch (CInternetException* m_pException)
{
httpFile = NULL;
m_pException->Delete();
}
return bReturn;
}
WCHAR* UTF8ToUnicode(char* pszUTF8)
{
COUT<<_T("转换UTF8为UNICODE...")<<endl;
DWORD dwUnicodeLen;