curl下载网页
场景:利用CURL下载网页内容,中文显示为乱码解决思路
利用CURL下载网页内容,中文显示为乱码
利用libcurl连接网络并下载网页内容,有些网页内容中文显示为乱码,但浏览器中打开的网页为正常,我的本地系统编码是utf-8的,网页编码也是utf-8的,可为什么显是为乱码呢,请高手指教
附我的curl设置:
CURL *curl;
curl = curl_easy_init();
if(!curl) {
return NULL;
}
WriteData * pWriteData = (WriteData*)malloc(sizeof(WriteData));
if(!pWriteData)
return NULL;
memset(pWriteData->pcPageBuf, '\0', MAX_PAGE_SIZE + 1);
pWriteData->iPageBufOffset = 0;
curl_easy_setopt(curl, CURLOPT_URL, sUrl.c_str());
if(!sProxy.empty())
curl_easy_setopt(curl, CURLOPT_PROXY, sProxy.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, Curl_Write_Func);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, pWriteData);
curl_easy_setopt(curl, CURLOPT_TIMEOUT,10); // 超时设置,免得程序被堵住
curl_easy_setopt(curl, CURLOPT_AUTOREFERER, 1); // 以下3个为重定向设置
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 1);
curl_easy_setopt(curl, CURLOPT_UNRESTRICTED_AUTH, 1);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
/* some servers don't like requests that are made without a user-agent field, so we provide one */
curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0");
CURLcode res;
res = curl_easy_perform(curl);
char pcPageBuf[MAX_PAGE_SIZE + 1]; // 为了应付网易的搜索页面中含'\0'
memset(pcPageBuf, 0, MAX_PAGE_SIZE+1);
int j = 0;
for(int i=0; i<pWriteData->iPageBufOffset; i++) {
if(*(pWriteData->pcPageBuf+i) != '\0') {
pcPageBuf[j] = *(pWriteData->pcPageBuf+i);
j++;
if(j > MAX_PAGE_SIZE)
break;
}
}
string sHTMLContent = pcPageBuf;
free(pWriteData);
curl_easy_cleanup(curl);
------解决方案--------------------
应该还是编码方式,检查下生成文件的时候的编码
利用CURL下载网页内容,中文显示为乱码
利用libcurl连接网络并下载网页内容,有些网页内容中文显示为乱码,但浏览器中打开的网页为正常,我的本地系统编码是utf-8的,网页编码也是utf-8的,可为什么显是为乱码呢,请高手指教
附我的curl设置:
CURL *curl;
curl = curl_easy_init();
if(!curl) {
return NULL;
}
WriteData * pWriteData = (WriteData*)malloc(sizeof(WriteData));
if(!pWriteData)
return NULL;
memset(pWriteData->pcPageBuf, '\0', MAX_PAGE_SIZE + 1);
pWriteData->iPageBufOffset = 0;
curl_easy_setopt(curl, CURLOPT_URL, sUrl.c_str());
if(!sProxy.empty())
curl_easy_setopt(curl, CURLOPT_PROXY, sProxy.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, Curl_Write_Func);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, pWriteData);
curl_easy_setopt(curl, CURLOPT_TIMEOUT,10); // 超时设置,免得程序被堵住
curl_easy_setopt(curl, CURLOPT_AUTOREFERER, 1); // 以下3个为重定向设置
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 1);
curl_easy_setopt(curl, CURLOPT_UNRESTRICTED_AUTH, 1);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
/* some servers don't like requests that are made without a user-agent field, so we provide one */
curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0");
CURLcode res;
res = curl_easy_perform(curl);
char pcPageBuf[MAX_PAGE_SIZE + 1]; // 为了应付网易的搜索页面中含'\0'
memset(pcPageBuf, 0, MAX_PAGE_SIZE+1);
int j = 0;
for(int i=0; i<pWriteData->iPageBufOffset; i++) {
if(*(pWriteData->pcPageBuf+i) != '\0') {
pcPageBuf[j] = *(pWriteData->pcPageBuf+i);
j++;
if(j > MAX_PAGE_SIZE)
break;
}
}
string sHTMLContent = pcPageBuf;
free(pWriteData);
curl_easy_cleanup(curl);
------解决方案--------------------
应该还是编码方式,检查下生成文件的时候的编码