爬虫抓取百度链接返回数量不够
问题描述:
爬虫初学者一枚,用java的webmagic框架抓取百度搜索结果,利用servlet传要搜索的关键字,做好了解析和相关的流程调度工作。
现在要爬取百度前10页返回的信息,每页有10条,期望返回100条信息,但是每次只返回90条左右,不知道为什么会有链接丢失。望大神解答,关键代码如下:
public void processWithException(Page page) throws MalformedURLException {
Html html = page.getHtml();
if(page.getRequest().getUrl().endsWith("&pn=0&ie=utf8")){
name = (String)page.getRequest().getExtra("name");
List<String> pag = html.xpath("//div[@id='page']/a/@href").all();
if(html.xpath("//div[@id='page']/a/@href").toString()!=null){
if(html.xpath("//div[@id='page']/a/@href").all().size()>=5){
for(int i=0;i<9;i++){
String pagination = pag.get(i);
page.addTargetRequest(pagination);
}
}
else {
for(int i=0;i<html.xpath("//div[@id='page']/a/@href").all().size();i++){
String pagination = pag.get(i);
page.addTargetRequest(pagination);
}
}
}
String eqid = StringUtils.substringBetween(page.getHtml().toString(),"bds.comm.eqid = \"","\";");
List<String> url = html.xpath("//div[@class='c-container']/h3/a/@href").all();
for(int i=0;i<url.size();i++){
String url_temp = url.get(i).replace("http","https")+"&wd=&eqid="+eqid;
page.addTargetRequest(url_temp);
}
}
else if(page.getRequest().getUrl().startsWith("http://www.baidu.com/s?wd=")){
String eqid = StringUtils.substringBetween(page.getHtml().toString(),"bds.comm.eqid = \"","\";");
List<String> url = html.xpath("//div[@class='c-container']/h3/a/@href").all();
for(int i=0;i<url.size();i++){
String url_temp = url.get(i).replace("http","https")+"&wd=&eqid="+eqid;
page.addTargetRequest(url_temp);
}
}
else if(page.getRequest().getUrl().startsWith("https://www.baidu.com/link")){
String url_real = StringUtils.substringBetween(page.getHtml().toString(),"URL=\'","\'");
page.addTargetRequest(url_real);
}
else if(!page.getRequest().getUrl().startsWith("http://www.baidu.com/s?wd=")&&
!page.getRequest().getUrl().startsWith("https://www.baidu.com/link")){
Baidu_FilterUtils baiduFilterUtils = new Baidu_FilterUtils();
baiduFilterUtils.BaiduPassKeyWord(page,name,programList);
baiduFilterUtils.BaiduContainWebsites(page,name,programList_websites);
baiduFilterUtils.BaiduContainDownload(page,name,programList_download);
baiduFilterUtils.BaiduContainsPass(page,name,programList_pass);
}
}