百度百科爬虫
WebMagic的配置:http://www.cnblogs.com/justcooooode/p/7913365.html
爬取目标
最近需要对一些领域概念做分析,选择利用百度百科爬取对应的词条,从中获取信息和知识。
我将查询结果分为两类,一种是百科中已收录,另一种就是未被收录,虽然未被收录但还是能从中获取一些信息。
对于未被收录的词条,会给出与其相关的若干词条,所以这一部分也作为爬取内容。
而对于已收录的词条,我需要的信息是标题、同义词、摘要、Infobox、百度知心内容。
代码
1 /** 2 * @Title: BaiduBaikeSpider.java 3 * @Package: edu.heu.kg.spider 4 * @Description: TODO 5 * @author:LJH 6 * @date:2017年12月1日 上午9:43:16 7 */ 8 package edu.heu.kg.spider; 9 10 import java.io.BufferedReader; 11 import java.io.BufferedWriter; 12 import java.io.FileOutputStream; 13 import java.io.FileReader; 14 import java.io.IOException; 15 import java.io.OutputStreamWriter; 16 import java.io.UnsupportedEncodingException; 17 import java.util.LinkedHashMap; 18 import java.util.LinkedList; 19 import java.util.List; 20 import java.util.Map; 21 import java.util.regex.Matcher; 22 import java.util.regex.Pattern; 23 24 import us.codecraft.webmagic.Page; 25 import us.codecraft.webmagic.Site; 26 import us.codecraft.webmagic.Spider; 27 import us.codecraft.webmagic.pipeline.ConsolePipeline; 28 import us.codecraft.webmagic.processor.PageProcessor; 29 import us.codecraft.webmagic.selector.Html; 30 import us.codecraft.webmagic.selector.JsonPathSelector; 31 32 public class BaiduBaikeSpider implements PageProcessor { 33 34 // 抓取配置 35 private Site site = Site.me().setRetryTimes(3).setSleepTime(500).setCharset("UTF-8").setUserAgent( 36 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"); 37 // 检索词 38 private static String word; 39 // 存储路经 40 private static final String PATH = "D:\百科词条\百度百科\"; 41 42 @Override 43 public Site getSite() { 44 return site; 45 } 46 47 @Override 48 public void process(Page page) { 49 50 if (page.getUrl().regex("https://baike.baidu.com/item/.*").match()) { 51 52 Html html = page.getHtml(); 53 54 if (html.xpath("/html/body/div[3]/div[2]/div/div[2]/dl[1]/dd/h1/text()").match()) { 55 56 // 标题 57 String title = html.xpath("/html/body/div[3]/div[2]/div/div[2]/dl[1]/dd/h1/text()").toString(); 58 // 同义词 59 String synonym = html.xpath("/html/body/div[3]/div[2]/div/div[2]/span/span/text()").all().toString(); 60 // 摘要 61 String summary = html.xpath("/html/body/div[3]/div[2]/div/div[2]/div[@class='lemma-summary']/allText()") 62 .all().toString().replaceAll(",", ""); 63 // 信息框 64 String infobox = html 65 .xpath("/html/body/div[3]/div[2]/div/div[2]/div[@class='basic-info cmn-clearfix']/dl/allText()") 66 .all().toString(); 67 68 StringBuilder sb = new StringBuilder(); 69 sb.append(word + " " + title + " " + synonym + " " + summary + " " + infobox + " "); 70 try { 71 outPut(PATH + "百科词条.txt", sb.toString()); 72 } catch (IOException e) { 73 e.printStackTrace(); 74 } 75 76 // 百度知心,由于是动态加载,先获取id 77 String zhixinID = html.xpath("//*[@>).toString(); 78 String zhixinURL = "https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=" + zhixinID; 79 // 添加任务 80 page.addTargetRequest(zhixinURL); 81 82 } 83 // 如果不存在词条则获取最相关的词条,添加url到任务 84 else { 85 page.addTargetRequest("https://baike.baidu.com/search/none?word=" + word); 86 } 87 } 88 // 解析百度知心Json 89 else if (page.getUrl().regex("https://baike.baidu.com/wikiui/.*").match()) { 90 91 Map<String, List<String>> resultMap = new LinkedHashMap<>(); 92 // 获取相关类型的数量,因为有的词是两个,有些是三个 93 List<String> tempList = new JsonPathSelector("$.[*]").selectList(page.getRawText()); 94 int typeNums = tempList.size(); 95 int count = typeNums; 96 while (count > 0) { 97 resultMap.put(new JsonPathSelector("$.[" + (typeNums - count) + "].tipTitle").select(page.getRawText()), 98 new JsonPathSelector("$.[" + (typeNums - count) + "].data[*].title") 99 .selectList(page.getRawText())); 100 count--; 101 } 102 103 StringBuilder sb = new StringBuilder(); 104 sb.append(word + " "); 105 resultMap.forEach((key, value) -> { 106 sb.append(key + ":" + value + " "); 107 }); 108 sb.append(" "); 109 try { 110 outPut(PATH + "相关词条_知心.txt", sb.toString()); 111 } catch (IOException e) { 112 e.printStackTrace(); 113 } 114 } 115 // 百度百科尚未收录词条,获取相关词条 116 else if (page.getUrl().regex("https://baike.baidu.com/search/none\?word=.*").match()) { 117 118 List<String> list = page.getHtml().xpath("//*[@>).all(); 119 try { 120 list = UrlDecode(list); 121 } catch (UnsupportedEncodingException e) { 122 e.printStackTrace(); 123 } 124 125 StringBuilder sb = new StringBuilder(); 126 sb.append(word + " " + list + " "); 127 try { 128 outPut(PATH + "相关词条_未收录.txt", sb.toString()); 129 } catch (IOException e) { 130 // TODO Auto-generated catch block 131 e.printStackTrace(); 132 } 133 134 } else 135 System.out.println("Nothing!"); 136 137 } 138 139 // URL解码 140 @SuppressWarnings("unused") 141 private static List<String> UrlDecode(List<String> rawList) throws UnsupportedEncodingException { 142 143 List<String> resultList = new LinkedList<>(); 144 String reg = "https://baike.baidu.com/item/(.*)/\d+"; 145 Pattern p = Pattern.compile(reg); 146 Matcher m; 147 for (String str : rawList) { 148 m = p.matcher(str); 149 if (m.find()) { 150 resultList.add(java.net.URLDecoder.decode(m.group(1), "UTF-8")); 151 } 152 } 153 return resultList; 154 155 } 156 157 // 存储 158 private static void outPut(String path, String content) throws IOException { 159 160 FileOutputStream fos = new FileOutputStream(path, true); 161 OutputStreamWriter osw = new OutputStreamWriter(fos); 162 BufferedWriter bw = new BufferedWriter(osw); 163 bw.write(content); 164 bw.close(); 165 166 } 167 168 public static void main(String[] args) throws IOException { 169 170 try { 171 BufferedReader br = new BufferedReader(new FileReader("D:\nba.txt")); 172 while ((word = br.readLine()) != null) { 173 // 创建Spider,addUrl的参数可以为可变参数,但是会有问题 174 Spider.create(new BaiduBaikeSpider()).addPipeline(new ConsolePipeline()) 175 .addUrl("https://baike.baidu.com/item/" + word).run(); 176 } 177 br.close(); 178 } catch (IOException e1) { 179 e1.printStackTrace(); 180 } 181 182 } 183 184 }
结果
词条的基本信息:
词条相关信息:
转载请注明原文链接:http://www.cnblogs.com/justcooooode/p/8068045.html