java爬虫 京东商品页 容易案例
java爬虫 京东商品页 简单案例
HttpClient + htmlcleaner + xpath +MySQL Java语言
要爬的数据


数据库表结构

数据库建表语句
项目的包结构

pom.xml 文件中的jar包依赖
编写实体类
spider类
Downloadable接口类
DownloadImpl实现类
PageUtil页面工具类
Processable.java
ProcessImpl.java
ProcessImpl.java代码中的几个注意点:
获取商品名称、图片URL的xpath路径

在京东商品页面获取商品价格的方式


得到如下的连接地址:

商品参数规格的Xpath

Storeable.java
StoreImple.java
MyDBUtils.java
运行test测试方法,在数据库中插入了数据

数据库表结构
数据库建表语句
SET FOREIGN_KEY_CHECKS=0; -- ---------------------------- -- Table structure for `spider` -- ---------------------------- DROP TABLE IF EXISTS `spider`; CREATE TABLE `spider` ( `id` int(10) NOT NULL AUTO_INCREMENT, `goods_id` varchar(20) DEFAULT NULL, `data_url` varchar(300) DEFAULT NULL, `pic_url` varchar(300) DEFAULT NULL, `title` varchar(300) DEFAULT NULL, `price` varchar(10) DEFAULT NULL, `param` text, `current_time` datetime DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;
项目的包结构
pom.xml 文件中的jar包依赖
<dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.4</version> </dependency> <dependency> <groupId>net.sourceforge.htmlcleaner</groupId> <artifactId>htmlcleaner</artifactId> <version>2.16</version> </dependency> <dependency> <groupId>org.json</groupId> <artifactId>json</artifactId> <version>20160212</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.38</version> </dependency> <dependency> <groupId>commons-dbutils</groupId> <artifactId>commons-dbutils</artifactId> <version>1.6</version> </dependency> </dependencies>
编写实体类
import java.util.HashMap; import java.util.Map; /** * 页面实体类 * 保存页面信息 */ public class Page { private String goodId;// 商品ID private String goodName;//商品名称 private String dataUrl;//商品URL地址 private String picUrl;//商品图片URL地址 private String price;//价格 private Map<String, String> param = new HashMap<String, String>();//商品参数规格 private String content;//页面原始源代码内容 public String getGoodId() { return goodId; } public void setGoodId(String goodId) { this.goodId = goodId; } public String getGoodName() { return goodName; } public void setGoodName(String goodName) { this.goodName = goodName; } public String getDataUrl() { return dataUrl; } public void setDataUrl(String dataUrl) { this.dataUrl = dataUrl; } public Map<String, String> getParam() { return param; } public void setParam(String key,String value) { this.param.put(key, value); } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getPicUrl() { return picUrl; } public void setPicUrl(String picUrl) { this.picUrl = picUrl; } public String getPrice() { return price; } public void setPrice(String price) { this.price = price; } }
spider类
import cn.crxy.maven.Spider.domain.Page; import cn.crxy.maven.Spider.download.Downloadable; import cn.crxy.maven.Spider.process.Processable; import cn.crxy.maven.Spider.store.Storeable; public class Spider { private Downloadable downloadable; private Processable processable; private Storeable storeable; //下载页面源代码 public Page download(String url){ return downloadable.download(url); } //解析页面源代码 public void process(Page page){ processable.process(page); } //将解析后的数据保存到数据库 public void store(Page page){ storeable.store(page); } public Downloadable getDownloadable() { return downloadable; } public void setDownloadable(Downloadable downloadable) { this.downloadable = downloadable; } public Processable getProcessable() { return processable; } public void setProcessable(Processable processable) { this.processable = processable; } public Storeable getStoreable() { return storeable; } public void setStoreable(Storeable storeable) { this.storeable = storeable; } }
Downloadable接口类
import cn.crxy.maven.Spider.domain.Page; public interface Downloadable { Page download(String url); }
DownloadImpl实现类
import cn.crxy.maven.Spider.domain.Page; import cn.crxy.maven.Spider.utils.PageUtil; public class DownloadImpl implements Downloadable { public Page download(String url) { Page page = new Page(); String content=PageUtil.getContent(url);//根据url得到内容 page.setContent(content); page.setDataUrl(url); return page; } }
PageUtil页面工具类
import java.io.IOException; import org.apache.http.HttpEntity; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; /** * 根据URL获取url对应的内容 */ public class PageUtil { public static String getContent(String url){ HttpClientBuilder custom = HttpClients.custom();//创建httpclient //通过构建器构建一个httpclient对象,可以认为是获取到一个浏览器对象 CloseableHttpClient build = custom.build(); //把url封装到get请求中 HttpGet httpGet = new HttpGet(url); String content = null; try { //使用client执行get请求,获取请求的结果,请求的结果被封装到response中 CloseableHttpResponse response = build.execute(httpGet); //表示获取返回的内容实体对象 HttpEntity entity = response.getEntity(); //解析实体中页面的内容,返回字符串形式 content = EntityUtils.toString(entity); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return content; } }
Processable.java
import cn.crxy.maven.Spider.domain.Page; public interface Processable { void process(Page page); }
ProcessImpl.java
import java.util.regex.Matcher; import java.util.regex.Pattern; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; import org.json.JSONArray; import org.json.JSONObject; import cn.crxy.maven.Spider.domain.Page; import cn.crxy.maven.Spider.utils.HtmlUtil; import cn.crxy.maven.Spider.utils.PageUtil; public class ProcessImpl implements Processable { public void process(Page page) { HtmlCleaner htmlCleaner = new HtmlCleaner(); TagNode rootNode = htmlCleaner.clean(page.getContent()); try { String goodName = HtmlUtil.getText(rootNode, "//*[@id='name']/h1");// 得到商品名称 page.setGoodName(goodName); String picUrl = HtmlUtil.getAttributeByName(rootNode, "//*[@id='spec-n1']/img","src");// 获取商品图片url page.setPicUrl("http:"+picUrl); // 获取商品号 String url = page.getDataUrl(); Pattern compile = Pattern.compile("http://item.jd.com/([0-9]+).html"); Matcher matcher = compile.matcher(url); String goodid = null; if (matcher.find()) { goodid = matcher.group(1); page.setGoodId(goodid); } // 获取商品价格 // 得到价格的json格式[{"id":"J_1593512","p":"17988.00","m":"17989.00"}] String pricejson = PageUtil .getContent("http://p.3.cn/prices/get?skuid=J_" + goodid); JSONArray jsonArray = new JSONArray(pricejson); JSONObject jsonObject = jsonArray.getJSONObject(0); String price = jsonObject.getString("p"); page.setPrice(price); // 获取规格参数 // *[@id="product-detail-2"] // *[@id="product-detail-2"]/table/tbody/tr[1]/th Object[] evaluateXPath = rootNode .evaluateXPath("//*[@id='product-detail-2']/table/tbody/tr"); JSONArray jsonArray2 = new JSONArray(); if(evaluateXPath != null && evaluateXPath.length > 0){ for(Object object : evaluateXPath){ TagNode tagnode = (TagNode) object; if(!"".equals(tagnode.getText().toString().trim())){//有数据 Object[] evaluateXPath2 = tagnode.evaluateXPath("/th"); JSONObject jsonObject2 = new JSONObject(); if(evaluateXPath2.length>0){ TagNode tagNode2 = (TagNode) evaluateXPath2[0]; jsonObject2.put("name", tagNode2.getText().toString()); jsonObject2.put("value", ""); }else { Object[] evaluateXPath3 = tagnode.evaluateXPath("/td"); TagNode tagNode1 = (TagNode) evaluateXPath3[0]; TagNode tagNode2 = (TagNode) evaluateXPath3[1]; jsonObject2.put("name", tagNode1.getText().toString()); jsonObject2.put("value", tagNode2.getText().toString()); } jsonArray2.put(jsonObject2); } } } page.setParam("spec",jsonArray2.toString()); } catch (XPatherException e) { e.printStackTrace(); } } }
ProcessImpl.java代码中的几个注意点:
获取商品名称、图片URL的xpath路径
在京东商品页面获取商品价格的方式
得到如下的连接地址:
http://p.3.cn/prices/get?type=1&area=1_72_4137&pdtk=&pduid=1112434089&pdpin=&pdbp=0&skuid=J_1593512&callback=cnp对连接进行处理后得到如下结果
商品参数规格的Xpath
Storeable.java
package cn.crxy.maven.Spider.store; import cn.crxy.maven.Spider.domain.Page; public interface Storeable { void store(Page page); }
StoreImple.java
package cn.crxy.maven.Spider.store; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import cn.crxy.maven.Spider.domain.Page; import cn.crxy.maven.Spider.utils.MyDBUtils; public class StoreImpl implements Storeable { public void store(Page page) { String dataUrl = page.getDataUrl(); String goodid = page.getGoodId(); String goodname = page.getGoodName(); String picUrl = page.getPicUrl(); String price = page.getPrice(); Map<String, String> values = page.getParam(); String param = values.get("spec"); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String currtime = sdf.format(new Date()); MyDBUtils.update(MyDBUtils.INSERT_LOG, goodid,dataUrl,picUrl,goodname,price,param,currtime); } }
MyDBUtils.java
package cn.crxy.maven.Spider.utils; import java.sql.Connection; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import org.apache.commons.dbutils.BasicRowProcessor; import org.apache.commons.dbutils.QueryRunner; import org.apache.commons.dbutils.handlers.ArrayListHandler; public class MyDBUtils { private static String className = "com.mysql.jdbc.Driver"; private static String url = "jdbc:mysql://localhost:3306/spider?" + "useUnicode=true&characterEncoding=utf-8"; private static String user = "root"; private static String password = "1234"; private static QueryRunner queryRunner = new QueryRunner(); public static final String INSERT_LOG = "INSERT INTO SPIDER(good_id," + "data_url,pic_url,good_name,price,param,`current_time`) " + "VALUES(?,?,?,?,?,?,?)"; // 拒绝new一个实例 private MyDBUtils() { }; static {// 调用该类时既注册驱动 try { Class.forName(className); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(); } } //查询 public static List<String> executeQuerySql(String sql) { List<String> result = new ArrayList<String>(); try { List<Object[]> requstList = queryRunner.query(getConnection(), sql, new ArrayListHandler(new BasicRowProcessor() { @Override public <Object> List<Object> toBeanList(ResultSet rs, Class<Object> type) throws SQLException { return super.toBeanList(rs, type); } })); for (Object[] objects : requstList) { result.add(objects[0].toString()); } } catch (SQLException e) { e.printStackTrace(); } return result; } //这个方法可以执行一些更新或者新增的sql语句或者删除 public static void update(String sql, Object... params) { try { Connection connection = getConnection(); queryRunner.update(connection, sql, params); connection.close(); } catch (SQLException e) { e.printStackTrace(); } } // 获取连接 private static Connection getConnection() throws SQLException { return DriverManager.getConnection(url, user, password); } }
在src/test/java文件夹下面的包中新建test类
TestSpider.java
package cn.crxy.maven.Spider; import org.junit.Test; import cn.crxy.maven.Spider.domain.Page; import cn.crxy.maven.Spider.download.DownloadImpl; import cn.crxy.maven.Spider.process.ProcessImpl; import cn.crxy.maven.Spider.store.StoreImpl; public class TestSpider { @Test public void test1() throws Exception { Spider spider = new Spider(); //给接口注入实现类 spider.setDownloadable(new DownloadImpl()); spider.setProcessable(new ProcessImpl()); spider.setStoreable(new StoreImpl()); String url = "http://item.jd.com/1593512.html"; Page page = spider.download(url); spider.process(page); spider.store(page); } }
运行test测试方法,在数据库中插入了数据