JAVA原生API读取XML大文件的DOM模式和SAX方式比较
JAVA原生API读取XML大文件的DOM方式和SAX方式比较
一直都在使用dom的方式读取xml文件,但如果稍大点的xml文件那么dom方式就有点不太适合。
研究了下jdk的api,用dom和sax方式的解析结果做了个对比
要解析的xml内容格式如下
<?xml version="1.0" encoding="UTF-8"?> <urlset> <url> <loc>商品链接访问地址</loc> <data> <display> <title>商品名称</title> <price>价格</price> <image> 商品图片访问地址 </image> <description>商品描述</description> <barCode>条形码值</barCode> <area>产地 (北京)</area> <producedate>生产日期 (2011-11-11)</producedate> <manufacturers>生产厂家 (某某某)</manufacturers> </display> </data> </url> //.....更更多 </urlset>
xml文件大小16.5M
首先是dom方式读取,代码如下
package test.xml; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import javax.xml.parsers.DocumentBuilderFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; public class JDKBigXmlDomParse { private int statmentSize = 6; private List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(statmentSize); public void test() throws Exception{ String uri = "f:\\test.xml"; Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(uri); NodeList urls = doc.getElementsByTagName("url"); int length = urls.getLength(); for(int i=0;i<length;i++){ Node node = urls.item(i); if(node.getNodeType() != Node.ELEMENT_NODE){ continue; } Element urlElement = (Element)node; Map<String,Object> entry = parseEntity(urlElement); if(!entry.isEmpty()){ dataList.add(entry); if(dataList.size() == statmentSize){ doSomeThing(); } } } } private Map<String,Object> parseEntity(Element element){ Map<String,Object> map = new HashMap<String, Object>(); map.put("loc", getElementValueByTagName(element,"loc")); map.put("title", getElementValueByTagName(element,"title")); map.put("price", getElementValueByTagName(element,"price")); map.put("image", getElementValueByTagName(element,"image")); map.put("description", getElementValueByTagName(element,"description")); map.put("barCode", getElementValueByTagName(element,"barCode")); map.put("area", getElementValueByTagName(element,"area")); map.put("producedate", getElementValueByTagName(element,"producedate")); map.put("manufacturers", getElementValueByTagName(element,"manufacturers")); return map; } private String getElementValueByTagName(Element element,String tagName){ NodeList nodeList = element.getElementsByTagName(tagName); String value = ""; if(nodeList.getLength() != 0){ Node node = nodeList.item(0); value = node.getFirstChild().getNodeValue().trim(); } return value; } private void doSomeThing(){ //printMapList(dataList); dataList.clear(); } private void printMapList(List<Map<String,Object>> dataList){ boolean first = true; for(Map<String,Object> map:dataList){ System.out.println(); System.out.print("{"); Set<Map.Entry<String, Object>> entries = map.entrySet(); for(Map.Entry<String, Object> entry:entries){ if(!first){ System.out.print(","); } System.out.print("\""+entry.getKey()+"\":"); System.out.print("\""+entry.getValue()+"\""); first = false; } first = true; System.out.print("}"); } System.out.println(); } public static void main(String[] args) throws Exception{ long start = System.nanoTime(); new JDKBigXmlDomParse().test(); long end = System.nanoTime(); System.out.println("耗时:"+(end-start)/1000000000.0+"秒"); } }
运行结果:
耗时:3.212168172秒
sax方式读取,代码如下:
package test.xml; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.commons.lang.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class JDKBigXmlSaxParse extends DefaultHandler { private int statmentSize = 6; private List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(statmentSize); private Map<String, Object> dataMap; private String currentTag = ""; public void test() throws Exception { SAXParser sax = SAXParserFactory.newInstance().newSAXParser(); InputStream in = new FileInputStream("f:\\test.xml"); sax.parse(in, this); in.close(); } @Override public void characters(char[] ch, int start, int length)throws SAXException { String value = new String(ch, start, length); if(!StringUtils.isBlank(value)){ dataMap.put(currentTag, value.trim()); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if("url".equals(qName)){ dataList.add(dataMap); //dataMap.clear(); } if(dataList.size() == statmentSize){ doSomeThing(); dataList.clear(); } if("urlset".equals(qName) && dataList.size() != 0){ doSomeThing(); dataList.clear(); } } @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if ("url".equals(qName)) { dataMap = new HashMap<String, Object>(); return; } currentTag = qName; } public static void main(String[] args) throws Exception { long start = System.nanoTime(); new JDKBigXmlSaxParse().test(); long end = System.nanoTime(); System.out.println("耗时:"+(end-start)/1000000000.0+"秒"); } public void doSomeThing(){ //printMapList(dataList); } private void printMapList(List<Map<String,Object>> dataList){ boolean first = true; for(Map<String,Object> map:dataList){ System.out.println(); System.out.print("{"); Set<Map.Entry<String, Object>> entries = map.entrySet(); for(Map.Entry<String, Object> entry:entries){ if(!first){ System.out.print(","); } System.out.print("\""+entry.getKey()+"\":"); System.out.print("\""+entry.getValue()+"\""); first = false; } first = true; System.out.print("}"); } System.out.println(); } }
运行结果:
耗时:0.639864769秒
可以看到dom消耗的时间是sax方式的5倍。结论:如果只是读取xml文件,还是sax方式强。。。
而且在eclipse里面用dom方式运行的时候可能会出现eclipse java.lang.OutOfMemoryError: Java heap space这个问题