JAVA原生API读取XML大文件的DOM模式和SAX方式比较

JAVA原生API读取XML大文件的DOM方式和SAX方式比较

一直都在使用dom的方式读取xml文件，但如果稍大点的xml文件那么dom方式就有点不太适合。

研究了下jdk的api，用dom和sax方式的解析结果做了个对比

要解析的xml内容格式如下

<?xml version="1.0" encoding="UTF-8"?>
<urlset>
<url>
<loc>商品链接访问地址</loc>
<data>
<display>
<title>商品名称</title>
<price>价格</price>
<image>
商品图片访问地址
</image>
<description>商品描述</description>
<barCode>条形码值</barCode>
<area>产地 （北京）</area>
<producedate>生产日期 （2011-11-11）</producedate>
<manufacturers>生产厂家  （某某某）</manufacturers>
</display>
</data>
</url>
//.....更更多
</urlset>

xml文件大小16.5M

首先是dom方式读取，代码如下

package test.xml;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class JDKBigXmlDomParse {

	private int statmentSize = 6;
	private List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(statmentSize);
	
	public void test() throws Exception{
		String uri = "f:\\test.xml";
		Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(uri);
		NodeList urls = doc.getElementsByTagName("url");
		int length = urls.getLength();
		for(int i=0;i<length;i++){
			Node node = urls.item(i);
			if(node.getNodeType() != Node.ELEMENT_NODE){
				continue;
			}
			Element urlElement = (Element)node;
			Map<String,Object> entry = parseEntity(urlElement);
			if(!entry.isEmpty()){
				dataList.add(entry);
				if(dataList.size() == statmentSize){
					doSomeThing();
				}
			}
		}
	}
	
	private Map<String,Object> parseEntity(Element element){
		Map<String,Object> map = new HashMap<String, Object>();
		map.put("loc", getElementValueByTagName(element,"loc"));
		map.put("title", getElementValueByTagName(element,"title"));
		map.put("price", getElementValueByTagName(element,"price"));
		map.put("image", getElementValueByTagName(element,"image"));
		map.put("description", getElementValueByTagName(element,"description"));
		map.put("barCode", getElementValueByTagName(element,"barCode"));
		map.put("area", getElementValueByTagName(element,"area"));
		map.put("producedate", getElementValueByTagName(element,"producedate"));
		map.put("manufacturers", getElementValueByTagName(element,"manufacturers"));
		return map;
	}
	
	private String getElementValueByTagName(Element element,String tagName){
		NodeList nodeList = element.getElementsByTagName(tagName);
		String value = "";
		if(nodeList.getLength() != 0){
			Node node = nodeList.item(0);
			value = node.getFirstChild().getNodeValue().trim();
		}
		return value;
	}
	
	private void doSomeThing(){
		//printMapList(dataList);
		dataList.clear();
	}
	
	private void printMapList(List<Map<String,Object>> dataList){
		boolean first = true;
		for(Map<String,Object> map:dataList){
			System.out.println();
			System.out.print("{");
			Set<Map.Entry<String, Object>> entries = map.entrySet();
			for(Map.Entry<String, Object> entry:entries){
				if(!first){
					System.out.print(",");
				}
				System.out.print("\""+entry.getKey()+"\":");
				System.out.print("\""+entry.getValue()+"\"");
				first = false;
			}
			first = true;
			System.out.print("}");
		}
		System.out.println();
	}
	
	public static void main(String[] args) throws Exception{
		long start = System.nanoTime();
		new JDKBigXmlDomParse().test();
		long end = System.nanoTime();
		System.out.println("耗时:"+(end-start)/1000000000.0+"秒");		
	}

}

运行结果：

耗时:3.212168172秒

sax方式读取，代码如下：

package test.xml;

import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.lang.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class JDKBigXmlSaxParse extends DefaultHandler {

	private int statmentSize = 6;
	private List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>(statmentSize);
	private Map<String, Object> dataMap;
	private String currentTag = "";

	public void test() throws Exception {
		SAXParser sax = SAXParserFactory.newInstance().newSAXParser();
		InputStream in = new FileInputStream("f:\\test.xml");
		sax.parse(in, this);
		in.close();
	}

	@Override
	public void characters(char[] ch, int start, int length)throws SAXException {
		String value = new String(ch, start, length);
		if(!StringUtils.isBlank(value)){
			dataMap.put(currentTag, value.trim());
		}
	}

	@Override
	public void endElement(String uri, String localName, String qName)
			throws SAXException {
		if("url".equals(qName)){
			dataList.add(dataMap);
			//dataMap.clear();
		}
		
		if(dataList.size() == statmentSize){
			doSomeThing();
			dataList.clear();
		}
		
		if("urlset".equals(qName) && dataList.size() != 0){
			doSomeThing();
			dataList.clear();			
		}
		
	}

	@Override
	public void startElement(String uri, String localName, String qName,
			Attributes attributes) throws SAXException {
		if ("url".equals(qName)) {
			dataMap = new HashMap<String, Object>();
			return;
		}

		currentTag = qName;
	}

	public static void main(String[] args) throws Exception {
		long start = System.nanoTime();
		new JDKBigXmlSaxParse().test();
		long end = System.nanoTime();
		System.out.println("耗时:"+(end-start)/1000000000.0+"秒");
	}
	
	public void doSomeThing(){
		//printMapList(dataList);
	}
	
	private void printMapList(List<Map<String,Object>> dataList){
		boolean first = true;
		for(Map<String,Object> map:dataList){
			System.out.println();
			System.out.print("{");
			Set<Map.Entry<String, Object>> entries = map.entrySet();
			for(Map.Entry<String, Object> entry:entries){
				if(!first){
					System.out.print(",");
				}
				System.out.print("\""+entry.getKey()+"\":");
				System.out.print("\""+entry.getValue()+"\"");
				first = false;
			}
			first = true;
			System.out.print("}");
		}
		System.out.println();
	}

}

运行结果：

耗时:0.639864769秒

可以看到dom消耗的时间是sax方式的5倍。结论：如果只是读取xml文件，还是sax方式强。。。

而且在eclipse里面用dom方式运行的时候可能会出现eclipse java.lang.OutOfMemoryError: Java heap space这个问题

JAVA原生API读取XML大文件的DOM模式和SAX方式比较

相关推荐