基于Spindle的加强HTTP Spider
基于Spindle的增强HTTP Spider
zz:http://www.iteye.com/news/1731
构建于lucene之上的可用的Java开源Spider少之又
少,spindle长期没有更新且功能不够完善,故而自己参考其源
代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,
有任何意见及建议均可Email联系我 (kaninebruno@hotmail.com)
以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector-
1.0.5;
下载地址分别为
htmlparser:http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis:http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
spindle的官方站点:http://www.bitmechanic.com/projects/spindle/
- package com.huizhi.kanine.util;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.UnsupportedEncodingException;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.SocketException;
- import java.net.SocketTimeoutException;
- import java.net.URL;
- import java.net.UnknownHostException;
- import java.nio.charset.Charset;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.HashSet;
- import jeasy.analysis.MMAnalyzer;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.DateTools;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.store.RAMDirectory;
- import org.htmlparser.Parser;
- import org.htmlparser.PrototypicalNodeFactory;
- import org.htmlparser.filters.AndFilter;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.tags.BaseHrefTag;
- import org.htmlparser.tags.FrameTag;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.tags.MetaTag;
- import org.htmlparser.util.EncodingChangeException;
- import org.htmlparser.util.NodeIterator;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- import org.htmlparser.visitors.HtmlPage;
- import cpdetector.io.ASCIIDetector;
- import cpdetector.io.CodepageDetectorProxy;
- import cpdetector.io.JChardetFacade;
- import cpdetector.io.ParsingDetector;
- import cpdetector.io.UnicodeDetector;
- /**
- * @author 张 波
- * E-mail:kaninebruno@hotmail.com
- * Created On : 2008-03-30
- */
- public class SiteCapturer implements Runnable{
- /* 基准(初始)URL */
- protected URL mSource;
- /* 索引文件的存放位置 */
- protected String mTarget;
- /**
- * 待 解析的URL地址集合,所有新检测到的链接均存放于此;
- * 解析时按照先入先出(First-In First-Out)法则线性取出
- */
- protected ArrayList mPages;
- /* 已解析的URL地址集合,避免链接的重复抓取 */
- protected HashSet mFinished;
- protected Parser mParser;
- /* StringBuffer的缓冲区大小 */
- protected final int TRANSFER_SIZE = 4096 ;
- /* 当前平台的行分隔符 */
- protected static String lineSep = System.getProperty( "line.separator" );
- /* 程序运行线程数,默认2个线程 */
- protected int mthreads;
- protected ArrayList threadList;
- /* 存储于磁盘的IndexWriter */
- protected IndexWriter FSDWriter;
- /* 存储于内存的IndexWriter */
- protected IndexWriter RAMWriter;
- protected IndexSearcher indexSearcher;
- protected RAMDirectory ramDirectory;
- /* 筛选页面内容的分词器 */
- protected Analyzer luceneAnalyzer;
- /* 解析页面时的字符编码 */
- protected String charset;
- /* 统计已抓取的页面数量 */
- protected int count = 0 ;
- /* 基准端口 */
- protected int mPort;
- /* 基准主机 */
- protected String mHost;
- /* 检测索引中是否存在当前URL信息,避免重复抓取 */
- protected boolean mCheck;
- /* 索引操作的写入线程锁 */
- public static final Object indexLock = new Object();
- public SiteCapturer() {
- mSource = null ;
- mTarget = null ;
- mthreads = 2 ;
- mCheck = false ;
- mPages = new ArrayList();
- mFinished = new HashSet();
- mParser = new Parser();
- PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
- factory.registerTag(new LocalLinkTag());
- factory.registerTag(new LocalFrameTag());
- factory.registerTag(new LocalBaseHrefTag());
- mParser.setNodeFactory(factory);
- }
- public String getSource() {
- return mSource.toString();
- }
- public void setSource(String source) {
- if (source.endsWith( "/" ))
- source = source.substring(0 , source.length() - 1 );
- try {
- mSource = new URL(source);
- } catch (MalformedURLException e) {
- System.err.println("Invalid URL : " + getSource());
- }
- }
- public String getTarget() {
- return (mTarget);
- }
- public void setTarget(String target) {
- mTarget = target;
- }
- public int getThreads() {
- return (mthreads);
- }
- public void setThreads( int threads) {
- mthreads = threads;
- }
- public boolean isMCheck() {
- return mCheck;
- }
- public void setMCheck( boolean check) {
- mCheck = check;
- }
- /**
- * 程 序入口,在此初始化mPages、IndexWriter
- * 通过协调各线程间的活动完成website的抓取工作
- * 任务完成后将所有的索引片段合并为一个以优化检索
- */
- public void capture(){
- mPages.clear();
- mPages.add(getSource());
- int responseCode = 0 ;
- String contentType = "" ;
- try {
- HttpURLConnection uc = (HttpURLConnection) mSource.openConnection();
- responseCode = uc.getResponseCode();
- contentType = uc.getContentType();
- } catch (MalformedURLException mue) {
- System.err.println("Invalid URL : " + getSource());
- } catch (IOException ie) {
- if (ie instanceof UnknownHostException) {
- System.err.println("UnknowHost : " + getSource());
- } else if (ie instanceof SocketException) {
- System.err.println("Socket Error : " + ie.getMessage() + " "
- + getSource());
- } else
- ie.printStackTrace();
- }
- if (responseCode == HttpURLConnection.HTTP_OK
- && contentType.startsWith("text/html" )) {
- mPort = mSource.getPort();
- mHost = mSource.getHost();
- charset = autoDetectCharset(mSource);
- /* 存放索引文件的位置 */
- File indexDir = new File(mTarget);
- /* 标记是否重新建立索引,true为重新建立索引 */
- boolean flag = true ;
- if (!indexDir.exists()) {
- /* 如果文件夹不存在则创建 */
- indexDir.mkdir();
- } else if (IndexReader.indexExists(mTarget)) {
- /* 如果已存在索引,则追加索引 */
- flag = false ;
- File lockfile = new File(mTarget + File.separator + "write.lock" );
- if (lockfile.exists())
- lockfile.delete();
- }
- luceneAnalyzer = new MMAnalyzer();
- ramDirectory = new RAMDirectory();
- try {
- FSDWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
- RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true );
- while (mCheck) {
- IndexReader indexReader = IndexReader.open(mTarget);
- indexSearcher = new IndexSearcher(indexReader);
- }
- long start = System.currentTimeMillis();
- threadList = new ArrayList();
- for ( int i = 0 ; i < mthreads; i++) {
- Thread t = new Thread( this , "K-9 Spider Thread #" + (i + 1 ));
- t.start();
- threadList.add(t);
- }
- while (threadList.size() > 0 ) {
- Thread child = (Thread) threadList.remove(0 );
- try {
- child.join();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- long elapsed = System.currentTimeMillis() - start;
- RAMWriter.close();
- FSDWriter.addIndexes(new Directory[] { ramDirectory });
- FSDWriter.optimize();
- FSDWriter.close();
- System.out.println("Finished in " + (elapsed / 1000 )
- + " seconds" );
- System.out.println("The Count of the Links Captured is "
- + count);
- } catch (CorruptIndexException cie) {
- cie.printStackTrace();
- } catch (LockObtainFailedException lofe) {
- lofe.printStackTrace();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- }
- }
- public void run() {
- String url;
- while ((url = dequeueURL()) != null ) {
- if (isToBeCaptured(url))
- process(url);
- }
- mthreads--;
- }
- /**
- * 判 断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain
- */
- public boolean isToBeCaptured (String url){
- boolean flag = false ;
- HttpURLConnection uc = null ;
- int responseCode = 0 ;
- String contentType = "" ;
- String host = "" ;
- int port = 0 ;
- try {
- URL source = new URL(url);
- String protocol = source.getProtocol();
- if (protocol != null && protocol.equals( "http" )) {
- host = source.getHost();
- port = source.getPort();
- uc = (HttpURLConnection) source.openConnection();
- uc.setConnectTimeout(8000 );
- responseCode = uc.getResponseCode();
- contentType = uc.getContentType();
- }
- } catch (MalformedURLException mue) {
- System.err.println("Invalid URL : " + url);
- } catch (IOException ie) {
- if (ie instanceof UnknownHostException) {
- System.err.println("UnknowHost : " + url);
- } else if (ie instanceof SocketException) {
- System.err.println("Socket Error : " + ie.getMessage() + " "
- + url);
- } else if (ie instanceof SocketTimeoutException) {
- System.err.println("Socket Connection Time Out : " + url);
- } else if (ie instanceof FileNotFoundException) {
- System.err.println("broken link "
- + ((FileNotFoundException) ie.getCause()).getMessage()
- + " ignored" );
- } else
- ie.printStackTrace();
- }
- if (port == mPort
- && responseCode == HttpURLConnection.HTTP_OK
- && host.equals(mHost)
- && (contentType.startsWith("text/html" ) || contentType
- .startsWith("text/plain" )))
- flag = true ;
- return flag;
- }
- /* 从URL队列mPages里取出单个的URL */
- public synchronized String dequeueURL() {
- while ( true ) {
- if (mPages.size() > 0 ) {
- String url = (String) mPages.remove(0 );
- mFinished.add(url);
- if (isToBeCaptured(url)) {
- int bookmark;
- NodeList list;
- NodeList robots;
- MetaTag robot;
- String content;
- try {
- bookmark = mPages.size();
- /* 获取页面所有节点 */
- mParser.setURL(url);
- try {
- list = new NodeList();
- for (NodeIterator e = mParser.elements(); e
- .hasMoreNodes();)
- list.add(e.nextNode());
- } catch (EncodingChangeException ece) {
- /* 解码出错的异常处理 */
- mParser.reset();
- list = new NodeList();
- for (NodeIterator e = mParser.elements(); e
- .hasMoreNodes();)
- list.add(e.nextNode());
- }
- /**
- * 依 据 http://www.robotstxt.org/wc/meta-user.html 处 理
- * Robots tag
- */
- robots = list
- .extractAllNodesThatMatch(
- new AndFilter( new NodeClassFilter(
- MetaTag.class ),
- new HasAttributeFilter( "name" ,
- "robots" )), true );
- if ( 0 != robots.size()) {
- robot = (MetaTag) robots.elementAt(0 );
- content = robot.getAttribute("content" )
- .toLowerCase();
- if ((- 1 != content.indexOf( "none" ))
- || (-1 != content.indexOf( "nofollow" )))
- for ( int i = bookmark; i < mPages.size(); i++)
- mPages.remove(i);
- }
- } catch (ParserException pe) {
- pe.printStackTrace();
- }
- }
- return url;
- } else {
- mthreads--;
- if (mthreads > 0 ) {
- try {
- wait();
- mthreads++;
- } catch (InterruptedException ie) {
- ie.printStackTrace();
- }
- } else {
- notifyAll();
- return null ;
- }
- }
- }
- }
- /**
- * 处 理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行
- */
- protected void process(String url) {
- String result[];
- String content = null ;
- String title = null ;
- /* 此项操作较耗性能,故默认不予检测 */
- if (mCheck) {
- try {
- TermQuery query = new TermQuery( new Term( "url" , url));
- Hits hits = indexSearcher.search(query);
- if (hits.length() > 0 ) {
- System.out.println("The URL : " + url
- + " has already been captured" );
- } else {
- result = parseHtml(url, charset);
- content = result[0 ];
- title = result[1 ];
- }
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- } else {
- result = parseHtml(url, charset);
- content = result[0 ];
- title = result[1 ];
- }
- if (content != null && content.trim().length() > 0 ) {
- Document document = new Document();
- document.add(new Field( "content" , content, Field.Store.YES,
- Field.Index.TOKENIZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- document.add(new Field( "url" , url, Field.Store.YES,
- Field.Index.UN_TOKENIZED));
- document.add(new Field( "title" , title, Field.Store.YES,
- Field.Index.TOKENIZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
- document.add(new Field( "date" , DateTools.timeToString( new Date()
- .getTime(), DateTools.Resolution.DAY), Field.Store.YES,
- Field.Index.UN_TOKENIZED));
- synchronized (indexLock) {
- try {
- RAMWriter.addDocument(document);
- /**
- * 当 存放索引的内存使用大于指定值时将其写入硬盘;采用此方法的目的是
- * 通过内存缓冲避免频繁的IO操作,提高索引创建性能;
- */
- if (RAMWriter.ramSizeInBytes() > 512 * 1024 ) {
- RAMWriter.close();
- FSDWriter.addIndexes(new Directory[] { ramDirectory });
- RAMWriter = new IndexWriter(ramDirectory,
- luceneAnalyzer, true );
- }
- count++;
- System.out.println(Thread.currentThread().getName()
- + ": Finished Indexing URL: " + url);
- } catch (CorruptIndexException cie) {
- cie.printStackTrace();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- }
- }
- }
- /**
- * Link tag that rewrites the HREF.
- * The HREF is changed to a local target if it matches the source.
- */
- class LocalLinkTag extends LinkTag {
- public void doSemanticAction() {
- String link = getLink();
- if (link.endsWith( "/" ))
- link = link.substring(0 , link.length() - 1 );
- int pos = link.indexOf( "#" );
- if (pos != - 1 )
- link = link.substring(0 , pos);
- /* 将链接加入到处理队列中 */
- if (!(mFinished.contains(link) || mPages.contains(link)))
- mPages.add(link);
- setLink(link);
- }
- }
- /**
- * Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
- * targets if they match the source.
- */
- class LocalFrameTag extends FrameTag {
- public void doSemanticAction() {
- String link = getFrameLocation();
- if (link.endsWith( "/" ))
- link = link.substring(0 , link.length() - 1 );
- int pos = link.indexOf( "#" );
- if (pos != - 1 )
- link = link.substring(0 , pos);
- /* 将链接加入到处理队列中 */
- if (!(mFinished.contains(link) || mPages.contains(link)))
- mPages.add(link);
- setFrameLocation(link);
- }
- }
- /**
- * Base tag that doesn't show. The toHtml() method is overridden to return
- * an empty string, effectively shutting off the base reference.
- */
- class LocalBaseHrefTag extends BaseHrefTag {
- public String toHtml() {
- return ( "" );
- }
- }
- /* 自动探测页面编码,避免中文乱码的出现 */
- protected String autoDetectCharset(URL url) {
- CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
- /**
- * ParsingDetector 可用于检查HTML、XML等文件或字符流的编码
- * 构造方法中的参数用于指示是否显示探测过程的详细信息
- * 为false则不显示
- */
- detector.add(new ParsingDetector( false ));
- detector.add(JChardetFacade.getInstance());
- detector.add(ASCIIDetector.getInstance());
- detector.add(UnicodeDetector.getInstance());
- Charset charset = null ;
- try {
- charset = detector.detectCodepage(url);
- } catch (MalformedURLException mue) {
- mue.printStackTrace();
- } catch (IOException ie) {
- ie.printStackTrace();
- }
- if (charset == null )
- charset = Charset.defaultCharset();
- return charset.name();
- }
- /* 按照指定编码解析标准的html页面,为建立索引做准备*/
- protected String[] parseHtml(String url, String charset) {
- String result[] = null ;
- String content = null ;
- tr