ThinkPHP Http工具类(用于远程采集 远程下载) phpSimpleHtmlDom采集类库_Jquery筛选方式 使用phpQuery轻松采集网页内容http://www.thinkphp.cn/extend/541.html
[php]代码库
<?php |
// +---------------------------------------------------------------------- |
// | ThinkPHP [ WE CAN DO IT JUST THINK IT ] |
// +---------------------------------------------------------------------- |
// | Copyright (c) 2009 http://thinkphp.cn All rights reserved. |
// +---------------------------------------------------------------------- |
// | Licensed ( http://www.apache.org/licenses/LICENSE-2.0 ) |
// +---------------------------------------------------------------------- |
// | Author: liu21st <liu21st@gmail.com> |
// +---------------------------------------------------------------------- |
/** |
* Http 工具类
|
* 提供一系列的Http方法
|
* @category ORG
|
* @package ORG
|
* @subpackage Net
|
* @author liu21st <liu21st@gmail.com>
|
*/
|
class Http {
|
/**
|
* 采集远程文件
|
* @access public
|
* @param string $remote 远程文件名
|
* @param string $local 本地保存文件名
|
* @return mixed
|
*/
|
static public function curlDownload( $remote , $local ) {
|
$cp = curl_init( $remote );
|
$fp = fopen ( $local , "w" );
|
curl_setopt( $cp , CURLOPT_FILE, $fp );
|
curl_setopt( $cp , CURLOPT_HEADER, 0);
|
curl_exec( $cp );
|
curl_close( $cp );
|
fclose( $fp );
|
}
|
/**
|
* 使用 fsockopen 通过 HTTP 协议直接访问(采集)远程文件
|
* 如果主机或服务器没有开启 CURL 扩展可考虑使用
|
* fsockopen 比 CURL 稍慢,但性能稳定
|
* @static
|
* @access public
|
* @param string $url 远程URL
|
* @param array $conf 其他配置信息
|
* int limit 分段读取字符个数
|
* string post post的内容,字符串或数组,key=value&形式
|
* string cookie 携带cookie访问,该参数是cookie内容
|
* string ip 如果该参数传入,$url将不被使用,ip访问优先
|
* int timeout 采集超时时间
|
* bool block 是否阻塞访问,默认为true
|
* @return mixed
|
*/
|
static public function fsockopenDownload( $url , $conf = array ()) {
|
$return = '' ;
|
if (! is_array ( $conf )) return $return ;
|
$matches = parse_url ( $url );
|
!isset( $matches [ 'host' ]) && $matches [ 'host' ] = '' ;
|
!isset( $matches [ 'path' ]) && $matches [ 'path' ] = '' ;
|
!isset( $matches [ 'query' ]) && $matches [ 'query' ] = '' ;
|
!isset( $matches [ 'port' ]) && $matches [ 'port' ] = '' ;
|
$host = $matches [ 'host' ];
|
$path = $matches [ 'path' ] ? $matches [ 'path' ].( $matches [ 'query' ] ? '?' . $matches [ 'query' ] : '' ) : '/' ;
|
$port = ! empty ( $matches [ 'port' ]) ? $matches [ 'port' ] : 80;
|
$conf_arr = array (
|
'limit' => 0,
|
'post' => '' ,
|
'cookie' => '' ,
|
'ip' => '' ,
|
'timeout' => 15,
|
'block' => TRUE,
|
);
|
foreach ( array_merge ( $conf_arr , $conf ) as $k => $v ) ${ $k } = $v ;
|
if ( $post ) {
|
if ( is_array ( $post ))
|
{
|
$post = http_build_query( $post );
|
}
|
$out = "POST $path HTTP/1.0
" ;
|
$out .= "Accept: */*
" ;
|
//$out .= "Referer: $boardurl
";
|
$out .= "Accept-Language: zh-cn
" ;
|
$out .= "Content-Type: application/x-www-form-urlencoded
" ;
|
$out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]
" ;
|
$out .= "Host: $host
" ;
|
$out .= 'Content-Length: ' . strlen ( $post ). "
" ;
|
$out .= "Connection: Close
" ;
|
$out .= "Cache-Control: no-cache
" ;
|
$out .= "Cookie: $cookie
" ;
|
$out .= $post ;
|
} else {
|
$out = "GET $path HTTP/1.0
" ;
|
$out .= "Accept: */*
" ;
|
//$out .= "Referer: $boardurl
";
|
$out .= "Accept-Language: zh-cn
" ;
|
$out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]
" ;
|
$out .= "Host: $host
" ;
|
$out .= "Connection: Close
" ;
|
$out .= "Cookie: $cookie
" ;
|
}
|
$fp = @ fsockopen (( $ip ? $ip : $host ), $port , $errno , $errstr , $timeout );
|
if (! $fp ) {
|
return '' ;
|
} else {
|
stream_set_blocking( $fp , $block );
|
stream_set_timeout( $fp , $timeout );
|
@fwrite( $fp , $out );
|
$status = stream_get_meta_data( $fp );
|
if (! $status [ 'timed_out' ]) {
|
while (! feof ( $fp )) {
|
if (( $header = @ fgets ( $fp )) && ( $header == "
" || $header == "
" )) {
|
break ;
|
}
|
}
|
$stop = false;
|
while (! feof ( $fp ) && ! $stop ) {
|
$data = fread ( $fp , ( $limit == 0 || $limit > 8192 ? 8192 : $limit ));
|
$return .= $data ;
|
if ( $limit ) {
|
$limit -= strlen ( $data );
|
$stop = $limit <= 0;
|
}
|
}
|
}
|
@fclose( $fp );
|
return $return ;
|
}
|
}
|
/**
|
* 下载文件
|
* 可以指定下载显示的文件名,并自动发送相应的Header信息
|
* 如果指定了content参数,则下载该参数的内容
|
* @static
|
* @access public
|
* @param string $filename 下载文件名
|
* @param string $showname 下载显示的文件名
|
* @param string $content 下载的内容
|
* @param integer $expire 下载内容浏览器缓存时间
|
* @return void
|
*/
|
static public function download ( $filename , $showname = '' , $content = '' , $expire =180) {
|
if ( is_file ( $filename )) {
|
$length = filesize ( $filename );
|
} elseif ( is_file (UPLOAD_PATH. $filename )) {
|
$filename = UPLOAD_PATH. $filename ;
|
$length = filesize ( $filename );
|
} elseif ( $content != '' ) {
|
$length = strlen ( $content );
|
} else {
|
throw_exception( $filename .L( '下载文件不存在!' ));
|
}
|
if ( empty ( $showname )) {
|
$showname = $filename ;
|
}
|
$showname = basename ( $showname );
|
if (! empty ( $filename )) {
|
$type = mime_content_type( $filename );
|
} else {
|
$type = "application/octet-stream" ;
|
}
|
//发送Http Header信息 开始下载
|
header( "Pragma: public" );
|
header( "Cache-control: max-age=" . $expire );
|
//header('Cache-Control: no-store, no-cache, must-revalidate');
|
header( "Expires: " . gmdate ( "D, d M Y H:i:s" ,time()+ $expire ) . "GMT" );
|
header( "Last-Modified: " . gmdate ( "D, d M Y H:i:s" ,time()) . "GMT" );
|
header( "Content-Disposition: attachment; filename=" . $showname );
|
header( "Content-Length: " . $length );
|
header( "Content-type: " . $type );
|
header( 'Content-Encoding: none' );
|
header( "Content-Transfer-Encoding: binary" );
|
if ( $content == '' ) {
|
readfile( $filename );
|
} else {
|
echo ( $content );
|
}
|
exit ();
|
}
|
/**
|
* 显示HTTP Header 信息
|
* @return string
|
*/
|
static function getHeaderInfo( $header = '' , $echo =true) {
|
ob_start();
|
$headers = getallheaders ();
|
if (! empty ( $header )) {
|
$info = $headers [ $header ];
|
echo ( $header . ':' . $info . "
" ); ;
|
} else {
|
foreach ( $headers as $key => $val ) {
|
echo ( "$key:$val
" );
|
}
|
}
|
$output = ob_get_clean();
|
if ( $echo ) {
|
echo ( nl2br ( $output ));
|
} else {
|
return $output ;
|
}
|
}
|
/**
|
* HTTP Protocol defined status codes
|
* @param int $num
|
*/
|
static function sendHttpStatus( $code ) {
|
static $_status = array (
|
// Informational 1xx
|
100 => 'Continue' ,
|
101 => 'Switching Protocols' ,
|
// Success 2xx
|
200 => 'OK' ,
|
201 => 'Created' ,
|
202 => 'Accepted' ,
|
203 => 'Non-Authoritative Information' ,
|
204 => 'No Content' ,
|
205 => 'Reset Content' ,
|
206 => 'Partial Content' ,
|
// Redirection 3xx
|
300 => 'Multiple Choices' ,
|
301 => 'Moved Permanently' ,
|
302 => 'Found' , // 1.1
|
303 => 'See Other' ,
|
304 => 'Not Modified' ,
|
305 => 'Use Proxy' ,
|
// 306 is deprecated but reserved
|
307 => 'Temporary Redirect' ,
|
// Client Error 4xx
|
400 => 'Bad Request' ,
|
401 => 'Unauthorized' ,
|
402 => 'Payment Required' ,
|
403 => 'Forbidden' ,
|
404 => 'Not Found' ,
|
405 => 'Method Not Allowed' ,
|
406 => 'Not Acceptable' ,
|
407 => 'Proxy Authentication Required' ,
|
408 => 'Request Timeout' ,
|
409 => 'Conflict' ,
|
410 => 'Gone' ,
|
411 => 'Length Required' ,
|
412 => 'Precondition Failed' ,
|
413 => 'Request Entity Too Large' ,
|
414 => 'Request-URI Too Long' ,
|
415 => 'Unsupported Media Type' ,
|
416 => 'Requested Range Not Satisfiable' ,
|
417 => 'Expectation Failed' ,
|
// Server Error 5xx
|
500 => 'Internal Server Error' ,
|
501 => 'Not Implemented' ,
|
502 => 'Bad Gateway' ,
|
503 => 'Service Unavailable' ,
|
504 => 'Gateway Timeout' ,
|
505 => 'HTTP Version Not Supported' ,
|
509 => 'Bandwidth Limit Exceeded'
|
);
|
if (isset( $_status [ $code ])) {
|
header( 'HTTP/1.1 ' . $code . ' ' . $_status [ $code ]);
|
}
|
}
|
} //类定义结束
|
if ( !function_exists ( 'mime_content_type' )) {
|
/**
|
* 获取文件的mime_content类型
|
* @return string
|
*/
|
function mime_content_type( $filename ) {
|
static $contentType = array (
|
'ai' => 'application/postscript' ,
|
'aif' => 'audio/x-aiff' ,
|
'aifc' => 'audio/x-aiff' ,
|
'aiff' => 'audio/x-aiff' ,
|
'asc' => 'application/pgp' , //changed by skwashd - was text/plain
|
'asf' => 'video/x-ms-asf' ,
|
'asx' => 'video/x-ms-asf' ,
|
'au' => 'audio/basic' ,
|
'avi' => 'video/x-msvideo' ,
|
'bcpio' => 'application/x-bcpio' ,
|
'bin' => 'application/octet-stream' ,
|
'bmp' => 'image/bmp' ,
|
'c' => 'text/plain' , // or 'text/x-csrc', //added by skwashd
|
'cc' => 'text/plain' , // or 'text/x-c++src', //added by skwashd
|
'cs' => 'text/plain' , //added by skwashd - for C# src
|
'cpp' => 'text/x-c++src' , //added by skwashd
|
'cxx' => 'text/x-c++src' , //added by skwashd
|
'cdf' => 'application/x-netcdf' ,
|
'class' => 'application/octet-stream' , //secure but application/java-class is correct
|
'com' => 'application/octet-stream' , //added by skwashd
|
'cpio' => 'application/x-cpio' ,
|
'cpt' => 'application/mac-compactpro' ,
|
'csh' => 'application/x-csh' ,
|
'css' => 'text/css' ,
|
'csv' => 'text/comma-separated-values' , //added by skwashd
|
'dcr' => 'application/x-director' ,
|
'diff' => 'text/diff' ,
|
'dir' => 'application/x-director' ,
|
'dll' => 'application/octet-stream' ,
|
'dms' => 'application/octet-stream' ,
|
'doc' => 'application/msword' ,
|
'dot' => 'application/msword' , //added by skwashd
|
'dvi' => 'application/x-dvi' ,
|
'dxr' => 'application/x-director' ,
|
'eps' => 'application/postscript' ,
|
'etx' => 'text/x-setext' ,
|
'exe' => 'application/octet-stream' ,
|
'ez' => 'application/andrew-inset' ,
|
'gif' => 'image/gif' ,
|
'gtar' => 'application/x-gtar' ,
|
'gz' => 'application/x-gzip' ,
|
'h' => 'text/plain' , // or 'text/x-chdr',//added by skwashd
|
'h++' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd
|
'hh' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd
|
'hpp' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd
|
'hxx' => 'text/plain' , // or 'text/x-c++hdr', //added by skwashd
|
'hdf' => 'application/x-hdf' ,
|
'hqx' => 'application/mac-binhex40' ,
|
'htm' => 'text/html' ,
|
'html' => 'text/html' ,
|
'ice' => 'x-conference/x-cooltalk' ,
|
'ics' => 'text/calendar' ,
|
'ief' => 'image/ief' ,
|
'ifb' => 'text/calendar' ,
|
'iges' => 'model/iges' ,
|
'igs' => 'model/iges' ,
|
'jar' => 'application/x-jar' , //added by skwashd - alternative mime type
|
'java' => 'text/x-java-source' , //added by skwashd
|
'jpe' => 'image/jpeg' ,
|
'jpeg' => 'image/jpeg' ,
|
'jpg' => 'image/jpeg' ,
|
'js' => 'application/x-javascript' ,
|
'kar' => 'audio/midi' ,
|
'latex' => 'application/x-latex' ,
|
'lha' => 'application/octet-stream' ,
|
'log' => 'text/plain' ,
|
'lzh' => 'application/octet-stream' ,
|
'm3u' => 'audio/x-mpegurl' ,
|
'man' => 'application/x-troff-man' ,
|
'me' => 'application/x-troff-me' ,
|
'mesh' => 'model/mesh' ,
|
'mid' => 'audio/midi' ,
|
'midi' => 'audio/midi' ,
|
'mif' => 'application/vnd.mif' ,
|
'mov' => 'video/quicktime' ,
|
'movie' => 'video/x-sgi-movie' ,
|
'mp2' => 'audio/mpeg' ,
|
'mp3' => 'audio/mpeg' ,
|
'mpe' => 'video/mpeg' ,
|
'mpeg' => 'video/mpeg' ,
|
'mpg' => 'video/mpeg' ,
|
'mpga' => 'audio/mpeg' ,
|
'ms' => 'application/x-troff-ms' ,
|
'msh' => 'model/mesh' ,
|
'mxu' => 'video/vnd.mpegurl' ,
|
'nc' => 'application/x-netcdf' ,
|
'oda' => 'application/oda' ,
|
'patch' => 'text/diff' ,
|
'pbm' => 'image/x-portable-bitmap' ,
|
'pdb' => 'chemical/x-pdb' ,
|
'pdf' => 'application/pdf' ,
|
'pgm' => 'image/x-portable-graymap' ,
|
'pgn' => 'application/x-chess-pgn' ,
|
'pgp' => 'application/pgp' , //added by skwashd
|
'php' => 'application/x-httpd-php' ,
|
'php3' => 'application/x-httpd-php3' ,
|
'pl' => 'application/x-perl' ,
|
'pm' => 'application/x-perl' ,
|
'png' => 'image/png' ,
|
'pnm' => 'image/x-portable-anymap' ,
|
'po' => 'text/plain' ,
|
'ppm' => 'image/x-portable-pixmap' ,
|
'ppt' => 'application/vnd.ms-powerpoint' ,
|
'ps' => 'application/postscript' ,
|
'qt' => 'video/quicktime' ,
|
'ra' => 'audio/x-realaudio' ,
|
'rar' => 'application/octet-stream' ,
|
'ram' => 'audio/x-pn-realaudio' ,
|
'ras' => 'image/x-cmu-raster' ,
|
'rgb' => 'image/x-rgb' ,
|
'rm' => 'audio/x-pn-realaudio' ,
|
'roff' => 'application/x-troff' ,
|
'rpm' => 'audio/x-pn-realaudio-plugin' ,
|
'rtf' => 'text/rtf' ,
|
'rtx' => 'text/richtext' ,
|
'sgm' => 'text/sgml' ,
|
'sgml' => 'text/sgml' ,
|
'sh' => 'application/x-sh' ,
|
'shar' => 'application/x-shar' ,
|
'shtml' => 'text/html' ,
|
'silo' => 'model/mesh' ,
|
'sit' => 'application/x-stuffit' ,
|
'skd' => 'application/x-koan' ,
|
'skm' => 'application/x-koan' ,
|
'skp' => 'application/x-koan' ,
|
'skt' => 'application/x-koan' ,
|
'smi' => 'application/smil' ,
|
'smil' => 'application/smil' ,
|
'snd' => 'audio/basic' ,
|
'so' => 'application/octet-stream' ,
|
'spl' => 'application/x-futuresplash' ,
|
'src' => 'application/x-wais-source' ,
|
'stc' => 'application/vnd.sun.xml.calc.template' ,
|
'std' => 'application/vnd.sun.xml.draw.template' ,
|
'sti' => 'application/vnd.sun.xml.impress.template' ,
|
'stw' => 'application/vnd.sun.xml.writer.template' ,
|
'sv4cpio' => 'application/x-sv4cpio' ,
|
'sv4crc' => 'application/x-sv4crc' ,
|
'swf' => 'application/x-shockwave-flash' ,
|
'sxc' => 'application/vnd.sun.xml.calc' ,
|
'sxd' => 'application/vnd.sun.xml.draw' ,
|
'sxg' => 'application/vnd.sun.xml.writer.global' ,
|
'sxi' => 'application/vnd.sun.xml.impress' ,
|
'sxm' => 'application/vnd.sun.xml.math' ,
|
'sxw' => 'application/vnd.sun.xml.writer' ,
|
't' => 'application/x-troff' ,
|
'tar' => 'application/x-tar' ,
|
'tcl' => 'application/x-tcl' ,
|
'tex' => 'application/x-tex' ,
|
'texi' => 'application/x-texinfo' ,
|
'texinfo' => 'application/x-texinfo' ,
|
'tgz' => 'application/x-gtar' ,
|
'tif' => 'image/tiff' ,
|
'tiff' => 'image/tiff' ,
|
'tr' => 'application/x-troff' ,
|
'tsv' => 'text/tab-separated-values' ,
|
'txt' => 'text/plain' ,
|
'ustar' => 'application/x-ustar' ,
|
'vbs' => 'text/plain' , //added by skwashd - for obvious reasons
|
'vcd' => 'application/x-cdlink' ,
|
'vcf' => 'text/x-vcard' ,
|
'vcs' => 'text/calendar' ,
|
'vfb' => 'text/calendar' ,
|
'vrml' => 'model/vrml' ,
|
'vsd' => 'application/vnd.visio' ,
|
'wav' => 'audio/x-wav' ,
|
'wax' => 'audio/x-ms-wax' ,
|
'wbmp' => 'image/vnd.wap.wbmp' ,
|
'wbxml' => 'application/vnd.wap.wbxml' ,
|
'wm' => 'video/x-ms-wm' ,
|
'wma' => 'audio/x-ms-wma' ,
|
'wmd' => 'application/x-ms-wmd' ,
|
'wml' => 'text/vnd.wap.wml' ,
|
'wmlc' => 'application/vnd.wap.wmlc' ,
|
'wmls' => 'text/vnd.wap.wmlscript' ,
|
'wmlsc' => 'application/vnd.wap.wmlscriptc' ,
|
'wmv' => 'video/x-ms-wmv' ,
|
'wmx' => 'video/x-ms-wmx' ,
|
'wmz' => 'application/x-ms-wmz' ,
|
'wrl' => 'model/vrml' ,
|
'wvx' => 'video/x-ms-wvx' ,
|
'xbm' => 'image/x-xbitmap' ,
|
'xht' => 'application/xhtml+xml' ,
|
'xhtml' => 'application/xhtml+xml' ,
|
'xls' => 'application/vnd.ms-excel' ,
|
'xlt' => 'application/vnd.ms-excel' ,
|
'xml' => 'application/xml' ,
|
'xpm' => 'image/x-xpixmap' ,
|
'xsl' => 'text/xml' ,
|
'xwd' => 'image/x-xwindowdump' ,
|
'xyz' => 'chemical/x-xyz' ,
|
'z' => 'application/x-compress' ,
|
'zip' => 'application/zip' ,
|
);
|
$type = strtolower ( substr ( strrchr ( $filename , '.' ),1));
|
if (isset( $contentType [ $type ])) {
|
$mime = $contentType [ $type ];
|
} else {
|
$mime = 'application/octet-stream' ;
|
}
|
return $mime ;
|
}
|
} |
if (!function_exists( 'image_type_to_extension' )){
|
function image_type_to_extension( $imagetype ) {
|
if ( empty ( $imagetype )) return false;
|
switch ( $imagetype ) {
|
case IMAGETYPE_GIF : return '.gif' ;
|
case IMAGETYPE_JPEG : return '.jpg' ;
|
case IMAGETYPE_PNG : return '.png' ;
|
case IMAGETYPE_SWF : return '.swf' ;
|
case IMAGETYPE_PSD : return '.psd' ;
|
case IMAGETYPE_BMP : return '.bmp' ;
|
case IMAGETYPE_TIFF_II : return '.tiff' ;
|
case IMAGETYPE_TIFF_MM : return '.tiff' ;
|
case IMAGETYPE_JPC : return '.jpc' ;
|
case IMAGETYPE_JP2 : return '.jp2' ;
|
case IMAGETYPE_JPX : return '.jpf' ;
|
case IMAGETYPE_JB2 : return '.jb2' ;
|
case IMAGETYPE_SWC : return '.swc' ;
|
case IMAGETYPE_IFF : return '.aiff' ;
|
case IMAGETYPE_WBMP : return '.wbmp' ;
|
case IMAGETYPE_XBM : return '.xbm' ;
|
default : return false;
|
}
|
}
|
}
JQERUY方式筛选采集内容,相信很多大牛都知道这个类库,可自学出身的我还是找了N久,phpquery,Snoopy等一遍一遍尝试,最后才在无意中找到phpSimpleHtmlDom,更让人惊喜的是又找到了中文手册.
一个人的学习,漫长而又艰辛,真希望有时候能得到指点,不至于让时间无辜的流失.
基础代码获取网页建议用CURL,附加POST数据可以登陆后采集
- <?php
- require_once('./simple_html_dom.php');
-
- $url='http://www.w3cschool.cc/';
- $Curl=curl_init();//实例化cURL
- curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
- curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
- curl_setopt($Curl, CURLOPT_HEADER, 1);//0关闭打印相应头,直接打印需为1,
- $result=curl_exec($Curl);//执行一个cURL会话
- curl_close($Curl);//关闭cURL会话
-
- $html = str_get_html($result);//创建DOM
- foreach($html->find('#leftcolumn a') as $element) {
- echo $element->href . '<br>';//获取URL
- echo $element->plaintext . '<br>';//获取纯文本
- }
-
- $html->clear();
- unset($html);
中文手册(作者: S.C. Chen):
http://www.ecartchina.com/php-simple-html-dom/index.htm
采集淘宝测试
- require_once('simple_html_dom.php');
- ini_set("time_limit","0");
- ini_set("memory_limit","512M");
- $memory=memory_get_usage();
- echo 'memory:'.($memory/1024).'KB<br/>';
- echo 'time:'.date('H:i:s',time()).'<br/>';
-
- function curl_get_content($url){
- $Curl=curl_init();//实例化cURL
- curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
- curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
- curl_setopt($Curl, CURLOPT_HEADER, 0);//0关闭打印相应头,直接打印需为1,
- $result=curl_exec($Curl);//执行一个cURL会话
- curl_close($Curl);//关闭cURL会话
- return $result;
- }
-
- $cateUrl='http://the-seventh-sense.taobao.com/';
- $cateCon=curl_get_content($cateUrl);
- $cateHtml = str_get_html($cateCon);//创建DOM
- $CateList=array();
- $i=0;
- foreach($cateHtml->find('.J_TAllCatsTree li .fst-cat-hd a[href*=category]') as $element) {
- $CateList[$i]['url']=urldecode($element->href);//获取URL
- $CateList[$i]['name']=$element->plaintext;//获取纯文本
- $i++;
- }
- $cateHtml->clear();
- unset($cateHtml);
-
- $i=0;
- foreach ($CateList as $goodsUrl) {
- $goodsCon=curl_get_content($goodsUrl['url']);
- $goodsHtml = str_get_html($goodsCon);//创建DOM
- $goodsBlock=$goodsHtml->find('.shop-hesper-bd .item');
- foreach($goodsBlock as $goodsElement ) {
- $goodsList[$i]['name']=$goodsElement->find(".detail .item-name",0)->plaintext;
- $goodsList[$i]['price']=$goodsElement->find(".detail .c-price",0)->plaintext;
- $goodsList[$i]['img']=$goodsElement->find(".photo a img",0)->src;
- $goodsList[$i]['catename']=$goodsUrl['name'];
- $i++;
- }
- $goodsHtml->clear();
- unset($goodsHtml);
- }
-
- echo '<hr/>';
-
- $n1=count($CateList);
- $n2=count($goodsList);
- echo '采集'.$n1.'条栏目'.$n2.'个商品<br/>';
-
- $memory=memory_get_usage();
- echo 'memory:'.($memory/1024).'KB<br/>';
- echo 'time:'.date('H:i:s',time()).'<br/>';
beginmemory:971.953125KB
begintime:05:30:19
overmemory:1352.890625KB
overtime:05:30:39
耗时20s,成功采集9个栏目127个商品
phpQuery是一个基于PHP的服务端开源项目,它可以让PHP开发人员轻松处理DOM文档内容,比如获取某新闻网站的头条信息。更有意思的是,它采用了jQuery的思想,你可以像使用jQuery一样处理页面内容,获取你想要的页面信息。
采集头条
先看一实例,现在我要采集新浪网国内新闻的头条,代码如下:
include 'phpQuery/phpQuery.php';
phpQuery::newDocumentFile('http://news.sina.com.cn/china');
echo pq(".blkTop h1:eq(0)")->html();
简单的三行代码,就可以获取头条内容。首先在程序中包含phpQuery.php核心程序,然后调用读取目标网页,最后输出对应标签下的内容。
pq()是一个功能强大的方法,跟jQuery的$()如出一辙,jQuery的选择器基本上都能使用在phpQuery上,只要把“.”变成“->”。如上例中,pq(".blkTop h1:eq(0)")抓取了页面class属性为blkTop的DIV元素,并找到该DIV内部的第一个h1标签,然后用html()方法获取h1标签里的内容(带html标签),也就是我们要获取的头条信息,如果使用text()方法,则只获取头条的文本内容。当然要使用好phpQuery,关键是要找对文档中对应内容的节点。
采集文章列表
下面再来看一个例子,获取helloweba.com网站的blog列表,请看代码:
include 'phpQuery/phpQuery.php';
phpQuery::newDocumentFile('http://www.helloweba.com/blog.html');
$artlist = pq(".blog_li");
foreach($artlist as $li){
echo pq($li)->find('h2')->html()."";
}
通过循环列表中的DIV,找出文章标题并输出,就是这么简单。
解析XML文档
假设现在有一个这样的test.xml文档:
<?xml version="1.0" encoding="utf-8"?>
<root>
<contact>
<name>张三</name>
<age>22</age>
</contact>
<contact>
<name>王五</name>
<age>18</age>
</contact>
</root>
现在我要获取名字为张三的联系人的年龄,代码如下:
include 'phpQuery/phpQuery.php';
phpQuery::newDocumentFile('test.xml');
echo pq('contact > age:eq(0)');
结果输出:22
像jQuery一样,精准查找文档节点,输出节点下的内容,解析一个XML文档就是这么简单。现在你不必为采集网站内容而使用那些头疼的正则算法、内容替换等繁琐的代码了,有了phpQuery,一切就变得轻松多了。