首先新建一个函数文件func.php
<?php
function get_image($url,$filename,$timeout=5)
{
$file = fopen($filename, 'w+');
$ch=curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_TIMEOUT,$timeout);
curl_setopt($ch,CURLOPT_FILE,$file);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36SE 2.X MetaSr 1.0');
curl_setopt($ch, CURLOPT_REFERER, 'http://www.doodle.com');
curl_setopt($ch, CURLOPT_NOPROGRESS, false);
curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, 'progress');
curl_exec($ch);
curl_close($ch);
fclose($file);
echo " $filename 下载完成\n";
}
function get_html($url)
{
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_TIMEOUT,30);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36SE 2.X MetaSr 1.0');
curl_setopt($ch, CURLOPT_REFERER, 'http://www.baidu.com');
$htdata = curl_exec($ch);
curl_close($ch);
return $htdata;
}
function xpath_preg($html,$xprule)
{
$dom = new DOMDocument();
@$dom->loadHTML($html);
if(!isset($dom)){echo "DOMDocument不存在";}
$xpath = new DOMXpath($dom);
$elements = $xpath->query($xprule);
return $elements;
}
function progress($resource,$dltotal,$dlnow,$ultotal, $ulnow){
if(!empty($dltotal)){
printf("progress: [%-50s] %d%% Done\r", str_repeat('#',($dlnow/$dltotal)*50), ($dlnow/$dltotal)*100);
}
}
function comp_file($file,$url)
{
$my_file = fopen($file,"r+") or die("can't open this file");
$_url = $url."\n";
while(!feof($my_file)) {
$row = fgets($my_file);
if($row==$_url)
{
echo("已经存在\n");
fclose($my_file);
return false;
}
}
if(feof($my_file))
{
echo("从未爬过\n");
fwrite($my_file,$_url);
}
fclose($my_file);
return true;
}
function connect_db($server, $user, $passwd, $db)
{
$conn=mysql_connect($server,$user,$passwd) or die("连接失败") ;
mysql_query("set names 'utf8'");
mysql_select_db($db);
return $conn;
}
function comp($res = array(),$str)
{
foreach($res as $r)
{
if($r['href'] == $str)
{
return false;
}
else
{
return true;
}
}
}
main文件
<?php
include('func.php');
$artrule = '/<article.*>(.*)<\/article>/isU';
$jpgrule = '/<img[^>]*src=\"[^\"]*(http\:\/\/[^\"]*)\"[^>]*>/isU';
$htrule = '/<a[^>]*href=\"[^\"]*(http\:\/\/[^\"]*)\"[^>]*>/';
$xprule_href = '//a/@href';
$xprule_imgsrc = '//img/@src';
$url='http://www.hahaha.com';
$htdata = get_html($url);
$elements = xpath_preg($htdata,$xprule_href);
$i = 1;
$total = $elements->length;
foreach ($elements as $e)
{
$href = $e->nodeValue;
echo($i."/".$total." ");
preg_match_all('/http\:\/\/[^\/]*\/([^\/]*)\//',$href,$atrue_href);
$true_href = $atrue_href[0][0];
$dirname = $atrue_href[1][0];
echo((string)$true_href);
echo "\n";
sleep(5);
$i += 1;
if(!is_dir($dirname)){mkdir($dirname);}else{break;}
$html = get_html((string)$true_href);
$ee = xpath_preg($html,$xprule_imgsrc);
if(isset($ee))
{
foreach($ee as $e)
{
echo "$e->nodeValue"."\n";
preg_match_all('/\/([^\.\/]*.jpg)/',$e->nodeValue,$filename);
if(isset($filename))$fi_name = $dirname."/".$filename[1][0];
get_image($e->nodeValue,$fi_name);
}
}
else
{
echo "没东西匹配到";
}
}