PHP小爬虫

首先新建一个函数文件func.php

<?php
//func.php
//使用给定的URL下载图片并保存为特定文件
function get_image($url,$filename,$timeout=5)
{
  $file = fopen($filename, 'w+');
  $ch=curl_init();
  curl_setopt($ch,CURLOPT_URL,$url);//设定需要回去的URL
  curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);//设置获取的信息以字符串返回
  curl_setopt($ch,CURLOPT_TIMEOUT,$timeout);//设置超时时间
  curl_setopt($ch,CURLOPT_FILE,$file);//设置保存的文件
  curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36SE 2.X MetaSr 1.0');//设置UA
  curl_setopt($ch, CURLOPT_REFERER, 'http://www.doodle.com');//设置referer
  curl_setopt($ch, CURLOPT_NOPROGRESS, false);//打开回调函数progress
  curl_setopt($ch, CURLOPT_PROGRESSFUNCTION, 'progress');//设置回调函数
  curl_exec($ch);//图片
  curl_close($ch);
  fclose($file);
  echo " $filename 下载完成\n";
}

//获取html源码
function get_html($url)
{
  $ch = curl_init();
  curl_setopt($ch,CURLOPT_URL,$url);
  curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
  curl_setopt($ch,CURLOPT_TIMEOUT,30);
  curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36SE 2.X MetaSr 1.0');
  curl_setopt($ch, CURLOPT_REFERER, 'http://www.baidu.com');
  //curl_setopt($ch, CURLOPT_HEADER, 1);
  $htdata = curl_exec($ch);
  curl_close($ch);
  return $htdata;
}
//对源码进行xpath匹配
function xpath_preg($html,$xprule)
{
  $dom = new DOMDocument();
  @$dom->loadHTML($html);
  if(!isset($dom)){echo "DOMDocument不存在";}
  $xpath = new DOMXpath($dom);
  $elements = $xpath->query($xprule);
  //unset($dom);//销毁object
  return $elements;
}
//$resource cURL 
//$dltotal:download total 下载文件总大小
//$dlnow:download now  当前已经下载大小
//$ultotal:upload total      上传文件总大小
//$ulnow:upload now       当前已经上传大小
function progress($resource,$dltotal,$dlnow,$ultotal, $ulnow){
    //$now = date('Y-m-d H:i:s');//当前时间
    //刚开始下载或上传时,$dltotal和$ultotal为0,此处避免除0错误
    if(!empty($dltotal)){
        printf("progress: [%-50s] %d%% Done\r", str_repeat('#',($dlnow/$dltotal)*50), ($dlnow/$dltotal)*100);
    }
}
//利用文件判断是否下载过
function comp_file($file,$url)
{
    $my_file = fopen($file,"r+") or die("can't open this file");
    $_url = $url."\n";
    while(!feof($my_file)) {
        $row = fgets($my_file);
        if($row==$_url)
        {    
            echo("已经存在\n");
            fclose($my_file);
            return false;
            //break;
        }  //echo $row;
    }
    if(feof($my_file))
    {
        echo("从未爬过\n");
        fwrite($my_file,$_url);
    }
    fclose($my_file);
    return true;
}
//对数据库进行连接
function connect_db($server, $user, $passwd, $db)
{
  $conn=mysql_connect($server,$user,$passwd) or die("连接失败") ; //连接数据库

  mysql_query("set names 'utf8'"); //数据库输出编码 应该与你的数据库编码保持一致.南昌网站建设公司百恒网络PHP工程师建议用UTF-8 国际标准编码.

  mysql_select_db($db); //打开数据库
  return $conn;
}

//对数据库进行查询
function comp($res = array(),$str)
{
    foreach($res as $r)
    {
      if($r['href'] == $str)
      {
        return false;
      }
      else
      {
        return true;
      }
    }

}

main文件

<?php
//main.php
//函数库
  include('func.php');
  //正则匹配规则
  $artrule = '/<article.*>(.*)<\/article>/isU';//匹配<article>标签里的内容
  $jpgrule = '/<img[^>]*src=\"[^\"]*(http\:\/\/[^\"]*)\"[^>]*>/isU';
  $htrule = '/<a[^>]*href=\"[^\"]*(http\:\/\/[^\"]*)\"[^>]*>/';

  //xPath匹配规则
  $xprule_href = '//a/@href';//获取所有链接
  $xprule_imgsrc = '//img/@src';//获取所有图片地址

  //需要抓取的页面
  $url='http://www.hahaha.com';//想爬的网站
  $htdata = get_html($url);

  //
  $elements = xpath_preg($htdata,$xprule_href);
  $i = 1;
  $total = $elements->length;
  foreach ($elements as $e)
  {
    $href = $e->nodeValue;
    echo($i."/".$total." ");
    preg_match_all('/http\:\/\/[^\/]*\/([^\/]*)\//',$href,$atrue_href);//防止使用参数传递的URL
    $true_href = $atrue_href[0][0];
    $dirname = $atrue_href[1][0];
    echo((string)$true_href);
    echo "\n";
    sleep(5);//防止抓得太快
    $i += 1;
    if(!is_dir($dirname)){mkdir($dirname);}else{break;}//防止重复抓取
    $html = get_html((string)$true_href);
    $ee = xpath_preg($html,$xprule_imgsrc);
    if(isset($ee))
    {
      foreach($ee as $e)
      {
        echo "$e->nodeValue"."\n";
        preg_match_all('/\/([^\.\/]*.jpg)/',$e->nodeValue,$filename);
        if(isset($filename))$fi_name = $dirname."/".$filename[1][0];
        get_image($e->nodeValue,$fi_name);
      }
    }
    else
    {
      echo "没东西匹配到";
    }

  }

猜你喜欢

转载自blog.csdn.net/sinat_39013092/article/details/76283657
今日推荐