PHP通过pcntl扩展使用redis队列多进程爬虫

** 不使用PhantomJs多线程采集,左转tp5使用curl特性进行定时多线程爬虫(或任务),使用redis队列
重要扩展
pcntl(推荐使用linux系统)
redis扩展


而且配置mysql,redis服务器
以及扩展程序

# phantomjs需要使用二进制文件,
# 注意如果使用了文件,那么文件阻塞,多进程变为单进程
composer require jaeger/querylist 
composer require jaeger/querylist-phantomjs
# 可以不用querylist-curl-multi
composer require jaeger/querylist-curl-multi
composer require topthink/think-queue

主要程序

<?php
namespace app\index\controller;

use QL\Ext\CurlMulti;
use QL\Ext\PhantomJs;
use QL\QueryList;
use think\Db;
use think\Queue;

/**
 * 执行cli:
 *         /vhs/php/php71/bin/php think do2
 * 执行队列监听:
 *         /vhs/php/php71/bin/php think queue:work --queue PaChongShuJu --daemon
 *         /vhs/php/php71/bin/php think queue:listen --queue PaChongShuJu
 */
class Pcntl
{
    function do() {
        $end     = 15399999;
        $bengin  = 15000949;
        $max     = $end - $bengin;
        $workers = 200; // 进程数量
        $pids    = array();
        for ($i = 0; $i < $workers; $i++) {
            $pids[$i] = pcntl_fork();
            switch ($pids[$i]) {
                case -1:
                    echo "fork error : {$i} \r\n";
                    exit;
                case 0:
                    $param = array(
                        'lastid'    => (int) ceil($max / $workers) * $i,
                        'maxid'     => (int) ceil($max / $workers) * ($i + 1),
                        'url_begen' => $end + (int) ceil($max / $workers) * $i,
                        'url_end'   => $end + (int) ceil($max / $workers) * ($i + 1),
                    );
                    // dump($param);
                    $this->executeWorker($param);
                    exit;
                default:
                    break;
            }
        }
        foreach ($pids as $i => $pid) {
            if ($pid) {
                pcntl_waitpid($pid, $status);
            }
        }
    }
    /**
     * 业务直接执行
     * @param  [type] $param  [description]
     * @return [type]         [description]
     */
    public function executeWorker($param)
    {
        // 每一个进程处理循环事件
        for ($i = $param['url_begen']; $i < $param['url_end']; $i++) {
            $url = 'http://www.whnews.cn/newss/node_' . $i . '.html';
            $this->startnojs($url, $i);
        }
    }
    /**
     * 采集实现
     * @param  array  $url [description]
     * @return [type]      [description]
     */
    public function startnojs($url, $i)
    {
        $ql = QueryList::getInstance();
        $ql->use(CurlMulti::class);
        // PhantomJs使用,模拟浏览器browser运行js(部分网页不需要,看情况)
        $ql->use(PhantomJs::class, '/home/lxx/phantomjs-2.1.1-linux-x86_64/bin/phantomjs');
        $html = $ql->browser($url)->getHtml();
        preg_match("/alt=\"(.*?)\" onload=/", $html, $m);
        preg_match("/f_rb\">(.*?)<\/span>/", $html, $m2);
        preg_match("/人:<\/label> <span>(.*?)<\/span><\/li>/", $html, $m3);
        preg_match("/址:<\/label>   <span>(.*?)<a/", $html, $m4);
        $data = [
            'compay'  => $m[1] ?? '',
            'mobile'  => $m2[1] ?? '',
            'name'    => $m3[1] ?? '',
            'address' => $m4[1] ?? '',
            'page'    => $i,
        ];
        // dump($data);
        if (isset($m[1])) {
            $check = Db::table('pachong2')->where('compay', $m[1])->count();
            if ($check < 1) {
                // 推送到队列
                $this->push($data);
            }
        }
    }
    /**
     * 推送列队
     * @param  array  $data [description]
     * @return [type]       [description]
     */
    public function push($data = [])
    {
        $jobData             = json_encode($data);
        $jobHandlerClassName = 'app\index\controller\Job';
        $jobQueueName        = "PaChongShuJu";
        $isPushed            = Queue::push($jobHandlerClassName, $jobData, $jobQueueName);
        if ($isPushed) {
            echo "ok";
        } else {
            dump($isPushed);
        }
    }
}

队列文件
自行配置Command文件和控制器

<?php
namespace app\index\controller;

use think\Db;
use think\queue\Job as QueueJob;

class Job
{
    public function fire(QueueJob $job, $data)
    {
        $pieces = json_encode($data);
        $this->add_db($pieces);
        if ($job->attempts() > 3) {
            //通过这个方法可以检查这个任务已经重试了几次了
            $job->delete();
        }
        //如果任务执行成功后 记得删除任务,不然这个任务会重复执行,直到达到最大重试次数后失败后,执行failed方法
        $job->delete();

        // 也可以重新发布这个任务
        // $job->release($delay); //$delay为延迟时间

    }

    public function failed($data)
    {
        // ...任务达到最大重试次数后,失败了
    }
    public function add_db($data = [])
    {
        $data  = (array) json_decode(json_decode($data));
        $count = Db::table('pachong2')->where('compay', $data['compay'])->count();
        if ($count == 0) {
            Db::table('pachong2')->insert($data);
        }
        dump($data);
    }
}

其他
单进程PhantomJs采集

<?php
namespace app\index\controller;

use QL\Ext\CurlMulti;
use QL\Ext\PhantomJs;
use QL\QueryList;
use think\Db;

class Index
{
    public function index()
    {
        phpinfo();
    }
    public function more_openexcel()
    {
        header("Content-type:application/vnd.ms-excel");
        header("Content-Disposition:attachement;filename=Haoyunyun_" . date("Ymd") . ".xls");
        // 表头
        $ReportArr[] = ['ID', '名称', '地址', '电话', 'p'];
        Db::table('tp_logo_content')->chunk(100, function ($datas) use (&$ReportArr) {
            foreach ($datas as $data) {
                $ReportArr[] = [$data['id'], $data['title'], $data['keyword'], $data['content'], $data['thumbsup']];
            }
        });
        $ReportContent = '';
        $num1          = count($ReportArr);
        for ($i = 0; $i < $num1; $i++) {
            $num2 = count($ReportArr[$i]);
            for ($j = 0; $j < $num2; $j++) {
                $ReportContent .= '"' . $ReportArr[$i][$j] . '"' . "\t";
            }
            $ReportContent .= "\n";
        }
        // $ReportContent = mb_convert_encoding($ReportContent, "gb2312", "utf-8");
        die($ReportContent); // 框架内推荐使用die
    }
    //http://www.whnews.cn/newss/node_15000000.html
    function do() {
        // 15000000
        for ($i = 15000949; $i <= 15399999; $i++) {
            $url = 'http://www.whnews.cn/newss/node_' . $i . '.html';
            $this->startnojs($url, $i);
        }
    }
    /**
     * 采集
     * /vhs/php/php71/bin/php think do
     * @param  array  $url [description]
     * @return [type]      [description]
     */
    public function startnojs($url, $i)
    {
        $ql = QueryList::getInstance();
        $ql->use(CurlMulti::class);
        $ql->use(PhantomJs::class, '/home/lxx/phantomjs-2.1.1-linux-x86_64/bin/phantomjs');
        $html = $ql->browser($url)->getHtml();
        preg_match("/alt=\"(.*?)\" onload=/", $html, $m);
        preg_match("/f_rb\">(.*?)<\/span>/", $html, $m2);
        preg_match("/人:<\/label> <span>(.*?)<\/span><\/li>/", $html, $m3);
        preg_match("/址:<\/label>   <span>(.*?)<a/", $html, $m4);
        $data = [
            'compay'  => $m[1] ?? '',
            'mobile'  => $m2[1] ?? '',
            'name'    => $m3[1] ?? '',
            'address' => $m4[1] ?? '',
            'page'    => $i,
        ];
        // 文件进程阻塞,直接送到数据库,不使用redis
        // $thid->push($data);
        if (isset($m[1])) {
            $check = Db::table('pachong')->where('compay', $m[1])->count();
            if ($check < 1) {
                Db::table('pachong')->insert($data);
                dump($data);
            }
        }
    }
}

猜你喜欢

转载自blog.csdn.net/HD2killers/article/details/82970910