当前位置 博文首页 > 缘来侍你的博客:PHP 抓取网站内容

    缘来侍你的博客:PHP 抓取网站内容

    作者:[db:作者] 时间:2021-09-16 13:33

    这里以guzzle方式为例,curl类似

    用guzzle需要先安装guzzle依赖,安装方式直接用composer就行,这里就不在过多阐述

    安装完guzzle后,以下代码是简单的guzzle的使用

    一、直接请求

    $client = new \GuzzleHttp\Client();
    $response = new \GuzzleHttp\Psr7\Request('GET', "https://m.baidu.com");
    // 获取头部信息
    $header = $response->getHeaders();
    // 获取html
    $body = $response->getBody();
    //        echo $body;
    // 转换为字符串
    $stringBody = (string) $body;
    
    //对结果过滤,取出class='title'的p标签内容,结果为数组格式的结果集
    $tag = 'p';
    $attr = 'class';
    $value = 'title';
    $html = $stringBody;
    $regex = "/<$tag.*?$attr=\".*?$value.*?\".*?>(.*?)<\/$tag>/is";
    preg_match_all($regex,$html,$matches,PREG_PATTERN_ORDER);
    var_dump($matches[1]);
    
    // 从body中读取10字节
    $tenBytes = $body->read(10);

    二、同时并发抓取

    <?php
    namespace App\Console\Commands;
    use App\Libs\mCache;
    use GuzzleHttp\Client;
    use GuzzleHttp\Exception\RequestException;
    use GuzzleHttp\Pool;
    use Illuminate\Console\Command;
    use DB;
    use function GuzzleHttp\Psr7\str;
    
    //use App\Models\ArticleModel;
    //use App\Models\ArticleCateModel;
    
    class Spider extends Command
    {
        /**
         * The name and signature of the console command.
         *
         * @var string
         */
        protected $signature = 'Spider';
    
        /**
         * The console command description.
         *
         * @var string
         */
        protected $description = 'Spider';
        private $totalPageCount;
        private $counter        = 1;
        private $concurrency    = 2;  // 同时并发抓取
    
        private $users = 1;
    
    //    protected $signature = 'test:multithreading-request';
    //    protected $description = 'Command description';
    
        public function __construct()
        {
            parent::__construct();
        }
        /**
         * Execute the console command.
         *
         * @return mixed
         */
        public function handle()
        {
            $this->totalPageCount = $this->users;
    
            $client = new Client();
    
            $requests = function ($total) use ($client) {
                for ($i=1;$i<=$this->users;$i++){
                    $uri = "https://m.xxx.net/xclass/0/{$i}.html";
                    yield function() use ($client, $uri) {
                        return $client->getAsync($uri);
                    };
                }
    //            foreach ($this->users as $key => $user) {
    //
    //                $uri = 'https://www.xxx.net/xclass/1/1.html';
    //                yield function() use ($client, $uri) {
    //                    return $client->getAsync($uri);
    //                };
    //            }
            };
    
            $pool = new Pool($client, $requests($this->totalPageCount), [
                'concurrency' => $this->concurrency,
                'fulfilled'   => function ($response, $index){
    
                    $res = (string)$response->getBody();
                    $book = getLabel($res,"div","class","booklist");
                    foreach ($book as $val){
                        $img = getLabel($val,"img","","");
                        echo $img[0];
                        $title = getLabel($val,"p","class","title");
                        echo $title[0];
                        $author = getLabel($val,"p","class","author");
                        echo $author[0];
    //                    $book_id = mCache::getBookId($title[0]);
    //                    var_dump($book_id);
    //                    DB::transaction(function () {
    //                        $res = DB::table('book')->get();
    //                        var_dump($res);
                            DB::table('book')->update(['votes' => 1]);
    
                            DB::table('posts')->delete();
    //                    });
                    }
    
                    $this->info("请求第 $index 页数据");
    
                    $this->countedAndCheckEnded();
                },
                'rejected' => function ($reason, $index){
                    $this->error("rejected" );
                    $this->error("rejected reason: " . $reason );
                    $this->countedAndCheckEnded();
                },
            ]);
    
            // 开始发送请求
            $promise = $pool->promise();
            $promise->wait();
        }
        public function countedAndCheckEnded()
        {
            if ($this->counter < $this->totalPageCount){
                $this->counter++;
                return;
            }
            $this->info("请求结束!");
        }
    }
    function getLable($html,$tag,$attr,$value){
        $regex = "/<$tag.*?$attr=\".*?$value.*?\".*?>(.*?)<\/$tag>/is";
        if($tag == 'src'){
            $regex = '/<img.*?src="(.*?)".*?>/is';    
        }
        preg_match_all($regex,$html,$matches,PREG_PATTERN_ORDER);
        return $matches[1];
    }

    ?

    guzzle的功能很强大,具体的的可自己看文档

    cs
    下一篇:没有了