这里以guzzle方式为例,curl类似
用guzzle需要先安装guzzle依赖,安装方式直接用composer就行,这里就不在过多阐述
安装完guzzle后,以下代码是简单的guzzle的使用
一、直接请求
$client = new \GuzzleHttp\Client();
$response = new \GuzzleHttp\Psr7\Request('GET', "https://m.baidu.com");
// 获取头部信息
$header = $response->getHeaders();
// 获取html
$body = $response->getBody();
// echo $body;
// 转换为字符串
$stringBody = (string) $body;
//对结果过滤,取出class='title'的p标签内容,结果为数组格式的结果集
$tag = 'p';
$attr = 'class';
$value = 'title';
$html = $stringBody;
$regex = "/<$tag.*?$attr=\".*?$value.*?\".*?>(.*?)<\/$tag>/is";
preg_match_all($regex,$html,$matches,PREG_PATTERN_ORDER);
var_dump($matches[1]);
// 从body中读取10字节
$tenBytes = $body->read(10);
二、同时并发抓取
<?php
namespace App\Console\Commands;
use App\Libs\mCache;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Pool;
use Illuminate\Console\Command;
use DB;
use function GuzzleHttp\Psr7\str;
//use App\Models\ArticleModel;
//use App\Models\ArticleCateModel;
class Spider extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'Spider';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Spider';
private $totalPageCount;
private $counter = 1;
private $concurrency = 2; // 同时并发抓取
private $users = 1;
// protected $signature = 'test:multithreading-request';
// protected $description = 'Command description';
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function handle()
{
$this->totalPageCount = $this->users;
$client = new Client();
$requests = function ($total) use ($client) {
for ($i=1;$i<=$this->users;$i++){
$uri = "https://m.xxx.net/xclass/0/{$i}.html";
yield function() use ($client, $uri) {
return $client->getAsync($uri);
};
}
// foreach ($this->users as $key => $user) {
//
// $uri = 'https://www.xxx.net/xclass/1/1.html';
// yield function() use ($client, $uri) {
// return $client->getAsync($uri);
// };
// }
};
$pool = new Pool($client, $requests($this->totalPageCount), [
'concurrency' => $this->concurrency,
'fulfilled' => function ($response, $index){
$res = (string)$response->getBody();
$book = getLabel($res,"div","class","booklist");
foreach ($book as $val){
$img = getLabel($val,"img","","");
echo $img[0];
$title = getLabel($val,"p","class","title");
echo $title[0];
$author = getLabel($val,"p","class","author");
echo $author[0];
// $book_id = mCache::getBookId($title[0]);
// var_dump($book_id);
// DB::transaction(function () {
// $res = DB::table('book')->get();
// var_dump($res);
DB::table('book')->update(['votes' => 1]);
DB::table('posts')->delete();
// });
}
$this->info("请求第 $index 页数据");
$this->countedAndCheckEnded();
},
'rejected' => function ($reason, $index){
$this->error("rejected" );
$this->error("rejected reason: " . $reason );
$this->countedAndCheckEnded();
},
]);
// 开始发送请求
$promise = $pool->promise();
$promise->wait();
}
public function countedAndCheckEnded()
{
if ($this->counter < $this->totalPageCount){
$this->counter++;
return;
}
$this->info("请求结束!");
}
}
function getLable($html,$tag,$attr,$value){
$regex = "/<$tag.*?$attr=\".*?$value.*?\".*?>(.*?)<\/$tag>/is";
if($tag == 'src'){
$regex = '/<img.*?src="(.*?)".*?>/is';
}
preg_match_all($regex,$html,$matches,PREG_PATTERN_ORDER);
return $matches[1];
}
?
guzzle的功能很强大,具体的的可自己看文档
|