一、初始化项目

npm init

二、安装需要的npm包

yarn add superagent cheerio

superagent 模拟浏览器端发请求, 比如登录 https://www.npmjs.com/package/superagent
cheerio 解析静态html https://www.npmjs.com/package/cheerio

三、修改package.json script

package.json

"scripts": {
    "start": "node index.js"
},

index.js

console.log('123')

运行一下试试

npm run start

四、引入需要的包, 简单访问一下百度测试一下

可以看到访问www.baidu.com的结果可以从res获取到, 而整个网站的html可以通过res.text获取.

const superagent = require("superagent");
const cheerio = require("cheerio");


superagent.get('http://www.baidu.com/').end((err, res) => {
    if (err) {
        console.log(`访问失败 - ${err}`)
    } else {
        console.log(res.text);
    }
});

五、解析获取到的html

通过cheerio.load可以解析出咱们获取到的html, 然后操作各种元素.

比如咱们试一下拿到百度网站的meta标签内容:

const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');


superagent.get('http://www.baidu.com').end((err, res) => {
    if (err) {
        console.log(`访问失败 - ${err}`)
    } else {
        const htmlText = res.text;
        const $ = cheerio.load(htmlText);
        $('meta').each((index, ele) => {
            console.log(index);
            console.log($(ele).attr('content'));
        })
    }
});

六、抓取百度图片

新建 image.handler.js 文件, 专门处理图片逻辑.

检查url

百度搜索一下"哈哈", 看一下url有什么变化？

可以看到比较重要的就是下面的这些字段

https://image.baidu.com/search/index?tn=baiduimage&word=%B9%FE%B9%FE&ie=gbk

tn=baiduimage 百度图片
word=encode(‘哈哈’)
ie=gbk 应该是指内容编码格式

检查DOM结构

我们要做的是下载图片, 那么现在有个问题, 我们如何拿到图片的Url?

右键查看网页源代码, 这里查看到的就是我们能直接通过superagent爬取到的内容.

查找可以发现, 图片的url是一个叫做objURL的字段, 所以我们待会可以通过正则来匹配到它们.

/"objURL":"(.*?)",/g

写代码访问百度图片

const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');

const word = '哈哈';

superagent
    .get(`http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=${encodeURIComponent(word)}`)
    .end((err, res) => {
        if (err) {
            console.log(`访问失败 - ${err}`)
        } else {
            const htmlText = res.text;
            const $ = cheerio.load(htmlText);
            console.log(htmlText);
        }
    });

运行一下试试

<!DOCTYPE html>
<html lang="zh-CN">
<head>
    <meta charset="utf-8">
    <title>百度安全验证</title>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black">
    <meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0">
    <meta name="format-detection" content="telephone=no, email=no">
    <link rel="shortcut icon" href="https://www.baidu.com/favicon.ico" type="image/x-icon">
    <link rel="icon" sizes="any" mask href="https://www.baidu.com/img/baidu.svg">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge">
    <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
    <link rel="stylesheet" href="https://ppui-static-wap.cdn.bcebos.com/static/touch/css/api/mkdjump_0635445.css" />
</head>
<body>
    <div class="timeout hide">
        <div class="timeout-img"></div>
        <div class="timeout-title">网络不给力，请稍后重试</div>
        <button type="button" class="timeout-button">返回首页</button>
    </div>
    <div class="timeout-feedback hide">
        <div class="timeout-feedback-icon"></div>
        <p class="timeout-feedback-title">问题反馈</p>
    </div>

<script src="https://wappass.baidu.com/static/machine/js/api/mkd.js"></script>
<script src="https://ppui-static-wap.cdn.bcebos.com/static/touch/js/mkdjump_fbb9952.js"></script>
</body>
</html>

怎么好像不太对劲啊? 怎么出来一个百度安全验证?
盲猜是遇到了百度的反爬策略了.

怎么办? 最大程度模拟浏览器行为!! 咱们把request headers也补上试试.

添加请求头

去浏览器network面板, 把这些请求头的值都复制下来.

const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');

const word = '哈哈';

const headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"'
}


superagent
    .get(`http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=${encodeURIComponent(word)}`)
    .set('Accept', headers['Accept'])
    .set('Accept-Encoding', headers['Accept-Encoding'])
    .set('Accept-Language', headers['Accept-Language'])
    .set('Cache-Control', headers['Cache-Control'])
    .set('Connection', headers['Connection'])
    .set('User-Agent', headers['User-Agent'])
    .set('sec-ch-ua', headers['sec-ch-ua'])
    .end((err, res) => {
        if (err) {
            console.log(`访问失败 - ${err}`)
        } else {
            const htmlText = res.text;
            const $ = cheerio.load(htmlText);
            console.log(htmlText);
        }
    });

发现可以了, 美滋滋!

获取imgurlList

const htmlText = res.text;
const $ = cheerio.load(htmlText);
const imageMatches = htmlText.match(/"objURL":"(.*?)",/g);
const imageUrlList = imageMatches.map(item => {
    const imageUrl = item.match(/:"(.*?)",/g)
    return RegExp.$1;
})

console.log(imageUrlList);

获取图片的标题列表

const titleMatches = htmlText.match(/"fromPageTitle":"(.*?)",/g);
const titleList = titleMatches.map(item => {
    const title = item.match(/:"(.*?)",/g)
    return RegExp.$1;
})

console.log(titleList);

提取公共函数

是不是觉得获取图片url和获取title的代码几乎一模一样, 咱们封装一下

注意这里要写动态的正则表达式了, 因为咱们要传入动态的key

function getValueListByReg(str, key) {
    const reg = new RegExp(`"${key}":"(.*?)"`, 'g');
    const matchResult = str.match(reg);
    const resList = matchResult.map(item => {
        const res = item.match(/:"(.*?)"/g)
        return RegExp.$1;
    })
    return resList
}


const htmlText = res.text;
const $ = cheerio.load(htmlText);

const imageUrlList = getValueListByReg(htmlText, 'objURL')
console.log(imageUrlList);

const titleList = getValueListByReg(htmlText, 'fromPageTitle')
console.log(titleList);

去重标题中的冗余内容

可以看到, 咱们获取到的title里是包含html标签的, 咱们通过正则把它去掉.

const titleList = getValueListByReg(htmlText, 'fromPageTitle').map(item => item.replace("<strong>", '').replace("<\\/strong>", ''));
console.log(titleList);

最终成型的简易代码

const superagent = require("superagent");
const cheerio = require('cheerio');
console.log(1123);
const word = '猫咪'
//  encodeURIComponent() 函数可把字符串作为 URI 组件进行编码。
const headers_defalult = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"'
    //cookie     不需要
    // sec       是安全策略相关的    
}
//创建动态正则函数
function getValueListByReg(str,key){
    const reg = new RegExp(`"${key}":"(.*?)"`,'g');
    const resMatches = str.match(reg) //    (.*?)  正则  表示匹配任意字符到下一个符合条件的字符
    const reslist = resMatches.map(item =>{
        const res = item.match(/:"(.*?)"/g); 
        return RegExp.$1;
    })  
    return reslist
}
superagent.get(`https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=${encodeURIComponent(word)}`)
.set("Accept",headers_defalult['Accept'])
.set("Accept-Encoding",headers_defalult['Accept-Encoding'])
.set("Accept-Language",headers_defalult['Accept-Language'])
.set("Cache-Control",headers_defalult['Cache-Control'])
.set("Connection",headers_defalult['Connection'])
.set("User-Agent",headers_defalult['User-Agent'])
.set("sec-ch-ua",headers_defalult['sec-ch-ua'])
.end((err,res)=>{
if(err){
        console.log(`访问失败- ${err}`);
    }else{
        const htmlText = res.text;
        const imagelist = getValueListByReg(htmlText,'objURL')
        const titlelist = getValueListByReg(htmlText,'fromPageTitle').map(item => item.replace('<\\/strong>','').replace('<strong>',''))
        console.log(imagelist);
        console.log(titlelist);
        
    }
})