一、 初始化项目
npm init
二、 安装需要的npm包
yarn add superagent cheerio
- superagent 模拟浏览器端发请求, 比如登录 https://www.npmjs.com/package/superagent
- cheerio 解析静态html https://www.npmjs.com/package/cheerio
三、 修改package.json script
- package.json
"scripts": {
"start": "node index.js"
},
- index.js
console.log('123')
- 运行一下试试
npm run start
四、引入需要的包, 简单访问一下百度测试一下
可以看到访问www.baidu.com的结果可以从res获取到, 而整个网站的html可以通过res.text获取.
const superagent = require("superagent");
const cheerio = require("cheerio");
superagent.get('http://www.baidu.com/').end((err, res) => {
if (err) {
console.log(`访问失败 - ${err}`)
} else {
console.log(res.text);
}
});
五、解析获取到的html
通过cheerio.load可以解析出咱们获取到的html, 然后操作各种元素.
比如咱们试一下拿到百度网站的meta标签内容:
const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');
superagent.get('http://www.baidu.com').end((err, res) => {
if (err) {
console.log(`访问失败 - ${err}`)
} else {
const htmlText = res.text;
const $ = cheerio.load(htmlText);
$('meta').each((index, ele) => {
console.log(index);
console.log($(ele).attr('content'));
})
}
});
六、抓取百度图片
新建 image.handler.js 文件, 专门处理图片逻辑.
- 检查url
百度搜索一下"哈哈", 看一下url有什么变化?
可以看到比较重要的就是下面的这些字段
https://image.baidu.com/search/index?tn=baiduimage&word=%B9%FE%B9%FE&ie=gbk
tn=baiduimage 百度图片 word=encode(‘哈哈’) ie=gbk 应该是指内容编码格式
- 检查DOM结构
我们要做的是下载图片, 那么现在有个问题, 我们如何拿到图片的Url?
右键查看网页源代码, 这里查看到的就是我们能直接通过superagent爬取到的内容.
查找可以发现, 图片的url是一个叫做objURL的字段, 所以我们待会可以通过正则来匹配到它们.
/"objURL":"(.*?)",/g
- 写代码 访问百度图片
const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');
const word = '哈哈';
superagent
.get(`http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=${encodeURIComponent(word)}`)
.end((err, res) => {
if (err) {
console.log(`访问失败 - ${err}`)
} else {
const htmlText = res.text;
const $ = cheerio.load(htmlText);
console.log(htmlText);
}
});
运行一下试试
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="utf-8">
<title>百度安全验证</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0">
<meta name="format-detection" content="telephone=no, email=no">
<link rel="shortcut icon" href="https://www.baidu.com/favicon.ico" type="image/x-icon">
<link rel="icon" sizes="any" mask href="https://www.baidu.com/img/baidu.svg">
<meta http-equiv="X-UA-Compatible" content="IE=Edge">
<meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
<link rel="stylesheet" href="https://ppui-static-wap.cdn.bcebos.com/static/touch/css/api/mkdjump_0635445.css" />
</head>
<body>
<div class="timeout hide">
<div class="timeout-img"></div>
<div class="timeout-title">网络不给力,请稍后重试</div>
<button type="button" class="timeout-button">返回首页</button>
</div>
<div class="timeout-feedback hide">
<div class="timeout-feedback-icon"></div>
<p class="timeout-feedback-title">问题反馈</p>
</div>
<script src="https://wappass.baidu.com/static/machine/js/api/mkd.js"></script>
<script src="https://ppui-static-wap.cdn.bcebos.com/static/touch/js/mkdjump_fbb9952.js"></script>
</body>
</html>
怎么好像不太对劲啊? 怎么出来一个百度安全验证? 盲猜是遇到了百度的反爬策略了.
怎么办? 最大程度模拟浏览器行为!! 咱们把request headers也补上试试.
- 添加请求头
去浏览器network面板, 把这些请求头的值都复制下来.
const superagent = require("superagent");
const cheerio = require("cheerio");
const fs = require('fs');
const word = '哈哈';
const headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"'
}
superagent
.get(`http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=${encodeURIComponent(word)}`)
.set('Accept', headers['Accept'])
.set('Accept-Encoding', headers['Accept-Encoding'])
.set('Accept-Language', headers['Accept-Language'])
.set('Cache-Control', headers['Cache-Control'])
.set('Connection', headers['Connection'])
.set('User-Agent', headers['User-Agent'])
.set('sec-ch-ua', headers['sec-ch-ua'])
.end((err, res) => {
if (err) {
console.log(`访问失败 - ${err}`)
} else {
const htmlText = res.text;
const $ = cheerio.load(htmlText);
console.log(htmlText);
}
});
发现可以了, 美滋滋!
- 获取imgurlList
const htmlText = res.text;
const $ = cheerio.load(htmlText);
const imageMatches = htmlText.match(/"objURL":"(.*?)",/g);
const imageUrlList = imageMatches.map(item => {
const imageUrl = item.match(/:"(.*?)",/g)
return RegExp.$1;
})
console.log(imageUrlList);
- 获取图片的标题列表
const titleMatches = htmlText.match(/"fromPageTitle":"(.*?)",/g);
const titleList = titleMatches.map(item => {
const title = item.match(/:"(.*?)",/g)
return RegExp.$1;
})
console.log(titleList);
- 提取公共函数
是不是觉得获取图片url和获取title的代码几乎一模一样, 咱们封装一下
注意这里要写动态的正则表达式了, 因为咱们要传入动态的key
function getValueListByReg(str, key) {
const reg = new RegExp(`"${key}":"(.*?)"`, 'g');
const matchResult = str.match(reg);
const resList = matchResult.map(item => {
const res = item.match(/:"(.*?)"/g)
return RegExp.$1;
})
return resList
}
const htmlText = res.text;
const $ = cheerio.load(htmlText);
const imageUrlList = getValueListByReg(htmlText, 'objURL')
console.log(imageUrlList);
const titleList = getValueListByReg(htmlText, 'fromPageTitle')
console.log(titleList);
- 去重标题中的冗余内容
可以看到, 咱们获取到的title里是包含html标签的, 咱们通过正则把它去掉.
const titleList = getValueListByReg(htmlText, 'fromPageTitle').map(item => item.replace("<strong>", '').replace("<\\/strong>", ''));
console.log(titleList);
最终成型的简易代码
const superagent = require("superagent");
const cheerio = require('cheerio');
console.log(1123);
const word = '猫咪'
const headers_defalult = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"'
}
function getValueListByReg(str,key){
const reg = new RegExp(`"${key}":"(.*?)"`,'g');
const resMatches = str.match(reg)
const reslist = resMatches.map(item =>{
const res = item.match(/:"(.*?)"/g);
return RegExp.$1;
})
return reslist
}
superagent.get(`https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=${encodeURIComponent(word)}`)
.set("Accept",headers_defalult['Accept'])
.set("Accept-Encoding",headers_defalult['Accept-Encoding'])
.set("Accept-Language",headers_defalult['Accept-Language'])
.set("Cache-Control",headers_defalult['Cache-Control'])
.set("Connection",headers_defalult['Connection'])
.set("User-Agent",headers_defalult['User-Agent'])
.set("sec-ch-ua",headers_defalult['sec-ch-ua'])
.end((err,res)=>{
if(err){
console.log(`访问失败- ${err}`);
}else{
const htmlText = res.text;
const imagelist = getValueListByReg(htmlText,'objURL')
const titlelist = getValueListByReg(htmlText,'fromPageTitle').map(item => item.replace('<\\/strong>','').replace('<strong>',''))
console.log(imagelist);
console.log(titlelist);
}
})
|