网络爬虫是大数据时代收集数据的一种有效手段,合理合法的运用技术手段获取网络数据,实现数据的再利用也是程序员的一项重要技能。本节我们使用java环境下的jsoup实现网络数据的爬取,主要是图片数据的异步爬取,并实现网络图片的下载及图片的预览功能,预览功能使用具有丰富功能的v-viewer实现。
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.2</version>
</dependency>
package com.yundi.atp.platform.module.test.controller;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.yundi.atp.platform.common.Result;
import com.yundi.atp.platform.module.test.entity.SpiderData;
import com.yundi.atp.platform.module.test.service.SpiderDataService;
import io.swagger.annotations.Api;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
import sun.misc.BASE64Encoder;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ThreadPoolExecutor;
/**
* <p>
* 前端控制器
* </p>
*
* @author yanp
* @since 2021-09-10
*/
@Api(tags = {"爬虫抓取案例API"})
@RestController
@RequestMapping("/test/spiderData")
public class SpiderDataController {
@Autowired
private SpiderDataService spiderDataService;
@Autowired
private ThreadPoolExecutor threadPoolExecutor;
@GetMapping(value = "startSpiderData")
public Result startSpiderData() {
CompletableFuture.runAsync(() -> {
spiderDataService.startSpiderData();
}, threadPoolExecutor);
return Result.success();
}
@PostMapping(value = "listPage")
public Result listPage(@RequestBody SpiderData spiderData) {
Page page = spiderDataService.page(spiderData.getPage(), new QueryWrapper<SpiderData>().like(StringUtils.isNotBlank(spiderData.getSrcImageName()), "src_image_name", spiderData.getSrcImageName()));
return Result.success(page);
}
@GetMapping(value = "download/{id}")
public Result download(@PathVariable(value = "id") String id) {
spiderDataService.download(id);
return Result.success();
}
@GetMapping(value = "batchDownload")
public Result batchDownload() {
spiderDataService.batchDownload();
return Result.success();
}
@GetMapping(value = "preview/{id}")
public Result preview(@PathVariable(value = "id") String id) throws IOException {
SpiderData spiderData = spiderDataService.getById(id);
File file = new File(spiderData.getStoreAddress());
FileInputStream fileInputStream = new FileInputStream(file);
int size = fileInputStream.available();
byte[] bytes = new byte[size];
fileInputStream.read(bytes);
fileInputStream.close();
BASE64Encoder encoder = new BASE64Encoder();
return Result.success(encoder.encode(bytes));
}
}
package com.yundi.atp.platform.module.test.service.impl;
import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import com.yundi.atp.platform.module.test.entity.SpiderData;
import com.yundi.atp.platform.module.test.mapper.SpiderDataMapper;
import com.yundi.atp.platform.module.test.service.SpiderDataService;
import com.yundi.atp.platform.spider.MzituImageSpider;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.List;
/**
* <p>
* 服务实现类
* </p>
*
* @author yanp
* @since 2021-09-10
*/
@Slf4j
@Service
public class SpiderDataServiceImpl extends ServiceImpl<SpiderDataMapper, SpiderData> implements SpiderDataService {
@Override
public void startSpiderData() {
try {
//1.创建连接
String url = "https://www.mzitu.com";
Connection conn = Jsoup.connect(url).timeout(50000);
conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
conn.header("referer", url);
//2.获取栏目图片连接
Document document = conn.get();
Element element = document.getElementById("menu-nav");
Elements elements = element.getElementsByTag("a");
List<String> linkList = elements.eachAttr("abs:href");
linkList.remove(0);
linkList.forEach(it -> {
Connection connect = Jsoup.connect(it).timeout(50000);
connect.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
connect.header("referer", it);
try {
Document doc = connect.get();
//3.获取栏目图片最大分页页数
Elements docElements = doc.getElementsByClass("nav-links");
Element docElement = docElements.get(0);
Elements tag = docElement.getElementsByTag("a");
List<String> tagValues = tag.eachText();
tagValues.remove(tagValues.size() - 1);
Integer maxPage = Integer.parseInt(tagValues.get(tagValues.size() - 1));
log.info("--------------------开始解析下载图片---------------------------");
String imageUrl = it.substring(0, it.length() - 1);
String imageCategory = imageUrl.substring(imageUrl.lastIndexOf("/") + 1);
//4.开始爬取图片
for (int i = 1; i <= maxPage; i++) {
String imgUrl = it + "page/" + i + "/";
Connection imgConn = Jsoup.connect(imgUrl).timeout(50000);
imgConn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36");
imgConn.header("referer", imgUrl);
try {
Document imgDocument = imgConn.get();
Element pins = imgDocument.getElementById("pins");
Elements imgTag = pins.getElementsByTag("img");
for (Element ele : imgTag) {
SpiderData spiderData = new SpiderData();
spiderData.setSrcLink(imgUrl);
spiderData.setSrcImageName(ele.attr("alt"));
spiderData.setSrcImageAddress(ele.attr("data-original"));
spiderData.setSrcImageCategory(imageCategory);
this.save(spiderData);
}
} catch (IOException ioException) {
log.error("获取资源失败:" + ioException);
}
Thread.sleep(5000);
}
} catch (IOException | InterruptedException ioException) {
log.error("获取资源失败:" + ioException);
}
});
} catch (IOException ioException) {
log.error("获取资源失败:" + ioException);
}
}
@Override
public void download(String id) {
SpiderData spiderData = this.getById(id);
String path = MzituImageSpider.downloadImage(spiderData.getSrcImageAddress());
if (path != null) {
spiderData.setStoreAddress(path);
spiderData.setStatus(true);
this.updateById(spiderData);
}
}
@Override
public void batchDownload() {
List<SpiderData> list = this.list(new QueryWrapper<SpiderData>().eq("status", 0));
for (SpiderData spiderData : list) {
String path = MzituImageSpider.downloadImage(spiderData.getSrcImageAddress());
if (path != null) {
spiderData.setStoreAddress(path);
spiderData.setStatus(true);
this.updateById(spiderData);
}
}
}
}
命令:npm i v-viewer -s
- ?爬虫案例前端main.js中引入v-viewer
import Vue from 'vue';
import ElementUI from 'element-ui';
import 'element-ui/lib/theme-chalk/index.css';
import router from '@/router';
import {http} from '@/axios/index';
import qs from 'qs';
import '@/util/derective'
import App from '@/App.vue';
import Print from 'vue-print-nb'
import VideoPlayer from 'vue-video-player'
require('video.js/dist/video-js.css')
require('vue-video-player/src/custom-theme.css')
import VueQuillEditor from 'vue-quill-editor'
// require styles
import 'quill/dist/quill.core.css'
import 'quill/dist/quill.snow.css'
import 'quill/dist/quill.bubble.css'
import mavonEditor from 'mavon-editor'
import 'mavon-editor/dist/css/index.css'
import TinymceVueH from 'tinymce-vue-h'
import Viewer from 'v-viewer'
import 'viewerjs/dist/viewer.css'
Vue.use(Viewer);
Viewer.setDefaults({
Options: { "inline": true, "button": true, "navbar": true, "title": true, "toolbar": true, "tooltip": true, "movable": true, "zoomable": true, "rotatable": true, "scalable": true, "transition": true, "fullscreen": true, "keyboard": true, "url": "data-source" }
});
Vue.use(TinymceVueH);
// use
Vue.use(mavonEditor);
Vue.use(VueQuillEditor, /* { default global options } */);
Vue.use(VideoPlayer);
Vue.use(Print);
Vue.use(ElementUI);
Vue.prototype.$http = http;
Vue.prototype.$qs = qs;
Vue.config.productionTip = false;
new Vue({
router,
render: h => h(App),
}).$mount('#app')
<template>
<div class="container">
<div class="title">
<span>爬虫案例(以https://www.mzitu.com网站为例)</span>
<el-divider direction="vertical"></el-divider>
<router-link to="home">
<span style="font-size: 18px;">退出</span>
</router-link>
</div>
<el-divider>Test Staring</el-divider>
<el-form :inline="true" :model="query">
<el-form-item>
<el-input v-model="query.srcImageName" placeholder="源图片名称" clearable></el-input>
</el-form-item>
<el-form-item>
<el-button type="primary" @click="startSpider">启动爬虫</el-button>
</el-form-item>
<el-form-item>
<el-button type="success" @click="search">查询</el-button>
</el-form-item>
<el-form-item>
<el-button type="success" @click="batchDownload">批量下载</el-button>
</el-form-item>
</el-form>
<el-table
:data="data"
border
stripe
v-loading="loading"
element-loading-text="数据加载中...">
<el-table-column
prop="id"
label="ID">
</el-table-column>
<el-table-column
prop="srcLink"
label="源网址">
</el-table-column>
<el-table-column
prop="srcImageName"
label="源图片名称">
</el-table-column>
<el-table-column
prop="srcImageAddress"
label="源图片地址">
</el-table-column>
<el-table-column
prop="srcImageCategory"
label="源图片分类">
</el-table-column>
<el-table-column
label="状态">
<template slot-scope="scope">
<el-tag size="medium" v-show="scope.row.status" type="success" effect="dark">已下载</el-tag>
<el-tag size="medium" v-show="!scope.row.status" type="warning" effect="dark">未下载</el-tag>
</template>
</el-table-column>
<el-table-column
prop="storeAddress"
label="存储地址">
</el-table-column>
<el-table-column label="操作" align="left">
<template slot-scope="scope">
<el-button type="text" @click="download(scope.row)" v-if="!scope.row.storeAddress">下载</el-button>
<el-button type="text" @click="preview(scope.row)" v-if="scope.row.storeAddress">预览</el-button>
</template>
</el-table-column>
</el-table>
<el-pagination layout="total,sizes,prev,pager,next,jumper"
@size-change="handlerSizeChange"
@current-change="handlerCurrentChange"
:current-page="query.page.current"
:page-sizes="query.page.sizes"
:page-size="query.page.size"
:total="query.page.total"
class="page"
background>
</el-pagination>
<el-dialog
:visible.sync="show">
<div style="text-align: center;">
<viewer>
<img :src="imgSrc">
</viewer>
</div>
</el-dialog>
</div>
</template>
<script>
export default {
name: "Spider",
data() {
return {
data: [],
loading: false,
query: {
page: {
total: 0,
current: 1,
size: 10,
pageSizes: [10, 50, 100, 500],
}
},
show: false,
imgSrc: '',
}
},
created() {
this.listPage();
},
methods: {
startSpider() {
this.$http.get('/test/spiderData/startSpiderData').then(res => {
if (res.data.code === 1) {
this.$message.success("爬取结束!");
this.search();
} else {
this.$message.warning(res.data.msg);
}
}).catch(error => {
this.$message.error(error);
});
},
search() {
this.query.page.current = 1;
this.listPage();
},
listPage() {
this.$http.post('/test/spiderData/listPage', this.query).then(res => {
if (res.data.code === 1) {
this.data = res.data.data.records;
this.query.page.total = res.data.data.total;
} else {
this.$message.warning(res.data.msg);
}
}).catch(error => {
this.$message.error(error);
});
},
download(data) {
this.$http.get('/test/spiderData/download/' + data.id).then(res => {
if (res.data.code === 1) {
this.$message.success("下载完成!");
this.search();
} else {
this.$message.warning(res.data.msg);
}
}).catch(error => {
this.$message.error(error);
});
},
batchDownload() {
this.$http.get('/test/spiderData/batchDownload').then(res => {
if (res.data.code === 1) {
this.$message.success("下载完成!");
this.search();
} else {
this.$message.warning(res.data.msg);
}
}).catch(error => {
this.$message.error(error);
});
},
preview(data) {
this.$http.get('/test/spiderData/preview/' + data.id).then(res => {
if (res.data.code === 1) {
this.show = true;
this.imgSrc = "data:image/png;base64," + res.data.data;
} else {
this.$message.warning(res.data.msg);
}
}).catch(error => {
this.$message.error(error);
});
},
handlerSizeChange(data) {
this.query.page.size = data;
this.listPage();
},
handlerCurrentChange(data) {
this.query.page.current = data;
this.listPage();
}
}
}
</script>
<style scoped lang="scss">
.container {
padding: 10px;
a {
text-decoration: none;
}
.title {
font-size: 20px;
font-weight: bold;
}
.page {
float: right;
margin-top: 20px;
}
}
</style>