java获取百度图片学习记录
主要用到的jar包
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
分析百度加载url,每次请求加载30张图片,主要有参数pn控制,gsm为pn的16进制,最后是时间戳,搜索内容有queryWord控制
https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7312553249612451609&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E7%BE%8E%E6%99%AF&queryWord=%E7%BE%8E%E6%99%AF&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn=30&rn=30&gsm=1e&1646700092850=
可以写一个实体类用来设置url
public class PageUrl {
private String queryWord;
private int pn;
private String gsm;
public PageUrl(String queryWord, int pn) {
// 对查询关键字进行url编码
try {
this.queryWord = URLEncoder.encode(queryWord, "UTF-8");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
this.pn = pn;
this.gsm = Integer.toHexString(pn);
}
// pn 和 gsm 应该是一起变化的
public void setPn(int pn) {
this.pn = pn;
this.gsm = Integer.toHexString(pn);
}
@Override
public String toString() {
return "http://image.baidu.com/search/acjsontn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord="
+ queryWord + "&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word="
+ queryWord + "&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn="
+ pn+ "&rn=30&gsm=" + gsm + "&" + new Date().getTime() + "=";
}
}
编写实现类
public class PicSpider {
public void crawlePicture(String queryWord, int page) throws Exception {
if (page < 1) {
throw new Exception("page set error.");
}
PageUrl pageUrl = new PageUrl(queryWord, 30); /
for (int i = 1; i <= page; i++) {
pageUrl.setPn(i * 30);
getJson(pageUrl.toString());
}
}
public String getJson(String url) {
String json = null;
try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
HttpGet getMethod = new HttpGet(url);
setHeaders(getMethod);
try (CloseableHttpResponse response = httpClient.execute(getMethod)) {
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == HttpStatus.SC_OK) {
HttpEntity entity = response.getEntity();
if (entity != null) {
json = EntityUtils.toString(entity, "UTF-8");
resolveJson(json);
}
} else {
throw new IOException("请求失败:" + statusCode);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return json;
}
public List<String> resolveJson(String json) {
// 使用正则表达式,进行匹配,获取 objURL
String regx = "\"thumbURL\":\"(.*?)\",";
Pattern p = Pattern.compile(regx);
Matcher m = p.matcher(json);
List<String> strs = new LinkedList<>();
while (m.find()) {
strs.add(m.group(0));
}
// 使用 Stream API 进行处理并返回。
return strs.stream().map(s -> s.substring(12, s.length() - 2)).collect(Collectors.toList());
}
public void download(List<String> urlList) {
// 用于统计一些数据
AtomicInteger successCount = new AtomicInteger(0), failCount = new AtomicInteger(0),
exceptionCount = new AtomicInteger(0);
// 设置超时时间
RequestConfig config = RequestConfig.custom().setSocketTimeout(10 * 1000).setConnectTimeout(10 * 1000)
.setConnectionRequestTimeout(10 * 1000).setRedirectsEnabled(false) // 不允许自动重定向,否则会把html页面当成图片下载下来
.build();
try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
urlList.forEach(url -> {
HttpGet getMethod = new HttpGet(url);
try (CloseableHttpResponse response = httpClient.execute(getMethod)) {
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == HttpStatus.SC_OK) {
HttpEntity entity = response.getEntity();
if (entity != null) {
String filename = this.getFileName(url);
File file = new File("F:/baiduImage/" + filename);
if (!file.exists()) {
file.getParentFile().mkdirs();
}
try (OutputStream out = new BufferedOutputStream(new FileOutputStream(file))) {
entity.writeTo(out);
successCount.getAndIncrement();
System.out.println(statusCode + " success: " + url + "\n" + filename);
}
}
} else {
failCount.getAndIncrement();
System.out.println(statusCode + " fail: " + url);
}
} catch (IOException e) {
e.printStackTrace();
exceptionCount.getAndIncrement();
System.out.println("IOException: " + url);
}
});
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("statistic data[ " + "Success: " + successCount.get() + "\n" + "Fail: " + failCount.get()
+ "\n" + "Exception: " + exceptionCount.get() + " ]");
}
private String getFileName(String url) {
String suffix = url.substring(url.lastIndexOf("/") + 1);
if (suffix.contains("?")) {
suffix = suffix.split("[?]")[0]; // 这个 ? ,不能直接使用,必须转义一下
}
// 后缀默认就是 jpeg
suffix = -1 != suffix.lastIndexOf(".") ? suffix.substring(suffix.lastIndexOf(".")) : ".jpeg";
return UUID.randomUUID().toString() + suffix;
}
public static void setHeaders(HttpGet get) {
get.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
get.setHeader("Accept-Encoding", "gzip, deflate, br");
get.setHeader("Cache-Control", "max-age=0");
get.setHeader("Connection", "keep-alive");
get.setHeader("Cookie","自己登录的cookie信息");// **自己登录的cookie信息**
get.setHeader("Host", "image.baidu.com");
get.setHeader("Upgrade-Insecure-Requests", "1");
get.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
}
}
编写main方法获取图片
public static void main(String[] args) throws Exception {
PicSpider spider = new PicSpider();
List<String> urls=spider.crawlePicture( "高清美景", 5);
download(urls);
}
跑起来在F:/baiduImage/文件夹可以找到下载的图片; 仅供学习使用
|