1. 爬取概述
通过java爬取图片资源,解析获取url,批量下载到本地。
2. 实现验证
HttpClientUtils
package com.zrj.unit.reptile;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.security.GeneralSecurityException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class HttpClientUtils {
public static Map<String, List<String>> convertHeaders(Header[] headers) {
Map<String, List<String>> results = new HashMap<String, List<String>>();
for (Header header : headers) {
List<String> list = results.get( header.getName() );
if (list == null) {
list = new ArrayList<String>();
results.put( header.getName(), list );
}
list.add( header.getValue() );
}
return results;
}
public static String get(String url) {
return get( url, "UTF-8" );
}
public static String get(String url, String charset) {
HttpGet httpGet = new HttpGet( url );
return executeRequest( httpGet, charset );
}
public static String ajaxGet(String url) {
return ajaxGet( url, "UTF-8" );
}
public static String ajaxGet(String url, String charset) {
HttpGet httpGet = new HttpGet( url );
httpGet.setHeader( "X-Requested-With", "XMLHttpRequest" );
return executeRequest( httpGet, charset );
}
public static String ajaxGet(CloseableHttpClient httpclient, String url) {
HttpGet httpGet = new HttpGet( url );
httpGet.setHeader( "X-Requested-With", "XMLHttpRequest" );
return executeRequest( httpclient, httpGet, "UTF-8" );
}
public static String post(String url, Map<String, String> dataMap) {
return post( url, dataMap, "UTF-8" );
}
public static String post(String url, Map<String, String> dataMap, String charset) {
HttpPost httpPost = new HttpPost( url );
try {
if (dataMap != null) {
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nvps.add( new BasicNameValuePair( entry.getKey(), entry.getValue() ) );
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity( nvps, charset );
formEntity.setContentEncoding( charset );
httpPost.setEntity( formEntity );
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return executeRequest( httpPost, charset );
}
public static String ajaxPost(String url, Map<String, String> dataMap) {
return ajaxPost( url, dataMap, "UTF-8" );
}
public static String ajaxPost(String url, Map<String, String> dataMap, String charset) {
HttpPost httpPost = new HttpPost( url );
httpPost.setHeader( "X-Requested-With", "XMLHttpRequest" );
try {
if (dataMap != null) {
List<NameValuePair> nvps = new ArrayList<NameValuePair>();
for (Map.Entry<String, String> entry : dataMap.entrySet()) {
nvps.add( new BasicNameValuePair( entry.getKey(), entry.getValue() ) );
}
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity( nvps, charset );
formEntity.setContentEncoding( charset );
httpPost.setEntity( formEntity );
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
return executeRequest( httpPost, charset );
}
public static String ajaxPostJson(String url, String jsonString) {
return ajaxPostJson( url, jsonString, "UTF-8" );
}
public static String ajaxPostJson(String url, String jsonString, String charset) {
HttpPost httpPost = new HttpPost( url );
httpPost.setHeader( "X-Requested-With", "XMLHttpRequest" );
StringEntity stringEntity = new StringEntity( jsonString, charset );
stringEntity.setContentEncoding( charset );
stringEntity.setContentType( "application/json" );
httpPost.setEntity( stringEntity );
return executeRequest( httpPost, charset );
}
public static String executeRequest(HttpUriRequest httpRequest) {
return executeRequest( httpRequest, "UTF-8" );
}
public static String executeRequest(HttpUriRequest httpRequest, String charset) {
CloseableHttpClient httpclient;
if ("https".equals( httpRequest.getURI().getScheme() )) {
httpclient = createSSLInsecureClient();
} else {
httpclient = HttpClients.createDefault();
}
String result = "";
try {
try {
CloseableHttpResponse response = httpclient.execute( httpRequest );
HttpEntity entity = null;
try {
entity = response.getEntity();
result = EntityUtils.toString( entity, charset );
} finally {
EntityUtils.consume( entity );
response.close();
}
} finally {
httpclient.close();
}
} catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
public static String executeRequest(CloseableHttpClient httpclient, HttpUriRequest httpRequest, String charset) {
String result = "";
try {
try {
CloseableHttpResponse response = httpclient.execute( httpRequest );
HttpEntity entity = null;
try {
entity = response.getEntity();
result = EntityUtils.toString( entity, charset );
} finally {
EntityUtils.consume( entity );
response.close();
}
} finally {
httpclient.close();
}
} catch (IOException ex) {
ex.printStackTrace();
}
return result;
}
public static CloseableHttpClient createSSLInsecureClient() {
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial( new TrustStrategy() {
@Override
public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
return true;
}
} ).build();
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory( sslContext, new HostnameVerifier() {
@Override
public boolean verify(String hostname, SSLSession session) {
return true;
}
} );
return HttpClients.custom().setSSLSocketFactory( sslsf ).build();
} catch (GeneralSecurityException ex) {
throw new RuntimeException( ex );
}
}
}
PipelineImage
package com.zrj.unit.reptile;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
public class PipelineImage {
private String extension = ".jpg";
private String path = "";
private volatile AtomicInteger suc;
private volatile AtomicInteger fails;
public PipelineImage() {
setPath( "D:/pipeline/sougou" );
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public PipelineImage(String path) {
setPath( path );
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public PipelineImage(String path, String extension) {
setPath( path );
this.extension = extension;
suc = new AtomicInteger();
fails = new AtomicInteger();
}
public void setPath(String path) {
this.path = path;
}
private void downloadImg(String url, String cate, String name) throws Exception {
String path = this.path + "/" + cate + "/";
File dir = new File( path );
if (!dir.exists()) {
dir.mkdirs();
}
String realExt = url.substring( url.lastIndexOf( "." ) );
String fileName = name + realExt;
fileName = fileName.replace( "-", "" );
String filePath = path + fileName;
File img = new File( filePath );
if (img.exists()) {
System.out.println( String.format( "文件%s已存在本地目录", fileName ) );
return;
}
URLConnection con = new URL( url ).openConnection();
con.setConnectTimeout( 5000 );
con.setReadTimeout( 5000 );
InputStream inputStream = con.getInputStream();
byte[] bs = new byte[1024];
File file = new File( filePath );
FileOutputStream os = new FileOutputStream( file, true );
int len;
while ((len = inputStream.read( bs )) != -1) {
os.write( bs, 0, len );
}
System.out.println( "filePath: " + filePath );
System.out.println( "picUrl: " + url );
System.out.println( String.format( "正在下载第%s张图片", suc.getAndIncrement() ) );
}
public void process(List<String> data, String word) {
long start = System.currentTimeMillis();
for (String picUrl : data) {
if (picUrl == null) {
continue;
}
try {
downloadImg( picUrl, word, picUrl );
} catch (Exception e) {
fails.incrementAndGet();
}
}
System.out.println( "下载成功: " + suc.get() );
System.out.println( "下载失败: " + fails.get() );
long end = System.currentTimeMillis();
System.out.println( "耗时:" + (end - start) / 1000 + "秒" );
}
public void processSync(List<String> data, String word) {
long start = System.currentTimeMillis();
int count = 0;
ExecutorService executorService = Executors.newCachedThreadPool();
for (int i = 0; i < data.size(); i++) {
String picUrl = data.get( i );
if (picUrl == null) {
continue;
}
String name = "";
if (i < 10) {
name = "000" + i;
} else if (i < 100) {
name = "00" + i;
} else if (i < 1000) {
name = "0" + i;
}
String finalName = name;
executorService.execute( () -> {
try {
downloadImg( picUrl, word, finalName );
} catch (Exception e) {
fails.incrementAndGet();
}
} );
count++;
}
executorService.shutdown();
try {
if (!executorService.awaitTermination( 60, TimeUnit.SECONDS )) {
}
System.out.println( "AwaitTermination Finished" );
System.out.println( "下载成功: " + suc );
System.out.println( "下载失败: " + fails );
File dir = new File( this.path + "/" + word + "/" );
int len = Objects.requireNonNull( dir.list() ).length;
System.out.println( "当前共有文件: " + len );
long end = System.currentTimeMillis();
System.out.println( "耗时:" + (end - start) / 1000.0 + "秒" );
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public void processSync2(List<String> data, final String word, int threadNum) {
if (data.size() < threadNum) {
process( data, word );
} else {
ExecutorService executorService = Executors.newCachedThreadPool();
int num = data.size() / threadNum;
for (int i = 0; i < threadNum; i++) {
int start = i * num;
int end = (i + 1) * num;
if (i == threadNum - 1) {
end = data.size();
}
final List<String> cutList = data.subList( start, end );
executorService.execute( () -> process( cutList, word ) );
}
executorService.shutdown();
}
}
}
ReptileProcessor
package com.zrj.unit.reptile;
import com.alibaba.fastjson.JSONObject;
import java.util.ArrayList;
import java.util.List;
public class ReptileProcessor {
private String url;
private PipelineImage pipeline;
private List<JSONObject> dataList;
private List<String> urlList;
private String word;
public ReptileProcessor(String url, String word) {
this.url = url;
this.word = word;
this.pipeline = new PipelineImage();
this.dataList = new ArrayList<>();
this.urlList = new ArrayList<>();
}
public static void main(String[] args) {
String url = "https://pic.sogou.com/napi/pc/searchList?mode=1&start=%s&xml_len=%s&query=%s";
ReptileProcessor processor = new ReptileProcessor( url, "美女" );
int start = 0, size = 5, limit = 10;
for (int i = start; i < start + limit; i += size) {
processor.process( i, size );
}
processor.pipelineData();
}
public void process(int idx, int size) {
String res = HttpClientUtils.get( String.format( this.url, idx, size, this.word ) );
JSONObject object = JSONObject.parseObject( res );
List<JSONObject> items = (List<JSONObject>) ((JSONObject) object.get( "data" )).get( "items" );
for (JSONObject item : items) {
this.urlList.add( item.getString( "picUrl" ) );
}
this.dataList.addAll( items );
}
public void pipelineData() {
pipeline.processSync( this.urlList, this.word );
}
}
|