1.Maven包需要
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.5</version>
</dependency>
2.具体代码如下所示
package com.example.demo;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.springframework.util.CollectionUtils;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
public class BlogVisits {
public static void main(String[] args) throws Exception {
List<String> blogDetailUrl = getBlogDetailUrl("https://blog.csdn.net/Lilayzzz?type=blog");
if (CollectionUtils.isEmpty(blogDetailUrl)){
return;
}
ExecutorService fixedThreadPool = Executors.newFixedThreadPool(5);
while (1 == 1)
Thread.sleep(5000);
for (String url:blogDetailUrl){
fixedThreadPool.execute(() ->{
try {
boolean result = httpGet(url);
} catch (Exception e) {
e.printStackTrace();
}
});
}
}
public static boolean httpGet(String url) throws Exception{
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpget = new HttpGet(url);
CloseableHttpResponse response = httpclient.execute(httpget);
response.close();
httpclient.close();
return response.getStatusLine().getStatusCode() == 200 ? true : false;
}
public static List<String> getBlogDetailUrl(String url) throws Exception{
Document doc = doc = Jsoup.connect(url).data("query", "Java") .userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)").post();
Elements links = doc.select("div").attr("class", "article-item-box csdn-tracking-statistics").select("a");
List<String> urls = links.stream().map(item -> item.attr("abs:href")).filter(item -> item.indexOf("https://blog.csdn.net/Lilayzzz/article/details") > -1).distinct().collect(Collectors.toList());
return urls;
}
}
|