集成jsoup
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
解析html页面
@Component
public class HtmlParseUtil {
public List<Content> parseJd(String keywords) throws Exception {
String url="https://search.jd.com/Search?keyword="+keywords;
Document document = Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 5.1; zh-CN) AppleWebKit/535.12 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/535.12").timeout(30000).get();
Element element = document.getElementById("J_goodsList");
Elements li = element.getElementsByTag("li");
ArrayList<Content> contentList = new ArrayList<Content>();
for (int i = 0; i < li.size(); i++) {
String img = li.get(i).getElementsByTag("img").attr("data-lazy-img");
String price = Arrays.asList(li.get(i).getElementsByClass("p-price").eq(0).text().split("¥")).get(1);
String title = li.get(i).getElementsByClass("p-name").eq(0).text();
Content content = new Content();
content.setTitle(title);
content.setImg(img);
content.setPrice(price);
contentList.add(content);
}
return contentList;
}
}
实体类
@Data
@AllArgsConstructor
@NoArgsConstructor
public class Content {
private String title;
private String img;
private String price;
}
controller层
@RestController
public class ContentController {
@Autowired
private ContentService contentService;
@GetMapping("/parse/{keyword}")
public Boolean parse(@PathVariable("keyword") String keyword) throws Exception {
return contentService.parseContent(keyword);
}
@GetMapping("/parse/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,
@PathVariable("pageNo")int pageNo,
@PathVariable("pageSize")int pageSize) throws IOException {
List<Map<String,Object>> list = contentService.searchContent(keyword, pageNo, pageSize);
return list;
}
}
业务层
@Service
public class ContentService {
@Autowired
private RestHighLevelClient restHighLevelClient;
public boolean parseContent(String keywords) throws Exception {
List<Content> contentList = new HtmlParseUtil().parseJd(keywords);
BulkRequest bulkRequest = new BulkRequest();
bulkRequest.timeout("2m");
for (int i=0;i < contentList.size(); i++){
bulkRequest.add(
new IndexRequest("jd_goods")
.source(JSON.toJSONString(contentList.get(i)), XContentType.JSON));
}
BulkResponse bulkResponse = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);
return bulkResponse.hasFailures();
}
public List<Map<String,Object>> searchContent(String keyword,int pageNo,int pageSize) throws IOException {
if (pageNo < 1) {
pageNo = 1;
}
SearchRequest searchRequest = new SearchRequest("jd_goods");
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
searchSourceBuilder.from(pageNo);
searchSourceBuilder.size(pageSize);
TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title",keyword);
searchSourceBuilder.query(termQueryBuilder);
searchSourceBuilder.timeout(new TimeValue(60,TimeUnit.SECONDS));
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = restHighLevelClient.search(searchRequest,RequestOptions.DEFAULT);
List<Map<String,Object>> list = new ArrayList<>();
for (SearchHit searchHit : searchResponse.getHits()) {
list.add(searchHit.getSourceAsMap());
}
return list;
}
}
最终实现结果:
|