一、引入jsoup依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
二、爬虫程序
@Service
public class JavaJsoupService {
/**
* 本示例只爬取宁夏回族自治区五级行政区划的信息
*/
private static String allName = "宁夏回族自治区";
/**
* 建立连接
*/
private static Document connect(String url) {
if (url == null || url.isEmpty()) {
throw new IllegalArgumentException("无效的url");
}
try {
return Jsoup.connect(url).timeout(200 * 2000).get();
} catch (IOException e) {
System.out.println(url+"地址不存在");
return null;
}
}
/**
* 获取所有的省份(本示例只爬取宁夏回族自治区五级行政区划的信息)
* @param url 请求地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html
* @return
*/
public List<SysArea> getProvinces(String url) {
List<SysArea> sysAreas = new ArrayList<>();
Document connect = connect(url);
Elements rowProvince = connect.select("tr.provincetr");
for (Element provinceElement : rowProvince) {
Elements select = provinceElement.select("a");
for (Element province : select) {
if(province.text().equals(allName)){
String code = province.select("a").attr("href");
String name = province.text();
SysArea sysArea = new SysArea();
sysArea.setAreaCode(code.replace(".html","0000000000"));
sysArea.setId(sysArea.getAreaCode());
sysArea.setAreaName(name);
sysArea.setLevel("1");
sysArea.setParentCode("0");
sysArea.setDelFlag("1");
sysArea.setStatus("1");
sysArea.setFullName(name);
sysAreas.add(sysArea);
String provinceUrl = url.replace("index.html",code);
System.err.println("++++++++++++++++++++++++++开始获取"+ name +"下属市区行政区划信息++++++++++++++++++++++++");
List<SysArea> cityAreaCodeList = getCityAreaCode(provinceUrl,code.replace(".html","0000000000"),name);
sysAreas.addAll(cityAreaCodeList);
}
}
}
return sysAreas;
}
/**
* 获取市行政区划信息
* @param provinceUrl 省份对应的地址
* @param parentCode 需要爬取的省份行政区划(对于市的父级代码即为省行政区划)
* @return
*/
public static List<SysArea> getCityAreaCode(String provinceUrl,String parentCode,String upAreaName){
List<SysArea> sysAreas = new ArrayList<>();
Document connect = connect(provinceUrl);
Elements rowCity = connect.select("tr.citytr");
for (Element cityElement : rowCity) {
String name = cityElement.select("td").text();
String[] split = name.split(" ");
SysArea sysArea = new SysArea();
sysArea.setAreaCode(split[0]);
sysArea.setAreaName(split[1]);
sysArea.setParentCode(parentCode);
sysArea.setLevel("2");
sysArea.setDelFlag("1");
sysArea.setStatus("1");
sysArea.setFullName(upAreaName+split[1]);
sysArea.setId(sysArea.getAreaCode());
sysAreas.add(sysArea);
String cityUrl = provinceUrl.replace(".html","/"+split[0].substring(0, 4)+".html");
System.err.println("-------------------开始获取"+split[1]+"下属区县行政区划信息-----------------------");
List<SysArea> downAreaCodeList = getDownAreaCode(cityUrl,split[0],upAreaName+split[1]);
sysAreas.addAll(downAreaCodeList);
//只爬取固原市的数据
/*if("固原市".equals(split[1])){
}*/
}
return sysAreas;
}
/**
* 获取区县行政区划信息
* @param cityUrl 城市对应的地址
* @param parentCode 需要爬取的市行政区划(对于区县的父级代码即为市行政区划)
* @return
*/
public static List<SysArea> getDownAreaCode(String cityUrl,String parentCode,String upAreaName){
List<SysArea> sysAreas = new ArrayList<>();
Document connect = connect(cityUrl);
Elements rowDown = connect.select("tr.countytr");
for (Element downElement : rowDown) {
String code = downElement.select("a").attr("href");
String name = downElement.select("td").text();
String[] split = name.split(" ");
if(!"市辖区".equals(split[1])){
SysArea sysArea = new SysArea();
sysArea.setAreaCode(split[0]);
sysArea.setAreaName(split[1]);
sysArea.setParentCode(parentCode);
sysArea.setLevel("3");
sysArea.setDelFlag("1");
sysArea.setStatus("1");
sysArea.setFullName(upAreaName+split[1]);
sysArea.setId(sysArea.getAreaCode());
sysAreas.add(sysArea);
String downUrl = cityUrl.replace(parentCode.substring(0,4)+".html",code);
System.err.println("====================开始获取"+split[1]+"下属区划信息");
List<SysArea> countryAreaList = getCountryAreaCodeList(downUrl,split[0],upAreaName+split[1]);
sysAreas.addAll(countryAreaList);
}
}
return sysAreas;
}
/**
* 获取乡镇行政区划信息
* @param downUrl
* @param parentCode
* @return
*/
public static List<SysArea> getCountryAreaCodeList(String downUrl,String parentCode,String upAreaName){
List<SysArea> sysAreas = new ArrayList<>();
Document connect = connect(downUrl);
Elements rowDown = connect.select("tr.towntr");
for (Element downElement : rowDown) {
String code = downElement.select("a").attr("href");
String name = downElement.select("td").text();
String[] split = name.split(" ");
SysArea sysArea = new SysArea();
sysArea.setAreaCode(split[0]);
sysArea.setAreaName(split[1]);
sysArea.setParentCode(parentCode);
sysArea.setLevel("4");
sysArea.setDelFlag("1");
sysArea.setStatus("1");
sysArea.setFullName(upAreaName+split[1]);
sysArea.setId(sysArea.getAreaCode());
sysAreas.add(sysArea);
String countryUrl = downUrl.replace(parentCode.substring(0,6)+".html",code);
System.err.println("====================开始获取"+split[1]+"下属区划信息");
List<SysArea> villageAreaCodeList = getVillageAreaCodeList(countryUrl,split[0],upAreaName+split[1]);
sysAreas.addAll(villageAreaCodeList);
}
return sysAreas;
}
/**
* 获取村行政区划信息
* @param countryUrl
* @param parentCode
* @return
*/
public static List<SysArea> getVillageAreaCodeList(String countryUrl,String parentCode,String upAreaName){
List<SysArea> villageAreaCodeList = new ArrayList<>();
Document connect = connect(countryUrl);
Elements rowDown = connect.select("tr.villagetr");
for (Element downElement : rowDown) {
String name = downElement.select("td").text();
String[] split = name.split(" ");
SysArea sysArea = new SysArea();
sysArea.setAreaCode(split[0]);
sysArea.setAreaName(split[2]);
sysArea.setParentCode(parentCode);
sysArea.setLevel("5");
sysArea.setDelFlag("1");
sysArea.setStatus("1");
sysArea.setId(sysArea.getAreaCode());
sysArea.setFullName(upAreaName+split[2]);
villageAreaCodeList.add(sysArea);
}
return villageAreaCodeList;
}
}
三、单元测试
//此处展示批量插入数据库
@Test
public void 爬虫批量写入数据(){
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/index.html";
List<SysArea> sysAreas = javaJsoupService.getProvinces(url);
System.err.println("爬虫相应数据为:"+JSON.toJSONString(sysAreas));
int result = surveyCommonMapper.insertAreaInfo(sysAreas);
System.err.println("插入数据条数:"+result);
}
四、批量插入数据库
<!--测试批量插入爬虫获取的区划信息-->
<insert id="insertAreaInfo" parameterType="java.util.List">
insert into sys_area_20220304(id, area_code, area_name,parent_code, full_name,level,status,del_flag) values
<foreach collection="list" item="item" index="index" separator=",">
(#{item.id},#{item.areaCode},#{item.areaName},#{item.parentCode},#{item.fullName},#{item.level},#{item.status},#{item.delFlag})
</foreach>
</insert>
|