欢迎各位高三的同学报考厂大💪
使用Java工具Jsoup爬取广州大学计算机科学与网络工程学院师资队伍中的公开信息,并存储到csv文件中
案例分析
测试结果
测试截图1 测试截图2
爬取信息工具HTMLParseUtil类
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
public class HTMLParseUtil {
public static void main(String[] args) {
try {
Map<String,String>map=new HashMap<>();
Map<String,TeacherInfo>Teachers=new HashMap<>();
String base_url="http://jsj.gzhu.edu.cn";
String url=base_url+"/szdw1/jsjkxywlgcxysz.htm";
Document document = Jsoup.parse(new URL(url), 30000);
Elements elements = document.getElementsByClass("mclb");
int teacher_num=0;
for(Element e :elements){
Elements div = e.getElementsByTag("a");
for(Element a:div){
String name = a.text();
String href = base_url+a.attr("href").substring(2);
if(name.length()<10){
map.put(name,href);
TeacherInfo teacherInfo = new TeacherInfo();
teacherInfo.setName(name);
Teachers.put(name,teacherInfo);
teacher_num++;
}
}
}
for(String name:map.keySet()){
String href = map.get(name);
Document doc = Jsoup.parse(new URL(href), 30000);
Elements imgs = doc.getElementsByTag("img");
for(Element img:imgs){
String src = img.attr("src");
if(src.startsWith("/__local/")){
Teachers.get(name).setImg_url(base_url+src);
}
}
}
System.out.println("请求解析网页获得对应老师的信息");
for(String name:map.keySet()){
String href = map.get(name);
Document doc = Jsoup.parse(new URL(href), 30000);
Elements elms1 = doc.select("[id=vsb_content]");
Elements elms2 = elms1.select("p");
TeacherInfo teacherInfo = Teachers.get(name);
for(Element elm:elms2){
String text = elm.text();
if(text.length()>0){
if(text.startsWith("职称")||text.startsWith("职务"))teacherInfo.setTitle(text);
else if(text.startsWith("系、研究所")||text.startsWith("部门"))teacherInfo.setDepartment(text);
else if(text.startsWith("研究领域"))teacherInfo.setDomain(text);
else if(text.startsWith("讲授课程"))teacherInfo.setCourse(text);
else if(text.startsWith("电子邮箱")||text.startsWith("电子邮件"))teacherInfo.setEmail(text);
else if(text.startsWith("办公电话"))teacherInfo.setPhone(text);
else if(text.startsWith("个人主页"))teacherInfo.setHome(text);
}
}
}
String fileName = "TeacherInfo.csv";
File file = new File(fileName);
if(file.exists()){
file.delete();
}
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(fileName));
for(TeacherInfo teacher:Teachers.values()) {
System.out.println(teacher);
bufferedWriter.newLine();
bufferedWriter.write(teacher.getName()+","+
teacher.getImg_url()+","+
teacher.getTitle()+","+
teacher.getDepartment()+","+
teacher.getDomain()+","+
teacher.getCourse()+","+
teacher.getEmail()+","+
teacher.getHome()+","+
teacher.getPhone()
);
bufferedWriter.flush();
}
System.out.println("一共有教师人数:"+teacher_num);
System.out.println("输出已经写入到路径"+file.getAbsolutePath()+"中");
} catch (Exception e) {
e.printStackTrace();
}
}
}
抽象一个TeacherInfo 类用于存储教师个人信息
public class TeacherInfo {
private String name;
private String img_url;
private String title;
private String department;
private String domain;
private String course;
private String email;
private String home;
private String phone;
@Override
public String toString() {
return "" +
"name='" + name + '\'' +
", img_url='" + img_url + '\'' +
", title='" + title + '\'' +
", department='" + department + '\'' +
", domain='" + domain + '\'' +
", course='" + course + '\'' +
", email='" + email + '\'' +
", home='" + home + '\'' +
", phone='" + phone + '\'' +
"";
}
public TeacherInfo(){
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public String getCourse() {
return course;
}
public String getHome() {
return home;
}
public void setHome(String home) {
this.home = home;
}
public void setCourse(String course) {
this.course = course;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getImg_url() {
return img_url;
}
public void setImg_url(String img_url) {
this.img_url = img_url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDepartment() {
return department;
}
public void setDepartment(String department) {
this.department = department;
}
public String getDomain() {
return domain;
}
public void setDomain(String domain) {
this.domain = domain;
}
public String getEmail() {
return email;
}
public void setEmail(String email) {
this.email = email;
}
}
|