1,2021大学排名词云图
2,2021大学排名附件
大学排名附件
3,使用的技术
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>${selenium.version}</version>
</dependency>
- hutool 5.7.14 工具包
- kumo 1.28 词云
<!--词云-->
<dependency>
<groupId>com.kennycason</groupId>
<artifactId>kumo-core</artifactId>
<version>${kumo.version}</version>
</dependency>
<!--多语言分词-->
<dependency>
<groupId>com.kennycason</groupId>
<artifactId>kumo-tokenizers</artifactId>
<version>${kumo.version}</version>
</dependency>
4,核心代码
4.1,创建Driver
/**
* 创建WebDriver
* @param debug
* @param debuggerAddress
* @return
*/
public static WebDriver createWebDriver(boolean debug,String ... debuggerAddress){
// 设置驱动
//System.setProperty("webdriver.chrome.driver", getProps().getStr("webdriver.qq.driver"));
// 360
System.setProperty("webdriver.chrome.driver", getProps().getStr("webdriver.360.driver"));
//调用谷歌
ChromeOptions options = new ChromeOptions();
//指定浏览器安装位置
options.setBinary(getProps().getStr("webdriver.360.bin"));
// 管理员权限运行 --no-sandbox
// 不打开界面 --headless
options.addArguments("--no-sandbox");
// 接受非安全的
options.setAcceptInsecureCerts(true);
if(debug && debuggerAddress.length > 0){
// 在cmd中执行 C:/Users/A/AppData/Local/Google/Chrome/Application/chrome.exe --remote-debugging-port=44444
// 并配置debuggerAddress,即可重用已经打开的谷歌浏览器
options.setExperimentalOption("debuggerAddress", debuggerAddress[0]);
}
return new ChromeDriver(options);
}
4.2,爬取大学数据并生成词云
package com.lcj.selenium.instance;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.lang.Console;
import cn.hutool.core.text.csv.*;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.CircleBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.font.scale.SqrtFontScalar;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
import com.kennycason.kumo.nlp.tokenizers.ChineseWordTokenizer;
import com.kennycason.kumo.palette.LinearGradientColorPalette;
import com.lcj.selenium.utils.SeleniumUtil;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import java.awt.*;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
/**
* <p>
* 中国大学排名
* https://www.shanghairanking.cn/rankings/bcur/2021
* </p>
*
* @author: liuchangng@qq.com
* @since: 2021/10/20 10:00
*/
public class ChinaUniversityRankCreawler {
// 大学排名地址
private static final String URL = "https://www.shanghairanking.cn/rankings/bcur/2021";
// 保存全部大学集合
private static final List<String[]> UNIVERSITY_INFO_LIST = new ArrayList<>(528);
// 数据保存目录
private static final String USER_HOME = System.getProperty("user.home");
public static void main(String[] args) {
WebDriver driver = SeleniumUtil.createWebDriver(false, "127.0.0.1:44444");
// 最大化网页窗口
driver.manage().window().maximize();
// 全局等待时间
driver.manage().timeouts().implicitlyWait(30, TimeUnit.SECONDS);
// 打开页面
driver.get(URL);
Console.log("当前网页地址: {} ,标题: {}", driver.getCurrentUrl(), driver.getTitle());
// 文件名字
String fileName = driver.findElement(By.cssSelector("#content > div.content-title > h1")).getText().trim();
fileName = StrUtil.format("{}{}{}.csv", USER_HOME,File.separator,fileName);
Console.log(fileName);
// 大学总数
int universityCount = ReUtil.getFirstNumber(driver.findElement(By.cssSelector("#content-box > div.tool-box > div > div:nth-child(3)")).getText().trim());
int size = 30;
int pages = (int) Math.ceil(universityCount * 1.0 / size);
// 文件在当前目录是否存在
if (!FileUtil.exist(fileName)) {
// 解析写一页大学数据
for (int i = 1; i <= pages; i++) {
if (i > 1) {
WebElement nextPage = driver.findElement(By.cssSelector("#content-box > ul > li.ant-pagination-next > a"));
nextPage.click();
}
// 解析大学数据
parseUniversityInfo(driver);
}
// 数据保存到csv文件中
try (final CsvWriter csvWriter = CsvUtil.getWriter(new File(fileName), StandardCharsets.UTF_8);) {
csvWriter.write(UNIVERSITY_INFO_LIST);
csvWriter.flush();
}
}
// 生成词云
generateWordCloud(fileName);
driver.quit();
System.exit(0);
}
/*
* <p>
* 生成词云
* </p>
* @author liuchangjun
* @since 2021/10/20 11:19
* @return void
*/
private static void generateWordCloud(String fileName) {
final FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
// 设置分词返回数量(频率最高的600个词)
frequencyAnalyzer.setWordFrequenciesToReturn(600);
// 最小分词长度
frequencyAnalyzer.setMinWordLength(2);
// 中文分词
frequencyAnalyzer.setWordTokenizer(new ChineseWordTokenizer());
// 从文件中读取
CsvReader reader = CsvUtil.getReader();
UNIVERSITY_INFO_LIST.clear();
//从文件中读取CSV数据
CsvData data = reader.read(FileUtil.file(fileName), StandardCharsets.UTF_8);
List<CsvRow> rows = data.getRows();
//遍历行
for (CsvRow csvRow : rows) {
//getRawList返回一个List列表,列表的每一项为CSV中的一个单元格(既逗号分隔部分)
UNIVERSITY_INFO_LIST.add(new String[]{csvRow.get(0),csvRow.get(1),csvRow.get(2),csvRow.get(3),csvRow.get(4),csvRow.get(5),csvRow.get(6)});
}
// 省份
List<String> provinceList = UNIVERSITY_INFO_LIST.stream().map(s -> s[4]).collect(Collectors.toList());
// 类型
List<String> categoryList = UNIVERSITY_INFO_LIST.stream().map(s -> s[5]).collect(Collectors.toList());
provinceList.addAll(categoryList);
List<WordFrequency> provinceFrequencies = frequencyAnalyzer.load(provinceList);
//设置图片分辨率
Dimension dimension = new Dimension(500, 500);
//此处的设置采用内置常量即可,生成词云对象
WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
//此处不设置会出现中文乱码
java.awt.Font font = new java.awt.Font("STSong-Light", 2, 18);
wordCloud.setKumoFont(new KumoFont(font));
// 字体大小范围
wordCloud.setFontScalar(new SqrtFontScalar(12, 42));
//设置边界及字体
wordCloud.setPadding(2);
//因为我这边是生成一个圆形,这边设置圆的半径
wordCloud.setBackground(new CircleBackground(255));
//设置词云显示的三种颜色,越靠前设置表示词频越高的词语的颜色
wordCloud.setColorPalette(new LinearGradientColorPalette(Color.RED, Color.BLUE, Color.GREEN, 30, 30));
// 图片背景色
wordCloud.setBackgroundColor(new Color(255, 255, 255));
wordCloud.build(provinceFrequencies);
//生成词云图路径
wordCloud.writeToFile(StrUtil.format("{}{}{}.png",USER_HOME,File.separator,FileUtil.getName(fileName)));
}
/*
* <p>
* 解析每页大学信息数据
* </p>
* @param driver 驱动
* @author liuchangjun
* @since 2021/10/20 10:39
*/
private static void parseUniversityInfo(WebDriver driver) {
List<WebElement> trs = driver.findElements(By.cssSelector("#content-box > div.rk-table-box > table > tbody > tr"));
trs.forEach(tr -> {
// 排名
String ranking = tr.findElement(By.cssSelector("td:nth-child(1) > div")).getText();
// logo
String logo = tr.findElement(By.cssSelector("td.align-left > div > div.logo > img")).getAttribute("src");
// 名字
String zhName = tr.findElement(By.cssSelector("td.align-left > div > div.univname > div:nth-child(1) > div > div > a")).getText();
String enName = tr.findElement(By.cssSelector("td.align-left > div > div.univname > div:nth-child(2) > div > div > a")).getText();
// 省市
String province = tr.findElement(By.cssSelector("td:nth-child(3)")).getText().trim();
// 类型
String category = tr.findElement(By.cssSelector("td:nth-child(4)")).getText().trim();
// 总分
String score = tr.findElement(By.cssSelector("td:nth-child(5)")).getText().trim();
Console.log("排名:{} ,logo:{} ,中文名称:{} ,英文名称:{} ,省份:{} ,类型:{} ,总分:{}", ranking, logo, zhName, enName, province, category, score);
UNIVERSITY_INFO_LIST.add(new String[]{ranking, logo, zhName, enName, province, category, score});
});
}
}
|