1. 背景
由于项目需要根据拼音字符进行搜索中文,本博客特为此提出解决方案
2. 方案1
使用pinyin4j将中文处理成拼音的形式存入数据库中
- 引入依赖
<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.1</version>
</dependency>
- 拼音处理源代码
public class PinYinUtils {
private static HanyuPinyinOutputFormat hanyuPinyinOutputFormat;
static {
hanyuPinyinOutputFormat = new HanyuPinyinOutputFormat();
hanyuPinyinOutputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
hanyuPinyinOutputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
}
public static void main(String[] args) throws BadHanyuPinyinOutputFormatCombination {
String chinese = "给我一首歌的时间";
System.out.println("中文:>>" + chinese);
System.out.println("全拼:>>" + convertToFullPinYin(chinese));
System.out.println("第一个汉字全拼:>>" + convertToFullAbbMixPinYin(chinese));
System.out.println("全部汉字首字符:>>" + convertToFirstLetterPinYin(chinese));
}
public static Set<String> convertToFullPinYin(String chinese) throws BadHanyuPinyinOutputFormatCombination {
List<String> pinyinFullList = new ArrayList<>();
for (int i = 0; i < chinese.length(); i++) {
char chineseChar = chinese.charAt(i);
if (String.valueOf(chineseChar).matches("[\\u4E00-\\u9FA5]+")) {
String[] currentPinyinArray = PinyinHelper.toHanyuPinyinStringArray(chineseChar, hanyuPinyinOutputFormat);
if (pinyinFullList.isEmpty()) {
pinyinFullList = Arrays.asList(currentPinyinArray);
continue;
}
pinyinFullList = getCartesian(pinyinFullList, Arrays.asList(currentPinyinArray));
} else {
for (int j = 0; j < pinyinFullList.size(); j++) {
pinyinFullList.set(j, pinyinFullList.get(j) + chineseChar);
}
}
}
return pinyinFullList.stream().collect(Collectors.toSet());
}
public static Set<String> convertToFirstLetterPinYin(String chinese) throws BadHanyuPinyinOutputFormatCombination {
List<String> pinyinFullList = new ArrayList<>();
for (int i = 0; i < chinese.length(); i++) {
char chineseChar = chinese.charAt(i);
if (String.valueOf(chineseChar).matches("[\\u4E00-\\u9FA5]+")) {
String[] currentPinyinArray = getFirstLetterPinYin(chineseChar, hanyuPinyinOutputFormat);
if (pinyinFullList.isEmpty()) {
pinyinFullList = Arrays.asList(currentPinyinArray);
continue;
}
pinyinFullList = getCartesian(pinyinFullList, Arrays.asList(currentPinyinArray));
} else {
for (int j = 0; j < pinyinFullList.size(); j++) {
pinyinFullList.set(j, pinyinFullList.get(j) + chineseChar);
}
}
}
return pinyinFullList.stream().collect(Collectors.toSet());
}
public static Set<String> convertToFullAbbMixPinYin(String chinese) throws BadHanyuPinyinOutputFormatCombination {
List<String> pinyinFullList = new ArrayList<>();
for (int i = 0; i < chinese.length(); i++) {
char chineseChar = chinese.charAt(i);
if (String.valueOf(chineseChar).matches("[\\u4E00-\\u9FA5]+")) {
String[] currentPinyinArray;
if (i == 0) {
currentPinyinArray = PinyinHelper.toHanyuPinyinStringArray(chineseChar, hanyuPinyinOutputFormat);
} else {
currentPinyinArray = getFirstLetterPinYin(chineseChar, hanyuPinyinOutputFormat);
}
if (pinyinFullList.isEmpty()) {
pinyinFullList = Arrays.asList(currentPinyinArray);
continue;
}
pinyinFullList = getCartesian(pinyinFullList, Arrays.asList(currentPinyinArray));
} else {
for (int j = 0; j < pinyinFullList.size(); j++) {
pinyinFullList.set(j, pinyinFullList.get(j) + chineseChar);
}
}
}
return pinyinFullList.stream().collect(Collectors.toSet());
}
private static String[] getFirstLetterPinYin(char chineseChar, HanyuPinyinOutputFormat hanyuPinyinOutputFormat) throws BadHanyuPinyinOutputFormatCombination {
String[] currentPinyinArray = PinyinHelper.toHanyuPinyinStringArray(chineseChar, hanyuPinyinOutputFormat);
String[] firstLetterPinYin = new String[currentPinyinArray.length];
for (int i = 0; i < currentPinyinArray.length; i++) {
firstLetterPinYin[i] = String.valueOf(currentPinyinArray[i].charAt(0));
}
return firstLetterPinYin;
}
private static List<String> getCartesian(List<String> list1, List<String> list2) {
List<String> set = new ArrayList<>();
for (String str : list1) {
for (String str2 : list2) {
set.add(str + str2);
}
}
return set;
}
}
- 程序运行截图
本代码效率可能较低,希望有大神可以优化一下
3. 方案2
如果数据库使用elsaticsearch,可以为此安装拼音分词插件,但是可能有的时候拼音分词不尽人意,不能满足相关需求,所以需要有能力者对该拼音分词插件进行调整。(这里贴上拼音分词插件github地址:https://github.com/medcl/elasticsearch-analysis-pinyin/releases)
|