package whu.extract.pubtime.core;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import whu.utils.TimeUtil;
/**
?* Created On 2014年3月13日 下午2:49:05
?* @description 获取网页的发布时间
?*/
public class FetchPubTime {
????/** 表示url中连续的8位日期,例如http://www.baidu.com/20140311/2356.html */
????private static String url_reg_whole= "([-|/|_]{1}20\\d{6})";
????/** 表示 用-或者/隔开的日期,有年月日的,例如 http://www.baidu.com/2014-3-11/2356.html? */
????private static String url_reg_sep_ymd = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})";
????/** 表示 用-或者/隔开的日期,只有年和月份的,例如 http://www.baidu.com/2014-3/2356.html? */
????private static String url_reg_sep_ym = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})";
????private static Calendar current = Calendar.getInstance();
????/** 格式正确的时间正则表达式*/
????private static String rightTimeReg = "^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$";
?????
????/**
?????* @param url
?????* @param urlContent
?????* @return
?????*/
????public static String getPubTimeVarious(String url,String urlContent) {
?????????
????????String pubTime = getPubTimeFromUrl(url);
?????????
??????//链接里面没有,匹配文本中的
????????if(pubTime == null)
????????{
????????????if(urlContent!=null&&!urlContent.trim().equals(""))
????????????????return extractPageDate(urlContent);
????????}
?????????
????????return pubTime;
????}
?????
????/**从url里面抽取出发布时间,返回YYYY-MM-DD HH:mm:ss格式的字符串
?????* @param url
?????* @return
?????*/
????public static String getPubTimeFromUrl(String url)
????{
????????Pattern p_whole = Pattern.compile(url_reg_whole);
????????Matcher m_whole = p_whole.matcher(url);
????????if(m_whole.find(0)&&m_whole.groupCount()>0)
????????{
???????????String time =? m_whole.group(0);
???????????time = time.substring(1,time.length());
???????????//每一步都不能够超出当前时间?????????
????????if(current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd"))>=0)
????????{
???????????return time.substring(0,4)+"-"+time.substring(4,6)+"-"+
??????????????????time.substring(6,8)+" "+"00:00:00";
????????}
????????}
???????
????????p_whole = null;
????????m_whole = null;
????????Pattern p_sep = Pattern.compile(url_reg_sep_ymd);
????????Matcher m_sep = p_sep.matcher(url);
????????if(m_sep.find(0)&&m_sep.groupCount()>0)
????????{
?????????????String time =? m_sep.group(0);
?????????????time = time.substring(1,time.length());
?????????????String[] seg = time.split("[-|/|_]{1}");
?????????????Calendar theTime = Calendar.getInstance();
?????????????theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
?????????????theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
?????????????theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[2]));
?????????????if(current.compareTo(theTime)>=0)
????????????????{
?????????????
????????????return seg[0]+"-"+seg[1]+"-"+seg[2]+" "+"00:00:00";
????????????????}
????????}
????????p_sep = null;
????????m_sep = null;
????????Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym);
????????Matcher m_sep_ym = p_sep_ym.matcher(url);
????????if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0)
????????{
?????????????String time =? m_sep_ym.group(0);
?????????????time = time.substring(1,time.length());
?????????????Calendar theTime = Calendar.getInstance();
?????????????String[] seg = time.split("[-|/|_]{1}");
?????????????theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
?????????????theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
?????????????theTime.set(Calendar.DAY_OF_MONTH, 1);
?????????????if(current.compareTo(theTime)>=0)
????????????{
??????????????
????????????return seg[0]+"-"+seg[1]+"-"+"01"+" "+"00:00:00";
????????????}
????????}
?????????
????????return null;
????}
|