package whu.extract.pubtime.core; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import whu.utils.TimeUtil; /** ? * Created On 2014年3月13日 下午2:49:05 ? * @description 获取网页的发布时间 ? */ public class FetchPubTime { ???? /** 表示url中连续的8位日期,例如http://www.baidu.com/20140311/2356.html */ ???? private static String url_reg_whole= "([-|/|_]{1}20\\d{6})" ; ???? /** 表示 用-或者/隔开的日期,有年月日的,例如 http://www.baidu.com/2014-3-11/2356.html? */ ???? private static String url_reg_sep_ymd = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})" ; ???? /** 表示 用-或者/隔开的日期,只有年和月份的,例如 http://www.baidu.com/2014-3/2356.html? */ ???? private static String url_reg_sep_ym = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})" ; ???? private static Calendar current = Calendar.getInstance(); ???? /** 格式正确的时间正则表达式*/ ???? private static String rightTimeReg = "^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$" ; ???? ? ???? /** ????? * @param url ????? * @param urlContent ????? * @return ????? */ ???? public static String getPubTimeVarious(String url,String urlContent) { ???????? ? ???????? String pubTime = getPubTimeFromUrl(url); ???????? ? ?????? //链接里面没有,匹配文本中的 ???????? if (pubTime == null ) ???????? { ???????????? if (urlContent!= null &&!urlContent.trim().equals( "" )) ???????????????? return extractPageDate(urlContent); ???????? } ???????? ? ???????? return pubTime; ???? } ???? ? ???? /**从url里面抽取出发布时间,返回YYYY-MM-DD HH:mm:ss格式的字符串 ????? * @param url ????? * @return ????? */ ???? public static String getPubTimeFromUrl(String url) ???? { ???????? Pattern p_whole = Pattern.compile(url_reg_whole); ???????? Matcher m_whole = p_whole.matcher(url); ???????? if (m_whole.find( 0 )&&m_whole.groupCount()> 0 ) ???????? { ??????????? String time =? m_whole.group( 0 ); ??????????? time = time.substring( 1 ,time.length()); ??????????? //每一步都不能够超出当前时间????????? ???????? if (current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd" ))>= 0 ) ???????? { ??????????? return time.substring( 0 , 4 )+ "-" +time.substring( 4 , 6 )+ "-" + ?????????????????? time.substring( 6 , 8 )+ " " + "00:00:00" ; ???????? } ???????? } ?????? ? ???????? p_whole = null ; ???????? m_whole = null ; ???????? Pattern p_sep = Pattern.compile(url_reg_sep_ymd); ???????? Matcher m_sep = p_sep.matcher(url); ???????? if (m_sep.find( 0 )&&m_sep.groupCount()> 0 ) ???????? { ????????????? String time =? m_sep.group( 0 ); ????????????? time = time.substring( 1 ,time.length()); ????????????? String[] seg = time.split( "[-|/|_]{1}" ); ????????????? Calendar theTime = Calendar.getInstance(); ????????????? theTime.set(Calendar.YEAR,Integer.parseInt(seg[ 0 ])); ????????????? theTime.set(Calendar.MONTH, Integer.parseInt(seg[ 1 ])); ????????????? theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[ 2 ])); ????????????? if (current.compareTo(theTime)>= 0 ) ???????????????? { ???????????? ? ???????????? return seg[ 0 ]+ "-" +seg[ 1 ]+ "-" +seg[ 2 ]+ " " + "00:00:00" ; ???????????????? } ???????? } ???????? p_sep = null ; ???????? m_sep = null ; ???????? Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym); ???????? Matcher m_sep_ym = p_sep_ym.matcher(url); ???????? if (m_sep_ym.find( 0 )&&m_sep_ym.groupCount()> 0 ) ???????? { ????????????? String time =? m_sep_ym.group( 0 ); ????????????? time = time.substring( 1 ,time.length()); ????????????? Calendar theTime = Calendar.getInstance(); ????????????? String[] seg = time.split( "[-|/|_]{1}" ); ????????????? theTime.set(Calendar.YEAR,Integer.parseInt(seg[ 0 ])); ????????????? theTime.set(Calendar.MONTH, Integer.parseInt(seg[ 1 ])); ????????????? theTime.set(Calendar.DAY_OF_MONTH, 1 ); ????????????? if (current.compareTo(theTime)>= 0 ) ???????????? { ????????????? ? ???????????? return seg[ 0 ]+ "-" +seg[ 1 ]+ "-" + "01" + " " + "00:00:00" ; ???????????? } ???????? } ???????? ? ???????? return null ; ???? } ???? ? ???? /** 从网页源码中取出发布时间 ????? *? java中正则表达式提取字符串中日期实现代码 ????? *? 2013年12月19日15:58:42 ????? *? 读取出2013-12-19 15:48:33或者2013-12-19或者2012/3/05形式的时间 ????? * @param text 待提取的字符串 ????? * @return 返回日期 ????? * @author: oschina ????? * @Createtime: Jan 21, 2013 ????? */ ???? public static String extractPageDate(String text) { ???????? boolean ? containsHMS = false ; ???????? String dateStr = text.replaceAll( "r?n" , " " ); ???????? try { ???????????? List matches = null ; ???????????? Pattern p_detail = Pattern.compile( "(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2} \\d{1,2}:\\d{1,2}:\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)" , Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); ???????????? //如果是仅仅抽取年月日,则按照上面的,如果是抽取年月日-时分秒,则按照下面的 ???????????? Pattern p = Pattern.compile( "(20\\d{2}[-/]\\d{1,2}[-/]\\d{1,2})|(20\\d{2}年\\d{1,2}月\\d{1,2}日)" , Pattern.CASE_INSENSITIVE|Pattern.MULTILINE); ???????????? //Matcher matcher = p.matcher(dateStr); ???????????? Matcher matcher_detail = p_detail.matcher(dateStr); ???????????? ? ???????????? if (!(matcher_detail.find( 0 ) && matcher_detail.groupCount() >= 1 )) ???????????? { ???????????????? matcher_detail = p.matcher(dateStr); ???????????????? containsHMS? = true ; ???????????? } else ???????????????? matcher_detail = p_detail.matcher(dateStr); ???????????? if (matcher_detail.find() && matcher_detail.groupCount() >= 1 ) { ???????????????? matches = new ArrayList(); ???????????????? for ( int i = 1 ; i <= matcher_detail.groupCount(); i++) { ???????????????????? String temp = matcher_detail.group(i); ???????????????????? matches.add(temp); ???????????????? } ???????????? } else { ???????????????? matches = Collections.EMPTY_LIST; ???????????? }??????????? ???????????? if (matches.size() > 0 ) { ???????????????? for ( int i= 0 ;i<matches.size();i++) ???????????????? { ???????????????????? String pubTime = matches.get(i).toString().trim(); ???????????????????? //取出第一个值 ???????????????????? pubTime = pubTime.replace( "/" , "-" ).replace( "年" , "-" ).replace( "月" , "-" ).replace( "日" , "-" ); ???????????????????? if (current.compareTo(TimeUtil.strToCalendar(pubTime, "yyyy-MM-dd" ))>= 0 ) ???????????????????? { ???????????????????????? if (containsHMS) ???????????????????????????? pubTime+= " " + "00:00:00" ; ???????????????????????? if (pubTime.matches(rightTimeReg)) ???????????????????????? { ???????????????????????????? return pubTime; ???????????????????????? } ???????????????????? } ???????????????? } ???????????? } else { ???????????????? return null ; ???????????? } ???????????? ? ???????? } catch (Exception e) { ???????????? return null ; ???????? } ???????? return null ; ???? } } |