# -*- coding:utf-8 -*-
"""FaceBook 发布时间解析规则"""
import re
import time
import dateparser
from loguru import logger
import traceback
print('hello world')
hour = r'\d{1,2}'
minute = r'\d{1,2}'
period = r'AM|PM|'
month = (
r"Jan(?:uary)?|"
r"Feb(?:ruary)?|"
r"Mar(?:ch)?|"
r"Apr(?:il)?|"
r"May|"
r"Jun(?:e)?|"
r"Jul(?:y)?|"
r"Aug(?:ust)?|"
r"Sep(?:tember)?|"
r"Oct(?:ober)?|"
r"Nov(?:ember)?|"
r"Dec(?:ember)?"
)
day_of_month = r"\d{1,2}"
specific_date_md = f'(?:{month}) {day_of_month}' + r'(?:,? \d{4})?'
specific_date_dm = f'{day_of_month} (?:{month})' + r'(?:,? \d{4})?'
date = f'{specific_date_md}|{specific_date_dm}|Today|Yesterday'
exact_time = f"(?:{date}) at {hour}:{minute} ?(?:{period})"
relative_time_hours = '^\d{1,2}\s?h(?:rs?)?$'
relative_time_minutes = '^\d{1,2}\s?m(?:ins?)?$'
relative_time_days = '^\d{1,2}\s?d(?:ays?)?$'
relative_time_weeks = '^\d{1,2}\s?wk)$'
relative_time_months = '^\d{1,2}\s?(?:mth|mo)$'
relative_time_years = '^\d{1,2}\s?yr$'
relative_time = f'{relative_time_years}|{relative_time_months}|{relative_time_days}|{relative_time_hours}|{relative_time_minutes}|{relative_time_weeks}'
hours_test_case_list = ['19 h', '19h', '19 hr', '19 hrs']
minutes_test_case_list = ['10m', '10 m', '10 mins', '1 min']
days_test_case_list = ['1 d', '2d']
def test_hours():
"""测试hours解析规则"""
for test_case in hours_test_case_list:
hours_com = re.compile(relative_time_hours, re.IGNORECASE)
print('hour_com:', hours_com)
test_case_res = re.match(relative_time_hours, test_case)
print('test_case_res:', test_case_res)
if test_case_res:
print(f'test_case:{test_case}--hours_result:{test_case_res.group(1)}')
def test_minutes():
"""测试minutes解析规则"""
for test_case in minutes_test_case_list:
minute_com = re.compile(relative_time_minutes, re.IGNORECASE)
print('minute_com:', minute_com)
test_case_res = re.match(relative_time_minutes, test_case)
print('test_case_res:', test_case_res)
if test_case_res:
print(f'test_case:{test_case}--minutes_result:{test_case_res.group(1)}')
def test_days():
"""测试minutes解析规则"""
for test_case in days_test_case_list:
day_com = re.compile(relative_time_days, re.IGNORECASE)
print('day_com:', day_com)
test_case_res = re.match(relative_time_days, test_case)
print('test_case_res:', test_case_res)
if test_case_res:
print(f'test_case:{test_case}--days_result:{test_case_res.group(1)}')
def get_publish_date():
# publish_date
result = dict()
publish_time = int(time.time() * 1000) # 发布时间 (如果未解析出发布时间默认会设定为当前时间)
test_case_list = ['19h ', '19 h', '10m', '10 m', '10 mins', '1 d', '2d', '10 hr', '11 hrs', 'December 4 at 11:46 pm',
'December 4 at 11:46 am', 'February 16, 2013', 'Yesterday at 04:33', '21 December at 02:23', '1 September', '21 October 2020', 'January 6', 'December 27, 2021 at 1:02 AM']
for test_case in test_case_list:
publish_time_str = test_case.strip()
if publish_time_str:
publish_time_str = publish_time_str.strip()
logger.info(f'[INFO]publish_time:{publish_time_str}')
hour = r'\d{1,2}'
minute = r'\d{1,2}'
period = r'AM|PM|'
month = (
r"Jan(?:uary)?|"
r"Feb(?:ruary)?|"
r"Mar(?:ch)?|"
r"Apr(?:il)?|"
r"May|"
r"Jun(?:e)?|"
r"Jul(?:y)?|"
r"Aug(?:ust)?|"
r"Sep(?:tember)?|"
r"Oct(?:ober)?|"
r"Nov(?:ember)?|"
r"Dec(?:ember)?"
)
day_of_month = r"\d{1,2}"
specific_date_md = f'(?:{month}) {day_of_month}' + r'(?:,? \d{4})?'
specific_date_dm = f'{day_of_month} (?:{month})' + r'(?:,? \d{4})?'
date = f'{specific_date_md}|{specific_date_dm}|Today|Yesterday'
exact_time = f"(?:{date}) at {hour}:{minute} ?(?:{period})"
exact_date = f'{date}'
relative_time_hours = r'^\d{1,2}\s?h(?:rs?)?$'
relative_time_minutes = r'^\d{1,2}\s?m(?:ins?)?$'
relative_time_days = r'^\d{1,2}\s?d(?:ays?)?$'
relative_time_weeks = r'^\d{1,2}\s?wk$'
relative_time_months = r'^\d{1,2}\s?(?:mth|mo)$'
relative_time_years = r'^\d{1,2}\s?yr$'
print('relative_time_hours_com:', re.compile(relative_time_hours))
print('relative_time_minutes_com:', re.compile(relative_time_minutes))
print('relative_time_days_com:', re.compile(relative_time_days))
print('relative_time_weeks_com:', re.compile(relative_time_weeks))
print('relative_time_years_com:', re.compile(relative_time_years))
print('relative_time_months:', re.compile(relative_time_months))
relative_time = f'{relative_time_years}|{relative_time_months}|{relative_time_days}|{relative_time_hours}|{relative_time_minutes}|{relative_time_weeks}'
datetime_regex = re.compile(fr"({exact_time}|{relative_time}|{exact_date})", re.IGNORECASE)
time_match = datetime_regex.search(publish_time_str)
try:
if time_match:
date_str = time_match.group(0).replace("mth", "month")
else:
date_str = publish_time_str
logger.info('[INFO]未能匹配发布日期')
logger.warning(f'[WARNING]未能匹配发布日期:{date_str}')
date_time = dateparser.parse(date_str)
publish_time = int(date_time.timestamp() * 1000)
except Exception as e:
logger.error(f'[ERROR]解析发布时间异常{traceback.format_exc()}')
result['publish_time'] = publish_time
logger.info(f'[INFO]publish_time: {publish_time}')
return result
if __name__ == '__main__':
# test_hours()
# test_minutes()
# test_days()
# parse_date()
get_publish_date()
使用dateparser模块去解析日期字符串,结果可以输出为日期字符串或者时间戳;
参考项目:
? ? ? ? facebook_scraper
|