import requests
import re
import os
import easygui
import time
def get_mom_url_list(page, home_url):
url = home_url + '/home/' + str(page)
url_text = requests.get(url).text
url_list = re.findall('a\shref=\"(/mm/.*?\d)\"\starget', url_text)[0:30]
f_url_list = []
for index in range(len(url_list)):
if index % 2 == 0:
f_url_list.append(home_url + url_list[index])
return f_url_list
def get_jpg_url(f_url):
url_text = requests.get(f_url).text
jpg_download_url = re.findall("img\ssrc=\"(.*?)\"\salt", url_text)[0]
return jpg_download_url
if __name__ == '__main__':
# 获取最近一次的下载位置
txt_file_path = os.getcwd() + '\\latest.txt'
if not os.path.exists(txt_file_path):
mom = '1'
son = '1'
jpg = '0'
else:
date = open(txt_file_path, 'r')
txt_list = date.readlines()
mom = txt_list[0].strip()
son = txt_list[1].strip()
jpg_path = os.getcwd() + '/' + txt_list[2].strip()
if os.path.exists(jpg_path):
jpg = str(len(os.listdir(jpg_path)))
else:
jpg = '0'
# 输入最新的网址及开始下载的位置
msg = "输入开始下载的页码及主题序号"
title = "确定开始下载的位置"
fieldNames = ["最新网址:", "开始下载的页码:(1~306)", "主题序号:(1~15)", "本主题已下载的照片数量:"]
Values = ['https://www.mm5mm5.com', mom, son, jpg]
fieldValues = easygui.multenterbox(msg, title, fieldNames, Values)
if fieldValues is None: exit()
# 获取数据,网址,母页,子页,文件夹中已有照片
url_home = fieldValues[0]
start_mom_page = int(fieldValues[1]) - 1
start_son_page = int(fieldValues[2]) - 1
start_jpg_page = int(fieldValues[3])
first_run = True
jpg_first_run = True
start_son_p = 0
for page_num in range(start_mom_page, 306):
mom_url_list = get_mom_url_list(page_num, url_home) # 获取当前页面所有图片包的地址
# 判断母页是否下一页,若下一页子页从0开始
if first_run is True:
start_son_p = start_son_page
# 对母页中主题进行循环
for son_url in mom_url_list[start_son_p:]:
theme_num = mom_url_list.index(son_url)
# 生成文件夹名称
text = requests.get(son_url).text
theme = re.findall('<h2>\s*(.*?)\s*</h2>', text)
folder_name = '【' + theme[0] + '】'
# 建立文件夹
path = os.getcwd() + "\\" + str(folder_name)
if not os.path.isdir(path):
os.mkdir(path)
# 判断母页是否下一页,若下一页子页从0开始
start_jpg_p = 0
if jpg_first_run is True:
start_jpg_p = start_jpg_page
# 开始对子页中所有图片进行下载
for i in range(start_jpg_p, 100):
son_jpg_url = son_url + '/' + str(i)
jpg_url = get_jpg_url(son_jpg_url)
if len(jpg_url) == 0:
break
else:
content = requests.get(jpg_url).content
jpg_name = re.findall("\d/(.*?jpg)", jpg_url)[0]
if not os.path.exists(path + "\\" + jpg_name):
with open(path + "\\" + jpg_name, 'wb') as f:
f.write(content)
f.close()
# 获取现在的时间
t = time.localtime(time.time())
s = time.strftime('%Y-%m-%d %H:%M:%S\t', t)
out_mess = f'{s}成功下载第{page_num + 1}页的第{theme_num + 1}个主题{folder_name}的{jpg_name}照片'
print(out_mess)
with open(os.getcwd() + "\\下载记录.txt", 'a', encoding='utf-8') as q:
q.writelines(out_mess + '\n')
q.close()
with open(os.getcwd() + '\\latest.txt', 'w') as b:
b.write(f'{page_num + 1}\n{theme_num + 1}\n{folder_name}')
b.close()
else:
print(f'{folder_name}的{jpg_name}照片已存在')
jpg_first_run = False
first_run = False
|