零组文库已经是关闭了,在网上找到了一个备份站点:主页 · 资料文库
http://book.iwonder.run/index.html
网页太多,就写了个脚本,但是,太多的请求访问,结果就访问不了了,脚本如下
#!C:\Python3.7
# -*- coding:utf-8 -*-
import requests
from lxml import etree
import os
import re
def get_date(url):
try:
req =requests.get(url)
req.encoding = "UTF-8"
# print(req.text)
html = etree.HTML(req.text)
# 1 /html/body/div/div[1]/nav/ul/li[2]/span/text()
#1_level directory
directory_html = html.xpath("/html/body/div/div[1]/nav/ul/li")
for directory in directory_html:
directory_name = directory.xpath("./span/text()")
if len(directory_name)>0:
directory_name=directory_name[0].replace("\n","").strip()[1:-1]
# print(directory_name)
mkdir(directory_name)#创建一级文件夹
##2_level directory
sub_html = directory.xpath("./ul/li")
for sub in sub_html:
# 2 /html/body/div/div[1]/nav/ul/li[2]/ul/li/span/text()
# 3/html/body/div/div[1]/nav/ul/li[6]/ul/li/ul/li/span/text()
sub_name = sub.xpath("./span/text()")
if len(sub_name) >0:
sub_name=sub_name[0].replace("\n","").strip()
# print(sub_name)
mkdir(os.path.join(directory_name,sub_name))#创建二级文件夹
mkdir(os.path.join(directory_name,sub_name,'img'))#创建图片文件夹
# href /html/body/div/div[1]/nav/ul/li[2]/ul/li[1]/ul/li/a/@href
#3_page_url
href=sub.xpath("./ul/li/a/@href")
# print(href)
# exit()
if len(href)>0:
for u in href:
if "http" in u:
continue
page_url=url+u
print("2",page_url)
data,file_name = page(page_url)#页面处理
# print(file_name)
file_name=file_name+".html"
save_file(os.path.join(directory_name,sub_name,file_name),data.encode())
#save_img
img_list = get_img_url(page_url,sub_name)
if len(img_list)>0:
for img_url in img_list:
save_img(os.path.join(directory_name,sub_name),img_url)
else:
# #3_level directory
print("3")
third_html= sub.xpath("./ul/li")
for third in third_html:
third_name = third.xpath("./span/text()")[0].replace("\n","").strip()
# print("third:",third_directory)
if len(third_name)>0:
mkdir(os.path.join(directory_name,sub_name,third_name))#创建三级目录
mkdir(os.path.join(directory_name, sub_name,third_name, 'img')) # 创建图片文件夹
href = third.xpath("./ul/li/a/@href")
# print(href)
if len(href) > 0:
for u in href:
if "http" in u:
continue
page_url=url+u
print("3",page_url)
data,file_name = page(page_url)#页面处理
# print(file_name)
file_name = file_name + ".html"
save_file(os.path.join(directory_name,sub_name,third_name,file_name),data.encode())
#save_img
img_list = get_img_url(page_url,sub_name)
if len(img_list)>0:
for img_url in img_list:
save_img(os.path.join(directory_name,sub_name,third_name),img_url)
print("[****] finish!")
except Exception as e:
print(e)
pass
def mkdir(name):
dir = os.getcwd()
dir=os.path.join(dir,"osec",name)
# print(dir)
if not os.path.exists(dir):
os.mkdir(dir)
print("[D*]mkdir ",dir,"success!")
else:
print("[D-]", dir, "exists!")
def save_file(name,data):
dir = os.getcwd()
dir = os.path.join(dir, "osec", name)
if not os.path.exists(dir):
with open(dir,"wb") as file:
file.write(data)
print("[f*] write ",dir," success!")
else:
print("[f-] write ", dir, " exists!")
def page(url):
try:
req = requests.get(url)
req.encoding = 'UTF-8'
# print(req.text)
start = req.text.find('<nav role="navigation">')
stop = req.text.find('</nav>')
# print(start,stop)
rem = req.text[start:stop]
# print(rem)
data = req.text.replace(rem,"")
# print(data)
#filename
html =etree.HTML(req.text)
filename = html.xpath("/html/body/div/div[2]/div/div[2]/div/div/div[1]/section/h1/text()")#
if len(filename)>0:
filename=filename[0].strip()
else:
filename=url.rsplit("/",1)[-1].rsplit(".",1)[0]
filename=str(filename).replace("<","").replace("=","").replace("/","_").replace("\\","_")
# print(data,filename)
return data,filename
except Exception as e:
pass
def get_img_url(page_url,sub_name):
base_url = "http://book.iwonder.run/0day/"
req = requests.get(page_url)
req.encoding = 'UTF-8'
html = etree.HTML(req.text)
img_list=[]
img_list = html.xpath("//*[@id=\"book-search-results\"]/div[1]/section/p/img/@src")
if len(img_list)>0:
for i in range(len(img_list)):
img_list[i] = base_url+sub_name+"/"+img_list[i]
# print(img_list)
return img_list
def save_img(path,img_url):
base_path = os.getcwd()
base_path = os.path.join(base_path,"osec",path,"img")
filename = str(img_url).rsplit("/",1)[-1]
file_path = os.path.join(base_path,filename)
# print(file_path)
if not os.path.exists(file_path):
try:
file_data = requests.get(img_url).content
with open(file_path,"wb") as file:
file.write(file_data)
print("[img**] save img ",file_path," success!")
except:
pass
else:
print("[img--] save img ", file_path, " exists!")
if __name__ == '__main__':
url="http://book.iwonder.run/"
page_url="http://book.iwonder.run/0day/74cms/%E6%96%B0%E7%89%8874cms%20v4.2.1-v4.2.129-%E5%90%8E%E5%8F%B0getshell%E6%BC%8F%E6%B4%9E.html"
get_date(url)
# mkdir("测试")
# page("http://book.iwonder.run/0day/Coremail/1.html")
# img(page_url,"test")
# save_img(os.path.join("dirctory","subname"),)
|