站点sitemap.xml内容格式
百度搜索:Xs小屋
Python 3
简单实现,提取URL保存到url.txt文件
import xml.dom.minidom as xmldom
import urllib.request
import xml
proxy = '127.0.0.1:10809'
proxy_support = urllib.request.ProxyHandler({'http':'http://'+proxy,
'https':'https://'+proxy})
opener = urllib.request.build_opener(proxy_support)
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0"
}
sitemap_url = "https://www.nstns.com/sitemap.xml"
http = urllib.request.Request(url=sitemap_url,headers=headers);
http_run = urllib.request.urlopen(http);
dom = xml.dom.minidom.parse(http_run)
return_xml = dom.documentElement.getElementsByTagName("url")
open("D:\\url.txt", 'w').close()
for url_xml in return_xml:
url = url_xml.getElementsByTagName("loc")[0]
a_url = url.firstChild.data
file = open("D:\\url.txt",'a',encoding="utf-8")
file.write(a_url+'\n')
file.close()
continue
效果图
|