1.将百度网页存为html文件
from urllib.request import urlopen
url = "http://www.baidu.com"
resp = urlopen(url)
with open("mybaidu.html", mode = "w", encoding = "utf-8") as f:
? f.write(resp.read().decode("utf-8")
2.批量抓取网站图片
import requests
import re
import time
headers = {
'User-Agent': 'asd', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'
}
target = 'https://www.baidu.com'
resp = requests.get(target, headers = headers)
html = resp.text
urls = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', html)
print(urls)
i = 0
for url in urls :
i = i+1
time.sleep(1)
url = 'http:' + url
print(url)
resp = requests.get(url, headers = headers)
file_name = str(i) + '.jpg'
with open(file_name, 'wb') as f:
f.write(resp.conten
|