不多bb,直接上代码
import re
import time
import requests
headers = {
"user-agent": "写自己的浏览器"
}
response = requests.get('https://www.vmgirls.com/这里五个数.html', headers=headers) # 只爬详情页
time.sleep(1)
def cunchu(data):
img_all = re.findall('<a href="(.*?)" alt=".*?" title=".*?">', data)
for i in img_all:
img_url = 'https:' + i
print(img_url)
time.sleep(1)
file = './img/' + i[26:34] + '.jpg'
img_res = requests.get(img_url, headers=headers)
with open(file, "wb") as f:
f.write(img_res.content)
if re.findall('<a href="(.*?)" alt=".*?" title=".*?">', response.text):
print("没有跳转")
cunchu(response.text)
else:
href = re.findall('.href ="(.*?)"; </s', response.text)[0]
print("有跳转")
url = 'https://www.vmgirls.com' + href
time.sleep(0.7)
data_res = requests.get(url, headers=headers)
cunchu(data_res.text)
重点思路:在请求详情页的时候有可能会遇到跳转页,通过re获取详情页url,继续get。
?
?
|