第一次的爬虫尝试,在此记录下代码,想请教大佬如何用f-string那种格式表示地址拼接,format不够美观。
import requests
import time
import random
class BaiduSpider:
def __init__(self):
self.url = "https://www.baidu.com/s?wd={}&pn={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"}
def get_html(self, url):
html = requests.get(url=url,
headers=self.headers).content.decode("utf-8", "ignore") # decode内用原网站查看charset获得
return html
def parse_html(self):
pass
def save_html(self, filename, html):
with open(filename, "w", encoding="utf-8") as f: # window记得加encoding
f.write(html)
def crawl(self):
"""
程序入口
"""
name = input("请输入查询明星:")
start = int(input("请输入起始页:"))
end = int(input("请输入结束页:"))
for page in range(start, end + 1):
# 拼接地址
pn = (page - 1) * 10
page_url = self.url.format(name, pn) # 在self.url.format中传入name,pn数据
html = self.get_html(url=page_url) # 得到完整的url
filename = f"{name}_第{page}页.html"
self.save_html(filename, html) # 在self.save_html中传入数据
time.sleep(random.uniform(0, 2))
# 爬取完成提示
print(f"第{page}页爬取成功啦")
if __name__ == '__main__':
spider = BaiduSpider()
spider.crawl()
|