import urllib.request
#获取一个get请求
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8')) #对获取到的网页源码进行utf-8解码
#获取一个post请求
import urllib.parse
data = bytes(urllib.parse.urlencode({"hello":"world"}),encoding = "utf-8")
reponse = urllib.request.urlopen("http://httpbin.org/post",data = data)
print(reponse.read().decode("utf-8"))
#超时处理
try:
reponse = urllib.request.urlopen("http://httpbin.org/get",timeout=0.01)
print(reponse.read().decode("utf-8"))
except urllib.error.URLError as e:
print("time out")
response = urllib.request.urlopen("http://www.baidu.com")
# print(response.status)
print(response.getheader("Server"))
#测试模仿真实浏览器
url = "http://httpbin.org/post"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
data = bytes(urllib.parse.urlencode({"name":"jeffchenitm"}),encoding = "utf-8")
req = urllib.request.Request(url = url,data = data,headers= headers,method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
#真实访问豆瓣,如果不做更改,将会被识别出来是爬虫,会报错418
url = "http://www.douban.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
req = urllib.request.Request(url = url,headers= headers)
response = urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
测试网站:
http://httpbin.org
|