import urllib.request
import urllib.parse
url = 'https://www.datafountain.cn/competitions'
'''
urllib. request. Request ( url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)
data 字节流格式,若是字典格式,先用urllib.parse.urlencode()转换
headers 请求头
origin_req_host 请求放的host名称或ip地址
method 请求方法
'''
headers = {
'User-Agent':'xxxxx'
}
req = urllib.request.Request(url,headers=headers)
'''
urllib . request . urlopen(url, data=None, [timeout]*, cafile=None, capath=None, cadefault=False, context=None )
data 以表单的形式提交,需要将data参数转为bytes格式
timeout 设置响应时间
response = urllib.request.urlopen(url,data=data)
'''
response = urllib.request.urlopen(req,timeout=1)
if response.status == 200:
print(response.read().decode('utf-8'))
print(response.getheaders())
高级操作 需要用到处理器的集合工具Handler,request中的BaseHandler是其它Handler的父类。Handler常用处理器类有:
- HTTPDefaultErrorHandler :用于处理响应错误,错误都会抛出HTTPError 类型的异常
- HTTPRedirectHandler :用于处理重定向
- HTTPCookieProcessor: 用于处理cookies
- ProxyHandler :用于设置代理默认为空
- HTTPPasswordMgr :用于管理密码,它维护了用户名和密码的表
- HTTPBasicAuthHandler: 用于管理认证,如果一个链接打开时需要认证,那么可以用它来解决认证问题
简单身份验证
from urllib.request import HTTPPasswordMgrWithDefaultRealm,build_opener,HTTPBasicAuthHandler
from urllib.error import URLError
url = 'change'
username = 'xxxxxx'
password = 'xxxxxx'
p = HTTPPasswordMgrWithDefaultRealm()
p.add_password(None,url,username,password)
auth_handler = HTTPBasicAuthHandler(p)
opener = build_opener(auth_handler)
try:
result = opener.open(url)
html = result.read().decode('utf-8')
print(html)
except URLError as e:
print(e.reason)
简单代理
from urllib.error import URLError
from urllib.request import ProxyHandler,build_opener
url = 'https://www.baidu.com'
proxy_Handler = ProxyHandler({
'http' : 'http://127.0.0.1:9743 ',
'https' : 'https://127.0.0.1:9743 '
})
opener = build_opener(proxy_Handler)
try:
response = opener.open(url)
if response.status == 200:
print(response.read().decode('utf-8'))
else:
print('ByeBye')
except URLError as e:
print(e.reason)
简单保存cookies
from urllib.request import HTTPCookieProcessor,build_opener
from http import cookiejar
'''
cookiejar是cookie类的的一个管理类,可以获取,添加,存储等等操作
'''
url = 'https://www.baidu.com'
cookies = cookiejar.CookieJar()
handler = HTTPCookieProcessor(cookies)
opener = build_opener(handler)
opener.open(url)
for item in cookies:
print(item.name + ":" + item.value)
cookies_save = cookiejar.MozillaCookieJar("cookies.txt")
handler_save = HTTPCookieProcessor(cookies_save)
opener_save = build_opener(handler_save)
opener_save.open(url)
cookies_save.save(ignore_discard=True , ignore_expires=True)
'''
ignore_discard:是否保存需要被抛弃的cookie
ignore_expires:是否保存过期的cookie
'''
解析链接:urllib.parse
from urllib.parse import urlparse
'''
urllib.parse.urlparse(urlstring, scheme=”", allow_fragments=True)
所有url都符合同一规则’scheme://netloc/path ;params?query#fragment‘
scheme : 如果url没有指明scheme则使用默认协议
allow_fragments : 是否忽略fragment
'''
url = 'http://www.baidu.com/index.html;user?id=5#comment'
result = urlparse(url)
print(result)
from urllib.parse import urlunparse
data = ['http','www.baidu.com','/index.html','user','id=5','comment']
print(urlunparse(data))
from urllib.parse import urlsplit
res = urlsplit(url)
print(res)
from urllib.parse import urlunsplit
mark = ['http','www.baidu.com','/index.html;user','id=5','comment']
print(urlunsplit(mark))
'''
urljoin(base_url,new_url)
urljoin会分析base_url的scheme,netloc,path。如果base的前三项在new中不存在则new作为base的补充
如果存在,就用new代替base
'''
'''
将字典化为字符串,例:
params = {
’name':'germey',
age : 22
}
urlencode(params) = 'name=germey&age=22'
'''
'''
反序列化
将字符串转为字典
'''
'''
避免中文出现乱码现象
将中文字转化为url编码
'''
'''
将url编码转为中文
'''
Robots 协议 机器人协议,规定哪些页面可以爬,哪些不行。
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
print(rp.can_fetch(url + '/robot.txt'))
|