爬虫入门
1、requests模块
import requests
from lxml import etree
url_info = 'https://www.sogou.com/sogou'
headers_info = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3878.400 QQBrowser/10.8.4518.400'
}
url_response_get = requests.get(url=url_info, headers=headers_info)
print(url_response_get.encoding)
print(url_response_get.text)
print(url_response_get.encoding)
print(url_response_get.headers)
print(url_response_get.status_code)
url_info_get = 'https://movie.douban.com/j/chart/top_list'
params_info = {
'type': '12'
, 'interval_id': '100:90'
, 'action': ''
, 'start': '1'
, 'limit': '10'
}
url_response_ajax_get = requests.get(url=url_info_get, headers=headers_info, params=params_info)
url_info_post = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
params_info = {
'cname': ''
, 'pid': ''
, 'keyword': '深圳'
, 'pageIndex': '1'
, 'pageSize': '10'
}
url_response_post = requests.post(url=url_info_post, data=params_info, headers=headers_info)
print(url_response_post)
print(url_response_post.text)
print(url_response_post.content)
print(url_response_post.json())
import bs4
fp = open('../book/1.html', 'r', encoding='utf-8')
bs_dx = bs4.BeautifulSoup(fp, 'lxml')
print(bs_dx.link)
print('.获取标签下面属性值的内容:{}'.format(bs_dx.link['href']))
print('标签下面属性值的内容:{}'.format(bs_dx.link.text))
print('find获取标签信息:{}'.format(bs_dx.find('link')))
print('find获取指定属性的标签信息:{}'.format(bs_dx.find('link', href='static/css/anti.min.css?v=1')))
print('find获取所有的标签信息:{}'.format(bs_dx.find_all('link')))
print('find获取指定属性所有的标签信息:{}'.format(bs_dx.find_all('link', href='static/css/anti.min.css?v=1')))
print('select获取满足类选择器条件的标签:{}'.format(bs_dx.select('.logo')))
print('select获取满足层级选择器条件的标签:{}'.format(bs_dx.select('.logo>a>img')[0]))
print(bs_dx.text)
print(5555555555555555555555555555555555555555555)
parser2 = etree.HTMLParser(encoding='utf-8')
file = r'F:\git_code\20211224\Reptile\Reptile基础\book\hongloumeng.html'
gsw_tree = etree.parse(file, parser=parser2)
print(gsw_tree)
print(gsw_tree.xpath('//title/text()'))
|