request方法:
import json
import requests
from lxml import etree
import xlwings as xw
import re
# 爬取网址
url ="https://hz.zu.anjuke.com/?from=navigation"
# 模拟浏览器访问
headers =headers={'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/\
537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36','referer':url}
# 获取正文
response =requests.get(url ,headers=headers)
response.encoding='utf-8'
body =response.text
# print(body)
# 定位内容
html =etree.HTML(body ,etree.HTMLParser())
gethtml=html.xpath('//div[contains(@class,"zu-info")]')
print(gethtml)
for item in gethtml:
print(item.xpath('//h3/a/attribute::href'))
print(item.xpath('//h3/a/b[contains(@class,"strongbox")]/text()'))
beautifulSoul方法:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import xlwings as xw
# 新闻链接
url = "https://hz.zu.anjuke.com/?from=navigation"
res = requests.get(url)
res.encoding = 'utf-8'
# 完整HTML
# html = BeautifulSoup(res.text, 'html.parser')
html = BeautifulSoup(res.text, 'html.parser')
# print(html)
# 新闻列表
# print(html.select('.zu-info'))
for item in html.select('.zu-info'):
print(item.select('h3 a')[0].get('href'))
print(item.select('.strongbox')[0].text)
print(item.select('a')[0].text)
|