python 爬虫学习笔记
运行结果
request
import requests
url = "https://baidu.com"
r = requests.get(url)
r.encoding = "utf8"
print(r.text[:150])
print(r.url, r.status_code)
print(r.request.headers)
print(r.headers)
hearders
url = "https://baidu.com/s?"
headers = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
headers = {"user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
headers = {'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4381.7 Mobile Safari/537.36"}
r = requests.get(url, headers=headers)
加cookie在get请求
data = {"wd": "python"}
h = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36", "Cookie": "123"}
r = requests.get(url, headers=h, params=data)
print(r.url)
cookies的字典处理
cookies = "anonymid=jy0ui55o-u6f6zd; depovince=GW; _r01_=1; JSESSIONID=abcMktGLRGjLtdhBk7OVw; " \
"ick_login=a9b557b8-8138-4e9d-8601-de7b2a633f80; _ga=GA1.2.1307141854.1562980962; " \
"_gid=GA1.2.201589596.1562980962; _c1=-100; first_login_flag=1; ln_uact=18323008898; " \
"ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; " \
"jebe_key=88f1340c-592c-4dd6-a738-128a76559f45%7Cad33b3c730fcdc8df220648f0893e840%7C1562981108370%7C1" \
"%7C1562981106763; jebe_key=88f1340c-592c-4dd6-a738-128a76559f45%7Cad33b3c730fcdc8df220648f0893e840" \
"%7C1562981108370%7C1%7C1562981106765; jebecookies=793eb32e-92c6-470d-b9d0-5f924c335d30|||||; " \
"_de=E77807CE44886E0134ABF27E72CFD74F; p=a00d65b1f779614cd242dc719e24c73e0; " \
"t=292ba8729a4151c1a357e176d8d91bff0; societyguester=292ba8729a4151c1a357e176d8d91bff0; id=969937120; " \
"xnsid=1700b2cc; ver=7.0; loginfrom=null; wp_" \
"fold=0 "
dicts = {cookie.split("=")[0]: cookie.split("=")[1] for cookie in cookies.split(";")}
print(dicts)
post请求
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", data=payload)
print(r.text[:150])
jsonpath使用
from jsonpath import jsonpath
data = {"1": {"2": {"3": "python"}}}
print(data["1"]["2"]["3"])
print(jsonpath(data, "$.1.2.3"))
print(jsonpath(data, "$..3"))
data = {
"store": {
"book": [{
"category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.2
}, {
"category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 2.99
}, {
"category": "fiction",
"author": "Herman Melville",
"title": "Moby Dick",
"isbn": "0-553-21311-3",
"price": 8.09
}, {
"category": "fiction",
"author": "J. R. R. Tolkien",
"title": "The Lord of the Rings",
"isbn": "0-395-19395-8",
"price": 2.99
}
],
"bicycle": {
"color": "red",
"price": 19.95
}
}
}
print(jsonpath(data, "$..color"))
print(jsonpath(data, "$..price"))
jsonpath 例子
import json
url = "https://www.lagou.com/lbs/getAllCitySearchLabels.json"
h = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
r = requests.get(url, headers=h)
data = json.loads(r.content)
print(jsonpath(data, "$..A..name"))
print(jsonpath(data, "$..B..name"))
xpath速览
lxml 使用
from lxml import etree
url = "https://www.xiaoxiaoran.top"
h = {
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
r = requests.get(url, headers=h)
data = r.content.decode()
html = etree.HTML(data)
lxml 例子
print([i for i in html.xpath("//div[@id='box02_text']/p/text()")])
print({i: j for i, j in zip(html.xpath("//div/ul/li/div/h1/text()"), html.xpath("//div/ul/li/div/p[1]/text()"))})
print(["https://xiaoxiaoran.top"+i if i.find("http") else i for i in html.xpath("//div/ul/li/div/p/a/@href")])
print({i:"https://xiaoxiaoran.top"+j for i,j in zip(html.xpath("//div/div/ul/li/a/h2/text()"),html.xpath("//div/div/ul/li/a/@href"))})
selenium 使用
from selenium import webdriver
import sys
import time
d = webdriver.Chrome("D:\Temp\chromedriver.exe")
d.get('https://yb.58.com/dog/?PGTID=0d100000-0094-c6a2-a99f-e8b0eca5c0bd&ClickID=2')
print(d.current_url)
print(d.title)
l=d.find_elements_by_xpath('//*[@id="infolist"]/table/tbody/tr/td[2]/a')
ll=d.find_elements_by_xpath('//*[@id="infolist"]/table/tbody/tr/td[3]/span/b')
for i in range(len(l)):
print(l[i].text+" "+ll[i].text)
time.sleep(5)
e = d.find_element_by_xpath('//*[@id="infolist"]/table/tbody/tr[1]/td[2]/a')
e.click()
print(d.current_url)
print(d.window_handles)
d.switch_to.window(d.window_handles[-1])
e = d.find_element_by_xpath('/html/body/div[17]/div[2]/a').click()
d.execute_script("scrollTo(0,15000)")
time.sleep(10)
d.quit()
selenium 例子
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import sys
import time
import pyexcel
if __name__ == "__main__":
keyword = 'iphone'
if len(sys.argv) > 1:
keyword = sys.argv[1]
option = Options()
driver = webdriver.Chrome(
r'D:\Temp\chromedriver.exe', chrome_options=option)
driver.get('http://www.jd.com')
kw = driver.find_element_by_id('key')
kw.send_keys(keyword)
kw.send_keys(Keys.RETURN)
time.sleep(3)
sort_btn = driver.find_element_by_xpath('.//div[@class="f-sort"]/a[2]')
sort_btn.click()
i = 0
has_next = True
rows = []
page_count = 0
while has_next:
page_count += 1
if page_count > 3:
break
time.sleep(5)
cur_page = driver.find_element_by_xpath(
'//div[@id="J_bottomPage"]//a[@class="curr"]').text
print('current page is ---------------------> %s ' % cur_page)
goods_list = driver.find_element_by_id('J_goodsList')
y = goods_list.rect['y'] + goods_list.rect['height']
driver.execute_script('window.scrollTo(0,%s)' % y)
products = driver.find_elements_by_class_name('gl-item')
for p in products:
row = {}
sku = p.get_attribute('data-sku')
row['price'] = p.find_element_by_css_selector(
'strong.J_%s' % sku).text
row['name'] = p.find_element_by_css_selector(
'div.p-name>a>em').text
row['comment'] = p.find_element_by_id('J_comment_%s' % sku).text
try:
row['shop'] = p.find_element_by_css_selector(
'div.p-shop>span>a').text
except:
row['shop'] = '无'
print(row)
i += 1
print('-------->', i)
rows.append(row)
try:
next_page = driver.find_element_by_css_selector('a.pn-next')
if 'disabled' in next_page.get_attribute('class'):
has_next = False
else:
next_page.click()
except:
has_next = False
pyexcel.save_as(records=rows, dest_file_name='%s.xls' % keyword)
driver.quit()
图像识别 tesseract
from PIL import Image
import pytesseract
print(pytesseract.image_to_string(Image.open(r'1.jpg')))
tesseract 1.jpg result
|