开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> python 爬虫学习笔记 -> 正文阅读

[Python知识库]python 爬虫学习笔记

python 爬虫学习笔记

运行结果

request

import requests
url = "https://baidu.com"

r = requests.get(url)
r.encoding = "utf8"
print(r.text[:150])
# r.content.decode("utf-8")

print(r.url, r.status_code)
print(r.request.headers)

print(r.headers)

hearders

url = "https://baidu.com/s?"
headers = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
headers = {"user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
headers = {'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4381.7 Mobile Safari/537.36"}
r = requests.get(url, headers=headers)

加cookie在get请求

data = {"wd": "python"}
h = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36", "Cookie": "123"}
r = requests.get(url, headers=h, params=data)
# with open('1.txt',"wb") as f:
#     f.write(r.content)
print(r.url)

cookies的字典处理

cookies = "anonymid=jy0ui55o-u6f6zd; depovince=GW; _r01_=1; JSESSIONID=abcMktGLRGjLtdhBk7OVw; " \
          "ick_login=a9b557b8-8138-4e9d-8601-de7b2a633f80; _ga=GA1.2.1307141854.1562980962; " \
          "_gid=GA1.2.201589596.1562980962; _c1=-100; first_login_flag=1; ln_uact=18323008898; " \
          "ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; " \
          "jebe_key=88f1340c-592c-4dd6-a738-128a76559f45%7Cad33b3c730fcdc8df220648f0893e840%7C1562981108370%7C1" \
          "%7C1562981106763; jebe_key=88f1340c-592c-4dd6-a738-128a76559f45%7Cad33b3c730fcdc8df220648f0893e840" \
          "%7C1562981108370%7C1%7C1562981106765; jebecookies=793eb32e-92c6-470d-b9d0-5f924c335d30|||||; " \
          "_de=E77807CE44886E0134ABF27E72CFD74F; p=a00d65b1f779614cd242dc719e24c73e0; " \
          "t=292ba8729a4151c1a357e176d8d91bff0; societyguester=292ba8729a4151c1a357e176d8d91bff0; id=969937120; " \
          "xnsid=1700b2cc; ver=7.0; loginfrom=null; wp_" \
          "fold=0 "
dicts = {cookie.split("=")[0]: cookie.split("=")[1] for cookie in cookies.split(";")}
print(dicts)

post请求

payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", data=payload)
print(r.text[:150])

jsonpath使用

from jsonpath import jsonpath
data = {"1": {"2": {"3": "python"}}}
print(data["1"]["2"]["3"])
print(jsonpath(data, "$.1.2.3"))
print(jsonpath(data, "$..3"))

data = {
    "store": {
        "book": [{
            "category": "reference",
            "author": "Nigel Rees",
            "title": "Sayings of the Century",
            "price": 8.2
        }, {
            "category": "fiction",
            "author": "Evelyn Waugh",
            "title": "Sword of Honour",
            "price": 2.99
        }, {
            "category": "fiction",
            "author": "Herman Melville",
            "title": "Moby Dick",
            "isbn": "0-553-21311-3",
            "price": 8.09
        }, {
            "category": "fiction",
            "author": "J. R. R. Tolkien",
            "title": "The Lord of the Rings",
            "isbn": "0-395-19395-8",
            "price": 2.99
        }
        ],
        "bicycle": {
            "color": "red",
            "price": 19.95
        }
    }
}

print(jsonpath(data, "$..color"))
print(jsonpath(data, "$..price"))

jsonpath 例子

import json
url = "https://www.lagou.com/lbs/getAllCitySearchLabels.json"
h = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
r = requests.get(url, headers=h)
data = json.loads(r.content)
print(jsonpath(data, "$..A..name"))
print(jsonpath(data, "$..B..name"))

xpath速览

# //div[@id="content-left"]/div/@id
# //div[@id="content-left"]/div[last()-1]
# //div[span[2]>=10]
# //div[contains(@id,"r_tag_12345")]
# //*[contains(@id,"r_tag_12345")]
# //a[contains(text(),"下一页")]
# //*[@id="content-left"]/div
# //td/a|//h2/a
# //@href

lxml 使用

from lxml import etree
url = "https://www.xiaoxiaoran.top"
h = {
    "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
r = requests.get(url, headers=h)
data = r.content.decode()
html = etree.HTML(data)

lxml 例子

print([i for i in  html.xpath("//div[@id='box02_text']/p/text()")])

print({i: j for i, j in zip(html.xpath("//div/ul/li/div/h1/text()"), html.xpath("//div/ul/li/div/p[1]/text()"))})

print(["https://xiaoxiaoran.top"+i if i.find("http") else i  for i in html.xpath("//div/ul/li/div/p/a/@href")])

print({i:"https://xiaoxiaoran.top"+j for i,j in zip(html.xpath("//div/div/ul/li/a/h2/text()"),html.xpath("//div/div/ul/li/a/@href"))})

selenium 使用

# phantomjs  无界面浏览器
from selenium import webdriver
import sys
import time

d = webdriver.Chrome("D:\Temp\chromedriver.exe")
d.get('https://yb.58.com/dog/?PGTID=0d100000-0094-c6a2-a99f-e8b0eca5c0bd&ClickID=2')

# print(d.page_source)
print(d.current_url)
print(d.title)

# 文字输入
# d.find_element_by_xpath('//*[@id="kw"]').send_keys("肖萧然")
# d.find_element_by_id("su").click()

# 元素定位
l=d.find_elements_by_xpath('//*[@id="infolist"]/table/tbody/tr/td[2]/a')
ll=d.find_elements_by_xpath('//*[@id="infolist"]/table/tbody/tr/td[3]/span/b')
for i in range(len(l)):
    print(l[i].text+" "+ll[i].text)

time.sleep(5)
e = d.find_element_by_xpath('//*[@id="infolist"]/table/tbody/tr[1]/td[2]/a')
e.click()

# 页面切换
print(d.current_url)
print(d.window_handles)

d.switch_to.window(d.window_handles[-1])

e = d.find_element_by_xpath('/html/body/div[17]/div[2]/a').click()

# 滚动
d.execute_script("scrollTo(0,15000)")


time.sleep(10)
d.quit()

selenium 例子


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import sys
import time
import pyexcel

if __name__ == "__main__":
    keyword = 'iphone'
    if len(sys.argv) > 1:
        keyword = sys.argv[1]
    option = Options()
    # option.add_argument('--headless')
    driver = webdriver.Chrome(
        r'D:\Temp\chromedriver.exe', chrome_options=option)
    driver.get('http://www.jd.com')
    kw = driver.find_element_by_id('key')
    kw.send_keys(keyword)
    kw.send_keys(Keys.RETURN)
    time.sleep(3)
    sort_btn = driver.find_element_by_xpath('.//div[@class="f-sort"]/a[2]')
    sort_btn.click()
    i = 0
    has_next = True
    rows = []
    page_count = 0
    while has_next:
        page_count += 1
        if page_count > 3:
            break
        time.sleep(5)
        cur_page = driver.find_element_by_xpath(
            '//div[@id="J_bottomPage"]//a[@class="curr"]').text
        print('current page is ---------------------> %s ' % cur_page)

        goods_list = driver.find_element_by_id('J_goodsList')  # 尺寸
        y = goods_list.rect['y'] + goods_list.rect['height']  # 滑
        driver.execute_script('window.scrollTo(0,%s)' % y)

        products = driver.find_elements_by_class_name('gl-item')

        for p in products:
            row = {}
            sku = p.get_attribute('data-sku')
            row['price'] = p.find_element_by_css_selector(
                'strong.J_%s' % sku).text
            row['name'] = p.find_element_by_css_selector(
                'div.p-name>a>em').text
            row['comment'] = p.find_element_by_id('J_comment_%s' % sku).text

            try:
                row['shop'] = p.find_element_by_css_selector(
                    'div.p-shop>span>a').text
            except:
                row['shop'] = '无'

            print(row)

            i += 1
            print('-------->', i)
            rows.append(row)
        try:
            next_page = driver.find_element_by_css_selector('a.pn-next')
            if 'disabled' in next_page.get_attribute('class'):
                has_next = False
            else:
                next_page.click()
        except:
            has_next = False

    pyexcel.save_as(records=rows, dest_file_name='%s.xls' % keyword)
    driver.quit()

图像识别 tesseract

from PIL import Image
import pytesseract

print(pytesseract.image_to_string(Image.open(r'1.jpg')))

tesseract 1.jpg result

Python知识库最新文章

Python中String模块

【Python】 14-CVS文件操作

python的panda库读写文件

使用Nordic的nrf52840实现蓝牙DFU过程

【Python学习记录】numpy数组用法整理

Python学习笔记

python字符串和列表

python如何从txt文件中解析出有效的数据

Python编程从入门到实践自学/3.1-3.2

python变量

加:2022-01-14 01:55:36 更:2022-01-14 01:58:07

360图书馆购物三丰科技阅读网日历万年历 2025年2日历

-2025/2/20 7:33:49-

图片自动播放器
↓图片自动播放器↓

TxT小说阅读器
↓语音阅读,小说下载,古典文学↓

一键清除垃圾
↓轻轻一点,清除系统垃圾↓

图片批量下载器
↓批量下载图片,美女图库↓

网站联系: qq:121756557 email:121756557@qq.com IT数码