IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> Python知识库 -> python 爬虫学习笔记 -> 正文阅读

[Python知识库]python 爬虫学习笔记

python 爬虫学习笔记

运行结果

request

import requests
url = "https://baidu.com"

r = requests.get(url)
r.encoding = "utf8"
print(r.text[:150])
# r.content.decode("utf-8")
print(r.url, r.status_code)
print(r.request.headers)

print(r.headers)

hearders

url = "https://baidu.com/s?"
headers = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
headers = {"user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
headers = {'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4381.7 Mobile Safari/537.36"}
r = requests.get(url, headers=headers)

加cookie在get请求

data = {"wd": "python"}
h = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36", "Cookie": "123"}
r = requests.get(url, headers=h, params=data)
# with open('1.txt',"wb") as f:
#     f.write(r.content)
print(r.url)

cookies的字典处理

cookies = "anonymid=jy0ui55o-u6f6zd; depovince=GW; _r01_=1; JSESSIONID=abcMktGLRGjLtdhBk7OVw; " \
          "ick_login=a9b557b8-8138-4e9d-8601-de7b2a633f80; _ga=GA1.2.1307141854.1562980962; " \
          "_gid=GA1.2.201589596.1562980962; _c1=-100; first_login_flag=1; ln_uact=18323008898; " \
          "ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; " \
          "jebe_key=88f1340c-592c-4dd6-a738-128a76559f45%7Cad33b3c730fcdc8df220648f0893e840%7C1562981108370%7C1" \
          "%7C1562981106763; jebe_key=88f1340c-592c-4dd6-a738-128a76559f45%7Cad33b3c730fcdc8df220648f0893e840" \
          "%7C1562981108370%7C1%7C1562981106765; jebecookies=793eb32e-92c6-470d-b9d0-5f924c335d30|||||; " \
          "_de=E77807CE44886E0134ABF27E72CFD74F; p=a00d65b1f779614cd242dc719e24c73e0; " \
          "t=292ba8729a4151c1a357e176d8d91bff0; societyguester=292ba8729a4151c1a357e176d8d91bff0; id=969937120; " \
          "xnsid=1700b2cc; ver=7.0; loginfrom=null; wp_" \
          "fold=0 "
dicts = {cookie.split("=")[0]: cookie.split("=")[1] for cookie in cookies.split(";")}
print(dicts)

post请求

payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", data=payload)
print(r.text[:150])

jsonpath使用

from jsonpath import jsonpath
data = {"1": {"2": {"3": "python"}}}
print(data["1"]["2"]["3"])
print(jsonpath(data, "$.1.2.3"))
print(jsonpath(data, "$..3"))
data = {
    "store": {
        "book": [{
            "category": "reference",
            "author": "Nigel Rees",
            "title": "Sayings of the Century",
            "price": 8.2
        }, {
            "category": "fiction",
            "author": "Evelyn Waugh",
            "title": "Sword of Honour",
            "price": 2.99
        }, {
            "category": "fiction",
            "author": "Herman Melville",
            "title": "Moby Dick",
            "isbn": "0-553-21311-3",
            "price": 8.09
        }, {
            "category": "fiction",
            "author": "J. R. R. Tolkien",
            "title": "The Lord of the Rings",
            "isbn": "0-395-19395-8",
            "price": 2.99
        }
        ],
        "bicycle": {
            "color": "red",
            "price": 19.95
        }
    }
}


print(jsonpath(data, "$..color"))
print(jsonpath(data, "$..price"))

jsonpath 例子

import json
url = "https://www.lagou.com/lbs/getAllCitySearchLabels.json"
h = {"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
r = requests.get(url, headers=h)
data = json.loads(r.content)
print(jsonpath(data, "$..A..name"))
print(jsonpath(data, "$..B..name"))

xpath速览

# //div[@id="content-left"]/div/@id
# //div[@id="content-left"]/div[last()-1]
# //div[span[2]>=10]
# //div[contains(@id,"r_tag_12345")]
# //*[contains(@id,"r_tag_12345")]
# //a[contains(text(),"下一页")]
# //*[@id="content-left"]/div
# //td/a|//h2/a
# //@href

lxml 使用

from lxml import etree
url = "https://www.xiaoxiaoran.top"
h = {
    "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4750.0 Mobile Safari/537.36"}
r = requests.get(url, headers=h)
data = r.content.decode()
html = etree.HTML(data)

lxml 例子

print([i for i in  html.xpath("//div[@id='box02_text']/p/text()")])
print({i: j for i, j in zip(html.xpath("//div/ul/li/div/h1/text()"), html.xpath("//div/ul/li/div/p[1]/text()"))})
print(["https://xiaoxiaoran.top"+i if i.find("http") else i  for i in html.xpath("//div/ul/li/div/p/a/@href")])
print({i:"https://xiaoxiaoran.top"+j for i,j in zip(html.xpath("//div/div/ul/li/a/h2/text()"),html.xpath("//div/div/ul/li/a/@href"))})

selenium 使用

# phantomjs  无界面浏览器
from selenium import webdriver
import sys
import time

d = webdriver.Chrome("D:\Temp\chromedriver.exe")
d.get('https://yb.58.com/dog/?PGTID=0d100000-0094-c6a2-a99f-e8b0eca5c0bd&ClickID=2')

# print(d.page_source)
print(d.current_url)
print(d.title)

# 文字输入
# d.find_element_by_xpath('//*[@id="kw"]').send_keys("肖萧然")
# d.find_element_by_id("su").click()

# 元素定位
l=d.find_elements_by_xpath('//*[@id="infolist"]/table/tbody/tr/td[2]/a')
ll=d.find_elements_by_xpath('//*[@id="infolist"]/table/tbody/tr/td[3]/span/b')
for i in range(len(l)):
    print(l[i].text+" "+ll[i].text)

time.sleep(5)
e = d.find_element_by_xpath('//*[@id="infolist"]/table/tbody/tr[1]/td[2]/a')
e.click()

# 页面切换
print(d.current_url)
print(d.window_handles)

d.switch_to.window(d.window_handles[-1])

e = d.find_element_by_xpath('/html/body/div[17]/div[2]/a').click()

# 滚动
d.execute_script("scrollTo(0,15000)")


time.sleep(10)
d.quit()

selenium 例子


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import sys
import time
import pyexcel

if __name__ == "__main__":
    keyword = 'iphone'
    if len(sys.argv) > 1:
        keyword = sys.argv[1]
    option = Options()
    # option.add_argument('--headless')
    driver = webdriver.Chrome(
        r'D:\Temp\chromedriver.exe', chrome_options=option)
    driver.get('http://www.jd.com')
    kw = driver.find_element_by_id('key')
    kw.send_keys(keyword)
    kw.send_keys(Keys.RETURN)
    time.sleep(3)
    sort_btn = driver.find_element_by_xpath('.//div[@class="f-sort"]/a[2]')
    sort_btn.click()
    i = 0
    has_next = True
    rows = []
    page_count = 0
    while has_next:
        page_count += 1
        if page_count > 3:
            break
        time.sleep(5)
        cur_page = driver.find_element_by_xpath(
            '//div[@id="J_bottomPage"]//a[@class="curr"]').text
        print('current page is ---------------------> %s ' % cur_page)

        goods_list = driver.find_element_by_id('J_goodsList')  # 尺寸
        y = goods_list.rect['y'] + goods_list.rect['height']  # 滑
        driver.execute_script('window.scrollTo(0,%s)' % y)

        products = driver.find_elements_by_class_name('gl-item')

        for p in products:
            row = {}
            sku = p.get_attribute('data-sku')
            row['price'] = p.find_element_by_css_selector(
                'strong.J_%s' % sku).text
            row['name'] = p.find_element_by_css_selector(
                'div.p-name>a>em').text
            row['comment'] = p.find_element_by_id('J_comment_%s' % sku).text

            try:
                row['shop'] = p.find_element_by_css_selector(
                    'div.p-shop>span>a').text
            except:
                row['shop'] = '无'

            print(row)

            i += 1
            print('-------->', i)
            rows.append(row)
        try:
            next_page = driver.find_element_by_css_selector('a.pn-next')
            if 'disabled' in next_page.get_attribute('class'):
                has_next = False
            else:
                next_page.click()
        except:
            has_next = False

    pyexcel.save_as(records=rows, dest_file_name='%s.xls' % keyword)
    driver.quit()


图像识别 tesseract

from PIL import Image
import pytesseract

print(pytesseract.image_to_string(Image.open(r'1.jpg')))

tesseract 1.jpg result


  Python知识库 最新文章
Python中String模块
【Python】 14-CVS文件操作
python的panda库读写文件
使用Nordic的nrf52840实现蓝牙DFU过程
【Python学习记录】numpy数组用法整理
Python学习笔记
python字符串和列表
python如何从txt文件中解析出有效的数据
Python编程从入门到实践自学/3.1-3.2
python变量
上一篇文章      下一篇文章      查看所有文章
加:2022-01-14 01:55:36  更:2022-01-14 01:58:07 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2024年11日历 -2024/11/16 3:29:26-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码