[Python知识库] Python爬取京东商品数据

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> Python爬取京东商品数据 -> 正文阅读

[Python知识库]Python爬取京东商品数据

一、前言

由于京东反爬技术较强，使用常规方法爬取其数据行不通，且使用逆向分析技术又具有一定难度，所以本文将直接使用selenium爬取京东商品数据。若不知道怎么安装和配置selenium，请点击查阅笔者之前的文章：Python自动化填写问卷星问卷
本文的爬取数据步骤如下：
在这里插入图片描述

二、完整代码

导入所需包，包括time、selenium、lxml和openpyxl。

import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree
from openpyxl import Workbook

openpyxl新建工作簿并添加表头，包括商品的标题、价格等。

wb = Workbook()
sheet = wb.active
sheet['A1'] = 'name'
sheet['B1'] = 'price'
sheet['C1'] = 'commit'
sheet['D1'] = 'shop'
sheet['E1'] = 'sku'
sheet['F1'] = 'icons'
sheet['G1'] = 'detail_url'

selenium基本配置，其中driver_path为chromedriver谷歌浏览器驱动所在文件夹，若把chromedriver放在项目文件夹下，则该行代码可省略。selenium主要配置了访问网站时不加载图片，这样既能加快访问速度又能节省流量；设置浏览器访问等待时间，避免因特殊原因程序报错。

driver_path = r"D:\python\chromedriver_win32\chromedriver.exe"
options = webdriver.ChromeOptions()
# 不加载图片
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
driver = webdriver.Chrome(executable_path=driver_path, options=options)
wait = WebDriverWait(driver, 60)  # 设置等待时间

获取商品最大页数，京东上的商品一般为100页。

def search(keyword):
    try:
        input = wait.until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key"))
        )  # 等到搜索框加载出来
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
        )  # 等到搜索按钮可以被点击
        input[0].send_keys(keyword)  # 向搜索框内输入关键词
        submit.click()  # 点击
        wait.until(
            EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')
            )
        )
        total_page = driver.find_element_by_xpath('//*[@id="J_bottomPage"]/span[2]/em[1]/b').text
        return int(total_page)
    except TimeoutError:
        search(keyword)

获取商品具体数据，爬取逻辑是先使用selenium获取网页源代码，再用lxml解析，当然这里也可以直接使用selenium解析。爬取的数据字段有标题、价格、评论数、店铺名称、商品唯一id等，其中商品唯一id的sku字段非常重要，是后续爬取该商品评论的主要url参数。

def get_data(html):
    selec_data = etree.HTML(html)
    lis = selec_data.xpath('//ul[@class="gl-warp clearfix"]/li')
    for li in lis:
        try:
            title = li.xpath('.//div[@class="p-name p-name-type-2"]//em/text()')[0].strip()   # 名字
            price = li.xpath('.//div[@class="p-price"]//i/text()')[0].strip()   # 价格
            comment = li.xpath('.//div[@class="p-commit"]//a/text()')  # 评论数
            shop_name = li.xpath('.//div[@class="p-shop"]//a/text()')  # 商铺名字
            data_sku = li.xpath('.//div[@class="p-focus  "]/a/@data-sku')[0]  # 商品唯一id
            icons = li.xpath('.//div[@class="p-icons"]/i/text()')  # 备注
            comment = comment[0] if comment != [] else ''
            shop_name = shop_name[0] if shop_name != [] else ''
            icons_n = ''
            for x in icons:
                icons_n = icons_n + ',' + x
            detail_url = li.xpath('.//div[@class="p-name p-name-type-2"]/a/@href')[0]  # 详情页网址
            detail_url = 'https:' + detail_url
            item = [title, price, comment, shop_name, data_sku, icons_n[1:], detail_url]
            print(item)
            sheet.append(item)
        except TimeoutError:
            get_data(html)

设置main函数串联爬取过程。其中第一页和其他页的url参数有一定差异，需要特殊处理。用j控制页数，for循环里面根据url参数规律构造每一页的真实url。

def main():
    url_main = 'https://www.jd.com/'
    keyword = input('请输入商品名称:')  # 搜索关键词
    driver.get(url=url_main)
    page = search(keyword)
    j = 1
    for i in range(3, page*2, 2):
        if j == 1:
            url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, j)
        else:
            url = 'https://search.jd.com/Search?keyword={}&page={}&s={}&click=0'.format(keyword, i, (j-1)*50)
        driver.get(url)
        time.sleep(1)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")  # 下滑到底部
        time.sleep(3)
        driver.implicitly_wait(20)
        wait.until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="J_goodsList"]/ul/li[last()]'))
        )
        html = driver.page_source
        get_data(html)
        time.sleep(1)
        print(f'正在爬取第{j}页')
        j += 1
    wb.save('京东双十一{}信息.xlsx'.format(keyword))