从网上找的程序,稍加修改,实现在JD商城上爬取商品保质期的功能。 具体步骤如下: 1)逐行读EXCL文件,获取关键词; 2)基于关键词在JD上搜索商品; 3)找到第1个商品,对比其title与关键词的匹配程度; 4)满足匹配度阈值,打开该商品详情页; 5)解析网页,检索“保质期”关键字后的字符串; 6)为提高写文件效率,每20次上述操作后,将检索到的保质期写入EXCL文件。
补充: 1)本实现主要依靠Selenium工具,该工具需要提前下载、配置工具。具体可以参考 Python 爬虫实战 — 抓取JD商品数据 **Selenium工具为 Web 应用程序的测试工具,能够操控浏览器完成一系列步骤,模拟人为操作;**比如自动填写文本,网页端查询快递单号 都是没问题的,目前支持 Java、Python、C#、Ruby 等多种语言。 2)本人python小白,下面代码仅供参考。
"""
获取XX商城商品信息爬虫
Author: zhouzying
URL: https://www.zhouzying.cn
Date: 2018-10-15
"""
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import openpyxl
from openpyxl import Workbook
from openpyxl.reader.excel import load_workbook
import os
import time
filepath = "record.txt"
wb = load_workbook('goods.xlsx')
sheets = wb.get_sheet_names()
sheet_first = sheets[0]
ws = wb.get_sheet_by_name(sheet_first)
driver = webdriver.Chrome()
succesNum=0
allProcessNum=0
for i in range(36599):
i=i+920
print('excl第', i, '行')
product=ws.cell(row=i+1, column=3).value
if product !='品名':
print("excl商品名称:", product)
index = 1
total_2 = 1
while index <= total_2:
try:
page = index * 2 - 1
url = 'https://search.jd.com/Search?keyword=' + str(product) + "&enc=utf-8" + "&page=" + str(page)
driver.get(url)
html = driver.page_source
urlNew = parser(html, product)
driver.get(urlNew)
html = driver.page_source
shelfDataStr=parser2(html)
print('找到保质期:'+shelfDataStr)
ws.cell(row=i+1, column=5).value = shelfDataStr
index += 1
succesNum +=1
print("保质期获取完成!".format(product))
except:
index += 1
print("保质期获取失败!".format(product))
allProcessNum +=1
print('累计处理:成功找到保质期/所有搜索数量=', succesNum,'/',allProcessNum)
print('------------------------------------------------------------------------')
if i%20==0:
print('**写文件**')
wb.save(filename='goods.xlsx')
print('------------------------------------------------------------------------')
driver.quit()
def parser(html, product):
soup = BeautifulSoup(html)
item = soup.find('div', 'gl-i-wrap')
strTemp = item.find('div', 'p-name')
titleInMall=item.find('div','p-name').a.em.text
print('商城第1个商品名称: ', titleInMall)
findInNum=0
for itemChar in product:
if itemChar in titleInMall:
findInNum=findInNum+1
lenTemp=len(product)
print('重合度 ',findInNum/lenTemp)
urlNew=' '
if (findInNum/lenTemp)>0.95:
urlNew = 'https:'+strTemp.find('a')['href']
print('重合度大于阈值0.95,打开第1个商品URL:'+urlNew)
else:
print('重合度小于阈值0.95,该商品未找到保质期!')
return urlNew
def parser2(html):
soup = BeautifulSoup(html)
clearfix= soup.find_all('dl', 'clearfix')
shelfDataStr=''
for item in clearfix:
shelfDataStrTemp= item.text
if '保质期' in shelfDataStrTemp:
shelfDataStr=shelfDataStrTemp[4:]
break
return shelfDataStr
if __name__ == '__main__':
main()
|