爬虫代码spider.py:
import requests
from lxml import etree
from save_mongodb import MongoClient
class Spider(object):
def __init__(self):
self.url = 'http://www.xxx.com/'
self.mongo = MongoClient()
self.name = 'xxx'
self.headers = {}
def spider_url(self):
response = requests.get(url=self.url,headers=self.headers)
html = etree.HTML(response.text)
url_list = html.xpath('//a/@href')
for i in range(len(url_list) - 1, -1, -1):
if 'javascript' in url_list[i] or url_list[i] == '' or '@' in url_list[i] or '.jpg' in url_list[
i] or '.png' in \
url_list[i]:
url_list.pop(i)
elif 'http' not in url_list[i]:
url_list[i] = 'http:' + url_list[i]
return url_list
def save_url(self):
url_list = self.spider_url()
for url in url_list:
item = dict()
item['url'] = url
item['source'] = self.name
self.mongo.add_document(item, self.name)
print(self.name + '入库完毕,入库url个数:' + str(len(url_list)))
if __name__ == '__main__':
sp = Spider()
sp.save_url()
数据库应用代码save_mongodb.py:
import datetime
import pymongo
from config import MONGO_DB_HOST, MONGO_DB, password, username
class MongoClient(object):
def __init__(self, mongo_uri=MONGO_DB_HOST, port=27017, mongo_db=MONGO_DB):
self.mongo_uri = mongo_uri
self.port = port
self.mongo_db = mongo_db
self.client = pymongo.MongoClient(self.mongo_uri, connect=True, unicode_decode_error_handler='ignore')
self.db = self.client[self.mongo_db]
def add_document(self, item, date):
self.db[date].insert(item)
def find_document(self, id, date):
collist = self.db.list_collection_names()
if date in collist:
key = self.db[date].find_one({'id': id})
return key
else:
return None
def find_same_url(self, url, source):
collist = self.db.list_collection_names()
for col in collist:
key = self.db[col].find_one({'url': url, 'source': source})
if key is not None:
return False
return True
def find_max_id(self, date):
collist = self.db.list_collection_names()
if date in collist:
max_data = self.db[date].find().sort([('id', -1)]).next()
max_id = max_data['id']
return max_id
else:
return None
def close_client(self):
self.client.close()
if __name__ == '__main__':
def getToday():
"""
获取前一天的年月日
:return: 昨天的日期
"""
today = datetime.date.today()
return today
mc = MongoClient()
mc.find_max_id(getToday().strftime('%Y-%m-%d'))
mc.close_client()
配置文件config.py:
from urllib import parse
username = parse.quote_plus("xxx")
password = parse.quote_plus("xxx")
MONGO_DB_HOST = 'localhost'
MONGO_DB = 'benign_url'
|