网络爬虫(一)
关键字:爬虫,数据分析,数据存储
一、 网络爬虫总体步骤
- 步骤一:确认数据的来源,网页的具体地址
- 步骤二:观察url地址规律,
- 步骤三:写正则表达式或者XPATH表达式等方式匹配抓取信息
- 步骤四:写具体的爬虫程序
二、以豆瓣为例子(使用正则)
import re
import time
import random
import requests
class DoubanBookSpider:
def __init__(self):
self.url = "https://book.douban.com/top250?start={}"
self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/80.0.3987.116 Safari/537.36"}
def get_html(self,url):
return requests.get(url=url,headers =self.headers).text
def parse_html(self,html):
regex = '<td valign="top">.*?title="(.*?)".*?<p class="pl">(.*?)</p>.*?<span class="rating_nums">(.*?)</span>.*?<span class="pl">(.*?)</span>.*?<span class="inq">(.*?)</span>'
book_list = re.findall(regex,html,re.S)
for book in book_list:
item = {}
item["book_name"] = book[0]
item["book_info"] = book[1]
item["book_score"] = book[2]
item["book_number"] = book[3][1:-1].strip()
item["book_desc"] = book[4]
print(item)
def save_html(self):
pass
def crawl(self):
for page in range(1,11):
page_url = self.url.format((page-1)*25)
html = self.get_html(page_url)
self.parse_html(html)
time.sleep(random.randint(1,2))
if __name__ == '__main__':
spider = DoubanBookSpider()
spider.crawl()
三、爬取数据的存储方式
1、mysql存储
在初始化函数中需要创建数据库和游标对象
import re
import time
import random
import requests
import pymysql
class DoubanBookSpider:
def __init__(self):
self.url = "https://book.douban.com/top250?start={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/80.0.3987.116 Safari/537.36"}
self.db = pymysql.connect(host='localhost', user="root", password='123456', database='bookdb', charset='utf8')
self.cur = self.db.cursor()
self.ins = 'insert into booktab values(%s,%s,%s,%s,%s)'
获取请求不变,在正则解析的过程中,需要存入数据库
def parse_html(self, html):
regex = '<td valign="top">.*?title="(.*?)".*?<p class="pl">(.*?)</p>.*?<span class="rating_nums">(.*?)</span>.*?<span class="pl">(.*?)</span>.*?<span class="inq">(.*?)</span>'
book_list = re.findall(regex, html, re.S)
for book in book_list:
item = {}
item["book_title"] = book[0]
item["book_info"] = book[1]
item["book_score"] = book[2]
item["book_number"] = book[3][1:-1].strip()
item["book_desc"] = book[4]
print(item)
li = [
item["book_title"],
item["book_info"],
item["book_score"],
item["book_number"],
item["book_desc"],
]
self.cur.execute(self.ins, li, )
self.db.commit()
最后需要关闭数据库和游标
def crawl(self):
for page in range(1, 11):
page_url = self.url.format((page - 1) * 25)
html = self.get_html(page_url)
self.parse_html(html)
time.sleep(random.randint(1, 2))
self.cur.close()
self.db.close()
2、使用Mongo数据库(比较简单)
在初始化函数中,创建mango数据库的库和集合名
import re
import time
import random
import requests
import pymongo
class DoubanBookSpider:
def __init__(self):
self.url = "https://book.douban.com/top250?start={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/80.0.3987.116 Safari/537.36"}
self.conn = pymongo.MongoClient(host="localhost", port=27017)
self.db = self.conn["bookdb"]
self.myset = self.db["bookset"]
将数据存入到数据库中注意,这里Mongo数据是以键值对存储到集合中,所以存储的数据需是集合形式
def parse_html(self, html):
regex = '<td valign="top">.*?title="(.*?)".*?<p class="pl">(.*?)</p>.*?<span class="rating_nums">(.*?)</span>.*?<span class="pl">(.*?)</span>.*?<span class="inq">(.*?)</span>'
book_list = re.findall(regex, html, re.S)
for book in book_list:
item = {}
item["book_name"] = book[0]
item["book_info"] = book[1]
item["book_score"] = book[2]
item["book_number"] = book[3][1:-1].strip()
item["book_desc"] = book[4]
print(item)
self.myset.insert_one(item)
3、存入到CSV文件之中
打开一个CSV文件,并初始化CSV文件
import re
import time
import random
import requests
import csv
class DoubanBookSpider:
def __init__(self):
self.url = "https://book.douban.com/top250?start={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) ""AppleWebKit/537.36 (KHTML, like Gecko) ""Chrome/80.0.3987.116 Safari/537.36"}
self.f = open("douban_book.csv", "w")
self.write = csv.writer(self.f)
将数据写入到CSV文件中
def parse_html(self, html):
regex = '<td valign="top">.*?title="(.*?)".*?<p class="pl">(.*?)</p>.*?<span class="rating_nums">(.*?)</span>.*?<span class="pl">(.*?)</span>.*?<span class="inq">(.*?)</span>'
book_list = re.findall(regex, html, re.S)
for book in book_list:
item = {}
item["book_name"] = book[0]
item["book_info"] = book[1]
item["book_score"] = book[2]
item["book_number"] = book[3][1:-1].strip()
item["book_desc"] = book[4]
print(item)
li = [
item["book_name"],
item["book_info"],
item["book_score"],
item["book_number"],
item["book_desc"],
]
self.write.writerow(li)
最后关闭CSV文件
def crawl(self):
for page in range(1, 11):
page_url = self.url.format((page - 1) * 25)
print('12')
html = self.get_html(page_url)
print('23')
self.parse_html(html)
print('45')
time.sleep(random.randint(1, 2))
self.f.close()
四、小结
mysql、redis和mongo数据库都可以用来存储数据,mysql和mongo数据库是基于磁盘存储,redis是基于内存的存储。mongo数据库和myspl相比它是非关系型数据库,数据类型很单一,值就是JSON文档。操作当面,mongo更加的方便快捷。 在数据分析过程中,编码方式是很重要的。我们一般在解析url中默认编码都是utf-8,但是有时候也有gb2312。所以按照我们之前的解析方式
html = requests.get(url=url,headers =self.headers).text
解决方案
html = requests.get(url=url,headers =self.headers).content.decode("gb2312","ignore")
五、明天课程安排
上述的案例都是抓取一层的数据,明天学习更深层次的数据抓取。然后在按照上面的存储方法,存储自己抓取的数据。
|