docker方式运行MongoDB
docker run -itd --name mongo -p 27017:27017 mongo
docker exec -it mongo mongo
MongoDB数据库基础知识
MongoDB为非关系性数据库,数据以键值对方式存储
MongoDB基于磁盘存储
MongoDB数据类型单一,值为JSON文档,而Redis基于内存
MongoDB:库->集合->文档
Mysql: 库->表->表记录
MongoD基础语法
show dbs
use 库名
show collections
db.集合名.find().pretty()
db.集合名.count()
db.集合名.drop()
db.dropDatabase()
pymongo模块使用流程
sudo pip install pymongo
import pymongo
conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
db = conn['maoyandb']
myset = db['maoyanset']
myset.insert_one({'name': '泰坦尼克号', 'star': 'T', 'time': '1990-01-01'})
myset.insert_many({'name': '泰坦尼克号', 'star': 'T', 'time': '1990-01-01'})
注意:MongoD无需提前建库建表。直接操作即可,会自动建库建表
代码示例
"""
猫眼电影首页抓取
"""
from urllib import request
import random
import time
import re
import pymongo
class MaoyanSpider_Home_Page:
def __init__(self):
self.url = 'https://www.maoyan.com/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
'Cookie': 'uuid_n_v=v1; uuid=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; _csrf=ebdd97cd428809914f8919dcfe0c1031f72c3caf9b01aa03d9c831fae4dffd7f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1644740619; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1644741203; _lxsdk_cuid=17ef22e2f1fc8-0bb075b44e2f43-4c3e237c-144000-17ef22e2f1f27; _lxsdk_s=17ef22e2f20-58d-04b-5bb%7C%7C5; _lxsdk=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; __mta=150189705.1644740620254.1644740620254.1644741203077.2'
}
self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
self.db = self.conn['maoyandb']
self.myset = self.db['maoyanset']
def get_html(self):
req = request.Request(url=self.url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode()
self.parse_html(html)
def parse_html(self, html):
regex = '<span class="ranking-movie-name">(.*?)</span>'
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
self.save_html(r_list)
def save_html(self, r_list):
for film in r_list:
itme = {}
itme['name'] = film
print(itme)
self.myset.insert_one(itme)
def run(self):
self.get_html()
time.sleep(random.randint(1, 2))
if __name__ == '__main__':
spider = MaoyanSpider_Home_Page()
spider.run()
代码示例2 说明:因为tops100榜单猫眼电影更了反爬虫机制,及时加入cookie也无法爬取,所以改变了爬取信息,能掌握存储到MongoDB的方式就好
"""
猫眼电影附近影院抓取
"""
from urllib import request
import random
import time
import re
import pymongo
class MaoyanSpiderFilmAddress:
def __init__(self):
self.url = 'https://www.maoyan.com/cinemas?offset={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
'Cookie': 'uuid_n_v=v1; uuid=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; _csrf=ebdd97cd428809914f8919dcfe0c1031f72c3caf9b01aa03d9c831fae4dffd7f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1644740619; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1644746835; _lxsdk_cuid=17ef22e2f1fc8-0bb075b44e2f43-4c3e237c-144000-17ef22e2f1f27; _lxsdk=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; __mta=150189705.1644740620254.1644746829263.1644746835412.32; _lxsdk_s=17ef2697afa-901-639-438%7C%7C62'
}
self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
self.db = self.conn['maoyandb']
self.myset = self.db['maoyanset']
self.i =0
def get_html(self, url):
req = request.Request(url=url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode()
self.parse_html(html)
def parse_html(self, html):
regex = '<div class="cinema-info">.*?<a href=.*?>(.*?)</a>.*?<p class="cinema-address">(.*?)</p>'
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
self.save_html(r_list)
def save_html(self, r_list):
for i in r_list:
item = {'name': i[0], 'address': i[1]}
print(item)
self.myset.insert_one(item)
self.i += 1
def run(self):
for page in range(0, 19, 1):
num = page * 12
url = self.url.format(num)
self.get_html(url=url)
time.sleep(random.randint(1,5))
if __name__ == '__main__':
spider = MaoyanSpiderFilmAddress()
spider.run()
print('附近影院数量:',spider.i)
结果展示
{
"_id" : ObjectId("6208df549494e00eef326a61"),
"name" : "保利国际影城(杭州西溪天堂店)",
"address" : "地址:西湖区紫金港路21号西溪天堂商业街地下一层(喜来登国际会议中心旁)"
}
{
"_id" : ObjectId("6208df549494e00eef326a62"),
"name" : "千红时代影城",
"address" : "地址:拱墅区丰庆路710号(世纪联华超市4楼)"
}
{
"_id" : ObjectId("6208df549494e00eef326a63"),
"name" : "华纳影城",
"address" : "地址:萧山区临浦镇萧山建材商贸城25幢4楼(新起点ktv)"
}
{
"_id" : ObjectId("6208df549494e00eef326a64"),
"name" : "嘉博杜比巨幕影城",
"address" : "地址:下城区石桥路274号西狗茂南区2楼"
}
{
"_id" : ObjectId("6208df549494e00eef326a65"),
"name" : "大地影院(杭州临安宝龙店)",
"address" : "地址:临安区临安市锦北街道农林大路899号宝龙广场3层M-F3-025室"
}
{
"_id" : ObjectId("6208df549494e00eef326a66"),
"name" : "天玖国际影城(浙商国际中心店)",
"address" : "地址:上城区笕桥街道机场路355号浙商国际中心2幢4楼"
}
{
"_id" : ObjectId("6208df549494e00eef326a67"),
"name" : "太平洋影城(杭州下沙店)",
"address" : "地址:钱塘区下沙街道天城东路955号郡原蓝湖国际4幢3楼"
}
{
"_id" : ObjectId("6208df549494e00eef326a68"),
"name" : "太平洋影城(滨江店)",
"address" : "地址:滨江区江陵路2028号星耀城3幢3楼"
}
Type "it" for more
>
代码示例3
"""
猫眼电影经典影片 按照热门排序
"""
"""
<div class="movie-hover-info">
<div class="movie-hover-title" title="奇迹·笨小孩">
<span class="name ">奇迹·笨小孩</span>
<span class="score channel-detail-orange"><i class="integer">9.</i><i class="fraction">5</i></span>
</div>
<div class="movie-hover-title" title="奇迹·笨小孩">
<span class="hover-tag">类型:</span>
剧情
</div>
<div class="movie-hover-title" title="奇迹·笨小孩">
<span class="hover-tag">主演:</span>
易烊千玺/田雨/陈哈琳
</div>
<div class="movie-hover-title movie-hover-brief" title="奇迹·笨小孩">
<span class="hover-tag">上映时间:</span>
2022-02-01 08:00
</div>
</div>
<div class="movie-hover-info">.*?<span class="name ">(.*?)</span>.*?<span class="score channel-detail-orange"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></span>.*?<span class="hover-tag">类型:</span>(.*?)</div>.*?<span class="hover-tag">主演:</span>(.*?)</div>.*?<span class="hover-tag">上映时间:</span>(.*?)</div>.*?</div>
"""
from urllib import request
import random
import time
import re
import pymongo
class MaoyanSpider_Classic_Film:
def __init__(self):
self.url = 'https://www.maoyan.com/films?showType=3&sortId=1&offset={}'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
'Cookie': 'uuid_n_v=v1; uuid=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; _csrf=ebdd97cd428809914f8919dcfe0c1031f72c3caf9b01aa03d9c831fae4dffd7f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1644740619; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1644765969; _lxsdk_cuid=17ef22e2f1fc8-0bb075b44e2f43-4c3e237c-144000-17ef22e2f1f27; _lxsdk=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; __mta=150189705.1644740620254.1644765647349.1644765969306.80; _lxsdk_s=17ef39966be-c29-c61-4a0%7C%7C43'
}
self.i = 0
self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
self.db = self.conn['maoyandb']
self.myset = self.db['maoyanset']
def get_html(self, url):
"获取HTML内容"
req = request.Request(url=url, headers=self.headers)
res = request.urlopen(req)
html = res.read().decode()
self.parse_html(html)
def parse_html(self, html):
"提取HTML内容"
regex = '<div class="movie-hover-info">.*?<span class="name ">(.*?)</span>.*?<span class="score channel-detail-orange"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></span>.*?<span class="hover-tag">类型:</span>(.*?)</div>.*?<span class="hover-tag">主演:</span>(.*?)</div>.*?<span class="hover-tag">上映时间:</span>(.*?)</div>.*?</div>'
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
self.save_html(r_list)
def save_html(self, r_list):
"数据处理函数"
for r in r_list:
item = {}
item['name'] = r[0].strip()
item['score'] = r[1].strip() + r[2].strip()
item['type'] = r[3].strip()
item['star'] = r[4].strip()
item['time'] = r[5].strip()
print(item)
self.myset.insert_one(item)
self.i += 1
def run(self):
"程序运行调配"
for page in range(0, 91, 10):
self.get_html(url=self.url.format(page * 30))
time.sleep(random.randint(1, 2))
if __name__ == '__main__':
spider = MaoyanSpider_Classic_Film()
spider.run()
print('电影数量:', spider.i)
|