python爬取微信公众号图片并生成word文档
???????由于微信公众号中,有些推文是以图片的形式推送,若想提取图片,需要先下载下来。如果图片数量多,就需要更多的时间,降低效率。因此写下此代码,可以将推文图片提取并插入word中生成文档,以提高提取效率。 ?
一、代码
import requests
from bs4 import BeautifulSoup
import datetime
import os
from docx import Document
from docx.shared import Cm
def get_picture(url):
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
curr_time = datetime.datetime.now()
path = datetime.datetime.strftime(curr_time, '%Y%m%d%H%M')
if os.path.exists(path):
print("属于这个时间点的文件夹已经创建好")
else:
os.mkdir(path)
print("创建成功!!!!正在保存图片")
dirname = os.getcwd() + '\\' + path + '\\'
req = requests.get(url=url, headers=headers).content.decode()
soup = BeautifulSoup(req, 'lxml')
res = soup.select('img')
a = 0
for i in res:
if i.get("data-src") == None:
pass
else:
print(f'链接:{i.get("data-src")}类型为:{i.get("data-type")}')
try:
with open(dirname + f'{a}.{i.get("data-type")}', 'wb') as f:
f.write(requests.get(url=i.get("data-src"), headers=headers).content)
f.close()
a = a + 1
except Exception as e:
print("该链接为空自动跳过!")
print(f"此次一共成功保存图片{a}张")
def Generate_word():
Doc = Document()
sec = Doc.sections[0]
sec.left_margin = Cm(0)
sec.right_margin = Cm(0)
sec.top_margin = Cm(0)
sec.bottom_margin = Cm(0)
for i in range(1, 128):
Doc.add_picture(r'G:\python_demo\python爬取微信公众号图片\202107102104\{}.png'.format(i), width=Cm(21.88), height=Cm(29))
i += 1
Doc.save(r'./二年级语文(上册).docx')
if __name__ == '__main__':
url = input("请输入url:")
get_picture(url)
print("Saving......")
Generate_word()
print("文档创建成功!")
?
二、实例
1、复制公众号推文链接: 2020部编版二年级语文上册电子课本【提前预习】。 2、运行程序,在交互窗口输入推文链接,运行。
|