Python从Word/PPT/PDF中抽取图片
PS 1:也是从网上各个帖子中学习的代码,因此代码的格式以及内容有粘贴网上其他大神的代码,如有侵权请告知删除 2:本次设计意在用pyinstaller生成控制台互动程序,但是生成后,总是提示“无法使用pix.save”,而在编译器中是能够正常运行的,如果有大神知道怎么回事,诚心请教。 3:如果从单一的PPT文件或者Word文件中提取图片,不必要使用该程序,只需要把文件后缀名改为zip,双击压缩包后,会有一个文件夹单独存放文件内的所有图片,拷出即可。
效果图: 完整代码:
import os, re
import docx
import pptx
import fitz
'''
扫描执行文件所在目录中的所有Word文件
'''
def Scan_Folder_ForWord(Folder_Path):
for FolderName, SubFolders, FileNames in os.walk(Folder_Path):
for FileName in FileNames:
if FileName.split('.')[-1] == "doc" or FileName.split('.')[-1] == "docx":
wordPath = FolderName + "\\" + FileName
PicFolder = Folder_Path + "\\WordResult\\" + os.path.splitext(FileName)[0]
get_pictures_ForWord(wordPath, PicFolder)
'''
扫描执行文件所在目录中的所有PPT文件
'''
def Scan_Folder_ForPPT(Folder_Path):
for FolderName, SubFolders, FileNames in os.walk(Folder_Path):
for FileName in FileNames:
if FileName.split('.')[-1] == "ppt" or FileName.split('.')[-1] == "pptx":
pptPath = FolderName + "\\" + FileName
PicFolder = Folder_Path + "\\PPTResult\\" + os.path.splitext(FileName)[0]
get_pictures_ForPPT(pptPath, PicFolder)
'''
扫描执行文件所在目录中的所有PDF文件
'''
def Scan_Folder_ForPDF(Folder_Path):
for FolderName, SubFolders, FileNames in os.walk(Folder_Path):
for FileName in FileNames:
if FileName.split('.')[-1] == "pdf":
pdfPath = FolderName + "\\" + FileName
PicFolder = Folder_Path + "\\PDFResult\\" + os.path.splitext(FileName)[0]
get_pictures_ForPDF(pdfPath, PicFolder)
'''
从Word里获取图片
'''
def get_pictures_ForWord(word_path, result_path):
doc = docx.Document(word_path)
dict_rel = doc.part._rels
'''
dict_rel
{'rId13': <docx.opc.rel._Relationship object at 0x0000028B248AC408>,
'rId18': <docx.opc.rel._Relationship object at 0x0000028B248AC488>,
……
}
'''
for rel in dict_rel:
'''
rel
rId13 rId18
'''
rel = dict_rel[rel]
'''
rel
<docx.opc.rel._Relationship object at 0x0000028B248AC408>; <docx.opc.rel._Relationship object at 0x0000028B248AC488>
'''
if "image" in rel.target_ref:
'''
rel.target_ref
media/image8.jpeg; media/image13.jpeg
'''
img_name = re.findall("/(.*)", rel.target_ref)[0]
NoPointPath = os.path.splitext(word_path)[0]
if os.sep in NoPointPath:
word_name = NoPointPath.split('\\')[-1]
else:
word_name = NoPointPath.split('/')[-1]
img_savename = f'{word_name}_{img_name}'
if not os.path.exists(result_path):
os.makedirs(result_path)
with open(f'{result_path}/{img_savename}', "wb") as f:
f.write(rel.target_part.blob)
'''
从PPT里获取图片
'''
def get_pictures_ForPPT(PPT_path, result_path):
i = 1
ppt = pptx.Presentation(PPT_path)
for slide in ppt.slides:
for shape in slide.shapes:
'''
使用try,catch的原因在于,不是”image"类型的内容,直接跳过,不进行操作
'''
try:
if "image" in shape.image.content_type:
'''
shape.image.content_type
image/png; image/jpeg; ……
'''
img_name = f'{i}_{shape.image.filename}'
i = i + 1
NoPointPath = os.path.splitext(PPT_path)[0]
if os.sep in NoPointPath:
ppt_name = NoPointPath.split('\\')[-1]
else:
ppt_name = NoPointPath.split('/')[-1]
img_savename = f'{ppt_name}_{img_name}'
if not os.path.exists(result_path):
os.makedirs(result_path)
with open(f'{result_path}/{img_savename}', "wb") as f:
f.write(shape.image.blob)
except:
pass
'''
从PDF里获取图片
'''
def get_pictures_ForPDF(PDF_path, result_path):
ImgCompare = r"/Subtype(?= */Image)"
pdf = fitz.open(PDF_path)
Length = pdf.xref_length()
count = 1
for i in range(1, Length):
Content = pdf.xref_object(i)
'''
Content举例
<<
/A 57571 0 R
/K 90
/P 4217 0 R
/Pg 221 0 R
/S /#ED#91#9C#EC#A4#80
>>
'''
Image = re.search(ImgCompare, Content)
'''
Image
None or <re.Match object; span=(193, 201), match='/Subtype'>
'''
if not Image:
continue
pix = fitz.Pixmap(pdf, i)
'''
pix
Pixmap(DeviceRGB, IRect(0, 0, 657, 18), 0)
'''
'''
必要的时候使用,作用是屏蔽一些不想要的图片,至于这个阈值设定多少则根据自己要求
if pix.size < 100000:
continue
'''
img_name = f"img{count}.png"
NoPointPath = os.path.splitext(PDF_path)[0]
if os.sep in NoPointPath:
pdf_name = NoPointPath.split('\\')[-1]
else:
pdf_name = NoPointPath.split('/')[-1]
img_savename = f'{pdf_name}_{img_name}'
if not os.path.exists(result_path):
os.makedirs(result_path)
pix.save(os.path.join(result_path, img_savename))
count += 1
pix = None
num = input("=========================Instruction=========================\n"
"1 : Put this tool under the folder which the Word/PPT/PDF files exist\n"
"2 : Input the Function Number then click 'Enter'\n"
"=============================================================\n"
"Please Select Function Number\n"
"1: Get Pictures Form Word\n"
"2: Get Pictures From PPT\n"
"3: Get Pictures From PDF\n")
if num == '1':
Scan_Folder_ForWord(os.getcwd())
if num == '2':
Scan_Folder_ForPPT(os.getcwd())
if num == '3':
Scan_Folder_ForPDF(os.getcwd())
input("\nPress any key to close the window")
有兴趣的伙伴在拷贝的时候,建议添加打印信息,更好的理解程序。
|