1.问题描述
应项目需求需要获取PDF扫描文件的内容,但寻遍整个网络能达到这种功能的产品,都要会员充值。苦于囊中羞涩也只好编写功能代码来实现了。 如PDF中表格图片图-1效果生成图-2
图-1
图-2
2.实现流程
整个步骤为:读取PDF文件->生成图片->ORC获取图片内容->写入Excel
Created with Rapha?l 2.2.0
开始
读取PDF
生成图片
ORC获取图片内容
写入Excel
结束
3.功能代码
3.1 pdf转为图片
import fitz
from aip import AipOcr
import time
import docx
from docx.oxml.ns import qn
""" 你的 APPID AK SK """
APP_ID = 'xxxxxx'
API_KEY = 'xxxxxxxx'
SECRET_KEY = 'xxxxxxxxxxxxxxxxxxxxxxx'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
'''
将PDF转化为图片
pdfPath pdf文件的路径
imgPath 图像要保存的路径
zoom_x x方向的缩放系数
zoom_y y方向的缩放系数
rotation_angle 旋转角度
zoom_x和zoom_y一般取相同值,值越大,图像分辨率越高
返回目标pdf的名称和页数,便于下一步操作
'''
def pdf_image(pdfPath, imgPath, zoom_x=10, zoom_y=10, rotation_angle=0):
name = pdfPath.split("\\")[-1].split('.pdf')[0]
pdf = fitz.open(pdfPath)
num = pdf.pageCount
for pg in range(0, num):
page = pdf[pg]
trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotation_angle)
pm = page.getPixmap(matrix=trans, alpha=False)
pm.writePNG(imgPath + name + "_" + str(pg) + ".png")
pdf.close()
return name, num
'''
将图片读取为docx文件
imgPath 图像所在路径
生成的docx也保存在图像所在路径中
name为pdf名称(不含后缀)
num为pdf页数
name和num均可由上一个函数返回
'''
def ReadDetail_docx(imgPath, name, num):
doc = docx.Document()
doc.styles["Normal"].font.name=u"宋体"
doc.styles["Normal"]._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
for n in range(0,num):
i = open(imgPath+name+"_"+str(n)+".png",'rb')
time.sleep(0.1)
img = i.read()
message = client.basicAccurate(img)
content = message.get('words_result')
for i in range(len(content)):
doc.add_paragraph(content[i].get('words'))
doc.save(imgPath + name + '.docx')
def pdf_to_docx(pdfPath, imgPath, zoom_x=10, zoom_y=10, rotation_angle=0):
print("正在将pdf文件转换为图片...")
name_, num_ = pdf_image(pdfPath, imgPath, zoom_x, zoom_y, rotation_angle)
print("转换成功!")
pdf_path = "JRT 0197-2020金融数据安全 数据安全分级指南.pdf"
img_path = r"G:\imges\\"
pdf_to_docx(pdf_path, img_path)
3.2 表格图片文字识别到excel
import pandas as pd
import numpy as np
import re
from aip import AipOcr
import time
import requests
import os
image_path = ''
def get_image():
images = []
for root, dirs, files in os.walk(image_path):
path = [os.path.join(root, name) for name in files]
images.extend(path)
return images
def Image_Excel(APP_ID, API_KEY, SECRET_KEY):
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
images = get_image()
for image in images:
img_open = open(image, 'rb')
img_read = img_open.read()
table = client.tableRecognitionAsync(img_read)
request_id = table['result'][0]['request_id']
result = client.getTableRecognitionResult(request_id)
while result['result']['ret_msg'] != '已完成':
time.sleep(2)
result = client.getTableRecognitionResult(request_id)
download_url = result['result']['result_data']
print(download_url)
excel_data = requests.get(download_url)
xlsx_name = image.split(".")[0] + ".xlsx"
xlsx = open(xlsx_name, 'wb')
xlsx.write(excel_data.content)
if __name__ == '__main__':
image_path = r"G:\imgs\\"
APP_ID = 'xxxxxxxx'
API_KEY = 'xxxxxxx'
SECRET_KEY = 'xxxxxxxxxxxxxxxxxxxxxx'
Image_Excel(APP_ID, API_KEY, SECRET_KEY)
4. 案例说明
我这里是获取JRT 0197-2020金融数据安全 数据安全分级指南.pdf扫描文件,将内部表格数据写入到excel文件。
|