需求:将doc文档的表格里的部分数据整理到excel。
步骤:
step1:将doc格式转为docx
step2:读取docx数据并去重
step3:将批量读取到的数据存储为excel
requirements:
pywin32=225
python-docx
pandas
工程目录:
import
# -*- coding: utf-8 -*-
import os
import re
import time
from win32com import client
from docx import Document
import pandas as pd
main
def main():
data_list = []
folder_path = r'C:\Users\LRL09\Desktop\python\source'
for index, origin_file in get_origin_file(folder_path):
target_file = origin_file.replace('.doc', '.docx').replace('source', 'temp')
target_path = '\\'.join(target_file.split('\\')[:-1])
if not os.path.exists(target_path):
os.makedirs(target_path)
print(origin_file)
doc2docx(origin_file, target_file)
data = get_data_from(target_file)
data['序号'] = index
data_list.append(data)
df = pd.DataFrame(data_list, columns=columns)
df.to_excel(f'{folder}.xlsx', index=False)
get_origin_file
def get_origin_file(folder_path):
file_list = []
for root, dirs, files in os.walk(folder_path):
for name in files:
if '~' in name:
continue
if name.endswith('.doc') or name.endswith('.docx'):
origin_file = os.path.join(root, name)
try:
index = re.findall(r'2021- *(\d+)', origin_file)[0]
except IndexError:
index = re.findall(r'\\(\d+)', origin_file)[-1]
file_list.append((index, origin_file))
file_list = sorted(file_list, key=lambda _: int(_[0]))
for (index, origin_file), i in zip(file_list, range(1, 1000)):
assert index == str(i)
return file_list
doc2docx
def doc2docx(origin_file, target_file):
word = client.gencache.EnsureDispatch('kwps.application')
doc = word.Documents.Open(origin_file)
doc.SaveAs2(target_file, 12)
doc.Close()
word.Quit()
time.sleep(1)
get_data_from
def get_data_from(docx):
# 获取docx数据并去重,因为Document读取docx后会出现很多重复
doc = Document(docx)
tb = doc.tables[0]
data_table = []
for i, row in enumerate(tb.rows):
data_row = []
temp = ''
for j, cell in enumerate(row.cells):
if temp == cell.text:
continue
else:
temp = cell.text
data_row.append(temp)
# print(data_row)
data_table.append(data_row)
# 获取目标数据
leader_name = data_table[1][1]
department = data_table[2][1]
project_type = data_table[4][1]
approve_money = data_table[6][1]
actual_money = data_table[6][3]
remained_money = data_table[6][5]
advise = data_table[13][1]
# 构造字典
data = {
'leader_name': leader_name,
'department': department,
'project_type': project_type,
'approve_money': approve_money,
'actual_money': actual_money,
'remained_money': remained_money,
'advise': advise,
}
return data
|