序言
做这个的背景是研究生导师要批量处理社会责任报告,提取出一些共性的关键词,大多数批量提出关键词次数的任务都能够完成
代码能够运行,但效率不一定最优(我的配置能够实现2.5s一份),Anaconda里自带的库就够用,无需安装其他库
函数模块介绍
具体的全部代码可见全部代码部分,这部分介绍思路和相应的函数模块
对文件进行批量重命名
因为文件名是中文,且无关于最后的结果,所以批量命名为数字 注意如果不是第一次运行,即已经命名完成,就在主函数内把这个函数注释掉就好了
def rename():
path=r'C:\Users\ASUS\Desktop\IMUFE\E.Python\text analysis\dealPdf'
filelist=os.listdir(path)
for i,files in enumerate(filelist):
Olddir=os.path.join(path,files)
if os.path.isdir(Olddir):
continue
Newdir=os.path.join(path,str(i+1)+'.pdf')
os.rename(Olddir,Newdir)
将PDF转化为txt
PDF是无法直接进行文本分析的,所以需要将文字转成txt文件(PDF中图内的文字无法提取)
def pdf_to_txt(dealPdf,index):
logging.propagate = False
logging.getLogger().setLevel(logging.ERROR)
pdf_filename = dealPdf
device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams())
interpreter = PDFPageInterpreter(PDFResourceManager(), device)
doc = PDFDocument()
parser = PDFParser(open(pdf_filename, 'rb'))
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
txt_filename='dealTxt\\'+str(index)+'.txt'
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
with open(txt_filename, 'w', encoding="utf-8") as fw:
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
results = x.get_text()
fw.write(results)
删除txt中的换行符
因为PDF导出的txt会用换行符换行,为了避免词语因此拆开,所以删除所有的换行符
def delete_huanhangfu(dealTxt,index):
outPutString=''
outPutTxt='outPutTxt\\'+str(index)+'.txt'
with open(dealTxt,'r',encoding="utf-8") as f:
lines=f.readlines()
for i in range(len(lines)):
if lines[i].endswith('\n'):
lines[i]=lines[i][:-1]
for j in range(len(lines)):
outPutString+=lines[j]
with open(outPutTxt,'w',encoding="utf-8") as fw:
fw.write(outPutString)
添加自定义词语
此处可以根据自己的需要自定义,传入的wordsByMyself是全局变量
def word_by_myself():
for i in range(len(wordsByMyself)):
jieba.add_word(wordsByMyself[i])
分词与词频统计
调用jieba进行分词,读取通用词表去掉停用词(此步其实可以省略,对最终结果影响不大),将词语和出现次数合成为键值对,输出关键词出现次数
def cut_and_count(outPutTxt):
with open(outPutTxt,encoding='utf-8') as f:
text=f.read()
words=jieba.lcut(text)
stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ])
final = ""
for word in words:
if word not in stopwords:
if (word != "。" and word != ",") :
final = final + " " + word
counts={}
for word in words:
if len(word) == 1:
continue
else:
counts[word]=counts.get(word,0)+1
for i in range(len(wordsByMyself)):
if wordsByMyself[i] in counts:
print(wordsByMyself[i]+':'+str(counts[wordsByMyself[i]]))
else:
print(wordsByMyself[i]+':0')
主函数
通过for循环进行批量操作
if __name__ == "__main__":
for i in range(1,fileNum+1):
pdf_to_txt('dealPdf\\'+str(i)+'.pdf',i)
delete_huanhangfu('dealTxt\\'+str(i)+'.txt',i)
word_by_myself()
print(f'----------result {i}----------')
cut_and_count('outPutTxt\\'+str(i)+'.txt')
本地文件结构
全部代码
import jieba
import jieba.analyse
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
import logging
import os
wordsByMyself=['社会责任','义务','上市','公司']
fileNum=17
def rename():
path=r'C:\Users\ASUS\Desktop\IMUFE\E.Python\text analysis\dealPdf'
filelist=os.listdir(path)
for i,files in enumerate(filelist):
Olddir=os.path.join(path,files)
if os.path.isdir(Olddir):
continue
Newdir=os.path.join(path,str(i+1)+'.pdf')
os.rename(Olddir,Newdir)
def pdf_to_txt(dealPdf,index):
logging.propagate = False
logging.getLogger().setLevel(logging.ERROR)
pdf_filename = dealPdf
device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams())
interpreter = PDFPageInterpreter(PDFResourceManager(), device)
doc = PDFDocument()
parser = PDFParser(open(pdf_filename, 'rb'))
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
txt_filename='dealTxt\\'+str(index)+'.txt'
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
with open(txt_filename, 'w', encoding="utf-8") as fw:
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
results = x.get_text()
fw.write(results)
def delete_huanhangfu(dealTxt,index):
outPutString=''
outPutTxt='outPutTxt\\'+str(index)+'.txt'
with open(dealTxt,'r',encoding="utf-8") as f:
lines=f.readlines()
for i in range(len(lines)):
if lines[i].endswith('\n'):
lines[i]=lines[i][:-1]
for j in range(len(lines)):
outPutString+=lines[j]
with open(outPutTxt,'w',encoding="utf-8") as fw:
fw.write(outPutString)
def word_by_myself():
for i in range(len(wordsByMyself)):
jieba.add_word(wordsByMyself[i])
def cut_and_count(outPutTxt):
with open(outPutTxt,encoding='utf-8') as f:
text=f.read()
words=jieba.lcut(text)
stopwords = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ])
final = ""
for word in words:
if word not in stopwords:
if (word != "。" and word != ",") :
final = final + " " + word
counts={}
for word in words:
if len(word) == 1:
continue
else:
counts[word]=counts.get(word,0)+1
for i in range(len(wordsByMyself)):
if wordsByMyself[i] in counts:
print(wordsByMyself[i]+':'+str(counts[wordsByMyself[i]]))
else:
print(wordsByMyself[i]+':0')
if __name__ == "__main__":
for i in range(1,fileNum+1):
pdf_to_txt('dealPdf\\'+str(i)+'.pdf',i)
delete_huanhangfu('dealTxt\\'+str(i)+'.txt',i)
word_by_myself()
print(f'----------result {i}----------')
cut_and_count('outPutTxt\\'+str(i)+'.txt')
结果预览
|