安装包
pip install pdfminer.six
读取pdf
import pdfminer.high_level
text = pdfminer.high_level.extract_text('D:\pypdfTest\MFT_SSCM_DT_HQ_00001_采购计划数据传输服务(MFT)-V0.12.pdf')
texts = text.split('服务提供方')
print(text)
批量读取
import os
import pdfminer.high_level
def get_file_names(directory):
return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
directory = 'D:\pypdfTest'
file_names = get_file_names(directory)
for file_name in file_names:
text = pdfminer.high_level.extract_text(directory+"/"+file_name)
print(text)