import fitz # PyMuPDF
"""
由于DeepL限制文件大小为20M,某些特别大的PDF无法被处理
1、将PDF转换为TXT
"""
class Totxt:
def __init__(self, pdf_source_path, txt_file_path):
# PDF文件路径
self.pdf_file = pdf_source_path
# 输出的TXT文件路径
self.txt_file = txt_file_path
def is_header_or_footer(self, block, page_height, margin=15, margin2=20):
# 判断文本块是否位于页眉或页脚位置
y = block['lines'][0]['spans'][0]['origin'][1]
return y < margin or y > page_height - margin2
def to_txt(self):
# 打开PDF文件
pdf_document = fitz.open(self.pdf_file)
# 创建或打开TXT文件
with open(self.txt_file, 'w', encoding='utf-8') as output_file:
# 遍历每一页
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
page_height = page.rect.height
# 获取页面中的文本块
blocks = page.get_text("dict")["blocks"]
# print(blocks)
the_current_page_lastline = ''
# 每个block是一行
for block in blocks:
# time.sleep(0.5)
# page_height = 10
if "lines" in block:
# 每行会被切成多个碎片放在lines中
if not self.is_header_or_footer(block, page_height):
spans = []
for line in block["lines"]:
for span in line["spans"]:
flags = span['flags']
# 确保是正文而不是乱码
if flags == 4:
text = span["text"].strip()
if text:
output_file.write(text)
spans.append(text)
the_current_page_lastline = text
else:
print("乱码",span["text"].strip())
# 美化text的文本输出格式
if spans:
if '。' == spans[-1][-1]:
output_file.write('\n')
else:
line_text = ''
for line in block['lines']:
spans = line['spans']
for span in spans:
text = span['text']
line_text += text
# 没换文本格式的条件判断
if the_current_page_lastline:
if '。' == the_current_page_lastline[-1]:
output_file.write('\n') # 每页之间添加一个换行符
print(f"PDF内容已成功保存到 {self.txt_file}")
For further actions, you may consider blocking this person and/or reporting abuse
Top comments (0)