DEV Community

drake
drake

Posted on

将PDF格式的电子书转换为TXT

import fitz  # PyMuPDF

"""
由于DeepL限制文件大小为20M,某些特别大的PDF无法被处理
1、将PDF转换为TXT

"""

class Totxt:

    def __init__(self, pdf_source_path, txt_file_path):

        # PDF文件路径
        self.pdf_file = pdf_source_path
        # 输出的TXT文件路径
        self.txt_file = txt_file_path

    def is_header_or_footer(self, block, page_height, margin=15, margin2=20):
        # 判断文本块是否位于页眉或页脚位置
        y = block['lines'][0]['spans'][0]['origin'][1]
        return y < margin or y > page_height - margin2


    def to_txt(self):

        # 打开PDF文件
        pdf_document = fitz.open(self.pdf_file)

        # 创建或打开TXT文件
        with open(self.txt_file, 'w', encoding='utf-8') as output_file:
            # 遍历每一页
            for page_num in range(len(pdf_document)):

                page = pdf_document[page_num]
                page_height = page.rect.height

                # 获取页面中的文本块
                blocks = page.get_text("dict")["blocks"]
                # print(blocks)
                the_current_page_lastline = ''
                # 每个block是一行
                for block in blocks:
                    # time.sleep(0.5)
                    # page_height = 10
                    if "lines" in block:
                        # 每行会被切成多个碎片放在lines中
                        if not self.is_header_or_footer(block, page_height):
                            spans = []
                            for line in block["lines"]:
                                for span in line["spans"]:
                                    flags = span['flags']
                                    # 确保是正文而不是乱码
                                    if flags == 4:
                                        text = span["text"].strip()
                                        if text:
                                            output_file.write(text)
                                            spans.append(text)
                                            the_current_page_lastline = text
                                    else:
                                        print("乱码",span["text"].strip())
                            # 美化text的文本输出格式
                            if spans:
                                if '' == spans[-1][-1]:
                                    output_file.write('\n')
                        else:
                            line_text = ''
                            for line in block['lines']:
                                spans = line['spans']
                                for span in spans:
                                    text = span['text']
                                    line_text += text
                # 没换文本格式的条件判断
                if the_current_page_lastline:
                    if '' == the_current_page_lastline[-1]:
                        output_file.write('\n')  # 每页之间添加一个换行符
            print(f"PDF内容已成功保存到 {self.txt_file}")

Enter fullscreen mode Exit fullscreen mode

Top comments (0)