DEV Community

artydev
artydev

Posted on

Extractive summarization in Python wit Sumy

# %%
# %pip install pymupdf
# %pip install frontend
# %pip install tools

# %%
import pymupdf  # PyMuPDF
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# %%
def extract_text_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return (text, len(doc))

# %%
def summarize_text(text, num_sentences=10):
    parser = PlaintextParser.from_string(text, Tokenizer("french"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return summary


# %%
pdf_path = "sy.pdf"


# %%
(text, l) = extract_text_from_pdf(pdf_path)
print(l)
summary = summarize_text(text, num_sentences=30)

# %%
print(summary)

# %%
for sentence in summary[1:]:
    print(sentence)

Enter fullscreen mode Exit fullscreen mode

Top comments (0)