DEV Community

artydev
artydev

Posted on

Extractive summarization of PDF files with Spacy

Here is a simple script to accomplish this task :

In your vitual environment run :

pip install PyPDF2
pip install spacy
python -m spacy download fr_core_news_sm
Enter fullscreen mode Exit fullscreen mode
import PyPDF2
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
from heapq import nlargest
import re

def extract_text_from_pdf(file_path):
    """Extracts text from a PDF file."""
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page_num]
        text += page_obj.extract_text()
    pdf_file_obj.close()
    return text

def summarize(text, ratio=0.0013):
    """Summarizes the given text using SpaCy."""
    nlp = spacy.load('fr_core_news_sm')
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    word_frequencies = {}
    for word in tokens:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = word_frequencies[word] / max_frequency

    sentence_scores = {}
    for sent in doc.sents:
        for word_value in sent:
            if word_value.text.lower() in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word_value.text.lower()]
                else:
                    sentence_scores[sent] += word_frequencies[word_value.text.lower()]

    select_length = int(len(sentence_scores) * ratio)
    summary_sentences = nlargest(select_length, sentence_scores, key=sentence_scores.get)
    final_summary = [str(sentence) for sentence in summary_sentences]
    summary = ' '.join(final_summary)
    return summary

# Path to your PDF file
file_path = 'sy.pdf'

# Extract text from PDF
pdf_text = extract_text_from_pdf(file_path)



summary = summarize(pdf_text )
print("Summary:")
print(summary)

Enter fullscreen mode Exit fullscreen mode

Top comments (0)