|
from PyPDF2 import PdfReader |
|
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
|
from gtts import gTTS |
|
import os |
|
|
|
|
|
model_name = "ArtifactAI/led_large_16384_arxiv_summarization" |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
def summarize_and_speak_pdf_abstract(pdf_path): |
|
""" |
|
Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary. |
|
|
|
Args: |
|
pdf_path: Path to the PDF file. |
|
""" |
|
|
|
|
|
summary = summarize_pdf_abstract(pdf_path) |
|
|
|
|
|
language = "en" |
|
audio_format = "mp3" |
|
|
|
|
|
tts = gTTS(text=summary, lang=language) |
|
|
|
|
|
audio_file_name = f"summary.{audio_format}" |
|
tts.save(audio_file_name) |
|
|
|
print(f"Audio file created: {audio_file_name}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
def summarize_pdf_abstract(pdf_path): |
|
""" |
|
Reads a PDF file, extracts the abstract, and summarizes it in one sentence. |
|
|
|
Args: |
|
pdf_path: Path to the PDF file. |
|
|
|
Returns: |
|
A string containing the one-sentence summary of the abstract. |
|
""" |
|
|
|
|
|
reader = PdfReader(open(pdf_path, "rb")) |
|
|
|
|
|
abstract_text = "" |
|
for page in reader.pages: |
|
|
|
if ( |
|
"Abstract" in page.extract_text() |
|
or "Introduction" in page.extract_text() |
|
): |
|
|
|
abstract_text = page.extract_text() |
|
break |
|
|
|
|
|
inputs = tokenizer(abstract_text, return_tensors="pt") |
|
|
|
|
|
outputs = model.generate(**inputs) |
|
|
|
|
|
summary = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
return summary |