testing / app.py
mrsk1883's picture
Create app.py
f11abda
raw
history blame
2.15 kB
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
import os
# Download the model and tokenizer
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def summarize_and_speak_pdf_abstract(pdf_path):
"""
Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
Args:
pdf_path: Path to the PDF file.
"""
# Summarize the abstract
summary = summarize_pdf_abstract(pdf_path)
# Define language and audio format
language = "en" # Change this to your desired language
audio_format = "mp3"
# Create the text-to-speech object
tts = gTTS(text=summary, lang=language)
# Generate the audio file
audio_file_name = f"summary.{audio_format}"
tts.save(audio_file_name)
print(f"Audio file created: {audio_file_name}")
# Play the audio file (optional)
# os.system(f"play {audio_file_name}")
# Define the function to summarize the abstract
def summarize_pdf_abstract(pdf_path):
"""
Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
Args:
pdf_path: Path to the PDF file.
Returns:
A string containing the one-sentence summary of the abstract.
"""
# Read the PDF file
reader = PdfReader(open(pdf_path, "rb"))
# Extract the abstract
abstract_text = ""
for page in reader.pages:
# Search for keywords like "Abstract" or "Introduction"
if (
"Abstract" in page.extract_text()
or "Introduction" in page.extract_text()
):
# Extract the text following the keyword
abstract_text = page.extract_text()
break
# Encode the abstract text
inputs = tokenizer(abstract_text, return_tensors="pt")
# Generate the summary
outputs = model.generate(**inputs)
# Decode the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
return summary