File size: 2,297 Bytes
5f7e3d8 c363ecf c98d7c6 5f7e3d8 c98d7c6 c363ecf 5f7e3d8 c98d7c6 c363ecf c98d7c6 b2410fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
import os
# Download the model and tokenizer
model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
def summarize_and_speak_pdf_abstract(pdf_path):
"""
Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
Args:
pdf_path: Path to the PDF file.
"""
# Summarize the abstract
summary = summarize_pdf_abstract(pdf_path)
# Define language and audio format
language = "en" # Change this to your desired language
audio_format = "mp3"
# Create the text-to-speech object
tts = gTTS(text=summary, lang=language)
# Generate the audio file
audio_file_name = f"summary.{audio_format}"
tts.save(audio_file_name)
print(f"Audio file created: {audio_file_name}")
# Play the audio file (optional)
# os.system(f"play {audio_file_name}")
# Define the function to summarize the abstract
def summarize_pdf_abstract(pdf_path):
"""
Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
Args:
pdf_path: Path to the PDF file.
Returns:
A string containing the one-sentence summary of the abstract.
"""
# Read the PDF file
reader = PdfReader(open(pdf_path, "rb"))
# Extract the abstract
abstract_text = ""
for page in reader.pages:
# Search for keywords like "Abstract" or "Introduction"
if (
"Abstract" in page.extract_text()
or "Introduction" in page.extract_text()
):
# Extract the text following the keyword
abstract_text = page.extract_text()
break
# Encode the abstract text
inputs = tokenizer(abstract_text, return_tensors="pt")
# Generate the summary
outputs = model.generate(**inputs)
# Decode the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
return summary
# Example usage
pdf_path = "/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
summarize_and_speak_pdf_abstract(pdf_path)
|