from PyPDF2 import PdfReader from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from gtts import gTTS import os # Download the model and tokenizer model_name = "ArtifactAI/led_large_16384_arxiv_summarization" model = AutoModelForSeq2SeqLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) def summarize_and_speak_pdf_abstract(pdf_path): """ Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary. Args: pdf_path: Path to the PDF file. """ # Summarize the abstract summary = summarize_pdf_abstract(pdf_path) # Define language and audio format language = "en" # Change this to your desired language audio_format = "mp3" # Create the text-to-speech object tts = gTTS(text=summary, lang=language) # Generate the audio file audio_file_name = f"summary.{audio_format}" tts.save(audio_file_name) print(f"Audio file created: {audio_file_name}") # Play the audio file (optional) # os.system(f"play {audio_file_name}") # Define the function to summarize the abstract def summarize_pdf_abstract(pdf_path): """ Reads a PDF file, extracts the abstract, and summarizes it in one sentence. Args: pdf_path: Path to the PDF file. Returns: A string containing the one-sentence summary of the abstract. """ # Read the PDF file reader = PdfReader(open(pdf_path, "rb")) # Extract the abstract abstract_text = "" for page in reader.pages: # Search for keywords like "Abstract" or "Introduction" if ( "Abstract" in page.extract_text() or "Introduction" in page.extract_text() ): # Extract the text following the keyword abstract_text = page.extract_text() break # Encode the abstract text inputs = tokenizer(abstract_text, return_tensors="pt") # Generate the summary outputs = model.generate(**inputs) # Decode the summary summary = tokenizer.decode(outputs[0], skip_special_tokens=True) return summary