Spaces:

mrsk1883
/

testing

Sleeping

App Files Files Community

testing / app.py

mrsk1883

Create app.py

f11abda over 1 year ago

raw

history blame

2.15 kB

	from PyPDF2 import PdfReader
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	from gtts import gTTS
	import os

	# Download the model and tokenizer
	model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)


	def summarize_and_speak_pdf_abstract(pdf_path):
	"""
	Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.

	Args:
	pdf_path: Path to the PDF file.
	"""

	# Summarize the abstract
	summary = summarize_pdf_abstract(pdf_path)

	# Define language and audio format
	language = "en" # Change this to your desired language
	audio_format = "mp3"

	# Create the text-to-speech object
	tts = gTTS(text=summary, lang=language)

	# Generate the audio file
	audio_file_name = f"summary.{audio_format}"
	tts.save(audio_file_name)

	print(f"Audio file created: {audio_file_name}")

	# Play the audio file (optional)
	# os.system(f"play {audio_file_name}")


	# Define the function to summarize the abstract
	def summarize_pdf_abstract(pdf_path):
	"""
	Reads a PDF file, extracts the abstract, and summarizes it in one sentence.

	Args:
	pdf_path: Path to the PDF file.

	Returns:
	A string containing the one-sentence summary of the abstract.
	"""

	# Read the PDF file
	reader = PdfReader(open(pdf_path, "rb"))

	# Extract the abstract
	abstract_text = ""
	for page in reader.pages:
	# Search for keywords like "Abstract" or "Introduction"
	if (
	"Abstract" in page.extract_text()
	or "Introduction" in page.extract_text()
	):
	# Extract the text following the keyword
	abstract_text = page.extract_text()
	break

	# Encode the abstract text
	inputs = tokenizer(abstract_text, return_tensors="pt")

	# Generate the summary
	outputs = model.generate(**inputs)

	# Decode the summary
	summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

	return summary