File size: 1,782 Bytes
cba2ea4
5f7e3d8
c363ecf
 
cba2ea4
 
 
5f7e3d8
c363ecf
 
 
5f7e3d8
cba2ea4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c98d7c6
 
cba2ea4
 
 
c98d7c6
 
 
cba2ea4
 
 
 
 
 
 
 
 
c98d7c6
cba2ea4
 
c98d7c6
cba2ea4
 
c98d7c6
cba2ea4
 
c98d7c6
cba2ea4
 
 
 
 
b2410fa
cba2ea4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import gradio as gr
from PyPDF2 import PdfReader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from gtts import gTTS
from io import BytesIO
import base64
import re

model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def extract_first_sentence(text):
  """
  Extracts the first sentence from the given text.
  """
  sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
  if sentences:
    return sentences[0]
  else:
    return text

def summarize_pdf_abstract(pdf_file):
  """
  Reads a PDF file, extracts the abstract, summarizes it as the first sentence, and generates audio.
  """
  try:
    reader = PdfReader(pdf_file)
    abstract_text = ""
    for page in reader.pages:
      if "Abstract" in page.extract_text() or "Introduction" in page.extract_text():
        abstract_text = page.extract_text()
        break

    inputs = tokenizer(abstract_text, return_tensors="pt")
    outputs = model.generate(**inputs)
    summary = tokenizer.decode(outputs[0])

    # Extract only the first sentence
    summary_sentence = extract_first_sentence(summary)

    # Generate audio
    speech = gTTS(text=summary_sentence, lang="en")
    speech_bytes = BytesIO()
    speech.write_to_fp(speech_bytes)

    # Encode audio data with Base64
    audio_data = base64.b64encode(speech_bytes.getvalue()).decode("utf-8")

    # Return individual output values
    return summary_sentence, audio_data

  except Exception as e:
    raise Exception(str(e))

interface = gr.Interface(
  fn=summarize_pdf_abstract,
  inputs=[gr.File(label="Upload PDF")],
  outputs=[gr.Textbox(label="Summary"), gr.Audio()],
)

interface.launch(share=True)