mrsk1883 commited on
Commit
c98d7c6
·
1 Parent(s): b2410fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -19
app.py CHANGED
@@ -1,29 +1,82 @@
1
- # app/main.py
2
  from PyPDF2 import PdfReader
3
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
4
  from gtts import gTTS
5
- import gradio as gr
6
 
7
- # Load the pre-trained model and tokenizer
8
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
9
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
10
  tokenizer = AutoTokenizer.from_pretrained(model_name)
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def summarize_pdf_abstract(pdf_path):
13
- # Implement the function to summarize PDF abstracts (similar to your previous code)
14
- # ...
15
-
16
- # Gradio Interface
17
- iface = gr.Interface(
18
- fn=summarize_pdf_abstract,
19
- inputs=gr.File(type="file", label="Upload a PDF file"),
20
- outputs="text",
21
- live=True,
22
- interpretation="default",
23
- title="PDF Abstract Summarizer",
24
- description="This app accepts PDFs with abstracts and generates a summary.",
25
- )
26
-
27
- # Launch the Gradio interface
28
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
1
  from PyPDF2 import PdfReader
2
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
  from gtts import gTTS
4
+ import os
5
 
6
+ # Download the model and tokenizer
7
  model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
8
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
  tokenizer = AutoTokenizer.from_pretrained(model_name)
10
 
11
+
12
+ def summarize_and_speak_pdf_abstract(pdf_path):
13
+ """
14
+ Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
15
+
16
+ Args:
17
+ pdf_path: Path to the PDF file.
18
+ """
19
+
20
+ # Summarize the abstract
21
+ summary = summarize_pdf_abstract(pdf_path)
22
+
23
+ # Define language and audio format
24
+ language = "en" # Change this to your desired language
25
+ audio_format = "mp3"
26
+
27
+ # Create the text-to-speech object
28
+ tts = gTTS(text=summary, lang=language)
29
+
30
+ # Generate the audio file
31
+ audio_file_name = f"summary.{audio_format}"
32
+ tts.save(audio_file_name)
33
+
34
+ print(f"Audio file created: {audio_file_name}")
35
+
36
+ # Play the audio file (optional)
37
+ # os.system(f"play {audio_file_name}")
38
+
39
+
40
+ # Define the function to summarize the abstract
41
  def summarize_pdf_abstract(pdf_path):
42
+ """
43
+ Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
44
+
45
+ Args:
46
+ pdf_path: Path to the PDF file.
47
+
48
+ Returns:
49
+ A string containing the one-sentence summary of the abstract.
50
+ """
51
+
52
+ # Read the PDF file
53
+ reader = PdfReader(open(pdf_path, "rb"))
54
+
55
+ # Extract the abstract
56
+ abstract_text = ""
57
+ for page in reader.pages:
58
+ # Search for keywords like "Abstract" or "Introduction"
59
+ if (
60
+ "Abstract" in page.extract_text()
61
+ or "Introduction" in page.extract_text()
62
+ ):
63
+ # Extract the text following the keyword
64
+ abstract_text = page.extract_text()
65
+ break
66
+
67
+ # Encode the abstract text
68
+ inputs = tokenizer(abstract_text, return_tensors="pt")
69
+
70
+ # Generate the summary
71
+ outputs = model.generate(**inputs)
72
+
73
+ # Decode the summary
74
+ summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
+
76
+ return summary
77
+
78
+
79
+ # Example usage
80
+ pdf_path = "/content/Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"
81
+ summarize_and_speak_pdf_abstract(pdf_path)
82