mrsk1883 commited on
Commit
17cf0e6
·
1 Parent(s): 3e0f5bd

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -76
app.py DELETED
@@ -1,76 +0,0 @@
1
- from PyPDF2 import PdfReader
2
- from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
- from gtts import gTTS
4
- import os
5
-
6
- # Download the model and tokenizer
7
- model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
8
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
9
- tokenizer = AutoTokenizer.from_pretrained(model_name)
10
-
11
-
12
- def summarize_and_speak_pdf_abstract(pdf_path):
13
- """
14
- Reads a PDF file, extracts the abstract, summarizes it in one sentence, and generates an audio file of the summary.
15
-
16
- Args:
17
- pdf_path: Path to the PDF file.
18
- """
19
-
20
- # Summarize the abstract
21
- summary = summarize_pdf_abstract(pdf_path)
22
-
23
- # Define language and audio format
24
- language = "en" # Change this to your desired language
25
- audio_format = "mp3"
26
-
27
- # Create the text-to-speech object
28
- tts = gTTS(text=summary, lang=language)
29
-
30
- # Generate the audio file
31
- audio_file_name = f"summary.{audio_format}"
32
- tts.save(audio_file_name)
33
-
34
- print(f"Audio file created: {audio_file_name}")
35
-
36
- # Play the audio file (optional)
37
- # os.system(f"play {audio_file_name}")
38
-
39
-
40
- # Define the function to summarize the abstract
41
- def summarize_pdf_abstract(pdf_path):
42
- """
43
- Reads a PDF file, extracts the abstract, and summarizes it in one sentence.
44
-
45
- Args:
46
- pdf_path: Path to the PDF file.
47
-
48
- Returns:
49
- A string containing the one-sentence summary of the abstract.
50
- """
51
-
52
- # Read the PDF file
53
- reader = PdfReader(open(pdf_path, "rb"))
54
-
55
- # Extract the abstract
56
- abstract_text = ""
57
- for page in reader.pages:
58
- # Search for keywords like "Abstract" or "Introduction"
59
- if (
60
- "Abstract" in page.extract_text()
61
- or "Introduction" in page.extract_text()
62
- ):
63
- # Extract the text following the keyword
64
- abstract_text = page.extract_text()
65
- break
66
-
67
- # Encode the abstract text
68
- inputs = tokenizer(abstract_text, return_tensors="pt")
69
-
70
- # Generate the summary
71
- outputs = model.generate(**inputs)
72
-
73
- # Decode the summary
74
- summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
-
76
- return summary