gianb commited on
Commit
6439089
·
1 Parent(s): 8f760c8

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -97
app.py DELETED
@@ -1,97 +0,0 @@
1
- import warnings
2
- import pdfplumber
3
- import torch
4
- from transformers import pipeline, AutoProcessor, AutoModel
5
- import numpy as np
6
- import gradio as gr
7
- from io import BytesIO
8
- from scipy.io.wavfile import write as write_wav
9
- warnings.filterwarnings("ignore")
10
-
11
-
12
- # Here is the code
13
- def extract_abstract(uploaded_file):
14
- pdf_bytes = BytesIO(uploaded_file)
15
- with pdfplumber.open(pdf_bytes) as pdf:
16
- abstract = ""
17
- # Iterate through each page
18
- for page in pdf.pages:
19
- text = page.extract_text(x_tolerance = 1, y_tolerance = 1) # these parameters are set 1 in order to get spaces between words and lines
20
- # Search for the "Abstract" keyword
21
- if "abstract" in text.lower():
22
- # Found the "Abstract" keyword
23
- start_index = text.lower().find("abstract") # find the "abstract" title as starter index
24
- end_index = text.lower().find("introduction") # find the "introduction" title as end index
25
- abstract = text[start_index:end_index]
26
- break
27
- print(abstract)
28
- return abstract
29
-
30
- def process_summary(summary):
31
- # Split the summary by the first period
32
- summary = summary[0]["summary_text"]
33
- sentences = summary.split('.', 1)
34
- if len(sentences) > 0:
35
- # Retrieve the first part before the period
36
- processed_summary = sentences[0].strip() + "."
37
- # Replace "-" with an empty string
38
- processed_summary = processed_summary.replace("-", "")
39
- return processed_summary
40
-
41
- # Function for summarization and audio conversion
42
- def summarize_and_convert_to_audio(pdf_file):
43
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
- print(device)
45
-
46
- # Move models and related tensors to CUDA device if available
47
- processor = AutoProcessor.from_pretrained("suno/bark-small")
48
- model = AutoModel.from_pretrained("suno/bark-small").to(device)
49
-
50
- # Extract abstract
51
- abstract_text = extract_abstract(pdf_file)
52
-
53
- if not abstract_text:
54
- return "No 'abstract' section found in the uploaded PDF. Please upload a different PDF."
55
-
56
- # Summarize the abstract
57
- summarization_pipeline = pipeline(task='summarization', model='knkarthick/MEETING_SUMMARY', min_length=15, max_length=30)
58
- summarized_text = summarization_pipeline(abstract_text)
59
- one_sentence_summary = process_summary(summarized_text)
60
-
61
- print(one_sentence_summary)
62
-
63
- # Text-to-audio conversion
64
- inputs = processor(
65
- text=[one_sentence_summary],
66
- return_tensors="pt",
67
- )
68
- inputs = inputs.to(device)
69
-
70
- speech_values = model.generate(**inputs, do_sample=True)
71
- sampling_rate = model.generation_config.sample_rate
72
-
73
- # Convert speech values to audio data
74
- audio_data = speech_values.cpu().numpy().squeeze()
75
-
76
- # Convert audio data to bytes
77
- with BytesIO() as buffer:
78
- write_wav(buffer, sampling_rate, audio_data.astype(np.float32))
79
- audio_bytes = buffer.getvalue()
80
-
81
- return audio_bytes, one_sentence_summary
82
-
83
-
84
- # Create a Gradio interface
85
- iface = gr.Interface(
86
- fn=summarize_and_convert_to_audio,
87
- inputs=gr.UploadButton(label="Upload PDF", type="binary", file_types=["pdf"]), # Set to accept only PDF files
88
- outputs=[gr.Audio(label="Audio"), gr.Textbox(label="Message")],
89
- title="PDF Abstract Summarizer",
90
- description="""
91
- This application is supposed to summarize the 'abstract' section of a PDF file and convert the summarization into a speech.
92
- Please make sure you upload a PDF file with the 'abstract' section for application to work.
93
- Note: If you get an error while processing the file please refresh your browser and try again.
94
- """
95
- )
96
-
97
- iface.launch()