yunusajib commited on
Commit
2606de1
·
verified ·
1 Parent(s): 2112708

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import os
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
5
+ from pyannote.audio import Pipeline as DiarizationPipeline
6
+ import whisper
7
+ import tempfile
8
+ import shutil
9
+ from pydub import AudioSegment
10
+
11
+ # Load whisper model
12
+ whisper_model = whisper.load_model("base") # Use "small" or "medium" if needed
13
+
14
+ # Load summarization pipeline
15
+ summarizer_tokenizer = AutoTokenizer.from_pretrained("t5-small")
16
+ summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
17
+ summarizer = pipeline("summarization", model=summarizer_model, tokenizer=summarizer_tokenizer)
18
+
19
+ def convert_to_wav(input_path, output_path):
20
+ audio = AudioSegment.from_file(input_path)
21
+ audio.export(output_path, format="wav")
22
+
23
+ def transcribe_audio(audio_path):
24
+ result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
25
+ return result['text']
26
+
27
+ def diarize_audio(audio_path, hf_token):
28
+ os.environ["HF_TOKEN"] = hf_token
29
+ diarization_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
30
+ diarization = diarization_pipeline(audio_path)
31
+ return diarization
32
+
33
+ def combine_diarized_transcript(diarization, full_text):
34
+ # Basic speaker labeling using diarization and full text
35
+ # Note: This is a simplified alignment using time chunks only
36
+ chunks = []
37
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
38
+ start, end = turn.start, turn.end
39
+ chunks.append(f"{speaker}: [from {start:.1f}s to {end:.1f}s]")
40
+ # Combine for display/demo
41
+ return "\n".join(chunks) + "\n" + full_text
42
+
43
+ def summarize_text(text):
44
+ prefix = "summarize: " + text.strip()
45
+ inputs = summarizer_tokenizer.encode(prefix, return_tensors="pt", max_length=512, truncation=True)
46
+ summary_ids = summarizer_model.generate(inputs, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
47
+ return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
48
+
49
+ def process_pipeline(audio_file, hf_token):
50
+ if not hf_token:
51
+ return "", "", "Error: HuggingFace token is required."
52
+
53
+ if not os.path.exists(audio_file) or os.path.getsize(audio_file) == 0:
54
+ return "", "", "Error: Uploaded file is missing or empty."
55
+
56
+ # Step 1: Convert to WAV if needed
57
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
58
+ try:
59
+ sound = AudioSegment.from_file(audio_file)
60
+ sound.export(tmp_wav.name, format="wav")
61
+ tmp_path = tmp_wav.name
62
+ except Exception as e:
63
+ return "", "", f"Audio conversion failed: {str(e)}"
64
+
65
+ # Step 2: Transcription (Whisper)
66
+ try:
67
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base",return_timestamps=True, device=0 if torch.cuda.is_available() else -1)
68
+ result = transcriber(tmp_path)
69
+ transcript = result["text"]
70
+ except Exception as e:
71
+ return "", "", f"Transcription failed: {str(e)}"
72
+
73
+ # Step 3: Summarization
74
+ try:
75
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
76
+ summary = summarizer(transcript, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
77
+ except Exception as e:
78
+ return transcript, "", f"Summarization failed: {str(e)}"
79
+
80
+ return tmp_path, transcript, summary
81
+
82
+ description = """
83
+ ### 🩺 GP Consultation Summarizer (Demo App)
84
+
85
+ This app:
86
+ 1. Transcribes short consultation audio using Whisper
87
+ 2. Identifies who spoke when using PyAnnote speaker diarization
88
+ 3. Combines both into a labeled transcript
89
+ 4. Generates a short summary using T5-small
90
+
91
+ ⚠️ **Note:** Best for short consultations (under 5–6 minutes).
92
+ ⚠️ You must provide your own Hugging Face token (required for diarization).
93
+ """
94
+
95
+ app = gr.Interface(
96
+ fn=process_pipeline,
97
+ inputs=[
98
+ gr.Audio(type="filepath", label="Upload Consultation Audio (.wav)"),
99
+ gr.Textbox(label="Your Hugging Face Token", type="password")
100
+ ],
101
+ outputs=[
102
+ gr.Textbox(label="Raw Transcript"),
103
+ gr.Textbox(label="Labeled Transcript (with Speaker Info)"),
104
+ gr.Textbox(label="Summary")
105
+ ],
106
+ title="GP Consultation Summarizer",
107
+ description=description,
108
+ allow_flagging="never"
109
+ )
110
+
111
+ if __name__ == "__main__":
112
+ app.launch(share=True)