Spaces:
Build error
Build error
Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import os
|
4 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
5 |
+
from pyannote.audio import Pipeline as DiarizationPipeline
|
6 |
+
import whisper
|
7 |
+
import tempfile
|
8 |
+
import shutil
|
9 |
+
from pydub import AudioSegment
|
10 |
+
|
11 |
+
# Load whisper model
|
12 |
+
whisper_model = whisper.load_model("base") # Use "small" or "medium" if needed
|
13 |
+
|
14 |
+
# Load summarization pipeline
|
15 |
+
summarizer_tokenizer = AutoTokenizer.from_pretrained("t5-small")
|
16 |
+
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
|
17 |
+
summarizer = pipeline("summarization", model=summarizer_model, tokenizer=summarizer_tokenizer)
|
18 |
+
|
19 |
+
def convert_to_wav(input_path, output_path):
|
20 |
+
audio = AudioSegment.from_file(input_path)
|
21 |
+
audio.export(output_path, format="wav")
|
22 |
+
|
23 |
+
def transcribe_audio(audio_path):
|
24 |
+
result = whisper_model.transcribe(audio_path, fp16=torch.cuda.is_available())
|
25 |
+
return result['text']
|
26 |
+
|
27 |
+
def diarize_audio(audio_path, hf_token):
|
28 |
+
os.environ["HF_TOKEN"] = hf_token
|
29 |
+
diarization_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
|
30 |
+
diarization = diarization_pipeline(audio_path)
|
31 |
+
return diarization
|
32 |
+
|
33 |
+
def combine_diarized_transcript(diarization, full_text):
|
34 |
+
# Basic speaker labeling using diarization and full text
|
35 |
+
# Note: This is a simplified alignment using time chunks only
|
36 |
+
chunks = []
|
37 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
38 |
+
start, end = turn.start, turn.end
|
39 |
+
chunks.append(f"{speaker}: [from {start:.1f}s to {end:.1f}s]")
|
40 |
+
# Combine for display/demo
|
41 |
+
return "\n".join(chunks) + "\n" + full_text
|
42 |
+
|
43 |
+
def summarize_text(text):
|
44 |
+
prefix = "summarize: " + text.strip()
|
45 |
+
inputs = summarizer_tokenizer.encode(prefix, return_tensors="pt", max_length=512, truncation=True)
|
46 |
+
summary_ids = summarizer_model.generate(inputs, max_length=100, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
|
47 |
+
return summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
48 |
+
|
49 |
+
def process_pipeline(audio_file, hf_token):
|
50 |
+
if not hf_token:
|
51 |
+
return "", "", "Error: HuggingFace token is required."
|
52 |
+
|
53 |
+
if not os.path.exists(audio_file) or os.path.getsize(audio_file) == 0:
|
54 |
+
return "", "", "Error: Uploaded file is missing or empty."
|
55 |
+
|
56 |
+
# Step 1: Convert to WAV if needed
|
57 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
|
58 |
+
try:
|
59 |
+
sound = AudioSegment.from_file(audio_file)
|
60 |
+
sound.export(tmp_wav.name, format="wav")
|
61 |
+
tmp_path = tmp_wav.name
|
62 |
+
except Exception as e:
|
63 |
+
return "", "", f"Audio conversion failed: {str(e)}"
|
64 |
+
|
65 |
+
# Step 2: Transcription (Whisper)
|
66 |
+
try:
|
67 |
+
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base",return_timestamps=True, device=0 if torch.cuda.is_available() else -1)
|
68 |
+
result = transcriber(tmp_path)
|
69 |
+
transcript = result["text"]
|
70 |
+
except Exception as e:
|
71 |
+
return "", "", f"Transcription failed: {str(e)}"
|
72 |
+
|
73 |
+
# Step 3: Summarization
|
74 |
+
try:
|
75 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)
|
76 |
+
summary = summarizer(transcript, max_length=130, min_length=30, do_sample=False)[0]["summary_text"]
|
77 |
+
except Exception as e:
|
78 |
+
return transcript, "", f"Summarization failed: {str(e)}"
|
79 |
+
|
80 |
+
return tmp_path, transcript, summary
|
81 |
+
|
82 |
+
description = """
|
83 |
+
### 🩺 GP Consultation Summarizer (Demo App)
|
84 |
+
|
85 |
+
This app:
|
86 |
+
1. Transcribes short consultation audio using Whisper
|
87 |
+
2. Identifies who spoke when using PyAnnote speaker diarization
|
88 |
+
3. Combines both into a labeled transcript
|
89 |
+
4. Generates a short summary using T5-small
|
90 |
+
|
91 |
+
⚠️ **Note:** Best for short consultations (under 5–6 minutes).
|
92 |
+
⚠️ You must provide your own Hugging Face token (required for diarization).
|
93 |
+
"""
|
94 |
+
|
95 |
+
app = gr.Interface(
|
96 |
+
fn=process_pipeline,
|
97 |
+
inputs=[
|
98 |
+
gr.Audio(type="filepath", label="Upload Consultation Audio (.wav)"),
|
99 |
+
gr.Textbox(label="Your Hugging Face Token", type="password")
|
100 |
+
],
|
101 |
+
outputs=[
|
102 |
+
gr.Textbox(label="Raw Transcript"),
|
103 |
+
gr.Textbox(label="Labeled Transcript (with Speaker Info)"),
|
104 |
+
gr.Textbox(label="Summary")
|
105 |
+
],
|
106 |
+
title="GP Consultation Summarizer",
|
107 |
+
description=description,
|
108 |
+
allow_flagging="never"
|
109 |
+
)
|
110 |
+
|
111 |
+
if __name__ == "__main__":
|
112 |
+
app.launch(share=True)
|