Upload folder using huggingface_hub
Browse files
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.40.0
|
8 |
-
app_file:
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
|
2 |
---
|
3 |
+
title: dialogue_diarization_demo_main
|
4 |
+
emoji: 🔥
|
5 |
+
colorFrom: indigo
|
6 |
colorTo: indigo
|
7 |
sdk: gradio
|
8 |
sdk_version: 5.40.0
|
9 |
+
app_file: run.py
|
10 |
pinned: false
|
11 |
+
hf_oauth: true
|
12 |
---
|
|
|
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio-client @ git+https://github.com/gradio-app/gradio@9828952dd0569d86ae15ec4fbf27331c1539daab#subdirectory=client/python
|
2 |
+
https://gradio-pypi-previews.s3.amazonaws.com/9828952dd0569d86ae15ec4fbf27331c1539daab/gradio-5.40.0-py3-none-any.whl
|
3 |
+
gradio
|
4 |
+
torch
|
5 |
+
torchaudio
|
6 |
+
pyannote.audio
|
7 |
+
openai-whisper
|
8 |
+
librosa
|
9 |
+
numpy
|
10 |
+
transformers
|
11 |
+
speechbrain
|
run.ipynb
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: dialogue_diarization_demo"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio gradio torch torchaudio pyannote.audio openai-whisper librosa numpy transformers speechbrain "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# type: ignore\n", "import gradio as gr\n", "from pyannote.audio import Pipeline\n", "import whisper\n", "\n", "diarization_pipeline = None\n", "whisper_model = None\n", "\n", "\n", "def load_models():\n", " global diarization_pipeline, whisper_model # noqa: PLW0603\n", "\n", " if diarization_pipeline is None:\n", " diarization_pipeline = Pipeline.from_pretrained(\n", " \"pyannote/speaker-diarization-3.1\", use_auth_token=True\n", " )\n", "\n", " if whisper_model is None:\n", " whisper_model = whisper.load_model(\"base\")\n", "\n", "\n", "def real_diarization(audio_file_path: str) -> list[dict[str, str]]:\n", " try:\n", " load_models()\n", "\n", " if diarization_pipeline is None or whisper_model is None:\n", " raise Exception(\"Failed to load models\")\n", "\n", " diarization = diarization_pipeline(audio_file_path)\n", "\n", " transcription = whisper_model.transcribe(audio_file_path)\n", " segments = transcription[\"segments\"]\n", "\n", " dialogue_segments = []\n", " speaker_mapping = {}\n", " speaker_counter = 1\n", "\n", " for segment in segments:\n", " start_time = segment[\"start\"]\n", " end_time = segment[\"end\"]\n", " text = segment[\"text\"].strip()\n", "\n", " speaker = \"Speaker 1\"\n", " for turn, _, speaker_label in diarization.itertracks(yield_label=True):\n", " if (\n", " turn.start <= start_time <= turn.end\n", " or turn.start <= end_time <= turn.end\n", " ):\n", " if speaker_label not in speaker_mapping:\n", " speaker_mapping[speaker_label] = f\"Speaker {speaker_counter}\"\n", " speaker_counter += 1\n", " speaker = speaker_mapping[speaker_label]\n", " break\n", "\n", " if text:\n", " dialogue_segments.append({\"speaker\": speaker, \"text\": text})\n", "\n", " return dialogue_segments\n", "\n", " except Exception as e:\n", " print(f\"Error in diarization: {str(e)}\")\n", " return []\n", "\n", "\n", "def process_audio(audio_file):\n", " if audio_file is None:\n", " gr.Warning(\"Please upload an audio file first.\")\n", " return []\n", "\n", " try:\n", " dialogue_segments = real_diarization(audio_file)\n", " return dialogue_segments\n", " except Exception as e:\n", " gr.Error(f\"Error processing audio: {str(e)}\")\n", " return []\n", "\n", "\n", "speakers = [\n", " \"Speaker 1\",\n", " \"Speaker 2\",\n", " \"Speaker 3\",\n", " \"Speaker 4\",\n", " \"Speaker 5\",\n", " \"Speaker 6\",\n", "]\n", "tags = [\n", " \"(pause)\",\n", " \"(background noise)\",\n", " \"(unclear)\",\n", " \"(overlap)\",\n", " \"(phone ringing)\",\n", " \"(door closing)\",\n", " \"(music)\",\n", " \"(applause)\",\n", " \"(laughter)\",\n", "]\n", "\n", "\n", "def format_speaker(speaker, text):\n", " return f\"{speaker}: {text}\"\n", "\n", "\n", "with gr.Blocks(title=\"Audio Diarization Demo\") as demo:\n", " with gr.Row():\n", " with gr.Column(scale=1):\n", " audio_input = gr.Audio(\n", " label=\"Upload Audio File\",\n", " type=\"filepath\",\n", " sources=[\"upload\", \"microphone\"],\n", " )\n", "\n", " process_btn = gr.Button(\"\ud83d\udd0d Analyze Speakers\", variant=\"primary\", size=\"lg\")\n", "\n", " with gr.Column(scale=2):\n", " dialogue_output = gr.Dialogue(\n", " speakers=speakers,\n", " tags=tags,\n", " formatter=format_speaker,\n", " label=\"AI-generated speaker-separated conversation\",\n", " value=[],\n", " )\n", "\n", " process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[dialogue_output])\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
|
run.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# type: ignore
|
2 |
+
import gradio as gr
|
3 |
+
from pyannote.audio import Pipeline
|
4 |
+
import whisper
|
5 |
+
|
6 |
+
diarization_pipeline = None
|
7 |
+
whisper_model = None
|
8 |
+
|
9 |
+
|
10 |
+
def load_models():
|
11 |
+
global diarization_pipeline, whisper_model # noqa: PLW0603
|
12 |
+
|
13 |
+
if diarization_pipeline is None:
|
14 |
+
diarization_pipeline = Pipeline.from_pretrained(
|
15 |
+
"pyannote/speaker-diarization-3.1", use_auth_token=True
|
16 |
+
)
|
17 |
+
|
18 |
+
if whisper_model is None:
|
19 |
+
whisper_model = whisper.load_model("base")
|
20 |
+
|
21 |
+
|
22 |
+
def real_diarization(audio_file_path: str) -> list[dict[str, str]]:
|
23 |
+
try:
|
24 |
+
load_models()
|
25 |
+
|
26 |
+
if diarization_pipeline is None or whisper_model is None:
|
27 |
+
raise Exception("Failed to load models")
|
28 |
+
|
29 |
+
diarization = diarization_pipeline(audio_file_path)
|
30 |
+
|
31 |
+
transcription = whisper_model.transcribe(audio_file_path)
|
32 |
+
segments = transcription["segments"]
|
33 |
+
|
34 |
+
dialogue_segments = []
|
35 |
+
speaker_mapping = {}
|
36 |
+
speaker_counter = 1
|
37 |
+
|
38 |
+
for segment in segments:
|
39 |
+
start_time = segment["start"]
|
40 |
+
end_time = segment["end"]
|
41 |
+
text = segment["text"].strip()
|
42 |
+
|
43 |
+
speaker = "Speaker 1"
|
44 |
+
for turn, _, speaker_label in diarization.itertracks(yield_label=True):
|
45 |
+
if (
|
46 |
+
turn.start <= start_time <= turn.end
|
47 |
+
or turn.start <= end_time <= turn.end
|
48 |
+
):
|
49 |
+
if speaker_label not in speaker_mapping:
|
50 |
+
speaker_mapping[speaker_label] = f"Speaker {speaker_counter}"
|
51 |
+
speaker_counter += 1
|
52 |
+
speaker = speaker_mapping[speaker_label]
|
53 |
+
break
|
54 |
+
|
55 |
+
if text:
|
56 |
+
dialogue_segments.append({"speaker": speaker, "text": text})
|
57 |
+
|
58 |
+
return dialogue_segments
|
59 |
+
|
60 |
+
except Exception as e:
|
61 |
+
print(f"Error in diarization: {str(e)}")
|
62 |
+
return []
|
63 |
+
|
64 |
+
|
65 |
+
def process_audio(audio_file):
|
66 |
+
if audio_file is None:
|
67 |
+
gr.Warning("Please upload an audio file first.")
|
68 |
+
return []
|
69 |
+
|
70 |
+
try:
|
71 |
+
dialogue_segments = real_diarization(audio_file)
|
72 |
+
return dialogue_segments
|
73 |
+
except Exception as e:
|
74 |
+
gr.Error(f"Error processing audio: {str(e)}")
|
75 |
+
return []
|
76 |
+
|
77 |
+
|
78 |
+
speakers = [
|
79 |
+
"Speaker 1",
|
80 |
+
"Speaker 2",
|
81 |
+
"Speaker 3",
|
82 |
+
"Speaker 4",
|
83 |
+
"Speaker 5",
|
84 |
+
"Speaker 6",
|
85 |
+
]
|
86 |
+
tags = [
|
87 |
+
"(pause)",
|
88 |
+
"(background noise)",
|
89 |
+
"(unclear)",
|
90 |
+
"(overlap)",
|
91 |
+
"(phone ringing)",
|
92 |
+
"(door closing)",
|
93 |
+
"(music)",
|
94 |
+
"(applause)",
|
95 |
+
"(laughter)",
|
96 |
+
]
|
97 |
+
|
98 |
+
|
99 |
+
def format_speaker(speaker, text):
|
100 |
+
return f"{speaker}: {text}"
|
101 |
+
|
102 |
+
|
103 |
+
with gr.Blocks(title="Audio Diarization Demo") as demo:
|
104 |
+
with gr.Row():
|
105 |
+
with gr.Column(scale=1):
|
106 |
+
audio_input = gr.Audio(
|
107 |
+
label="Upload Audio File",
|
108 |
+
type="filepath",
|
109 |
+
sources=["upload", "microphone"],
|
110 |
+
)
|
111 |
+
|
112 |
+
process_btn = gr.Button("🔍 Analyze Speakers", variant="primary", size="lg")
|
113 |
+
|
114 |
+
with gr.Column(scale=2):
|
115 |
+
dialogue_output = gr.Dialogue(
|
116 |
+
speakers=speakers,
|
117 |
+
tags=tags,
|
118 |
+
formatter=format_speaker,
|
119 |
+
label="AI-generated speaker-separated conversation",
|
120 |
+
value=[],
|
121 |
+
)
|
122 |
+
|
123 |
+
process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[dialogue_output])
|
124 |
+
|
125 |
+
if __name__ == "__main__":
|
126 |
+
demo.launch()
|