freddyaboulton HF Staff commited on
Commit
3bb71cf
·
verified ·
1 Parent(s): 535bd1a

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +6 -6
  2. requirements.txt +11 -0
  3. run.ipynb +1 -0
  4. run.py +126 -0
README.md CHANGED
@@ -1,12 +1,12 @@
 
1
  ---
2
- title: Dialogue Diarization Demo Main
3
- emoji: 🐠
4
- colorFrom: blue
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 5.40.0
8
- app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+
2
  ---
3
+ title: dialogue_diarization_demo_main
4
+ emoji: 🔥
5
+ colorFrom: indigo
6
  colorTo: indigo
7
  sdk: gradio
8
  sdk_version: 5.40.0
9
+ app_file: run.py
10
  pinned: false
11
+ hf_oauth: true
12
  ---
 
 
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio-client @ git+https://github.com/gradio-app/gradio@9828952dd0569d86ae15ec4fbf27331c1539daab#subdirectory=client/python
2
+ https://gradio-pypi-previews.s3.amazonaws.com/9828952dd0569d86ae15ec4fbf27331c1539daab/gradio-5.40.0-py3-none-any.whl
3
+ gradio
4
+ torch
5
+ torchaudio
6
+ pyannote.audio
7
+ openai-whisper
8
+ librosa
9
+ numpy
10
+ transformers
11
+ speechbrain
run.ipynb ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: dialogue_diarization_demo"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio gradio torch torchaudio pyannote.audio openai-whisper librosa numpy transformers speechbrain "]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# type: ignore\n", "import gradio as gr\n", "from pyannote.audio import Pipeline\n", "import whisper\n", "\n", "diarization_pipeline = None\n", "whisper_model = None\n", "\n", "\n", "def load_models():\n", " global diarization_pipeline, whisper_model # noqa: PLW0603\n", "\n", " if diarization_pipeline is None:\n", " diarization_pipeline = Pipeline.from_pretrained(\n", " \"pyannote/speaker-diarization-3.1\", use_auth_token=True\n", " )\n", "\n", " if whisper_model is None:\n", " whisper_model = whisper.load_model(\"base\")\n", "\n", "\n", "def real_diarization(audio_file_path: str) -> list[dict[str, str]]:\n", " try:\n", " load_models()\n", "\n", " if diarization_pipeline is None or whisper_model is None:\n", " raise Exception(\"Failed to load models\")\n", "\n", " diarization = diarization_pipeline(audio_file_path)\n", "\n", " transcription = whisper_model.transcribe(audio_file_path)\n", " segments = transcription[\"segments\"]\n", "\n", " dialogue_segments = []\n", " speaker_mapping = {}\n", " speaker_counter = 1\n", "\n", " for segment in segments:\n", " start_time = segment[\"start\"]\n", " end_time = segment[\"end\"]\n", " text = segment[\"text\"].strip()\n", "\n", " speaker = \"Speaker 1\"\n", " for turn, _, speaker_label in diarization.itertracks(yield_label=True):\n", " if (\n", " turn.start <= start_time <= turn.end\n", " or turn.start <= end_time <= turn.end\n", " ):\n", " if speaker_label not in speaker_mapping:\n", " speaker_mapping[speaker_label] = f\"Speaker {speaker_counter}\"\n", " speaker_counter += 1\n", " speaker = speaker_mapping[speaker_label]\n", " break\n", "\n", " if text:\n", " dialogue_segments.append({\"speaker\": speaker, \"text\": text})\n", "\n", " return dialogue_segments\n", "\n", " except Exception as e:\n", " print(f\"Error in diarization: {str(e)}\")\n", " return []\n", "\n", "\n", "def process_audio(audio_file):\n", " if audio_file is None:\n", " gr.Warning(\"Please upload an audio file first.\")\n", " return []\n", "\n", " try:\n", " dialogue_segments = real_diarization(audio_file)\n", " return dialogue_segments\n", " except Exception as e:\n", " gr.Error(f\"Error processing audio: {str(e)}\")\n", " return []\n", "\n", "\n", "speakers = [\n", " \"Speaker 1\",\n", " \"Speaker 2\",\n", " \"Speaker 3\",\n", " \"Speaker 4\",\n", " \"Speaker 5\",\n", " \"Speaker 6\",\n", "]\n", "tags = [\n", " \"(pause)\",\n", " \"(background noise)\",\n", " \"(unclear)\",\n", " \"(overlap)\",\n", " \"(phone ringing)\",\n", " \"(door closing)\",\n", " \"(music)\",\n", " \"(applause)\",\n", " \"(laughter)\",\n", "]\n", "\n", "\n", "def format_speaker(speaker, text):\n", " return f\"{speaker}: {text}\"\n", "\n", "\n", "with gr.Blocks(title=\"Audio Diarization Demo\") as demo:\n", " with gr.Row():\n", " with gr.Column(scale=1):\n", " audio_input = gr.Audio(\n", " label=\"Upload Audio File\",\n", " type=\"filepath\",\n", " sources=[\"upload\", \"microphone\"],\n", " )\n", "\n", " process_btn = gr.Button(\"\ud83d\udd0d Analyze Speakers\", variant=\"primary\", size=\"lg\")\n", "\n", " with gr.Column(scale=2):\n", " dialogue_output = gr.Dialogue(\n", " speakers=speakers,\n", " tags=tags,\n", " formatter=format_speaker,\n", " label=\"AI-generated speaker-separated conversation\",\n", " value=[],\n", " )\n", "\n", " process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[dialogue_output])\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
run.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # type: ignore
2
+ import gradio as gr
3
+ from pyannote.audio import Pipeline
4
+ import whisper
5
+
6
+ diarization_pipeline = None
7
+ whisper_model = None
8
+
9
+
10
+ def load_models():
11
+ global diarization_pipeline, whisper_model # noqa: PLW0603
12
+
13
+ if diarization_pipeline is None:
14
+ diarization_pipeline = Pipeline.from_pretrained(
15
+ "pyannote/speaker-diarization-3.1", use_auth_token=True
16
+ )
17
+
18
+ if whisper_model is None:
19
+ whisper_model = whisper.load_model("base")
20
+
21
+
22
+ def real_diarization(audio_file_path: str) -> list[dict[str, str]]:
23
+ try:
24
+ load_models()
25
+
26
+ if diarization_pipeline is None or whisper_model is None:
27
+ raise Exception("Failed to load models")
28
+
29
+ diarization = diarization_pipeline(audio_file_path)
30
+
31
+ transcription = whisper_model.transcribe(audio_file_path)
32
+ segments = transcription["segments"]
33
+
34
+ dialogue_segments = []
35
+ speaker_mapping = {}
36
+ speaker_counter = 1
37
+
38
+ for segment in segments:
39
+ start_time = segment["start"]
40
+ end_time = segment["end"]
41
+ text = segment["text"].strip()
42
+
43
+ speaker = "Speaker 1"
44
+ for turn, _, speaker_label in diarization.itertracks(yield_label=True):
45
+ if (
46
+ turn.start <= start_time <= turn.end
47
+ or turn.start <= end_time <= turn.end
48
+ ):
49
+ if speaker_label not in speaker_mapping:
50
+ speaker_mapping[speaker_label] = f"Speaker {speaker_counter}"
51
+ speaker_counter += 1
52
+ speaker = speaker_mapping[speaker_label]
53
+ break
54
+
55
+ if text:
56
+ dialogue_segments.append({"speaker": speaker, "text": text})
57
+
58
+ return dialogue_segments
59
+
60
+ except Exception as e:
61
+ print(f"Error in diarization: {str(e)}")
62
+ return []
63
+
64
+
65
+ def process_audio(audio_file):
66
+ if audio_file is None:
67
+ gr.Warning("Please upload an audio file first.")
68
+ return []
69
+
70
+ try:
71
+ dialogue_segments = real_diarization(audio_file)
72
+ return dialogue_segments
73
+ except Exception as e:
74
+ gr.Error(f"Error processing audio: {str(e)}")
75
+ return []
76
+
77
+
78
+ speakers = [
79
+ "Speaker 1",
80
+ "Speaker 2",
81
+ "Speaker 3",
82
+ "Speaker 4",
83
+ "Speaker 5",
84
+ "Speaker 6",
85
+ ]
86
+ tags = [
87
+ "(pause)",
88
+ "(background noise)",
89
+ "(unclear)",
90
+ "(overlap)",
91
+ "(phone ringing)",
92
+ "(door closing)",
93
+ "(music)",
94
+ "(applause)",
95
+ "(laughter)",
96
+ ]
97
+
98
+
99
+ def format_speaker(speaker, text):
100
+ return f"{speaker}: {text}"
101
+
102
+
103
+ with gr.Blocks(title="Audio Diarization Demo") as demo:
104
+ with gr.Row():
105
+ with gr.Column(scale=1):
106
+ audio_input = gr.Audio(
107
+ label="Upload Audio File",
108
+ type="filepath",
109
+ sources=["upload", "microphone"],
110
+ )
111
+
112
+ process_btn = gr.Button("🔍 Analyze Speakers", variant="primary", size="lg")
113
+
114
+ with gr.Column(scale=2):
115
+ dialogue_output = gr.Dialogue(
116
+ speakers=speakers,
117
+ tags=tags,
118
+ formatter=format_speaker,
119
+ label="AI-generated speaker-separated conversation",
120
+ value=[],
121
+ )
122
+
123
+ process_btn.click(fn=process_audio, inputs=[audio_input], outputs=[dialogue_output])
124
+
125
+ if __name__ == "__main__":
126
+ demo.launch()