import streamlit as st import tempfile from pydub import AudioSegment from pyannote.audio import Pipeline from faster_whisper import WhisperModel from docx import Document from io import BytesIO import colorsys # ページ設定 st.set_page_config(page_title="話者分離付き文字起こし", layout="centered") st.title("🧠 話者分離付き文字起こしアプリ") # Whisperモデル選択 model_size = st.selectbox("Whisperモデルを選択", ["tiny", "base", "small", "medium", "large-v2"], index=2) # Hugging Face トークン入力 token_input = st.text_input("🔐 Hugging Face アクセストークンを入力", type="password") # 音声アップロード uploaded_file = st.file_uploader("🎵 音声ファイルをアップロード(mp3, wav, m4a)", type=["mp3", "wav", "m4a"]) # カラーパレット生成 def generate_color_palette(n): colors = [] for i in range(n): hue = i / n lightness = 0.85 saturation = 0.6 rgb = colorsys.hls_to_rgb(hue, lightness, saturation) hex_color = '#%02x%02x%02x' % tuple(int(c * 255) for c in rgb) colors.append(hex_color) return colors # 処理スタート if uploaded_file and token_input: st.audio(uploaded_file) if st.button("▶️ 文字起こしスタート"): status = st.info("準備中…") progress = st.progress(0) try: # 音声を.wavに変換 with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: audio = AudioSegment.from_file(uploaded_file) audio.export(tmp.name, format="wav") audio_path = tmp.name progress.progress(20) status.info("🔍 話者分離中...") pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=token_input) diarization = pipeline(audio_path) progress.progress(50) status.info("📝 Whisperで文字起こし中...") model = WhisperModel(model_size, compute_type="int8") segments, _ = model.transcribe(audio_path, language="ja", vad_filter=True) # 話者割当 word_blocks = [] for segment in segments: start = segment.start speaker = "unknown" for turn in diarization.itertracks(yield_label=True): if turn[0].start <= start <= turn[0].end: speaker = turn[2] break word_blocks.append((speaker, segment.text.strip())) progress.progress(80) status.success("✅ 完了!") # 表示と色分け st.subheader("🗣️ 話者ごとの文字起こし") unique_speakers = sorted(set(s for s, _ in word_blocks)) colors = generate_color_palette(len(unique_speakers)) color_map = {spk: col for spk, col in zip(unique_speakers, colors)} for speaker, text in word_blocks: st.markdown( f"