File size: 3,661 Bytes
1baa979
 
 
 
 
 
ecedb67
9b1b90d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1baa979
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
import torch
import base64
import tempfile
import os
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor

# Setup model
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "KBLab/kb-whisper-tiny"

@st.cache_resource
def load_model():
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, use_safetensors=True, cache_dir="cache"
    )
    model.to(device)
    processor = AutoProcessor.from_pretrained(model_id)
    return pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )

asr_pipeline = load_model()

st.title("Swedish Speech-to-Text Demo")

# Audio Upload Option
uploaded_file = st.file_uploader("Ladda upp en ljudfil", type=["wav", "mp3", "flac"])

# JavaScript for recording audio
audio_recorder_js = """
<script>
let mediaRecorder;
let audioChunks = [];
let isRecording = false;

function startRecording() {
    if (!isRecording) {
        isRecording = true;
        navigator.mediaDevices.getUserMedia({ audio: true }).then(stream => {
            mediaRecorder = new MediaRecorder(stream);
            audioChunks = [];
            mediaRecorder.ondataavailable = event => {
                audioChunks.push(event.data);
            };
            mediaRecorder.onstop = () => {
                const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
                const reader = new FileReader();
                reader.readAsDataURL(audioBlob);
                reader.onloadend = () => {
                    const base64Audio = reader.result.split(',')[1];
                    fetch('/save_audio', {
                        method: 'POST',
                        headers: { 'Content-Type': 'application/json' },
                        body: JSON.stringify({ audio: base64Audio })
                    }).then(response => response.json()).then(data => {
                        console.log(data);
                        window.location.reload();
                    });
                };
            };
            mediaRecorder.start();
        });
    }
}

function stopRecording() {
    if (isRecording) {
        isRecording = false;
        mediaRecorder.stop();
    }
}
</script>

<button onclick="startRecording()">🎤 Starta inspelning</button>
<button onclick="stopRecording()">⏹️ Stoppa inspelning</button>
"""

st.components.v1.html(audio_recorder_js)

# Processing audio input (uploaded file or recorded)
audio_path = None

if uploaded_file is not None:
    # Save uploaded file to a temp location
    with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[-1]) as temp_audio:
        temp_audio.write(uploaded_file.read())
        audio_path = temp_audio.name

elif "audio_data" in st.session_state and st.session_state["audio_data"]:
    # Decode base64 audio from JavaScript recording
    audio_bytes = base64.b64decode(st.session_state["audio_data"])
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio.write(audio_bytes)
        audio_path = temp_audio.name

# Transcribe if we have audio
if audio_path:
    st.audio(audio_path, format="audio/wav")
    
    with st.spinner("Transkriberar..."):
        transcription = asr_pipeline(audio_path)["text"]

    st.subheader("📜 Transkription:")
    st.write(transcription)

    # Cleanup temp file
    os.remove(audio_path)