File size: 3,923 Bytes
5d52c32 d8083da 6c226f9 a5099f1 88183ad 6c226f9 2362603 9d6fa91 66efbc3 6c226f9 d8083da 6c226f9 d8083da 6c226f9 5d52c32 d8083da 3f4e5b4 d8083da a5099f1 d8083da a5099f1 3f4e5b4 a5099f1 0f44b3b 3f4e5b4 0f44b3b 3f4e5b4 0f44b3b 3f4e5b4 0f44b3b 3f4e5b4 0f44b3b 3f4e5b4 0f44b3b a5099f1 0f44b3b 3f4e5b4 0f44b3b 6b749d2 d8083da 3291a15 d8083da ae3ef7d d8083da ae3ef7d d8083da 0fdae18 67a946c 6c226f9 0fdae18 d8083da 3c0cd8e d8083da 3f4e5b4 d8083da 3c0cd8e d8083da 67a946c 3c0cd8e 0fdae18 d8083da 3f4e5b4 6c226f9 d8083da a5099f1 d8083da 6c226f9 d8083da d82704d 6c226f9 d8083da 67a946c d8083da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import spaces
import torch
import gradio as gr
from transformers import pipeline
from huggingface_hub import InferenceClient
import os
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
device = 0 if torch.cuda.is_available() else "cpu"
# Whisper νμ΄νλΌμΈ μ΄κΈ°ν
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# Hugging Face μΆλ‘ ν΄λΌμ΄μΈνΈ μ€μ
hf_client = InferenceClient(
"CohereForAI/c4ai-command-r-plus-08-2024",
token=os.getenv("HF_TOKEN")
)
@spaces.GPU
def transcribe_summarize(audio_input, task):
if audio_input is None:
raise gr.Error("μ€λμ€ νμΌμ΄ μ μΆλμ§ μμμ΅λλ€!")
# μμ±μ ν
μ€νΈλ‘ λ³ν
result = pipe(
audio_input,
batch_size=BATCH_SIZE,
generate_kwargs={"task": task},
return_timestamps=True
)
transcribed_text = result["text"]
# ν
μ€νΈ μμ½ (μμ λ λΆλΆ)
try:
# μμ½μ© ν둬ννΈ μμ±
prompt = f"""μλ ν
μ€νΈλ₯Ό κ°λ¨ν μμ½ν΄μ£ΌμΈμ:
ν
μ€νΈ: {transcribed_text}
μμ½:"""
# API νΈμΆ
response = hf_client.text_generation(
model="CohereForAI/c4ai-command-r-plus-08-2024",
prompt=prompt,
max_new_tokens=150,
temperature=0.3,
top_p=0.9,
repetition_penalty=1.2,
stop_sequences=["\n", "ν
μ€νΈ:", "μμ½:"]
)
# API μλ΅ μ²λ¦¬ (μμ λ λΆλΆ)
if isinstance(response, str):
summary_text = response
else:
summary_text = response.generated_text if hasattr(response, 'generated_text') else str(response)
# ν둬ννΈ λΆλΆ μ κ±°
if "μμ½:" in summary_text:
summary_text = summary_text.split("μμ½:")[1].strip()
if not summary_text:
summary_text = "μμ½μ μμ±ν μ μμ΅λλ€."
except Exception as e:
print(f"μμ½ μμ± μ€ μ€λ₯ λ°μ: {str(e)}") # λλ²κΉ
μ© λ‘κ·Έ
summary_text = "μμ½μ μμ±ν μ μμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ."
print(f"λ³νλ ν
μ€νΈ: {transcribed_text}") # λλ²κΉ
μ© λ‘κ·Έ
print(f"μμ±λ μμ½: {summary_text}") # λλ²κΉ
μ© λ‘κ·Έ
return [transcribed_text, summary_text]
# CSS μ€νμΌ
css = """
footer { visibility: hidden; }
"""
# νμΌ μ
λ‘λ μΈν°νμ΄μ€
file_transcribe = gr.Interface(
fn=transcribe_summarize,
inputs=[
gr.Audio(sources="upload", type="filepath", label="μ€λμ€ νμΌ"),
gr.Radio(
choices=["transcribe", "translate"],
label="μμ
",
value="transcribe"
),
],
outputs=[
gr.Textbox(label="λ³νλ ν
μ€νΈ", lines=5),
gr.Textbox(label="μμ½", lines=3)
],
title="λ°μμ°κΈ° AI: μμ±μ ν
μ€νΈλ‘ λ³ννκ³ μμ½νκΈ°",
flagging_mode="never"
)
# λ§μ΄ν¬ λ
Ήμ μΈν°νμ΄μ€
mic_transcribe = gr.Interface(
fn=transcribe_summarize,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Radio(
choices=["transcribe", "translate"],
label="μμ
",
value="transcribe"
),
],
outputs=[
gr.Textbox(label="λ³νλ ν
μ€νΈ", lines=5),
gr.Textbox(label="μμ½", lines=3)
],
title="λ°μμ°κΈ° AI: μμ±μ ν
μ€νΈλ‘ λ³ννκ³ μμ½νκΈ°",
flagging_mode="never",
css=css
)
# λ©μΈ μ ν리μΌμ΄μ
demo = gr.Blocks(theme="Nymbo/Nymbo_Theme",css=css)
with demo:
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["μ€λμ€ νμΌ", "λ§μ΄ν¬"]
)
# μ ν리μΌμ΄μ
μ€ν
demo.queue().launch(ssr_mode=False) |