File size: 3,923 Bytes
5d52c32
d8083da
6c226f9
 
a5099f1
88183ad
6c226f9
2362603
9d6fa91
66efbc3
6c226f9
 
 
d8083da
6c226f9
 
 
 
 
 
 
d8083da
 
 
 
 
6c226f9
5d52c32
d8083da
 
3f4e5b4
d8083da
a5099f1
d8083da
 
 
 
 
 
 
a5099f1
3f4e5b4
a5099f1
0f44b3b
 
 
 
 
 
3f4e5b4
 
 
0f44b3b
 
3f4e5b4
 
0f44b3b
 
 
3f4e5b4
 
0f44b3b
 
 
 
 
3f4e5b4
 
0f44b3b
 
3f4e5b4
0f44b3b
 
 
a5099f1
0f44b3b
 
3f4e5b4
0f44b3b
 
6b749d2
d8083da
3291a15
d8083da
ae3ef7d
d8083da
ae3ef7d
 
d8083da
0fdae18
67a946c
6c226f9
0fdae18
d8083da
 
 
 
 
3c0cd8e
d8083da
3f4e5b4
 
d8083da
 
 
3c0cd8e
 
d8083da
 
67a946c
3c0cd8e
0fdae18
d8083da
 
 
 
 
 
 
3f4e5b4
 
6c226f9
d8083da
a5099f1
d8083da
6c226f9
 
d8083da
d82704d
6c226f9
d8083da
 
 
 
67a946c
d8083da
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import spaces
import torch 
import gradio as gr
from transformers import pipeline
from huggingface_hub import InferenceClient
import os

MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000

device = 0 if torch.cuda.is_available() else "cpu"

# Whisper νŒŒμ΄ν”„λΌμΈ μ΄ˆκΈ°ν™”
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

# Hugging Face μΆ”λ‘  ν΄λΌμ΄μ–ΈνŠΈ μ„€μ •
hf_client = InferenceClient(
    "CohereForAI/c4ai-command-r-plus-08-2024",
    token=os.getenv("HF_TOKEN")
)

@spaces.GPU
def transcribe_summarize(audio_input, task):
    if audio_input is None:
        raise gr.Error("μ˜€λ””μ˜€ 파일이 μ œμΆœλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
    
    # μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜
    result = pipe(
        audio_input,
        batch_size=BATCH_SIZE,
        generate_kwargs={"task": task},
        return_timestamps=True
    )
    transcribed_text = result["text"]
    
    # ν…μŠ€νŠΈ μš”μ•½ (μˆ˜μ •λœ λΆ€λΆ„)
    try:
        # μš”μ•½μš© ν”„λ‘¬ν”„νŠΈ 생성
        prompt = f"""μ•„λž˜ ν…μŠ€νŠΈλ₯Ό κ°„λ‹¨νžˆ μš”μ•½ν•΄μ£Όμ„Έμš”:

ν…μŠ€νŠΈ: {transcribed_text}

μš”μ•½:"""
        
        # API 호좜
        response = hf_client.text_generation(
            model="CohereForAI/c4ai-command-r-plus-08-2024",
            prompt=prompt,
            max_new_tokens=150,
            temperature=0.3,
            top_p=0.9,
            repetition_penalty=1.2,
            stop_sequences=["\n", "ν…μŠ€νŠΈ:", "μš”μ•½:"]
        )
        
        # API 응닡 처리 (μˆ˜μ •λœ λΆ€λΆ„)
        if isinstance(response, str):
            summary_text = response
        else:
            summary_text = response.generated_text if hasattr(response, 'generated_text') else str(response)
        
        # ν”„λ‘¬ν”„νŠΈ λΆ€λΆ„ 제거
        if "μš”μ•½:" in summary_text:
            summary_text = summary_text.split("μš”μ•½:")[1].strip()
        
        if not summary_text:
            summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€."
            
    except Exception as e:
        print(f"μš”μ•½ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")  # λ””λ²„κΉ…μš© 둜그
        summary_text = "μš”μ•½μ„ 생성할 수 μ—†μŠ΅λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
    
    print(f"λ³€ν™˜λœ ν…μŠ€νŠΈ: {transcribed_text}")  # λ””λ²„κΉ…μš© 둜그
    print(f"μƒμ„±λœ μš”μ•½: {summary_text}")  # λ””λ²„κΉ…μš© 둜그
    
    return [transcribed_text, summary_text]

# CSS μŠ€νƒ€μΌ
css = """
footer { visibility: hidden; }
"""

# 파일 μ—…λ‘œλ“œ μΈν„°νŽ˜μ΄μŠ€
file_transcribe = gr.Interface(
    fn=transcribe_summarize,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="μ˜€λ””μ˜€ 파일"),
        gr.Radio(
            choices=["transcribe", "translate"],
            label="μž‘μ—…",
            value="transcribe"
        ),
    ],
    outputs=[
        gr.Textbox(label="λ³€ν™˜λœ ν…μŠ€νŠΈ", lines=5),
        gr.Textbox(label="μš”μ•½", lines=3)
    ],
    title="λ°›μ•„μ“°κΈ° AI: μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•˜κΈ°",
    flagging_mode="never"
)

# 마이크 λ…ΉμŒ μΈν„°νŽ˜μ΄μŠ€
mic_transcribe = gr.Interface(
    fn=transcribe_summarize,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(
            choices=["transcribe", "translate"],
            label="μž‘μ—…",
            value="transcribe"
        ),
    ],
    outputs=[
        gr.Textbox(label="λ³€ν™˜λœ ν…μŠ€νŠΈ", lines=5),
        gr.Textbox(label="μš”μ•½", lines=3)
    ],
    title="λ°›μ•„μ“°κΈ° AI: μŒμ„±μ„ ν…μŠ€νŠΈλ‘œ λ³€ν™˜ν•˜κ³  μš”μ•½ν•˜κΈ°",
    flagging_mode="never",
    css=css
)

# 메인 μ• ν”Œλ¦¬μΌ€μ΄μ…˜
demo = gr.Blocks(theme="Nymbo/Nymbo_Theme",css=css)
with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["μ˜€λ””μ˜€ 파일", "마이크"]
    )

# μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ‹€ν–‰
demo.queue().launch(ssr_mode=False)