File size: 2,948 Bytes
1125bf9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
from transformers import pipeline
import PyPDF2
import ebooklib
from ebooklib import epub
import re
import tempfile
import os
from pydub import AudioSegment
def read_pdf(file_path):
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def read_epub(file_path):
book = epub.read_epub(file_path)
text = ""
for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
text += re.sub('<[^<]+?>', '', doc.get_content().decode()) + "\n"
return text
def split_text_into_chunks(text, max_tokens=500):
sentences = re.split(r'(?<=[.!?]) +', text)
chunks, current_chunk = [], ""
for sentence in sentences:
if len((current_chunk + sentence).split()) > max_tokens:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
else:
current_chunk += sentence + " "
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
def tts_orpheus(text_chunks, token):
pipe = pipeline("text-to-speech", model="SebastianBodza/Kartoffel_Orpheus-3B_german_synthetic-v0.1", use_auth_token=token)
combined_audio = AudioSegment.silent(duration=0)
for chunk in text_chunks:
output = pipe(chunk, forward_params={"speaker_id": 0})
wav_path = output["audio"]
audio_seg = AudioSegment.from_file(wav_path, format="wav")
combined_audio += audio_seg
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
combined_audio.export(tmp_file.name, format="mp3")
return tmp_file.name
def process_file(file, token, max_tokens):
ext = os.path.splitext(file.name)[-1].lower()
if ext == ".pdf":
text = read_pdf(file.name)
elif ext == ".epub":
text = read_epub(file.name)
else:
return "Ungültiges Dateiformat", None
chunks = split_text_into_chunks(text, max_tokens=max_tokens)
audio_path = tts_orpheus(chunks, token)
return "Fertig!", audio_path
token_input = gr.Textbox(label="Hugging Face Token", type="password")
file_input = gr.File(label="EPUB oder PDF hochladen")
max_tokens_input = gr.Slider(100, 500, value=500, step=50, label="Maximale Tokens pro Chunk")
with gr.Blocks() as demo:
gr.Markdown("## Kartoffel Orpheus TTS - EPUB/PDF zu Audio")
with gr.Row():
with gr.Column():
token_box = token_input
file_box = file_input
token_limit_box = max_tokens_input
start_btn = gr.Button("Starten")
with gr.Column():
status = gr.Textbox(label="Status")
audio_out = gr.Audio(label="Ergebnis MP3", type="filepath")
start_btn.click(fn=process_file, inputs=[file_box, token_box, token_limit_box], outputs=[status, audio_out])
if __name__ == "__main__":
demo.launch() |