122_ / app.py
UnSinnlos's picture
Update app.py
1125bf9 verified
import gradio as gr
from transformers import pipeline
import PyPDF2
import ebooklib
from ebooklib import epub
import re
import tempfile
import os
from pydub import AudioSegment
def read_pdf(file_path):
with open(file_path, 'rb') as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def read_epub(file_path):
book = epub.read_epub(file_path)
text = ""
for doc in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
text += re.sub('<[^<]+?>', '', doc.get_content().decode()) + "\n"
return text
def split_text_into_chunks(text, max_tokens=500):
sentences = re.split(r'(?<=[.!?]) +', text)
chunks, current_chunk = [], ""
for sentence in sentences:
if len((current_chunk + sentence).split()) > max_tokens:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
else:
current_chunk += sentence + " "
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
def tts_orpheus(text_chunks, token):
pipe = pipeline("text-to-speech", model="SebastianBodza/Kartoffel_Orpheus-3B_german_synthetic-v0.1", use_auth_token=token)
combined_audio = AudioSegment.silent(duration=0)
for chunk in text_chunks:
output = pipe(chunk, forward_params={"speaker_id": 0})
wav_path = output["audio"]
audio_seg = AudioSegment.from_file(wav_path, format="wav")
combined_audio += audio_seg
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
combined_audio.export(tmp_file.name, format="mp3")
return tmp_file.name
def process_file(file, token, max_tokens):
ext = os.path.splitext(file.name)[-1].lower()
if ext == ".pdf":
text = read_pdf(file.name)
elif ext == ".epub":
text = read_epub(file.name)
else:
return "Ungültiges Dateiformat", None
chunks = split_text_into_chunks(text, max_tokens=max_tokens)
audio_path = tts_orpheus(chunks, token)
return "Fertig!", audio_path
token_input = gr.Textbox(label="Hugging Face Token", type="password")
file_input = gr.File(label="EPUB oder PDF hochladen")
max_tokens_input = gr.Slider(100, 500, value=500, step=50, label="Maximale Tokens pro Chunk")
with gr.Blocks() as demo:
gr.Markdown("## Kartoffel Orpheus TTS - EPUB/PDF zu Audio")
with gr.Row():
with gr.Column():
token_box = token_input
file_box = file_input
token_limit_box = max_tokens_input
start_btn = gr.Button("Starten")
with gr.Column():
status = gr.Textbox(label="Status")
audio_out = gr.Audio(label="Ergebnis MP3", type="filepath")
start_btn.click(fn=process_file, inputs=[file_box, token_box, token_limit_box], outputs=[status, audio_out])
if __name__ == "__main__":
demo.launch()