PodMagic / app.py
Ribot's picture
Update app.py
734bffb verified
raw
history blame
3.37 kB
import requests
import re
import os
import zipfile
import tempfile
import gradio as gr
from pathlib import Path
from bs4 import BeautifulSoup
def sanitize_filename(name):
return re.sub(r'[\\/*?:"<>|]', "", name).strip().replace(" ", "_")[:100]
def extract_episode_links(html_text, base_url):
soup = BeautifulSoup(html_text, "html.parser")
episodes = []
seen_urls = set()
# Analyse de chaque bloc d'épisode
for audio_tag in soup.find_all("audio"):
source = audio_tag.find("source")
if source and source.get("src", "").endswith(".mp3"):
title = (
audio_tag.get("aria-label")
or audio_tag.get("title")
or source.get("title")
or "episode"
)
url = source["src"]
if not url.startswith("http"):
url = requests.compat.urljoin(base_url, url)
if url not in seen_urls:
seen_urls.add(url)
episodes.append((title, url))
return episodes
def download_podcast_series(url):
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
except Exception as e:
return f"Erreur lors du chargement de la page : {e}", None
html_text = r.text
episodes = extract_episode_links(html_text, url)
if not episodes:
return "Aucun épisode audio trouvé sur la page.", None
with tempfile.TemporaryDirectory() as temp_dir:
zip_path = os.path.join(temp_dir, "podcast.zip")
used_filenames = set()
with zipfile.ZipFile(zip_path, "w") as zipf:
for idx, (title, mp3_url) in enumerate(episodes, 1):
base_name = f"{idx:02d}-" + sanitize_filename(title)
filename = base_name + ".mp3"
# Éviter les doublons de nom
count = 1
while filename in used_filenames:
filename = f"{base_name}_{count}.mp3"
count += 1
used_filenames.add(filename)
try:
audio = requests.get(mp3_url, stream=True, timeout=15)
audio.raise_for_status()
temp_mp3_path = os.path.join(temp_dir, filename)
with open(temp_mp3_path, "wb") as f:
for chunk in audio.iter_content(8192):
f.write(chunk)
zipf.write(temp_mp3_path, arcname=filename)
except Exception as e:
print(f"Erreur lors du téléchargement de {mp3_url} : {e}")
return "Téléchargement terminé !", zip_path
interface = gr.Interface(
fn=download_podcast_series,
inputs=gr.Textbox(label="URL du podcast radio (ex: France Culture)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/serie-le-capitaine-fracasse-de-theophile-gautier"),
outputs=[
gr.Textbox(label="Statut"),
gr.File(label="Fichier ZIP des épisodes")
],
title="Téléchargeur de Podcast Radio (.mp3)",
description="Collez un lien vers une série de podcast Radio France (ex: France Culture). Seuls les fichiers .mp3 correspondant aux épisodes seront extraits et regroupés dans un fichier ZIP téléchargeable.",
allow_flagging="never"
)
if __name__ == "__main__":
interface.launch()