import requests import re import os import zipfile import tempfile import gradio as gr from pathlib import Path from bs4 import BeautifulSoup def sanitize_filename(name): return re.sub(r'[\\/*?:"<>|]', "", name).strip().replace(" ", "_")[:100] def extract_episode_links(html_text, base_url): soup = BeautifulSoup(html_text, "html.parser") episodes = [] seen_urls = set() # Analyse de chaque bloc d'épisode for audio_tag in soup.find_all("audio"): source = audio_tag.find("source") if source and source.get("src", "").endswith(".mp3"): title = ( audio_tag.get("aria-label") or audio_tag.get("title") or source.get("title") or "episode" ) url = source["src"] if not url.startswith("http"): url = requests.compat.urljoin(base_url, url) if url not in seen_urls: seen_urls.add(url) episodes.append((title, url)) return episodes def download_podcast_series(url): try: r = requests.get(url, timeout=10) r.raise_for_status() except Exception as e: return f"Erreur lors du chargement de la page : {e}", None html_text = r.text episodes = extract_episode_links(html_text, url) if not episodes: return "Aucun épisode audio trouvé sur la page.", None with tempfile.TemporaryDirectory() as temp_dir: zip_path = os.path.join(temp_dir, "podcast.zip") used_filenames = set() with zipfile.ZipFile(zip_path, "w") as zipf: for idx, (title, mp3_url) in enumerate(episodes, 1): base_name = f"{idx:02d}-" + sanitize_filename(title) filename = base_name + ".mp3" # Éviter les doublons de nom count = 1 while filename in used_filenames: filename = f"{base_name}_{count}.mp3" count += 1 used_filenames.add(filename) try: audio = requests.get(mp3_url, stream=True, timeout=15) audio.raise_for_status() temp_mp3_path = os.path.join(temp_dir, filename) with open(temp_mp3_path, "wb") as f: for chunk in audio.iter_content(8192): f.write(chunk) zipf.write(temp_mp3_path, arcname=filename) except Exception as e: print(f"Erreur lors du téléchargement de {mp3_url} : {e}") return "Téléchargement terminé !", zip_path interface = gr.Interface( fn=download_podcast_series, inputs=gr.Textbox(label="URL du podcast radio (ex: France Culture)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/serie-le-capitaine-fracasse-de-theophile-gautier"), outputs=[ gr.Textbox(label="Statut"), gr.File(label="Fichier ZIP des épisodes") ], title="Téléchargeur de Podcast Radio (.mp3)", description="Collez un lien vers une série de podcast Radio France (ex: France Culture). Seuls les fichiers .mp3 correspondant aux épisodes seront extraits et regroupés dans un fichier ZIP téléchargeable.", allow_flagging="never" ) if __name__ == "__main__": interface.launch()