|
import requests |
|
import re |
|
import os |
|
import zipfile |
|
import tempfile |
|
import gradio as gr |
|
from pathlib import Path |
|
from bs4 import BeautifulSoup |
|
|
|
def sanitize_filename(name): |
|
return re.sub(r'[\\/*?:"<>|]', "", name).strip().replace(" ", "_")[:100] |
|
|
|
def extract_episode_links(html_text, base_url): |
|
soup = BeautifulSoup(html_text, "html.parser") |
|
episodes = [] |
|
seen_urls = set() |
|
|
|
|
|
for audio_tag in soup.find_all("audio"): |
|
source = audio_tag.find("source") |
|
if source and source.get("src", "").endswith(".mp3"): |
|
title = ( |
|
audio_tag.get("aria-label") |
|
or audio_tag.get("title") |
|
or source.get("title") |
|
or "episode" |
|
) |
|
url = source["src"] |
|
if not url.startswith("http"): |
|
url = requests.compat.urljoin(base_url, url) |
|
|
|
if url not in seen_urls: |
|
seen_urls.add(url) |
|
episodes.append((title, url)) |
|
|
|
return episodes |
|
|
|
def download_podcast_series(url): |
|
try: |
|
r = requests.get(url, timeout=10) |
|
r.raise_for_status() |
|
except Exception as e: |
|
return f"Erreur lors du chargement de la page : {e}", None |
|
|
|
html_text = r.text |
|
episodes = extract_episode_links(html_text, url) |
|
|
|
if not episodes: |
|
return "Aucun épisode audio trouvé sur la page.", None |
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
zip_path = os.path.join(temp_dir, "podcast.zip") |
|
used_filenames = set() |
|
|
|
with zipfile.ZipFile(zip_path, "w") as zipf: |
|
for idx, (title, mp3_url) in enumerate(episodes, 1): |
|
base_name = f"{idx:02d}-" + sanitize_filename(title) |
|
filename = base_name + ".mp3" |
|
|
|
|
|
count = 1 |
|
while filename in used_filenames: |
|
filename = f"{base_name}_{count}.mp3" |
|
count += 1 |
|
used_filenames.add(filename) |
|
|
|
try: |
|
audio = requests.get(mp3_url, stream=True, timeout=15) |
|
audio.raise_for_status() |
|
temp_mp3_path = os.path.join(temp_dir, filename) |
|
with open(temp_mp3_path, "wb") as f: |
|
for chunk in audio.iter_content(8192): |
|
f.write(chunk) |
|
zipf.write(temp_mp3_path, arcname=filename) |
|
except Exception as e: |
|
print(f"Erreur lors du téléchargement de {mp3_url} : {e}") |
|
|
|
return "Téléchargement terminé !", zip_path |
|
|
|
interface = gr.Interface( |
|
fn=download_podcast_series, |
|
inputs=gr.Textbox(label="URL du podcast radio (ex: France Culture)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/serie-le-capitaine-fracasse-de-theophile-gautier"), |
|
outputs=[ |
|
gr.Textbox(label="Statut"), |
|
gr.File(label="Fichier ZIP des épisodes") |
|
], |
|
title="Téléchargeur de Podcast Radio (.mp3)", |
|
description="Collez un lien vers une série de podcast Radio France (ex: France Culture). Seuls les fichiers .mp3 correspondant aux épisodes seront extraits et regroupés dans un fichier ZIP téléchargeable.", |
|
allow_flagging="never" |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch() |
|
|