File size: 3,367 Bytes
734bffb 96682d9 7c806f5 734bffb 1f317db 734bffb 11fd592 734bffb 11fd592 734bffb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import requests
import re
import os
import zipfile
import tempfile
import gradio as gr
from pathlib import Path
from bs4 import BeautifulSoup
def sanitize_filename(name):
return re.sub(r'[\\/*?:"<>|]', "", name).strip().replace(" ", "_")[:100]
def extract_episode_links(html_text, base_url):
soup = BeautifulSoup(html_text, "html.parser")
episodes = []
seen_urls = set()
# Analyse de chaque bloc d'épisode
for audio_tag in soup.find_all("audio"):
source = audio_tag.find("source")
if source and source.get("src", "").endswith(".mp3"):
title = (
audio_tag.get("aria-label")
or audio_tag.get("title")
or source.get("title")
or "episode"
)
url = source["src"]
if not url.startswith("http"):
url = requests.compat.urljoin(base_url, url)
if url not in seen_urls:
seen_urls.add(url)
episodes.append((title, url))
return episodes
def download_podcast_series(url):
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
except Exception as e:
return f"Erreur lors du chargement de la page : {e}", None
html_text = r.text
episodes = extract_episode_links(html_text, url)
if not episodes:
return "Aucun épisode audio trouvé sur la page.", None
with tempfile.TemporaryDirectory() as temp_dir:
zip_path = os.path.join(temp_dir, "podcast.zip")
used_filenames = set()
with zipfile.ZipFile(zip_path, "w") as zipf:
for idx, (title, mp3_url) in enumerate(episodes, 1):
base_name = f"{idx:02d}-" + sanitize_filename(title)
filename = base_name + ".mp3"
# Éviter les doublons de nom
count = 1
while filename in used_filenames:
filename = f"{base_name}_{count}.mp3"
count += 1
used_filenames.add(filename)
try:
audio = requests.get(mp3_url, stream=True, timeout=15)
audio.raise_for_status()
temp_mp3_path = os.path.join(temp_dir, filename)
with open(temp_mp3_path, "wb") as f:
for chunk in audio.iter_content(8192):
f.write(chunk)
zipf.write(temp_mp3_path, arcname=filename)
except Exception as e:
print(f"Erreur lors du téléchargement de {mp3_url} : {e}")
return "Téléchargement terminé !", zip_path
interface = gr.Interface(
fn=download_podcast_series,
inputs=gr.Textbox(label="URL du podcast radio (ex: France Culture)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/serie-le-capitaine-fracasse-de-theophile-gautier"),
outputs=[
gr.Textbox(label="Statut"),
gr.File(label="Fichier ZIP des épisodes")
],
title="Téléchargeur de Podcast Radio (.mp3)",
description="Collez un lien vers une série de podcast Radio France (ex: France Culture). Seuls les fichiers .mp3 correspondant aux épisodes seront extraits et regroupés dans un fichier ZIP téléchargeable.",
allow_flagging="never"
)
if __name__ == "__main__":
interface.launch()
|