|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import os |
|
import zipfile |
|
import tempfile |
|
import re |
|
from urllib.parse import urljoin |
|
|
|
def process_url(url): |
|
try: |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' |
|
} |
|
response = requests.get(url, headers=headers) |
|
response.raise_for_status() |
|
except requests.RequestException as e: |
|
return None, f"Erreur lors de la récupération de la page : {e}" |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
mp3_links = [] |
|
|
|
|
|
for audio_tag in soup.find_all('audio'): |
|
source = audio_tag.find('source') |
|
if source and 'src' in source.attrs: |
|
mp3_url = source['src'] |
|
if '.mp3' in mp3_url: |
|
absolute_url = urljoin(response.url, mp3_url.split('?')[0]) |
|
mp3_links.append(absolute_url) |
|
|
|
|
|
script_tags = soup.find_all('script', type='application/ld+json') |
|
for script in script_tags: |
|
content = script.string |
|
if content and '"episode"' in content: |
|
matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+\.mp3[^"]*)"', content) |
|
for match in matches: |
|
absolute_url = urljoin(response.url, match.split('?')[0]) |
|
mp3_links.append(absolute_url) |
|
|
|
|
|
mp3_links = list(dict.fromkeys(mp3_links)) |
|
|
|
if not mp3_links: |
|
return None, "Aucun lien MP3 trouvé - Structure de page non reconnue" |
|
|
|
temp_dir = tempfile.mkdtemp() |
|
filenames = [] |
|
|
|
for idx, mp3_url in enumerate(mp3_links, start=1): |
|
try: |
|
filename = f"{idx:02d}_{os.path.basename(mp3_url).split('?')[0]}" |
|
filepath = os.path.join(temp_dir, filename) |
|
|
|
with requests.get(mp3_url, headers=headers, stream=True) as r: |
|
r.raise_for_status() |
|
with open(filepath, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=8192): |
|
f.write(chunk) |
|
filenames.append(filepath) |
|
except Exception as e: |
|
print(f"Erreur sur {mp3_url}: {str(e)}") |
|
continue |
|
|
|
if not filenames: |
|
return None, "Échec du téléchargement des fichiers" |
|
|
|
zip_path = os.path.join(temp_dir, 'podcast.zip') |
|
with zipfile.ZipFile(zip_path, 'w') as zipf: |
|
for file in filenames: |
|
zipf.write(file, arcname=os.path.basename(file)) |
|
|
|
return zip_path, None |
|
|
|
def download_podcast(url): |
|
zip_path, error = process_url(url) |
|
if error: |
|
raise gr.Error(error) |
|
return zip_path |
|
|
|
iface = gr.Interface( |
|
fn=download_podcast, |
|
inputs=gr.Textbox(label="URL du podcast Radio France", placeholder="https://www.radiofrance.fr/..."), |
|
outputs=gr.File(label="Télécharger les épisodes"), |
|
title="Téléchargeur Radio France", |
|
examples=[[ |
|
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin" |
|
]], |
|
description="Collez ici l'URL d'une série de podcasts Radio France pour récupérer tous les épisodes MP3" |
|
) |
|
|
|
iface.launch() |