File size: 3,344 Bytes
eeae908 734bffb eeae908 734bffb 6ca2249 88da9f3 0a6bdb5 6ca2249 96682d9 88da9f3 0a6bdb5 88da9f3 734bffb 88da9f3 0a6bdb5 eeae908 0a6bdb5 734bffb 6ca2249 0a6bdb5 734bffb 88da9f3 0a6bdb5 88da9f3 6ca2249 0a6bdb5 88da9f3 734bffb 88da9f3 0a6bdb5 734bffb 0a6bdb5 88da9f3 6ca2249 734bffb 0a6bdb5 734bffb 88da9f3 734bffb 88da9f3 0a6bdb5 88da9f3 11fd592 88da9f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import os
import zipfile
import tempfile
import re
from urllib.parse import urljoin
def process_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
except requests.RequestException as e:
return None, f"Erreur lors de la récupération de la page : {e}"
soup = BeautifulSoup(response.text, 'html.parser')
mp3_links = []
# Recherche dans les balises audio et les divs de podcast
for audio_tag in soup.find_all('audio'):
source = audio_tag.find('source')
if source and 'src' in source.attrs:
mp3_url = source['src']
if '.mp3' in mp3_url:
absolute_url = urljoin(response.url, mp3_url.split('?')[0]) # Nettoyer l'URL
mp3_links.append(absolute_url)
# Recherche alternative dans les données JSON
script_tags = soup.find_all('script', type='application/ld+json')
for script in script_tags:
content = script.string
if content and '"episode"' in content:
matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+\.mp3[^"]*)"', content)
for match in matches:
absolute_url = urljoin(response.url, match.split('?')[0])
mp3_links.append(absolute_url)
# Suppression des doublons
mp3_links = list(dict.fromkeys(mp3_links))
if not mp3_links:
return None, "Aucun lien MP3 trouvé - Structure de page non reconnue"
temp_dir = tempfile.mkdtemp()
filenames = []
for idx, mp3_url in enumerate(mp3_links, start=1):
try:
filename = f"{idx:02d}_{os.path.basename(mp3_url).split('?')[0]}"
filepath = os.path.join(temp_dir, filename)
with requests.get(mp3_url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
filenames.append(filepath)
except Exception as e:
print(f"Erreur sur {mp3_url}: {str(e)}")
continue
if not filenames:
return None, "Échec du téléchargement des fichiers"
zip_path = os.path.join(temp_dir, 'podcast.zip')
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in filenames:
zipf.write(file, arcname=os.path.basename(file))
return zip_path, None
def download_podcast(url):
zip_path, error = process_url(url)
if error:
raise gr.Error(error)
return zip_path
iface = gr.Interface(
fn=download_podcast,
inputs=gr.Textbox(label="URL du podcast Radio France", placeholder="https://www.radiofrance.fr/..."),
outputs=gr.File(label="Télécharger les épisodes"),
title="Téléchargeur Radio France",
examples=[[
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
]],
description="Collez ici l'URL d'une série de podcasts Radio France pour récupérer tous les épisodes MP3"
)
iface.launch() |