PodMagic / app.py
Ribot's picture
Update app.py
0a6bdb5 verified
raw
history blame
3.34 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
import os
import zipfile
import tempfile
import re
from urllib.parse import urljoin
def process_url(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
except requests.RequestException as e:
return None, f"Erreur lors de la récupération de la page : {e}"
soup = BeautifulSoup(response.text, 'html.parser')
mp3_links = []
# Recherche dans les balises audio et les divs de podcast
for audio_tag in soup.find_all('audio'):
source = audio_tag.find('source')
if source and 'src' in source.attrs:
mp3_url = source['src']
if '.mp3' in mp3_url:
absolute_url = urljoin(response.url, mp3_url.split('?')[0]) # Nettoyer l'URL
mp3_links.append(absolute_url)
# Recherche alternative dans les données JSON
script_tags = soup.find_all('script', type='application/ld+json')
for script in script_tags:
content = script.string
if content and '"episode"' in content:
matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+\.mp3[^"]*)"', content)
for match in matches:
absolute_url = urljoin(response.url, match.split('?')[0])
mp3_links.append(absolute_url)
# Suppression des doublons
mp3_links = list(dict.fromkeys(mp3_links))
if not mp3_links:
return None, "Aucun lien MP3 trouvé - Structure de page non reconnue"
temp_dir = tempfile.mkdtemp()
filenames = []
for idx, mp3_url in enumerate(mp3_links, start=1):
try:
filename = f"{idx:02d}_{os.path.basename(mp3_url).split('?')[0]}"
filepath = os.path.join(temp_dir, filename)
with requests.get(mp3_url, headers=headers, stream=True) as r:
r.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
filenames.append(filepath)
except Exception as e:
print(f"Erreur sur {mp3_url}: {str(e)}")
continue
if not filenames:
return None, "Échec du téléchargement des fichiers"
zip_path = os.path.join(temp_dir, 'podcast.zip')
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in filenames:
zipf.write(file, arcname=os.path.basename(file))
return zip_path, None
def download_podcast(url):
zip_path, error = process_url(url)
if error:
raise gr.Error(error)
return zip_path
iface = gr.Interface(
fn=download_podcast,
inputs=gr.Textbox(label="URL du podcast Radio France", placeholder="https://www.radiofrance.fr/..."),
outputs=gr.File(label="Télécharger les épisodes"),
title="Téléchargeur Radio France",
examples=[[
"https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
]],
description="Collez ici l'URL d'une série de podcasts Radio France pour récupérer tous les épisodes MP3"
)
iface.launch()