import gradio as gr
import requests
import re
import os
import zipfile
import tempfile
from urllib.parse import urljoin
from bs4 import BeautifulSoup

def process_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Referer': 'https://www.radiofrance.fr/'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        return None, f"Erreur de connexion : {str(e)}"

    soup = BeautifulSoup(response.text, 'html.parser')
    mp3_links = []

    # Nouvelle méthode de détection
    scripts = soup.find_all('script', type='application/ld+json')
    for script in scripts:
        if script.string:
            matches = re.findall(r'"contentUrl"\s*:\s*"([^"]+?\.mp3)', script.string)
            for match in matches:
                full_url = urljoin(url, match.split('?')[0])
                if full_url not in mp3_links:
                    mp3_links.append(full_url)

    # Fallback pour les URLs dans les attributs data
    if not mp3_links:
        for tag in soup.find_all(attrs={"data-url": re.compile(r"\.mp3")}):
            mp3_url = urljoin(url, tag['data-url'].split('?')[0])
            mp3_links.append(mp3_url)

    # Dernier recours : recherche globale
    if not mp3_links:
        matches = re.findall(r'(https?://[^\s"\']+?\.mp3)', response.text)
        for match in matches:
            clean_url = urljoin(url, match.split('?')[0])
            if clean_url not in mp3_links and 'podcast' in clean_url:
                mp3_links.append(clean_url)

    # Filtrage final
    mp3_links = list(dict.fromkeys(mp3_links))  # Supprime les doublons

    if not mp3_links:
        return None, "Aucun épisode trouvé - Structure de page inconnue"

    # Téléchargement et création ZIP
    temp_dir = tempfile.mkdtemp()
    filenames = []
    
    for idx, mp3_url in enumerate(mp3_links, 1):
        try:
            filename = f"{idx:02d}_{os.path.basename(mp3_url)}"
            filepath = os.path.join(temp_dir, filename)
            
            with requests.get(mp3_url, headers=headers, stream=True) as r:
                r.raise_for_status()
                with open(filepath, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            filenames.append(filepath)
        except Exception:
            continue

    if not filenames:
        return None, "Échec du téléchargement"

    zip_path = os.path.join(temp_dir, 'podcast.zip')
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file in filenames:
            zipf.write(file, arcname=os.path.basename(file))

    return zip_path, None

def download_podcast(url):
    zip_path, error = process_url(url)
    if error:
        raise gr.Error(error)
    return zip_path

with gr.Blocks() as app:
    gr.Markdown("## 🎧 Téléchargeur Radio France")
    with gr.Row():
        url_input = gr.Textbox(
            label="URL du podcast",
            placeholder="Ex: https://www.radiofrance.fr/...",
            max_lines=1
        )
    btn = gr.Button("Télécharger les épisodes", variant="primary")
    output = gr.File(label="Fichier ZIP")
    
    examples = gr.Examples(
        examples=[[
            "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
        ]],
        inputs=[url_input]
    )

    btn.click(
        fn=download_podcast,
        inputs=url_input,
        outputs=output,
        api_name="download"
    )

app.launch()