Ribot commited on
Commit
7c806f5
·
verified ·
1 Parent(s): c0153c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -96
app.py CHANGED
@@ -1,110 +1,85 @@
1
  import os
2
  import re
3
- import sys
4
- import subprocess
5
- import importlib.util
6
- import shutil
7
- import tempfile
8
-
9
- # === INSTALLATION AUTOMATIQUE DES DÉPENDANCES ===
10
- def install_if_missing(package_name, import_name=None):
11
- import_name = import_name or package_name
12
- if importlib.util.find_spec(import_name) is None:
13
- subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
14
-
15
- for package in [("requests",), ("bs4", "bs4"), ("gradio",)]:
16
- install_if_missing(*package)
17
-
18
  import requests
19
- import gradio as gr
 
20
  from bs4 import BeautifulSoup
 
21
 
22
- # === UTILITAIRES ===
23
- def slugify(text, max_length=50):
24
- text = text.lower()
25
- text = re.sub(r'[^\w\s-]', '', text)
26
- text = re.sub(r'[-\s]+', '_', text)
27
- return text[:max_length].strip('_')
28
 
29
- def get_episode_pages(main_url):
30
- response = requests.get(main_url)
31
- response.raise_for_status()
32
- soup = BeautifulSoup(response.text, 'html.parser')
33
- episode_urls = []
34
 
35
- for a in soup.find_all('a', href=True):
36
- href = a['href']
37
- if "/franceculture/podcasts/" in href and not href.endswith('/serie'):
38
- full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
39
- episode_urls.append(full_url)
40
 
41
- return list(dict.fromkeys(episode_urls))
 
 
 
 
42
 
43
- def get_mp3_link_with_ithema(page_url):
 
44
  try:
45
- response = requests.get(page_url)
46
  response.raise_for_status()
47
- html = response.text
48
- matches = re.findall(r'https://[^"]*ithema[^"]*\.mp3', html)
49
- return matches[0] if matches else None
50
- except Exception:
51
- return None
52
-
53
- def get_podcast_title(main_url):
54
- try:
55
- response = requests.get(main_url)
56
- soup = BeautifulSoup(response.text, 'html.parser')
57
- title_tag = soup.find('h1') or soup.find('title')
58
- return slugify(title_tag.get_text()) if title_tag else "podcast"
59
- except:
60
- return "podcast"
61
-
62
- def download_and_zip_podcast(main_url):
63
- try:
64
- title = get_podcast_title(main_url)
65
- episode_pages = get_episode_pages(main_url)
66
- if not episode_pages:
67
- return "Aucune page d’épisode trouvée.", None
68
-
69
- mp3_links = []
70
- for page in episode_pages:
71
- mp3 = get_mp3_link_with_ithema(page)
72
- if mp3:
73
- mp3_links.append(mp3)
74
-
75
- if not mp3_links:
76
- return "Aucun fichier MP3 contenant 'ithema' trouvé.", None
77
-
78
- temp_dir = tempfile.mkdtemp()
79
- for i, mp3_url in enumerate(mp3_links, start=1):
80
- filename = f"{title}_{i:02}.mp3"
81
- filepath = os.path.join(temp_dir, filename)
82
- with requests.get(mp3_url, stream=True) as r:
83
- r.raise_for_status()
84
- with open(filepath, 'wb') as f:
85
- for chunk in r.iter_content(chunk_size=8192):
86
- f.write(chunk)
87
-
88
- zip_path = os.path.join(temp_dir, f"{title}.zip")
89
- shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
90
- return f"{len(mp3_links)} fichier(s) téléchargé(s).", zip_path
91
-
92
  except Exception as e:
93
- return f"Erreur : {str(e)}", None
94
-
95
- # === INTERFACE GRADIO ===
96
- with gr.Blocks() as app:
97
- gr.Markdown("# Téléchargeur de Podcasts MP3 (France Culture)")
98
- url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
99
- download_button = gr.Button("Télécharger et compresser")
100
- output_text = gr.Textbox(label="Message")
101
- file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
102
-
103
- def process(url):
104
- msg, zip_path = download_and_zip_podcast(url)
105
- return msg, zip_path
106
-
107
- download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  if __name__ == "__main__":
110
- app.launch(share=True)
 
1
  import os
2
  import re
3
+ import zipfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import requests
5
+ import tempfile
6
+ import subprocess
7
  from bs4 import BeautifulSoup
8
+ import gradio as gr
9
 
10
+ # Installation automatique des dépendances (à lancer en local une seule fois)
11
+ try:
12
+ import bs4
13
+ except ImportError:
14
+ subprocess.run(["pip", "install", "-q", "gradio", "beautifulsoup4", "requests"])
 
15
 
16
+ def sanitize_filename(name):
17
+ return re.sub(r"[^\w\-_.]", "_", name.strip())[:50]
 
 
 
18
 
19
+ def extract_audio_links_from_html(html_text):
20
+ pattern = r'https://media\.radiofrance-podcast\.net/[^"]*ITEMA[^"]*\.(mp3|m4a)'
21
+ return list(set(re.findall(pattern, html_text)))
 
 
22
 
23
+ def extract_titles_and_links(html_text):
24
+ # Recherche de segments JSON avec les titres + urls audio
25
+ pattern = r'title:\\"(.*?)\\",url:\\"(https://media\.radiofrance-podcast\.net/[^"]*ITEMA[^"]*\.(mp3|m4a))\\"'
26
+ matches = re.findall(pattern, html_text)
27
+ return [(sanitize_filename(title), url) for title, url, _ in matches]
28
 
29
+ def download_and_zip(url):
30
+ # Téléchargement du HTML
31
  try:
32
+ response = requests.get(url)
33
  response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  except Exception as e:
35
+ return f"Erreur de téléchargement : {e}", None
36
+
37
+ html_text = response.text
38
+
39
+ # Extraction des titres et des liens
40
+ titles_links = extract_titles_and_links(html_text)
41
+ if not titles_links:
42
+ # fallback brut si les titres ne sont pas extraits
43
+ urls = extract_audio_links_from_html(html_text)
44
+ titles_links = [(f"track_{i+1:02d}", u) for i, u in enumerate(urls)]
45
+
46
+ if not titles_links:
47
+ return "Aucun fichier audio trouvé avec ITEMA dans l'URL", None
48
+
49
+ # Création dossier temporaire
50
+ with tempfile.TemporaryDirectory() as tmpdir:
51
+ zip_path = os.path.join(tmpdir, "podcasts.zip")
52
+ with zipfile.ZipFile(zip_path, "w") as zipf:
53
+ for idx, (title, audio_url) in enumerate(titles_links, 1):
54
+ ext = ".mp3" if ".mp3" in audio_url else ".m4a"
55
+ filename = f"{idx:02d}-{title}{ext}"
56
+ filepath = os.path.join(tmpdir, filename)
57
+ try:
58
+ audio_resp = requests.get(audio_url)
59
+ audio_resp.raise_for_status()
60
+ with open(filepath, "wb") as f:
61
+ f.write(audio_resp.content)
62
+ zipf.write(filepath, arcname=filename)
63
+ except Exception as e:
64
+ print(f"Erreur téléchargement {audio_url} : {e}")
65
+
66
+ return "Téléchargement terminé avec succès", zip_path
67
+
68
+ def gradio_interface(url):
69
+ message, zip_file = download_and_zip(url)
70
+ return message, zip_file
71
+
72
+ # Interface Gradio
73
+ demo = gr.Interface(
74
+ fn=gradio_interface,
75
+ inputs=gr.Textbox(label="URL de la page Radio France (Podcast)", placeholder="https://www.radiofrance.fr/franceculture/podcasts/..."),
76
+ outputs=[
77
+ gr.Textbox(label="Message"),
78
+ gr.File(label="Fichier ZIP des épisodes")
79
+ ],
80
+ title="Téléchargement de Podcasts Radio France",
81
+ description="Collez une URL vers un podcast de Radio France pour télécharger tous les épisodes (mp3/m4a) avec les bons noms."
82
+ )
83
 
84
  if __name__ == "__main__":
85
+ demo.launch()