Ribot commited on
Commit
c0153c3
·
verified ·
1 Parent(s): 1f317db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -42
app.py CHANGED
@@ -10,17 +10,11 @@ import tempfile
10
  def install_if_missing(package_name, import_name=None):
11
  import_name = import_name or package_name
12
  if importlib.util.find_spec(import_name) is None:
13
- print(f"Installation de {package_name}...")
14
  subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
15
 
16
- for package in [
17
- ("requests",),
18
- ("bs4", "bs4"),
19
- ("gradio",),
20
- ]:
21
  install_if_missing(*package)
22
 
23
- # === IMPORTS ===
24
  import requests
25
  import gradio as gr
26
  from bs4 import BeautifulSoup
@@ -32,61 +26,54 @@ def slugify(text, max_length=50):
32
  text = re.sub(r'[-\s]+', '_', text)
33
  return text[:max_length].strip('_')
34
 
35
- def get_episode_links(main_url):
36
- """Récupère toutes les URL des pages d’épisodes depuis la page principale."""
37
  response = requests.get(main_url)
38
  response.raise_for_status()
39
  soup = BeautifulSoup(response.text, 'html.parser')
40
- links = []
41
 
42
  for a in soup.find_all('a', href=True):
43
  href = a['href']
44
- if "/podcasts/" in href and href != main_url:
45
  full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
46
- if full_url not in links:
47
- links.append(full_url)
48
-
49
- return list(dict.fromkeys(links)) # dédoublonner
50
 
51
- def extract_mp3_from_episode(url):
52
- """Extrait le lien MP3 d’un épisode."""
 
53
  try:
54
- response = requests.get(url)
55
  response.raise_for_status()
56
- soup = BeautifulSoup(response.text, 'html.parser')
57
- audio_tag = soup.find("audio")
58
- if audio_tag and audio_tag.get("src", "").endswith(".mp3"):
59
- return audio_tag["src"]
60
  except Exception:
61
- pass
62
- return None
63
 
64
- def get_podcast_title(url):
65
- """Extrait le titre général du podcast."""
66
  try:
67
- response = requests.get(url)
68
  soup = BeautifulSoup(response.text, 'html.parser')
69
  title_tag = soup.find('h1') or soup.find('title')
70
  return slugify(title_tag.get_text()) if title_tag else "podcast"
71
- except Exception:
72
  return "podcast"
73
 
74
- def download_and_zip_podcast_series(main_url):
75
  try:
76
  title = get_podcast_title(main_url)
77
- episode_pages = get_episode_links(main_url)
78
-
79
  if not episode_pages:
80
  return "Aucune page d’épisode trouvée.", None
81
 
82
  mp3_links = []
83
- for ep_url in episode_pages:
84
- mp3 = extract_mp3_from_episode(ep_url)
85
  if mp3:
86
  mp3_links.append(mp3)
87
 
88
  if not mp3_links:
89
- return "Aucun fichier MP3 trouvé dans les épisodes.", None
90
 
91
  temp_dir = tempfile.mkdtemp()
92
  for i, mp3_url in enumerate(mp3_links, start=1):
@@ -100,27 +87,24 @@ def download_and_zip_podcast_series(main_url):
100
 
101
  zip_path = os.path.join(temp_dir, f"{title}.zip")
102
  shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
103
-
104
- return f"{len(mp3_links)} épisode(s) téléchargé(s).", zip_path
105
 
106
  except Exception as e:
107
  return f"Erreur : {str(e)}", None
108
 
109
  # === INTERFACE GRADIO ===
110
  with gr.Blocks() as app:
111
- gr.Markdown("# Téléchargeur de Podcasts MP3 - France Culture")
112
- with gr.Row():
113
- url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
114
  download_button = gr.Button("Télécharger et compresser")
115
  output_text = gr.Textbox(label="Message")
116
  file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
117
 
118
  def process(url):
119
- message, zip_file = download_and_zip_podcast_series(url)
120
- return message, zip_file
121
 
122
  download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
123
 
124
- # === LANCEMENT ===
125
  if __name__ == "__main__":
126
  app.launch(share=True)
 
10
  def install_if_missing(package_name, import_name=None):
11
  import_name = import_name or package_name
12
  if importlib.util.find_spec(import_name) is None:
 
13
  subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
14
 
15
+ for package in [("requests",), ("bs4", "bs4"), ("gradio",)]:
 
 
 
 
16
  install_if_missing(*package)
17
 
 
18
  import requests
19
  import gradio as gr
20
  from bs4 import BeautifulSoup
 
26
  text = re.sub(r'[-\s]+', '_', text)
27
  return text[:max_length].strip('_')
28
 
29
+ def get_episode_pages(main_url):
 
30
  response = requests.get(main_url)
31
  response.raise_for_status()
32
  soup = BeautifulSoup(response.text, 'html.parser')
33
+ episode_urls = []
34
 
35
  for a in soup.find_all('a', href=True):
36
  href = a['href']
37
+ if "/franceculture/podcasts/" in href and not href.endswith('/serie'):
38
  full_url = href if href.startswith("http") else f"https://www.radiofrance.fr{href}"
39
+ episode_urls.append(full_url)
 
 
 
40
 
41
+ return list(dict.fromkeys(episode_urls))
42
+
43
+ def get_mp3_link_with_ithema(page_url):
44
  try:
45
+ response = requests.get(page_url)
46
  response.raise_for_status()
47
+ html = response.text
48
+ matches = re.findall(r'https://[^"]*ithema[^"]*\.mp3', html)
49
+ return matches[0] if matches else None
 
50
  except Exception:
51
+ return None
 
52
 
53
+ def get_podcast_title(main_url):
 
54
  try:
55
+ response = requests.get(main_url)
56
  soup = BeautifulSoup(response.text, 'html.parser')
57
  title_tag = soup.find('h1') or soup.find('title')
58
  return slugify(title_tag.get_text()) if title_tag else "podcast"
59
+ except:
60
  return "podcast"
61
 
62
+ def download_and_zip_podcast(main_url):
63
  try:
64
  title = get_podcast_title(main_url)
65
+ episode_pages = get_episode_pages(main_url)
 
66
  if not episode_pages:
67
  return "Aucune page d’épisode trouvée.", None
68
 
69
  mp3_links = []
70
+ for page in episode_pages:
71
+ mp3 = get_mp3_link_with_ithema(page)
72
  if mp3:
73
  mp3_links.append(mp3)
74
 
75
  if not mp3_links:
76
+ return "Aucun fichier MP3 contenant 'ithema' trouvé.", None
77
 
78
  temp_dir = tempfile.mkdtemp()
79
  for i, mp3_url in enumerate(mp3_links, start=1):
 
87
 
88
  zip_path = os.path.join(temp_dir, f"{title}.zip")
89
  shutil.make_archive(zip_path.replace('.zip', ''), 'zip', temp_dir)
90
+ return f"{len(mp3_links)} fichier(s) téléchargé(s).", zip_path
 
91
 
92
  except Exception as e:
93
  return f"Erreur : {str(e)}", None
94
 
95
  # === INTERFACE GRADIO ===
96
  with gr.Blocks() as app:
97
+ gr.Markdown("# Téléchargeur de Podcasts MP3 (France Culture)")
98
+ url_input = gr.Textbox(label="URL de la série", placeholder="https://www.radiofrance.fr/...")
 
99
  download_button = gr.Button("Télécharger et compresser")
100
  output_text = gr.Textbox(label="Message")
101
  file_output = gr.File(label="Fichier ZIP", file_types=[".zip"])
102
 
103
  def process(url):
104
+ msg, zip_path = download_and_zip_podcast(url)
105
+ return msg, zip_path
106
 
107
  download_button.click(fn=process, inputs=[url_input], outputs=[output_text, file_output])
108
 
 
109
  if __name__ == "__main__":
110
  app.launch(share=True)