Ribot commited on
Commit
2a11a1f
·
verified ·
1 Parent(s): 13af006

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -34
app.py CHANGED
@@ -8,50 +8,63 @@ from urllib.parse import urljoin
8
 
9
  def process_url(url):
10
  try:
11
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}
 
 
 
12
  response = requests.get(url, headers=headers)
13
  response.raise_for_status()
14
  except Exception as e:
15
  return None, f"Erreur de connexion : {str(e)}"
16
 
17
- # Extraction des URLs MP3
18
  mp3_links = []
19
- pattern = r'(?:contentUrl|url)"\s*:\s*"([^"]+?\.mp3)'
20
- matches = re.findall(pattern, response.text)
21
 
22
- for match in matches:
23
- clean_url = match.split('";')[0] if '";' in match else match
24
- absolute_url = urljoin(response.url, clean_url)
25
- if absolute_url not in mp3_links:
26
- mp3_links.append(absolute_url)
27
 
28
- if not mp3_links:
29
- return None, "Aucun lien MP3 détecté dans le code source"
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Téléchargement
32
  temp_dir = tempfile.mkdtemp()
33
  filenames = []
34
 
35
- for idx, mp3_url in enumerate(mp3_links, 1):
36
  try:
37
- filename = f"{idx:02d}_{os.path.basename(mp3_url).split('?')[0]}"
38
  filepath = os.path.join(temp_dir, filename)
39
 
40
- with requests.get(mp3_url, headers=headers, stream=True) as r:
41
  r.raise_for_status()
42
  with open(filepath, 'wb') as f:
43
  for chunk in r.iter_content(chunk_size=8192):
44
- if chunk:
45
- f.write(chunk)
46
  filenames.append(filepath)
47
  except Exception as e:
48
  continue
49
 
50
  if not filenames:
51
- return None, "Échec des téléchargements"
52
 
53
- # Création ZIP
54
- zip_path = os.path.join(temp_dir, 'podcast.zip')
55
  with zipfile.ZipFile(zip_path, 'w') as zipf:
56
  for file in filenames:
57
  zipf.write(file, arcname=os.path.basename(file))
@@ -64,33 +77,30 @@ def download_podcast(url):
64
  raise gr.Error(error)
65
  return zip_path
66
 
67
- # Configuration Gradio compatible Hugging Face
68
- with gr.Blocks() as app:
69
- gr.Markdown("# 📻 Téléchargeur de Podcasts Radio France")
70
-
71
  with gr.Row():
72
  url_input = gr.Textbox(
73
  label="URL de la série podcast",
74
- placeholder="Collez l'URL Radio France ici..."
 
75
  )
76
- download_btn = gr.Button("Télécharger les épisodes")
77
-
78
- output_file = gr.File(label="Fichier ZIP à télécharger")
79
- error_output = gr.Textbox(visible=False)
80
 
81
  examples = gr.Examples(
82
  examples=[[
83
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
84
  ]],
85
- inputs=[url_input]
 
86
  )
87
-
88
- download_btn.click(
89
  fn=download_podcast,
90
  inputs=url_input,
91
- outputs=output_file,
92
  api_name="download"
93
  )
94
 
95
- # Configuration spécifique pour Hugging Face
96
- app.launch(debug=False, show_error=True)
 
8
 
9
  def process_url(url):
10
  try:
11
+ headers = {
12
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
13
+ 'Referer': 'https://www.radiofrance.fr/'
14
+ }
15
  response = requests.get(url, headers=headers)
16
  response.raise_for_status()
17
  except Exception as e:
18
  return None, f"Erreur de connexion : {str(e)}"
19
 
20
+ # Nouvelle méthode de détection des MP3
21
  mp3_links = []
 
 
22
 
23
+ # 1. Recherche dans les balises script
24
+ scripts = re.findall(r'<script.*?>(.*?)</script>', response.text, re.DOTALL)
25
+ for script in scripts:
26
+ matches = re.findall(r'(https://[^\s"\']+?\.mp3)', script)
27
+ mp3_links.extend(matches)
28
 
29
+ # 2. Recherche dans les attributs HTML
30
+ html_matches = re.findall(r'(?:href|src|rl|contentUrl)\s*=\s*["\'](.*?\.mp3.*?)["\']', response.text)
31
+ mp3_links.extend([urljoin(url, m.split('";')[0]) for m in html_matches])
32
+
33
+ # 3. Suppression des paramètres et dédoublonnage
34
+ clean_links = []
35
+ seen = set()
36
+ for link in mp3_links:
37
+ clean = link.split('?')[0].split('";')[0]
38
+ if clean not in seen:
39
+ seen.add(clean)
40
+ clean_links.append(clean)
41
+
42
+ if not clean_links:
43
+ return None, "Aucun MP3 trouvé - Essayez avec l'URL complète d'une série"
44
 
45
  # Téléchargement
46
  temp_dir = tempfile.mkdtemp()
47
  filenames = []
48
 
49
+ for idx, mp3_url in enumerate(clean_links, 1):
50
  try:
51
+ filename = f"{idx:02d}_{os.path.basename(mp3_url)}"
52
  filepath = os.path.join(temp_dir, filename)
53
 
54
+ with requests.get(mp3_url, headers=headers, stream=True, timeout=10) as r:
55
  r.raise_for_status()
56
  with open(filepath, 'wb') as f:
57
  for chunk in r.iter_content(chunk_size=8192):
58
+ f.write(chunk)
 
59
  filenames.append(filepath)
60
  except Exception as e:
61
  continue
62
 
63
  if not filenames:
64
+ return None, "Tous les téléchargements ont échoué"
65
 
66
+ # Création du ZIP
67
+ zip_path = os.path.join(temp_dir, 'radiofrance_podcast.zip')
68
  with zipfile.ZipFile(zip_path, 'w') as zipf:
69
  for file in filenames:
70
  zipf.write(file, arcname=os.path.basename(file))
 
77
  raise gr.Error(error)
78
  return zip_path
79
 
80
+ with gr.Blocks(title="RadioFrance Podcaster") as app:
81
+ gr.Markdown("## 🎧 Téléchargement de podcasts Radio France")
 
 
82
  with gr.Row():
83
  url_input = gr.Textbox(
84
  label="URL de la série podcast",
85
+ placeholder="Ex: https://www.radiofrance.fr/.../mon-podcast",
86
+ max_lines=1
87
  )
88
+ btn = gr.Button("Télécharger les épisodes", variant="primary")
89
+ output = gr.File(label="Fichier ZIP contenant les MP3")
 
 
90
 
91
  examples = gr.Examples(
92
  examples=[[
93
  "https://www.radiofrance.fr/franceculture/podcasts/serie-le-secret-de-la-licorne-les-aventures-de-tintin"
94
  ]],
95
+ inputs=[url_input],
96
+ label="Exemple fonctionnel"
97
  )
98
+
99
+ btn.click(
100
  fn=download_podcast,
101
  inputs=url_input,
102
+ outputs=output,
103
  api_name="download"
104
  )
105
 
106
+ app.launch(show_error=True, share=False)