seawolf2357 commited on
Commit
b90144d
ยท
verified ยท
1 Parent(s): 6332f2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -32
app.py CHANGED
@@ -11,11 +11,8 @@ def get_url_content(url):
11
 
12
  def parse_html(html_content):
13
  soup = BeautifulSoup(html_content, 'html.parser')
14
- # ์›ํ•˜๋Š” HTML ์š”์†Œ๋ฅผ ํŒŒ์‹ฑํ•˜์—ฌ ๋ฐ˜ํ™˜
15
- # ์˜ˆ: soup.find_all('p') ๋“ฑ
16
  return soup.prettify()
17
 
18
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค ํ•จ์ˆ˜
19
  def gradio_fetch_and_parse(url):
20
  html_content = get_url_content(url)
21
  parsed_content = parse_html(html_content)
@@ -24,14 +21,20 @@ def gradio_fetch_and_parse(url):
24
  def get_main_content(html_content):
25
  soup = BeautifulSoup(html_content, 'html.parser')
26
 
27
- # ํด๋ž˜์Šค๋ช…์ด "a-list-item"์ธ ๋ชจ๋“  <span> ํƒœ๊ทธ ์ถ”์ถœ
28
- list_items_text = ' '.join([item.get_text(strip=True) for item in soup.find_all('span', class_='a-list-item')])
29
 
30
- # <meta> ํƒœ๊ทธ์˜ content ์†์„ฑ ์ถ”์ถœ
31
- meta_content_text = ' '.join([meta.get('content', '') for meta in soup.find_all('meta') if meta.get('content')])
32
 
33
- # ๋‘ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฒฐํ•ฉ
34
- combined_text = list_items_text + ' ' + meta_content_text
 
 
 
 
 
 
35
 
36
  if combined_text.strip():
37
  print("์ถ”์ถœ๋œ ํ…์ŠคํŠธ:", combined_text)
@@ -40,7 +43,6 @@ def get_main_content(html_content):
40
  print("๋ณธ๋ฌธ ์ฝ˜ํ…์ธ ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
41
  return ''
42
 
43
-
44
  def format_script(text):
45
  sentences = text.split('.')
46
  script = ""
@@ -49,10 +51,9 @@ def format_script(text):
49
  if i+1 < len(sentences):
50
  line += sentences[i+1].strip() + '\n'
51
  script += line
52
- print("ํ˜„์žฌ ์Šคํฌ๋ฆฝํŠธ:", script) # ๋””๋ฒ„๊น…์„ ์œ„ํ•œ ๋กœ๊ทธ
53
  return script
54
 
55
-
56
  def gradio_fetch_and_format_script(url):
57
  print("ํ•จ์ˆ˜ ํ˜ธ์ถœ๋จ:", url)
58
  html_content = get_url_content(url)
@@ -62,23 +63,7 @@ def gradio_fetch_and_format_script(url):
62
  print("์ƒ์„ฑ๋œ ์Šคํฌ๋ฆฝํŠธ:", script)
63
  return script
64
 
65
-
66
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
67
- iface_html = gr.Interface(
68
- fn=gradio_fetch_and_parse,
69
- inputs=gr.Textbox(label="URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"),
70
- outputs=gr.Textbox(label="์Šคํฌ๋žฉ๋œ HTML ์ฝ˜ํ…์ธ ")
71
- )
72
-
73
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
74
- iface_script = gr.Interface(
75
- fn=gradio_fetch_and_format_script, # ์ด ํ•จ์ˆ˜๊ฐ€ ํ˜ธ์ถœ๋˜์–ด์•ผ ํ•จ
76
- inputs=gr.Textbox(label="URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"), # ์ž…๋ ฅ ํ•„๋“œ
77
- outputs=gr.Textbox(label="์˜์ƒ์šฉ ์Šคํฌ๋ฆฝํŠธ") # ์ถœ๋ ฅ ํ•„๋“œ
78
- )
79
-
80
-
81
- # ๋‘ ์ธํ„ฐํŽ˜์ด์Šค๋ฅผ ํƒญ์œผ๋กœ ๊ตฌ์„ฑํ•˜์—ฌ ์‹คํ–‰
82
- iface_combined = gr.TabbedInterface([iface_html, iface_script],
83
- ["HTML ๋ณด๊ธฐ", "์Šคํฌ๋ฆฝํŠธ ์ƒ์„ฑ"])
84
- iface_combined.launch()
 
11
 
12
  def parse_html(html_content):
13
  soup = BeautifulSoup(html_content, 'html.parser')
 
 
14
  return soup.prettify()
15
 
 
16
  def gradio_fetch_and_parse(url):
17
  html_content = get_url_content(url)
18
  parsed_content = parse_html(html_content)
 
21
  def get_main_content(html_content):
22
  soup = BeautifulSoup(html_content, 'html.parser')
23
 
24
+ # ๋‹ค์–‘ํ•œ ์š”์†Œ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
25
+ extracted_texts = []
26
 
27
+ # <span class="a-list-item">
28
+ extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all('span', class_='a-list-item')])
29
 
30
+ # <meta content>
31
+ extracted_texts.extend([meta.get('content', '') for meta in soup.find_all('meta') if meta.get('content')])
32
+
33
+ # ์ถ”๊ฐ€ ์š”์†Œ๋“ค
34
+ for tag in ['section', 'article', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'footer', 'aside', 'iframe']:
35
+ extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all(tag)])
36
+
37
+ combined_text = ' '.join(extracted_texts)
38
 
39
  if combined_text.strip():
40
  print("์ถ”์ถœ๋œ ํ…์ŠคํŠธ:", combined_text)
 
43
  print("๋ณธ๋ฌธ ์ฝ˜ํ…์ธ ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
44
  return ''
45
 
 
46
  def format_script(text):
47
  sentences = text.split('.')
48
  script = ""
 
51
  if i+1 < len(sentences):
52
  line += sentences[i+1].strip() + '\n'
53
  script += line
54
+ print("ํ˜„์žฌ ์Šคํฌ๋ฆฝํŠธ:", script)
55
  return script
56
 
 
57
  def gradio_fetch_and_format_script(url):
58
  print("ํ•จ์ˆ˜ ํ˜ธ์ถœ๋จ:", url)
59
  html_content = get_url_content(url)
 
63
  print("์ƒ์„ฑ๋œ ์Šคํฌ๋ฆฝํŠธ:", script)
64
  return script
65
 
66
+ iface_html = gr.Interface(fn=gradio_fetch_and_parse, inputs=gr.Textbox(label="URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"), outputs=gr.Textbox(label="์Šคํฌ๋žฉ๋œ HTML ์ฝ˜ํ…์ธ "))
67
+ iface_script = gr.Interface(fn=gradio_fetch_and_format_script, inputs=gr.Textbox(label="URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"), outputs=gr.Textbox(label="์˜์ƒ์šฉ ์Šคํฌ๋ฆฝํŠธ"))
68
+ iface_combined = gr.TabbedInterface([iface_html, iface_script], ["HTML ๋ณด๊ธฐ", "์Šคํฌ๋ฆฝํŠธ ์ƒ์„ฑ"])
69
+ iface_combined.launch()