Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,8 @@ def get_url_content(url):
|
|
11 |
|
12 |
def parse_html(html_content):
|
13 |
soup = BeautifulSoup(html_content, 'html.parser')
|
14 |
-
# ์ํ๋ HTML ์์๋ฅผ ํ์ฑํ์ฌ ๋ฐํ
|
15 |
-
# ์: soup.find_all('p') ๋ฑ
|
16 |
return soup.prettify()
|
17 |
|
18 |
-
# Gradio ์ธํฐํ์ด์ค ํจ์
|
19 |
def gradio_fetch_and_parse(url):
|
20 |
html_content = get_url_content(url)
|
21 |
parsed_content = parse_html(html_content)
|
@@ -24,14 +21,20 @@ def gradio_fetch_and_parse(url):
|
|
24 |
def get_main_content(html_content):
|
25 |
soup = BeautifulSoup(html_content, 'html.parser')
|
26 |
|
27 |
-
#
|
28 |
-
|
29 |
|
30 |
-
# <
|
31 |
-
|
32 |
|
33 |
-
#
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
if combined_text.strip():
|
37 |
print("์ถ์ถ๋ ํ
์คํธ:", combined_text)
|
@@ -40,7 +43,6 @@ def get_main_content(html_content):
|
|
40 |
print("๋ณธ๋ฌธ ์ฝํ
์ธ ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
41 |
return ''
|
42 |
|
43 |
-
|
44 |
def format_script(text):
|
45 |
sentences = text.split('.')
|
46 |
script = ""
|
@@ -49,10 +51,9 @@ def format_script(text):
|
|
49 |
if i+1 < len(sentences):
|
50 |
line += sentences[i+1].strip() + '\n'
|
51 |
script += line
|
52 |
-
print("ํ์ฌ ์คํฌ๋ฆฝํธ:", script)
|
53 |
return script
|
54 |
|
55 |
-
|
56 |
def gradio_fetch_and_format_script(url):
|
57 |
print("ํจ์ ํธ์ถ๋จ:", url)
|
58 |
html_content = get_url_content(url)
|
@@ -62,23 +63,7 @@ def gradio_fetch_and_format_script(url):
|
|
62 |
print("์์ฑ๋ ์คํฌ๋ฆฝํธ:", script)
|
63 |
return script
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
inputs=gr.Textbox(label="URL์ ์
๋ ฅํ์ธ์"),
|
70 |
-
outputs=gr.Textbox(label="์คํฌ๋ฉ๋ HTML ์ฝํ
์ธ ")
|
71 |
-
)
|
72 |
-
|
73 |
-
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ
|
74 |
-
iface_script = gr.Interface(
|
75 |
-
fn=gradio_fetch_and_format_script, # ์ด ํจ์๊ฐ ํธ์ถ๋์ด์ผ ํจ
|
76 |
-
inputs=gr.Textbox(label="URL์ ์
๋ ฅํ์ธ์"), # ์
๋ ฅ ํ๋
|
77 |
-
outputs=gr.Textbox(label="์์์ฉ ์คํฌ๋ฆฝํธ") # ์ถ๋ ฅ ํ๋
|
78 |
-
)
|
79 |
-
|
80 |
-
|
81 |
-
# ๋ ์ธํฐํ์ด์ค๋ฅผ ํญ์ผ๋ก ๊ตฌ์ฑํ์ฌ ์คํ
|
82 |
-
iface_combined = gr.TabbedInterface([iface_html, iface_script],
|
83 |
-
["HTML ๋ณด๊ธฐ", "์คํฌ๋ฆฝํธ ์์ฑ"])
|
84 |
-
iface_combined.launch()
|
|
|
11 |
|
12 |
def parse_html(html_content):
|
13 |
soup = BeautifulSoup(html_content, 'html.parser')
|
|
|
|
|
14 |
return soup.prettify()
|
15 |
|
|
|
16 |
def gradio_fetch_and_parse(url):
|
17 |
html_content = get_url_content(url)
|
18 |
parsed_content = parse_html(html_content)
|
|
|
21 |
def get_main_content(html_content):
|
22 |
soup = BeautifulSoup(html_content, 'html.parser')
|
23 |
|
24 |
+
# ๋ค์ํ ์์์์ ํ
์คํธ ์ถ์ถ
|
25 |
+
extracted_texts = []
|
26 |
|
27 |
+
# <span class="a-list-item">
|
28 |
+
extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all('span', class_='a-list-item')])
|
29 |
|
30 |
+
# <meta content>
|
31 |
+
extracted_texts.extend([meta.get('content', '') for meta in soup.find_all('meta') if meta.get('content')])
|
32 |
+
|
33 |
+
# ์ถ๊ฐ ์์๋ค
|
34 |
+
for tag in ['section', 'article', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'footer', 'aside', 'iframe']:
|
35 |
+
extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all(tag)])
|
36 |
+
|
37 |
+
combined_text = ' '.join(extracted_texts)
|
38 |
|
39 |
if combined_text.strip():
|
40 |
print("์ถ์ถ๋ ํ
์คํธ:", combined_text)
|
|
|
43 |
print("๋ณธ๋ฌธ ์ฝํ
์ธ ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
44 |
return ''
|
45 |
|
|
|
46 |
def format_script(text):
|
47 |
sentences = text.split('.')
|
48 |
script = ""
|
|
|
51 |
if i+1 < len(sentences):
|
52 |
line += sentences[i+1].strip() + '\n'
|
53 |
script += line
|
54 |
+
print("ํ์ฌ ์คํฌ๋ฆฝํธ:", script)
|
55 |
return script
|
56 |
|
|
|
57 |
def gradio_fetch_and_format_script(url):
|
58 |
print("ํจ์ ํธ์ถ๋จ:", url)
|
59 |
html_content = get_url_content(url)
|
|
|
63 |
print("์์ฑ๋ ์คํฌ๋ฆฝํธ:", script)
|
64 |
return script
|
65 |
|
66 |
+
iface_html = gr.Interface(fn=gradio_fetch_and_parse, inputs=gr.Textbox(label="URL์ ์
๋ ฅํ์ธ์"), outputs=gr.Textbox(label="์คํฌ๋ฉ๋ HTML ์ฝํ
์ธ "))
|
67 |
+
iface_script = gr.Interface(fn=gradio_fetch_and_format_script, inputs=gr.Textbox(label="URL์ ์
๋ ฅํ์ธ์"), outputs=gr.Textbox(label="์์์ฉ ์คํฌ๋ฆฝํธ"))
|
68 |
+
iface_combined = gr.TabbedInterface([iface_html, iface_script], ["HTML ๋ณด๊ธฐ", "์คํฌ๋ฆฝํธ ์์ฑ"])
|
69 |
+
iface_combined.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|