Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
def get_url_content(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
return response.text | |
else: | |
return "URL์์ ์ฝํ ์ธ ๋ฅผ ๊ฐ์ ธ์ค๋ ๋ฐ ์คํจํ์ต๋๋ค." | |
def parse_html(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
return soup.prettify() | |
def gradio_fetch_and_parse(url): | |
html_content = get_url_content(url) | |
parsed_content = parse_html(html_content) | |
return parsed_content | |
def get_main_content(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# ๋ค์ํ ์์์์ ํ ์คํธ ์ถ์ถ | |
extracted_texts = [] | |
# <span class="a-list-item"> | |
extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all('span', class_='a-list-item')]) | |
# <meta content> | |
extracted_texts.extend([meta.get('content', '') for meta in soup.find_all('meta') if meta.get('content')]) | |
# ์ถ๊ฐ ์์๋ค | |
for tag in ['section', 'article', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'footer', 'aside', 'iframe']: | |
extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all(tag)]) | |
combined_text = ' '.join(extracted_texts) | |
if combined_text.strip(): | |
print("์ถ์ถ๋ ํ ์คํธ:", combined_text) | |
return combined_text | |
else: | |
print("๋ณธ๋ฌธ ์ฝํ ์ธ ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.") | |
return '' | |
def format_script(text): | |
sentences = text.split('.') | |
script = "" | |
for i in range(0, min(len(sentences), 10), 2): | |
line = sentences[i].strip() + '. ' | |
if i+1 < len(sentences): | |
line += sentences[i+1].strip() + '\n' | |
script += line | |
print("ํ์ฌ ์คํฌ๋ฆฝํธ:", script) | |
return script | |
def gradio_fetch_and_format_script(url): | |
print("ํจ์ ํธ์ถ๋จ:", url) | |
html_content = get_url_content(url) | |
main_content = get_main_content(html_content) | |
print("์ถ์ถ๋ ๋ณธ๋ฌธ:", main_content) | |
script = format_script(main_content) | |
print("์์ฑ๋ ์คํฌ๋ฆฝํธ:", script) | |
return script | |
iface_html = gr.Interface(fn=gradio_fetch_and_parse, inputs=gr.Textbox(label="URL์ ์ ๋ ฅํ์ธ์"), outputs=gr.Textbox(label="์คํฌ๋ฉ๋ HTML ์ฝํ ์ธ ")) | |
iface_script = gr.Interface(fn=gradio_fetch_and_format_script, inputs=gr.Textbox(label="URL์ ์ ๋ ฅํ์ธ์"), outputs=gr.Textbox(label="์์์ฉ ์คํฌ๋ฆฝํธ")) | |
iface_combined = gr.TabbedInterface([iface_html, iface_script], ["HTML ๋ณด๊ธฐ", "์คํฌ๋ฆฝํธ ์์ฑ"]) | |
iface_combined.launch() | |