Spaces:
Running
Running
File size: 2,580 Bytes
c05ccf6 c35600d 1840ee6 ef1d2b4 6332f2b b90144d 6332f2b b90144d 6332f2b b90144d 6332f2b f31d39f 48e40bd f31d39f ef1d2b4 f31d39f ef1d2b4 f31d39f b90144d ef1d2b4 f31d39f ef1d2b4 bf141c7 ef1d2b4 bf141c7 ef1d2b4 b90144d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
def get_url_content(url):
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return "URL์์ ์ฝํ
์ธ ๋ฅผ ๊ฐ์ ธ์ค๋ ๋ฐ ์คํจํ์ต๋๋ค."
def parse_html(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
return soup.prettify()
def gradio_fetch_and_parse(url):
html_content = get_url_content(url)
parsed_content = parse_html(html_content)
return parsed_content
def get_main_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# ๋ค์ํ ์์์์ ํ
์คํธ ์ถ์ถ
extracted_texts = []
# <span class="a-list-item">
extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all('span', class_='a-list-item')])
# <meta content>
extracted_texts.extend([meta.get('content', '') for meta in soup.find_all('meta') if meta.get('content')])
# ์ถ๊ฐ ์์๋ค
for tag in ['section', 'article', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'footer', 'aside', 'iframe']:
extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all(tag)])
combined_text = ' '.join(extracted_texts)
if combined_text.strip():
print("์ถ์ถ๋ ํ
์คํธ:", combined_text)
return combined_text
else:
print("๋ณธ๋ฌธ ์ฝํ
์ธ ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
return ''
def format_script(text):
sentences = text.split('.')
script = ""
for i in range(0, min(len(sentences), 10), 2):
line = sentences[i].strip() + '. '
if i+1 < len(sentences):
line += sentences[i+1].strip() + '\n'
script += line
print("ํ์ฌ ์คํฌ๋ฆฝํธ:", script)
return script
def gradio_fetch_and_format_script(url):
print("ํจ์ ํธ์ถ๋จ:", url)
html_content = get_url_content(url)
main_content = get_main_content(html_content)
print("์ถ์ถ๋ ๋ณธ๋ฌธ:", main_content)
script = format_script(main_content)
print("์์ฑ๋ ์คํฌ๋ฆฝํธ:", script)
return script
iface_html = gr.Interface(fn=gradio_fetch_and_parse, inputs=gr.Textbox(label="URL์ ์
๋ ฅํ์ธ์"), outputs=gr.Textbox(label="์คํฌ๋ฉ๋ HTML ์ฝํ
์ธ "))
iface_script = gr.Interface(fn=gradio_fetch_and_format_script, inputs=gr.Textbox(label="URL์ ์
๋ ฅํ์ธ์"), outputs=gr.Textbox(label="์์์ฉ ์คํฌ๋ฆฝํธ"))
iface_combined = gr.TabbedInterface([iface_html, iface_script], ["HTML ๋ณด๊ธฐ", "์คํฌ๋ฆฝํธ ์์ฑ"])
iface_combined.launch()
|