File size: 2,580 Bytes
c05ccf6
c35600d
1840ee6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef1d2b4
 
6332f2b
b90144d
 
6332f2b
b90144d
 
6332f2b
b90144d
 
 
 
 
 
 
 
6332f2b
 
 
 
f31d39f
48e40bd
f31d39f
ef1d2b4
 
 
 
 
f31d39f
ef1d2b4
f31d39f
 
b90144d
ef1d2b4
f31d39f
ef1d2b4
bf141c7
ef1d2b4
 
bf141c7
 
 
 
ef1d2b4
b90144d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
import requests
from bs4 import BeautifulSoup

def get_url_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        return "URL์—์„œ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฐ€์ ธ์˜ค๋Š” ๋ฐ ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค."

def parse_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.prettify()

def gradio_fetch_and_parse(url):
    html_content = get_url_content(url)
    parsed_content = parse_html(html_content)
    return parsed_content

def get_main_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # ๋‹ค์–‘ํ•œ ์š”์†Œ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
    extracted_texts = []

    # <span class="a-list-item">
    extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all('span', class_='a-list-item')])

    # <meta content>
    extracted_texts.extend([meta.get('content', '') for meta in soup.find_all('meta') if meta.get('content')])

    # ์ถ”๊ฐ€ ์š”์†Œ๋“ค
    for tag in ['section', 'article', 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'footer', 'aside', 'iframe']:
        extracted_texts.extend([item.get_text(strip=True) for item in soup.find_all(tag)])

    combined_text = ' '.join(extracted_texts)

    if combined_text.strip():
        print("์ถ”์ถœ๋œ ํ…์ŠคํŠธ:", combined_text)
        return combined_text
    else:
        print("๋ณธ๋ฌธ ์ฝ˜ํ…์ธ ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
        return ''

def format_script(text):
    sentences = text.split('.')
    script = ""
    for i in range(0, min(len(sentences), 10), 2):
        line = sentences[i].strip() + '. '
        if i+1 < len(sentences):
            line += sentences[i+1].strip() + '\n'
        script += line
        print("ํ˜„์žฌ ์Šคํฌ๋ฆฝํŠธ:", script)
    return script

def gradio_fetch_and_format_script(url):
    print("ํ•จ์ˆ˜ ํ˜ธ์ถœ๋จ:", url)
    html_content = get_url_content(url)
    main_content = get_main_content(html_content)
    print("์ถ”์ถœ๋œ ๋ณธ๋ฌธ:", main_content)
    script = format_script(main_content)
    print("์ƒ์„ฑ๋œ ์Šคํฌ๋ฆฝํŠธ:", script)
    return script

iface_html = gr.Interface(fn=gradio_fetch_and_parse, inputs=gr.Textbox(label="URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"), outputs=gr.Textbox(label="์Šคํฌ๋žฉ๋œ HTML ์ฝ˜ํ…์ธ "))
iface_script = gr.Interface(fn=gradio_fetch_and_format_script, inputs=gr.Textbox(label="URL์„ ์ž…๋ ฅํ•˜์„ธ์š”"), outputs=gr.Textbox(label="์˜์ƒ์šฉ ์Šคํฌ๋ฆฝํŠธ"))
iface_combined = gr.TabbedInterface([iface_html, iface_script], ["HTML ๋ณด๊ธฐ", "์Šคํฌ๋ฆฝํŠธ ์ƒ์„ฑ"])
iface_combined.launch()