|
import gradio as gr |
|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
import tempfile |
|
|
|
def scrape_news(keyword): |
|
|
|
base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query=" |
|
url = base_url + keyword |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" |
|
} |
|
res = requests.get(url, headers=headers) |
|
res.encoding = 'utf-8' |
|
soup = BeautifulSoup(res.text, "html.parser") |
|
|
|
news_list = [] |
|
|
|
news_areas = soup.find_all("div", class_="news_area") |
|
for area in news_areas: |
|
try: |
|
|
|
info_group = area.find("div", class_="news_info").find("div", class_="info_group") |
|
publisher_tag = info_group.find("a", class_="info press") |
|
publisher = publisher_tag.get_text(strip=True) if publisher_tag else "" |
|
date_tag = info_group.find("span", class_="info") |
|
date = date_tag.get_text(strip=True) if date_tag else "" |
|
|
|
|
|
title_tag = area.find("a", class_="news_tit") |
|
title = title_tag.get("title", title_tag.get_text(strip=True)) if title_tag else "" |
|
link = title_tag.get("href") if title_tag else "" |
|
|
|
|
|
brief_tag = area.find("a", class_="api_txt_lines dsc_txt_wrap") |
|
brief = brief_tag.get_text(strip=True) if brief_tag else "" |
|
|
|
news_list.append({ |
|
"μ λ¬Έμ¬": publisher, |
|
"λ°νμΌ": date, |
|
"μ λͺ©": title, |
|
"λ΄μ€κ°λ΅μ 보": brief, |
|
"λ§ν¬": link |
|
}) |
|
except Exception as e: |
|
|
|
continue |
|
|
|
|
|
df = pd.DataFrame(news_list) |
|
|
|
|
|
if df.empty: |
|
html_table = "<p>κ²μ κ²°κ³Όκ° μμ΅λλ€.</p>" |
|
else: |
|
df["λ§ν¬"] = df["λ§ν¬"].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>') |
|
html_table = df.to_html(escape=False, index=False) |
|
|
|
|
|
df_excel = pd.DataFrame(news_list) |
|
if not df_excel.empty: |
|
df_excel["λ§ν¬"] = df_excel["λ§ν¬"].apply(lambda x: f'=HYPERLINK("{x}", "λ§ν¬")') |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
|
tmp_path = tmp.name |
|
with pd.ExcelWriter(tmp_path, engine="openpyxl") as writer: |
|
df_excel.to_excel(writer, index=False, sheet_name="News") |
|
|
|
return html_table, tmp_path |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# λ€μ΄λ² λ΄μ€ μ€ν¬λν") |
|
gr.Markdown("μ
λ ₯ν κ²μμ΄λ₯Ό κΈ°λ°μΌλ‘ λ€μ΄λ² λ΄μ€ μ 보λ₯Ό μ€ν¬λννκ³ HTML νμ μμ
λ€μ΄λ‘λλ₯Ό μ 곡ν©λλ€.") |
|
|
|
|
|
keyword_input = gr.Textbox(label="κ²μμ΄", placeholder="λ΄μ€ κ²μμ΄λ₯Ό μ
λ ₯νμΈμ.") |
|
search_button = gr.Button("κ²μ") |
|
news_html = gr.HTML(label="λ΄μ€ κ²°κ³Ό") |
|
excel_file = gr.File(label="μμ
λ€μ΄λ‘λ") |
|
|
|
search_button.click(fn=scrape_news, inputs=keyword_input, outputs=[news_html, excel_file]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |