5-2_t1 / app.py
Kims12's picture
Update app.py
0c4db8b verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
def scrape_news(keyword):
# 넀이버 λ‰΄μŠ€ 검색 URL ꡬ성
base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
url = base_url + keyword
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
}
res = requests.get(url, headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, "html.parser")
news_list = []
# λ‰΄μŠ€ μ•„μ΄ν…œμ€ div 클래슀 "news_area" μ•ˆμ— 있음
news_areas = soup.find_all("div", class_="news_area")
for area in news_areas:
try:
# 신문사와 λ°œν–‰μΌμ€ news_info > info_group μ—μ„œ μΆ”μΆœ
info_group = area.find("div", class_="news_info").find("div", class_="info_group")
publisher_tag = info_group.find("a", class_="info press")
publisher = publisher_tag.get_text(strip=True) if publisher_tag else ""
date_tag = info_group.find("span", class_="info")
date = date_tag.get_text(strip=True) if date_tag else ""
# 제λͺ©κ³Ό λ§ν¬λŠ” news_contents λ‚΄μ˜ news_titμ—μ„œ μΆ”μΆœ
title_tag = area.find("a", class_="news_tit")
title = title_tag.get("title", title_tag.get_text(strip=True)) if title_tag else ""
link = title_tag.get("href") if title_tag else ""
# λ‰΄μŠ€ κ°„λž΅μ •λ³΄λŠ” api_txt_lines dsc_txt_wrapμ—μ„œ μΆ”μΆœ
brief_tag = area.find("a", class_="api_txt_lines dsc_txt_wrap")
brief = brief_tag.get_text(strip=True) if brief_tag else ""
news_list.append({
"신문사": publisher,
"λ°œν–‰μΌ": date,
"제λͺ©": title,
"λ‰΄μŠ€κ°„λž΅μ •λ³΄": brief,
"링크": link
})
except Exception as e:
# 였λ₯˜ λ°œμƒ μ‹œ ν•΄λ‹Ή λ‰΄μŠ€ ν•­λͺ©μ€ κ±΄λ„ˆλœλ‹ˆλ‹€.
continue
# κ²°κ³Ό DataFrame 생성
df = pd.DataFrame(news_list)
# UI 좜λ ₯용 HTML ν…Œμ΄λΈ” 생성 (λ§ν¬λŠ” 클릭 κ°€λŠ₯ν•˜λ„λ‘ a νƒœκ·Έ 처리)
if df.empty:
html_table = "<p>검색 κ²°κ³Όκ°€ μ—†μŠ΅λ‹ˆλ‹€.</p>"
else:
df["링크"] = df["링크"].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
html_table = df.to_html(escape=False, index=False)
# Excel 파일 생성 (μ—‘μ…€ λ‚΄ λ§ν¬λŠ” HYPERLINK ν•¨μˆ˜λ‘œ 클릭 κ°€λŠ₯ν•˜λ„λ‘ 처리)
df_excel = pd.DataFrame(news_list)
if not df_excel.empty:
df_excel["링크"] = df_excel["링크"].apply(lambda x: f'=HYPERLINK("{x}", "링크")')
# μž„μ‹œ νŒŒμΌμ„ μƒμ„±ν•˜μ—¬ μ—‘μ…€ 파일 μ €μž₯ (gr.File μ»΄ν¬λ„ŒνŠΈλŠ” 파일 경둜λ₯Ό ν•„μš”λ‘œ 함)
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
tmp_path = tmp.name
with pd.ExcelWriter(tmp_path, engine="openpyxl") as writer:
df_excel.to_excel(writer, index=False, sheet_name="News")
return html_table, tmp_path
with gr.Blocks() as demo:
gr.Markdown("# 넀이버 λ‰΄μŠ€ μŠ€ν¬λž˜ν•‘")
gr.Markdown("μž…λ ₯ν•œ 검색어λ₯Ό 기반으둜 넀이버 λ‰΄μŠ€ 정보λ₯Ό μŠ€ν¬λž˜ν•‘ν•˜κ³  HTML ν‘œμ™€ μ—‘μ…€ λ‹€μš΄λ‘œλ“œλ₯Ό μ œκ³΅ν•©λ‹ˆλ‹€.")
# 각 κΈ°λŠ₯을 ν•œ 쀄씩 μœ„μ•„λž˜λ‘œ 배치
keyword_input = gr.Textbox(label="검색어", placeholder="λ‰΄μŠ€ 검색어λ₯Ό μž…λ ₯ν•˜μ„Έμš”.")
search_button = gr.Button("검색")
news_html = gr.HTML(label="λ‰΄μŠ€ κ²°κ³Ό")
excel_file = gr.File(label="μ—‘μ…€ λ‹€μš΄λ‘œλ“œ")
search_button.click(fn=scrape_news, inputs=keyword_input, outputs=[news_html, excel_file])
if __name__ == "__main__":
demo.launch()