Spaces:

Kims12
/

5-2_t1

Sleeping

App Files Files Community

5-2_t1 / app.py

Kims12

Update app.py

0c4db8b verified 6 months ago

raw

history blame contribute delete

3.79 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import tempfile

	def scrape_news(keyword):
	# 네이버 뉴스 검색 URL 구성
	base_url = "https://search.naver.com/search.naver?sm=tab_hty.top&where=news&ssc=tab.news.all&query="
	url = base_url + keyword
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
	}
	res = requests.get(url, headers=headers)
	res.encoding = 'utf-8'
	soup = BeautifulSoup(res.text, "html.parser")

	news_list = []
	# 뉴스 아이템은 div 클래스 "news_area" 안에 있음
	news_areas = soup.find_all("div", class_="news_area")
	for area in news_areas:
	try:
	# 신문사와 발행일은 news_info > info_group 에서 추출
	info_group = area.find("div", class_="news_info").find("div", class_="info_group")
	publisher_tag = info_group.find("a", class_="info press")
	publisher = publisher_tag.get_text(strip=True) if publisher_tag else ""
	date_tag = info_group.find("span", class_="info")
	date = date_tag.get_text(strip=True) if date_tag else ""

	# 제목과 링크는 news_contents 내의 news_tit에서 추출
	title_tag = area.find("a", class_="news_tit")
	title = title_tag.get("title", title_tag.get_text(strip=True)) if title_tag else ""
	link = title_tag.get("href") if title_tag else ""

	# 뉴스 간략정보는 api_txt_lines dsc_txt_wrap에서 추출
	brief_tag = area.find("a", class_="api_txt_lines dsc_txt_wrap")
	brief = brief_tag.get_text(strip=True) if brief_tag else ""

	news_list.append({
	"신문사": publisher,
	"발행일": date,
	"제목": title,
	"뉴스간략정보": brief,
	"링크": link
	})
	except Exception as e:
	# 오류 발생 시 해당 뉴스 항목은 건너뜁니다.
	continue

	# 결과 DataFrame 생성
	df = pd.DataFrame(news_list)

	# UI 출력용 HTML 테이블 생성 (링크는 클릭 가능하도록 a 태그 처리)
	if df.empty:
	html_table = "<p>검색 결과가 없습니다.</p>"
	else:
	df["링크"] = df["링크"].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
	html_table = df.to_html(escape=False, index=False)

	# Excel 파일 생성 (엑셀 내 링크는 HYPERLINK 함수로 클릭 가능하도록 처리)
	df_excel = pd.DataFrame(news_list)
	if not df_excel.empty:
	df_excel["링크"] = df_excel["링크"].apply(lambda x: f'=HYPERLINK("{x}", "링크")')

	# 임시 파일을 생성하여 엑셀 파일 저장 (gr.File 컴포넌트는 파일 경로를 필요로 함)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
	tmp_path = tmp.name
	with pd.ExcelWriter(tmp_path, engine="openpyxl") as writer:
	df_excel.to_excel(writer, index=False, sheet_name="News")

	return html_table, tmp_path

	with gr.Blocks() as demo:
	gr.Markdown("# 네이버 뉴스 스크래핑")
	gr.Markdown("입력한 검색어를 기반으로 네이버 뉴스 정보를 스크래핑하고 HTML 표와 엑셀 다운로드를 제공합니다.")

	# 각 기능을 한 줄씩 위아래로 배치
	keyword_input = gr.Textbox(label="검색어", placeholder="뉴스 검색어를 입력하세요.")
	search_button = gr.Button("검색")
	news_html = gr.HTML(label="뉴스 결과")
	excel_file = gr.File(label="엑셀 다운로드")

	search_button.click(fn=scrape_news, inputs=keyword_input, outputs=[news_html, excel_file])

	if __name__ == "__main__":
	demo.launch()