Spaces:
Running
Running
# app.py | |
import os | |
import streamlit as st | |
import pandas as pd | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import time | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
from collections import Counter | |
import json | |
from datetime import datetime, timedelta | |
import openai | |
import schedule | |
import threading | |
import matplotlib.pyplot as plt | |
from wordcloud import WordCloud | |
# βββ μ€μ : μμ λλ ν 리, NLTK λ°μ΄ν° βββββββββββββββββββββββββββββββββββββββββ | |
# μμ λλ ν 리 μμ± | |
TMP = "/tmp" | |
NLP_DATA = os.path.join(TMP, "nltk_data") | |
os.makedirs(NLP_DATA, exist_ok=True) | |
# NLTK λ°μ΄ν° κ²μ κ²½λ‘μ μΆκ° | |
nltk.data.path.insert(0, NLP_DATA) | |
# νμν NLTK 리μμ€ λ€μ΄λ‘λ | |
for pkg in ["punkt", "stopwords"]: | |
try: | |
nltk.data.find(f"tokenizers/{pkg}") | |
except LookupError: | |
nltk.download(pkg, download_dir=NLP_DATA) | |
# βββ OpenAI API ν€ λΆλ¬μ€κΈ° ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
# μ°μ νκ²½ λ³μ, κ·Έλ€μ st.secrets, λ§μ§λ§μΌλ‘ μ¬μ΄λλ° μ λ ₯ | |
OPENAI_KEY = os.getenv("OPENAI_API_KEY") or st.secrets.get("OPENAI_API_KEY") | |
if not OPENAI_KEY: | |
# μ± μ€ν μ€ μ¬μ΄λλ°μμ μ λ ₯ λ°κΈ° | |
with st.sidebar: | |
st.markdown("### π OpenAI API Key") | |
key_input = st.text_input("Enter your OpenAI API Key:", type="password") | |
if key_input: | |
OPENAI_KEY = key_input | |
if OPENAI_KEY: | |
openai.api_key = OPENAI_KEY | |
else: | |
st.sidebar.error("OpenAI API Keyκ° μ€μ λμ§ μμμ΅λλ€.") | |
# βββ Streamlit νμ΄μ§ & λ©λ΄ κ΅¬μ± βββββββββββββββββββββββββββββββββββββββββββββ | |
st.set_page_config(page_title="π° News Tool", layout="wide") | |
with st.sidebar: | |
st.title("λ΄μ€ κΈ°μ¬ λꡬ") | |
menu = st.radio("λ©λ΄ μ ν", [ | |
"λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§", "κΈ°μ¬ λΆμνκΈ°", "μ κΈ°μ¬ μμ±νκΈ°", "λ΄μ€ κΈ°μ¬ μμ½νκΈ°" | |
]) | |
# βββ νμΌ κ²½λ‘ ν¬νΌ ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def _tmp_path(*paths): | |
"""/tmp νμ κ²½λ‘ μ‘°ν©""" | |
full = os.path.join(TMP, *paths) | |
os.makedirs(os.path.dirname(full), exist_ok=True) | |
return full | |
# βββ μ μ₯λ κΈ°μ¬ λ‘λ/μ μ₯ βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def load_saved_articles(): | |
path = _tmp_path("saved_articles", "articles.json") | |
if os.path.exists(path): | |
with open(path, "r", encoding="utf-8") as f: | |
return json.load(f) | |
return [] | |
def save_articles(articles): | |
path = _tmp_path("saved_articles", "articles.json") | |
with open(path, "w", encoding="utf-8") as f: | |
json.dump(articles, f, ensure_ascii=False, indent=2) | |
# βββ λ€μ΄λ² λ΄μ€ ν¬λ‘€λ¬ βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def crawl_naver_news(keyword, num_articles=5): | |
url = f"https://search.naver.com/search.naver?where=news&query={keyword}" | |
results = [] | |
try: | |
resp = requests.get(url, timeout=5) | |
soup = BeautifulSoup(resp.text, "html.parser") | |
items = soup.select("div.sds-comps-base-layout.sds-comps-full-layout") | |
for i, it in enumerate(items): | |
if i >= num_articles: break | |
title_el = it.select_one("a.X0fMYp2dHd0TCUS2hjww span") | |
link_el = it.select_one("a.X0fMYp2dHd0TCUS2hjww") | |
src_el = it.select_one("div.sds-comps-profile-info-title span") | |
date_el = it.select_one("span.r0VOr") | |
desc_el = it.select_one("a.X0fMYp2dHd0TCUS2hjww.IaKmSOGPdofdPwPE6cyU > span") | |
if not title_el or not link_el: continue | |
results.append({ | |
"title": title_el.text.strip(), | |
"link": link_el["href"], | |
"source": src_el.text.strip() if src_el else "μ μ μμ", | |
"date": date_el.text.strip() if date_el else "μ μ μμ", | |
"description": desc_el.text.strip() if desc_el else "", | |
"content": "" | |
}) | |
except Exception as e: | |
st.error(f"ν¬λ‘€λ§ μ€λ₯: {e}") | |
return results | |
# βββ κΈ°μ¬ λ³Έλ¬Έ κ°μ Έμ€κΈ° βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
def get_article_content(url): | |
try: | |
resp = requests.get(url, timeout=5) | |
soup = BeautifulSoup(resp.text, "html.parser") | |
cont = soup.select_one("#dic_area") or soup.select_one(".article_body, .news-content-inner") | |
if cont: | |
text = re.sub(r"\s+", " ", cont.text.strip()) | |
return text | |
except Exception: | |
pass | |
return "λ³Έλ¬Έμ κ°μ Έμ¬ μ μμ΅λλ€." | |
# βββ ν€μλ λΆμ & μλν΄λΌμ°λ βββββββββββββββββββββββββββββββββββββββββββββββ | |
def analyze_keywords(text, top_n=10): | |
stop_kr = ["μ΄","κ·Έ","μ ","κ²","λ°","λ±","λ₯Ό","μ","μ","μμ","μ","μΌλ‘","λ‘"] | |
tokens = [w for w in word_tokenize(text) if w.isalnum() and len(w)>1 and w not in stop_kr] | |
freq = Counter(tokens) | |
return freq.most_common(top_n) | |
def extract_for_wordcloud(text, top_n=50): | |
tokens = [w for w in word_tokenize(text.lower()) if w.isalnum()] | |
stop_en = set(stopwords.words("english")) | |
korea_sw = {"λ°","λ±","λ₯Ό","μ΄","μ","κ°","μ","λ"} | |
sw = stop_en.union(korea_sw) | |
filtered = [w for w in tokens if w not in sw and len(w)>1] | |
freq = Counter(filtered) | |
return dict(freq.most_common(top_n)) | |
def generate_wordcloud(freq_dict): | |
try: | |
wc = WordCloud(width=800, height=400, background_color="white")\ | |
.generate_from_frequencies(freq_dict) | |
return wc | |
except Exception as e: | |
st.error(f"μλν΄λΌμ°λ μμ± μ€λ₯: {e}") | |
return None | |
# βββ OpenAI κΈ°λ° μ κΈ°μ¬ & μ΄λ―Έμ§ μμ± βββββββββββββββββββββββββββββββββββββββ | |
def generate_article(orig, prompt_text): | |
if not openai.api_key: | |
return "API Keyκ° μ€μ λμ§ μμμ΅λλ€." | |
try: | |
resp = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role":"system","content":"λΉμ μ μ λ¬Έ λ΄μ€ κΈ°μμ λλ€."}, | |
{"role":"user", "content":f"{prompt_text}\n\n{orig[:1000]}"} | |
], | |
max_tokens=1500 | |
) | |
return resp.choices[0].message["content"] | |
except Exception as e: | |
return f"κΈ°μ¬ μμ± μ€λ₯: {e}" | |
def generate_image(prompt): | |
if not openai.api_key: | |
return None | |
try: | |
resp = openai.Image.create(prompt=prompt, n=1, size="512x512") | |
return resp["data"][0]["url"] | |
except Exception as e: | |
st.error(f"μ΄λ―Έμ§ μμ± μ€λ₯: {e}") | |
return None | |
# βββ μ€μΌμ€λ¬ μν ν΄λμ€ βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
class SchedulerState: | |
def __init__(self): | |
self.is_running = False | |
self.thread = None | |
self.last_run = None | |
self.next_run = None | |
self.jobs = [] | |
self.results = [] | |
global_scheduler = SchedulerState() | |
def perform_news_task(task_type, kw, n, prefix): | |
arts = crawl_naver_news(kw, n) | |
for a in arts: | |
a["content"] = get_article_content(a["link"]) | |
time.sleep(0.5) | |
fname = _tmp_path("scheduled_news", f"{prefix}_{task_type}_{datetime.now():%Y%m%d_%H%M%S}.json") | |
with open(fname,"w",encoding="utf-8") as f: | |
json.dump(arts, f, ensure_ascii=False, indent=2) | |
global_scheduler.last_run = datetime.now() | |
global_scheduler.results.append({ | |
"type":task_type, "keyword":kw, | |
"count":len(arts), "file":fname, | |
"timestamp":global_scheduler.last_run | |
}) | |
def run_scheduler(): | |
while global_scheduler.is_running: | |
schedule.run_pending() | |
time.sleep(1) | |
def start_scheduler(daily, interval): | |
if global_scheduler.is_running: return | |
schedule.clear(); global_scheduler.jobs=[] | |
# μΌλ³ | |
for t in daily: | |
hh, mm = t["hour"], t["minute"] | |
tag = f"d_{t['keyword']}_{hh}{mm}" | |
schedule.every().day.at(f"{hh:02d}:{mm:02d}")\ | |
.do(perform_news_task,"daily",t["keyword"],t["num_articles"],tag).tag(tag) | |
global_scheduler.jobs.append(tag) | |
# κ°κ²© | |
for t in interval: | |
tag = f"i_{t['keyword']}_{t['interval']}" | |
if t["immediate"]: | |
perform_news_task("interval", t["keyword"], t["num_articles"], tag) | |
schedule.every(t["interval"]).minutes\ | |
.do(perform_news_task,"interval",t["keyword"],t["num_articles"],tag).tag(tag) | |
global_scheduler.jobs.append(tag) | |
global_scheduler.next_run = schedule.next_run() | |
global_scheduler.is_running = True | |
th = threading.Thread(target=run_scheduler, daemon=True) | |
th.start(); global_scheduler.thread = th | |
def stop_scheduler(): | |
global_scheduler.is_running = False | |
schedule.clear() | |
global_scheduler.jobs=[] | |
# βββ νλ©΄ 그리기: λ©λ΄λ³ κΈ°λ₯ ββββββββββββββββββββββββββββββββββββββββββββββββ | |
if menu == "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§": | |
st.header("λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§") | |
kw = st.text_input("π κ²μμ΄", "μΈκ³΅μ§λ₯") | |
num = st.slider("κ°μ Έμ¬ κΈ°μ¬ μ", 1, 20, 5) | |
if st.button("κΈ°μ¬ κ°μ Έμ€κΈ°"): | |
arts = crawl_naver_news(kw, num) | |
for i,a in enumerate(arts): | |
st.progress((i+1)/len(arts)) | |
a["content"] = get_article_content(a["link"]) | |
time.sleep(0.3) | |
save_articles(arts) | |
st.success(f"{len(arts)}κ° κΈ°μ¬ μ μ₯λ¨") | |
for a in arts: | |
with st.expander(a["title"]): | |
st.write(f"μΆμ²: {a['source']} | λ μ§: {a['date']}") | |
st.write(a["description"]) | |
st.write(a["content"][:300]+"β¦") | |
elif menu == "κΈ°μ¬ λΆμνκΈ°": | |
st.header("κΈ°μ¬ λΆμνκΈ°") | |
arts = load_saved_articles() | |
if not arts: | |
st.warning("λ¨Όμ βλ΄μ€ κΈ°μ¬ ν¬λ‘€λ§β λ©λ΄μμ κΈ°μ¬λ₯Ό μμ§νμΈμ.") | |
else: | |
titles = [a["title"] for a in arts] | |
sel = st.selectbox("λΆμν κΈ°μ¬ μ ν", titles) | |
art = next(a for a in arts if a["title"]==sel) | |
st.subheader(art["title"]) | |
with st.expander("본문 보기"): | |
st.write(art["content"]) | |
mode = st.radio("λΆμ λ°©μ", ["ν€μλ λΆμ", "ν μ€νΈ ν΅κ³"]) | |
if mode=="ν€μλ λΆμ" and st.button("μ€ν"): | |
kw_list = analyze_keywords(art["content"]) | |
df = pd.DataFrame(kw_list, columns=["λ¨μ΄","λΉλ"]) | |
st.bar_chart(df.set_index("λ¨μ΄")) | |
st.write("μμ ν€μλ:") | |
for w,c in kw_list: st.write(f"- {w}: {c}") | |
# μλν΄λΌμ°λ | |
wc_data = extract_for_wordcloud(art["content"]) | |
wc = generate_wordcloud(wc_data) | |
if wc: | |
fig,ax = plt.subplots(figsize=(8,4)) | |
ax.imshow(wc,interp="bilinear"); ax.axis("off") | |
st.pyplot(fig) | |
if mode=="ν μ€νΈ ν΅κ³" and st.button("μ€ν"): | |
txt=art["content"] | |
wcnt=len(re.findall(r"\\w+",txt)) | |
scnt=len(re.split(r"[.!?]+",txt)) | |
st.metric("λ¨μ΄ μ",wcnt); st.metric("λ¬Έμ₯ μ",scnt) | |
elif menu == "μ κΈ°μ¬ μμ±νκΈ°": | |
st.header("μ κΈ°μ¬ μμ±νκΈ°") | |
arts = load_saved_articles() | |
if not arts: | |
st.warning("λ¨Όμ κΈ°μ¬λ₯Ό μμ§ν΄μ£ΌμΈμ.") | |
else: | |
sel = st.selectbox("μλ³Έ κΈ°μ¬ μ ν", [a["title"] for a in arts]) | |
art = next(a for a in arts if a["title"]==sel) | |
st.write(art["content"][:200]+"β¦") | |
prompt = st.text_area("κΈ°μ¬ μμ± μ§μΉ¨", "κΈ°μ¬ νμμ λ§μΆ° μλ‘ μμ±ν΄ μ£ΌμΈμ.") | |
gen_img = st.checkbox("μ΄λ―Έμ§λ μμ±", value=True) | |
if st.button("μμ±"): | |
new = generate_article(art["content"], prompt) | |
st.subheader("μμ±λ κΈ°μ¬") | |
st.write(new) | |
if gen_img: | |
url = generate_image(f"κΈ°μ¬ μ λͺ©: {art['title']}\n\n{prompt}") | |
if url: st.image(url) | |
elif menu == "λ΄μ€ κΈ°μ¬ μμ½νκΈ°": | |
st.header("λ΄μ€ κΈ°μ¬ μμ½νκΈ°") | |
tab1,tab2,tab3 = st.tabs(["μΌλ³ μμ½","κ°κ²© μμ½","μν"]) | |
# μΌλ³ | |
with tab1: | |
dkw = st.text_input("ν€μλ(μΌλ³)", "μΈκ³΅μ§λ₯", key="dk") | |
dnum = st.number_input("κΈ°μ¬ μ",1,20,5,key="dn") | |
dhh = st.number_input("μ",0,23,9,key="dh") | |
dmm = st.number_input("λΆ",0,59,0,key="dm") | |
if st.button("μΆκ°",key="addd"): | |
st.session_state.setdefault("daily",[]).append({ | |
"keyword":dkw,"num_articles":dnum, | |
"hour":dhh,"minute":dmm | |
}) | |
if st.session_state.get("daily"): | |
st.write(st.session_state["daily"]) | |
# κ°κ²© | |
with tab2: | |
ikw = st.text_input("ν€μλ(κ°κ²©)", "λΉ λ°μ΄ν°", key="ik") | |
inum = st.number_input("κΈ°μ¬ μ",1,20,5,key="in") | |
inter= st.number_input("κ°κ²©(λΆ)",1,1440,60,key="ii") | |
imm = st.checkbox("μ¦μ μ€ν",True,key="im") | |
if st.button("μΆκ°",key="addi"): | |
st.session_state.setdefault("interval",[]).append({ | |
"keyword":ikw,"num_articles":inum, | |
"interval":inter,"immediate":imm | |
}) | |
if st.session_state.get("interval"): | |
st.write(st.session_state["interval"]) | |
# μν | |
with tab3: | |
if not global_scheduler.is_running and st.button("μμ"): | |
start_scheduler(st.session_state.get("daily",[]), | |
st.session_state.get("interval",[])) | |
if global_scheduler.is_running and st.button("μ€μ§"): | |
stop_scheduler() | |
st.write("μ€νμ€:", global_scheduler.is_running) | |
st.write("λ§μ§λ§ μ€ν:", global_scheduler.last_run) | |
st.write("λ€μ μ€ν:", global_scheduler.next_run) | |
st.write("μ‘ μ:", global_scheduler.jobs) | |
st.dataframe(pd.DataFrame(global_scheduler.results)) | |
# βββ νΈν° ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
st.markdown("---") | |
st.markdown("Β© 2025 News Tool @conanssam") | |