example-ai-crawler / src /streamlit_app.py
JUNGU's picture
Update src/streamlit_app.py
31658d4 verified
raw
history blame
63.4 kB
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import json
import os
from datetime import datetime, timedelta
import traceback
import plotly.graph_objects as go
import schedule
import threading
import matplotlib.pyplot as plt
from pathlib import Path
import openai
from dotenv import load_dotenv
# ν—ˆκΉ…νŽ˜μ΄μŠ€ Spaces ν™˜κ²½μ— 맞게 μž„μ‹œ 디렉토리 μ„€μ •
# /tmp ν΄λ”λŠ” μ‘΄μž¬ν•  수 μžˆμ§€λ§Œ κΆŒν•œ λ¬Έμ œκ°€ μžˆμ„ 수 μžˆμœΌλ―€λ‘œ ν˜„μž¬ μž‘μ—… 디렉토리 기반으둜 λ³€κ²½
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
DATA_DIR = os.path.join(CURRENT_DIR, "data")
NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
# 디렉토리 생성 ν•¨μˆ˜
def ensure_directory(directory):
try:
os.makedirs(directory, exist_ok=True)
return True
except Exception as e:
st.error(f"디렉토리 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
return False
# ν•„μš”ν•œ λͺ¨λ“  디렉토리 생성
ensure_directory(DATA_DIR)
ensure_directory(NLTK_DATA_DIR)
ensure_directory(SCHEDULED_NEWS_DIR)
# NLTK μ„€μ • - κΆŒν•œ 문제 해결을 μœ„ν•΄ μ‚¬μš©μž μ§€μ • 디렉토리 μ‚¬μš©
import nltk
nltk.data.path.append(NLTK_DATA_DIR)
# ν•„μš”ν•œ NLTK 데이터 λ‹€μš΄λ‘œλ“œ (κΆŒν•œ 문제 ν•΄κ²°)
try:
# μ‚¬μš©μž μ§€μ • 디렉토리에 데이터 λ‹€μš΄λ‘œλ“œ
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', download_dir=NLTK_DATA_DIR)
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
except Exception as e:
st.warning(f"NLTK 데이터 λ‹€μš΄λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}. κΈ°λ³Έ ν† ν¬λ‚˜μ΄μ§• 방식을 μ‚¬μš©ν•©λ‹ˆλ‹€.")
# ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ§•μ„ μœ„ν•œ λŒ€μ²΄ ν•¨μˆ˜ (KoNLPy λŒ€μ‹  μ‚¬μš©)
def tokenize_korean(text):
try:
# 1. λ¨Όμ € transformers λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ–΄ μžˆλŠ”μ§€ 확인
try:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
return tokenizer.tokenize(text)
except (ImportError, Exception) as e:
st.debug(f"Transformers ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ μ‹€νŒ¨: {str(e)}")
# 2. soynlp μ‹œλ„
try:
from soynlp.tokenizer import LTokenizer
tokenizer = LTokenizer()
return tokenizer.tokenize(text)
except (ImportError, Exception) as e:
st.debug(f"soynlp ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ μ‹€νŒ¨: {str(e)}")
# 3. kss μ‹œλ„
try:
import kss
tokens = []
for sentence in kss.split_sentences(text):
tokens.extend(sentence.split())
return tokens
except (ImportError, Exception) as e:
st.debug(f"kss ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ μ‹€νŒ¨: {str(e)}")
except Exception as e:
st.debug(f"ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ§• μ‹€νŒ¨: {str(e)}")
# 4. κΈ°λ³Έ μ •κ·œμ‹ 기반 ν† ν¬λ‚˜μ΄μ € - λͺ¨λ“  방법이 μ‹€νŒ¨ν–ˆμ„ λ•Œ 폴백
return re.findall(r'[κ°€-힣]+|[a-zA-Z]+|[0-9]+|[^\sκ°€-힣a-zA-Z0-9]+', text)
# μ›Œλ“œν΄λΌμš°λ“œ μΆ”κ°€ (선택적 μ‚¬μš©)
try:
from wordcloud import WordCloud
wordcloud_available = True
except ImportError:
wordcloud_available = False
# μŠ€μΌ€μ€„λŸ¬ μƒνƒœ 클래슀 μΆ”κ°€
class SchedulerState:
def __init__(self):
self.is_running = False
self.thread = None
self.last_run = None
self.next_run = None
self.scheduled_jobs = []
self.scheduled_results = []
# μ „μ—­ μŠ€μΌ€μ€„λŸ¬ μƒνƒœ 객체 생성 (μŠ€λ ˆλ“œ μ•ˆμ—μ„œ μ‚¬μš©)
global_scheduler_state = SchedulerState()
# API ν‚€ 관리λ₯Ό μœ„ν•œ μ„Έμ…˜ μƒνƒœ μ΄ˆκΈ°ν™”
if 'openai_api_key' not in st.session_state:
st.session_state.openai_api_key = None
# API ν‚€ λ‘œλ“œ (ν—ˆκΉ…νŽ˜μ΄μŠ€ ν™˜κ²½λ³€μˆ˜ μš°μ„ , λ‹€μŒμœΌλ‘œ Streamlit secrets, κ·Έ λ‹€μŒ .env 파일)
if st.session_state.openai_api_key is None:
st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY') # Hugging Face
if st.session_state.openai_api_key is None:
try:
if 'OPENAI_API_KEY' in st.secrets: # Streamlit Cloud
st.session_state.openai_api_key = st.secrets['OPENAI_API_KEY']
except Exception: # st.secretsκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠλŠ” ν™˜κ²½ (둜컬 λ“±)
pass
if st.session_state.openai_api_key is None:
load_dotenv() # 둜컬 .env 파일
st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
# νŽ˜μ΄μ§€ μ„€μ •
st.set_page_config(page_title="λ‰΄μŠ€ 기사 도ꡬ", page_icon="πŸ“°", layout="wide")
# μ‚¬μ΄λ“œλ°” 메뉴 μ„€μ •
st.sidebar.title("λ‰΄μŠ€ 기사 도ꡬ")
menu = st.sidebar.radio(
"메뉴 선택",
["λ‰΄μŠ€ 기사 크둀링", "기사 λΆ„μ„ν•˜κΈ°", "μƒˆ 기사 μƒμ„±ν•˜κΈ°", "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°"]
)
# OpenAI API ν‚€ μž…λ ₯ (μ‚¬μ΄λ“œλ°”)
openai_api_key = st.sidebar.text_input("OpenAI API ν‚€ (선택사항)",
value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
type="password")
if openai_api_key:
st.session_state.openai_api_key = openai_api_key
openai.api_key = openai_api_key
# μ €μž₯된 기사λ₯Ό λΆˆλŸ¬μ˜€λŠ” ν•¨μˆ˜
def load_saved_articles():
try:
if os.path.exists(SAVED_ARTICLES_PATH):
with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
st.error(f"기사 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
return []
return []
# 기사λ₯Ό μ €μž₯ν•˜λŠ” ν•¨μˆ˜
def save_articles(articles):
try:
with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
st.error(f"기사 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
return False
@st.cache_data
def crawl_naver_news(keyword, num_articles=5):
"""
넀이버 λ‰΄μŠ€ 기사λ₯Ό μˆ˜μ§‘ν•˜λŠ” ν•¨μˆ˜
"""
url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
results = []
try:
# νŽ˜μ΄μ§€ μš”μ²­
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# λ‰΄μŠ€ μ•„μ΄ν…œ μ°ΎκΈ°
news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
# 각 λ‰΄μŠ€ μ•„μ΄ν…œμ—μ„œ 정보 μΆ”μΆœ
for i, item in enumerate(news_items):
if i >= num_articles:
break
try:
# 제λͺ©κ³Ό 링크 μΆ”μΆœ
title_element = item.select_one('a.X0fMYp2dHd0TCUS2hjww span')
if not title_element:
continue
title = title_element.text.strip()
link_element = item.select_one('a.X0fMYp2dHd0TCUS2hjww')
link = link_element['href'] if link_element else ""
# 언둠사 μΆ”μΆœ
press_element = item.select_one('div.sds-comps-profile-info-title span.sds-comps-text-type-body2')
source = press_element.text.strip() if press_element else "μ•Œ 수 μ—†μŒ"
# λ‚ μ§œ μΆ”μΆœ
date_element = item.select_one('span.r0VOr')
date = date_element.text.strip() if date_element else "μ•Œ 수 μ—†μŒ"
# 미리보기 λ‚΄μš© μΆ”μΆœ
desc_element = item.select_one('a.X0fMYp2dHd0TCUS2hjww.IaKmSOGPdofdPwPE6cyU > span')
description = desc_element.text.strip() if desc_element else "λ‚΄μš© μ—†μŒ"
results.append({
'title': title,
'link': link,
'description': description,
'source': source,
'date': date,
'content': "" # λ‚˜μ€‘μ— 원문 λ‚΄μš©μ„ μ €μž₯ν•  ν•„λ“œ
})
except Exception as e:
st.error(f"기사 정보 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
continue
except Exception as e:
st.error(f"νŽ˜μ΄μ§€ μš”μ²­ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
return results
# 기사 원문 κ°€μ Έμ˜€κΈ°
def get_article_content(url):
try:
response = requests.get(url, timeout=5)
soup = BeautifulSoup(response.text, 'html.parser')
# 넀이버 λ‰΄μŠ€ λ³Έλ¬Έ μ°ΎκΈ°
content = soup.select_one('#dic_area')
if content:
text = content.text.strip()
text = re.sub(r'\s+', ' ', text) # μ—¬λŸ¬ 곡백 제거
return text
# λ‹€λ₯Έ λ‰΄μŠ€ μ‚¬μ΄νŠΈ λ³Έλ¬Έ μ°ΎκΈ° (μ—¬λŸ¬ μ‚¬μ΄νŠΈ λŒ€μ‘ ν•„μš”)
content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
if content:
text = content.text.strip()
text = re.sub(r'\s+', ' ', text)
return text
return "본문을 κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€."
except Exception as e:
return f"였λ₯˜ λ°œμƒ: {str(e)}"
# NLTKλ₯Ό μ΄μš©ν•œ ν‚€μ›Œλ“œ 뢄석 (ν•œκ΅­μ–΄ λŒ€μ‘ μΆ”κ°€)
def analyze_keywords(text, top_n=10):
# ν•œκ΅­μ–΄ λΆˆμš©μ–΄ λͺ©λ‘
korean_stopwords = [
'이', 'κ·Έ', 'μ €', '것', '및', 'λ“±', 'λ₯Ό', '을', '에', 'μ—μ„œ', '의', '으둜', '둜',
'μ—κ²Œ', '뿐', 'λ‹€', 'λŠ”', 'κ°€', '이닀', 'μ—κ²Œμ„œ', '께', 'κ»˜μ„œ', 'λΆ€ν„°', 'κΉŒμ§€'
]
# μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
is_korean = bool(re.search(r'[κ°€-힣]', text))
if is_korean:
# ν•œκ΅­μ–΄ ν…μŠ€νŠΈμΈ 경우 ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
tokens = tokenize_korean(text)
else:
# ν•œκΈ€μ΄ μ—†λŠ” 경우 NLTK ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
try:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
except Exception:
# NLTKκ°€ μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ ν† ν¬λ‚˜μ΄μ €λ‘œ λŒ€μ²΄
tokens = re.findall(r'\b\w+\b', text.lower())
# λΆˆμš©μ–΄ 필터링
tokens = [word for word in tokens if len(word) > 1 and word.lower() not in korean_stopwords]
# λΉˆλ„ 계산
from collections import Counter
word_count = Counter(tokens)
top_keywords = word_count.most_common(top_n)
return top_keywords
# μ›Œλ“œ ν΄λΌμš°λ“œμš© 뢄석
def extract_keywords_for_wordcloud(text, top_n=50):
if not text or len(text.strip()) < 10:
return {}
try:
# μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
is_korean = bool(re.search(r'[κ°€-힣]', text))
if is_korean:
# ν•œκ΅­μ–΄ ν…μŠ€νŠΈμΈ 경우 ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
tokens = tokenize_korean(text.lower())
else:
# μ˜μ–΄ λ˜λŠ” 기타 μ–Έμ–΄λŠ” NLTK μ‚¬μš© μ‹œλ„
try:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text.lower())
except Exception:
# μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ ν† ν¬λ‚˜μ΄μ§•
tokens = text.lower().split()
# λΆˆμš©μ–΄ μ„€μ •
stop_words = set()
# μ˜μ–΄ λΆˆμš©μ–΄ (NLTK 있으면 μ‚¬μš©)
try:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
except Exception:
# κΈ°λ³Έ μ˜μ–΄ λΆˆμš©μ–΄
stop_words = {
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
'will', 'shall', 'can', 'may', 'must', 'ought'
}
# ν•œκ΅­μ–΄ λΆˆμš©μ–΄
korea_stop_words = {
'및', 'λ“±', 'λ₯Ό', '이', '의', 'κ°€', '에', 'λŠ”', '으둜', 'μ—μ„œ', 'κ·Έ', '또', 'λ˜λŠ”', 'ν•˜λŠ”', 'ν• ', 'ν•˜κ³ ',
'μžˆλ‹€', '이닀', 'μœ„ν•΄', '것이닀', '것은', 'λŒ€ν•œ', 'λ•Œλ¬Έ', '그리고', 'ν•˜μ§€λ§Œ', 'κ·ΈλŸ¬λ‚˜', 'κ·Έλž˜μ„œ',
'μž…λ‹ˆλ‹€', 'ν•©λ‹ˆλ‹€', 'μŠ΅λ‹ˆλ‹€', 'μš”', 'μ£ ', 'κ³ ', 'κ³Ό', '와', '도', '은', '수', '것', 'λ“€', '제', 'μ €',
'λ…„', 'μ›”', '일', 'μ‹œ', 'λΆ„', '초', 'μ§€λ‚œ', 'μ˜¬ν•΄', 'λ‚΄λ…„', '졜근', 'ν˜„μž¬', '였늘', '내일', 'μ–΄μ œ',
'μ˜€μ „', 'μ˜€ν›„', 'λΆ€ν„°', 'κΉŒμ§€', 'μ—κ²Œ', 'κ»˜μ„œ', '이라고', '라고', 'ν•˜λ©°', 'ν•˜λ©΄μ„œ', '따라', '톡해',
'κ΄€λ ¨', 'ν•œνŽΈ', '특히', 'κ°€μž₯', '맀우', '더', '덜', '많이', '쑰금', '항상', '자주', '가끔', '거의',
'μ „ν˜€', 'λ°”λ‘œ', '정말', 'λ§Œμ•½', 'λΉ„λ‘―ν•œ', '등을', '등이', 'λ“±μ˜', 'λ“±κ³Ό', '등도', '등에', 'λ“±μ—μ„œ',
'기자', 'λ‰΄μŠ€', '사진', 'μ—°ν•©λ‰΄μŠ€', 'λ‰΄μ‹œμŠ€', '제곡', '무단', 'μ „μž¬', '재배포', 'κΈˆμ§€', '액컀', '멘트',
'일보', '데일리', '경제', 'μ‚¬νšŒ', 'μ •μΉ˜', '세계', 'κ³Όν•™', '아이티', 'λ‹·μ»΄', '씨넷', 'λΈ”λ‘œν„°', 'μ „μžμ‹ λ¬Έ'
}
stop_words.update(korea_stop_words)
# 1κΈ€μž 이상이고 λΆˆμš©μ–΄κ°€ μ•„λ‹Œ ν† ν°λ§Œ 필터링
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
# 단어 λΉˆλ„ 계산
word_freq = {}
for word in filtered_tokens:
if word.isalnum(): # μ•ŒνŒŒλ²³κ³Ό 숫자만 ν¬ν•¨λœ λ‹¨μ–΄λ§Œ ν—ˆμš©
word_freq[word] = word_freq.get(word, 0) + 1
# λΉˆλ„μˆœμœΌλ‘œ μ •λ ¬ν•˜μ—¬ μƒμœ„ n개 λ°˜ν™˜
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
if not sorted_words:
return {"data": 1, "analysis": 1, "news": 1}
return dict(sorted_words[:top_n])
except Exception as e:
st.error(f"ν‚€μ›Œλ“œ μΆ”μΆœ 쀑 였λ₯˜λ°œμƒ {str(e)}")
return {"data": 1, "analysis": 1, "news": 1}
# μ›Œλ“œ ν΄λΌμš°λ“œ 생성 ν•¨μˆ˜
def generate_wordcloud(keywords_dict):
if not wordcloud_available:
st.warning("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
return None
try:
# λ‚˜λˆ”κ³ λ”• 폰트 확인 (μ—†μœΌλ©΄ κΈ°λ³Έ 폰트 μ‚¬μš©)
font_path = os.path.join(CURRENT_DIR, "NanumGothic.ttf")
if not os.path.exists(font_path):
# κΈ°λ³Έ 폰트 μ‚¬μš©
wc = WordCloud(
width=800,
height=400,
background_color='white',
colormap='viridis',
max_font_size=150,
random_state=42
).generate_from_frequencies(keywords_dict)
else:
# λ‚˜λˆ”κ³ λ”• 폰트 μ‚¬μš©
wc = WordCloud(
font_path=font_path,
width=800,
height=400,
background_color='white',
colormap='viridis',
max_font_size=150,
random_state=42
).generate_from_frequencies(keywords_dict)
return wc
except Exception as e:
st.error(f"μ›Œλ“œν΄λΌμš°λ“œ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
return None
# λ‰΄μŠ€ 뢄석 ν•¨μˆ˜
def analyze_news_content(news_df):
if news_df.empty:
return "데이터가 μ—†μŠ΅λ‹ˆλ‹€"
results = {}
# μΉ΄ν…Œκ³ λ¦¬λ³„ 뢄석
if 'source' in news_df.columns:
results['source_counts'] = news_df['source'].value_counts().to_dict()
if 'date' in news_df.columns:
results['date_counts'] = news_df['date'].value_counts().to_dict()
# ν‚€μ›Œλ“œ 뢄석
all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
if len(all_text.strip()) > 0:
results['top_keywords_for_wordcloud'] = extract_keywords_for_wordcloud(all_text, top_n=50)
results['top_keywords'] = analyze_keywords(all_text)
else:
results['top_keywords_for_wordcloud'] = {}
results['top_keywords'] = []
return results
# OpenAI APIλ₯Ό μ΄μš©ν•œ μƒˆ 기사 생성
def generate_article(original_content, prompt_text):
if not st.session_state.openai_api_key:
return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
try:
# API ν‚€ μ„€μ •
openai.api_key = st.session_state.openai_api_key
# API 호좜
response = openai.chat.completions.create(
model="gpt-4.1-mini", # λ˜λŠ” λ‹€λ₯Έ μ‚¬μš© κ°€λŠ₯ν•œ λͺ¨λΈ
messages=[
{"role": "system", "content": "당신은 전문적인 λ‰΄μŠ€ κΈ°μžμž…λ‹ˆλ‹€. μ£Όμ–΄μ§„ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μƒˆλ‘œμš΄ 기사λ₯Ό μž‘μ„±ν•΄μ£Όμ„Έμš”."},
{"role": "user", "content": f"λ‹€μŒ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ {prompt_text}\n\n{original_content[:1000]}"}
],
max_tokens=2000
)
return response.choices[0].message.content
except Exception as e:
return f"기사 생성 였λ₯˜: {str(e)}"
# OpenAI APIλ₯Ό μ΄μš©ν•œ 이미지 생성
def generate_image(prompt):
if not st.session_state.openai_api_key:
return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
try:
# API ν‚€ μ„€μ •
openai.api_key = st.session_state.openai_api_key
# API 호좜
response = openai.images.generate(
model="gpt-image-1",
prompt=prompt
)
image_base64 = response.data[0].b64_json
return f"data:image/png;base64,{image_base64}"
except Exception as e:
return f"이미지 생성 였λ₯˜: {str(e)}"
# μŠ€μΌ€μ€„λŸ¬ κ΄€λ ¨ ν•¨μˆ˜λ“€
def get_next_run_time(hour, minute):
now = datetime.now()
next_run = now.replace(hour=hour, minute=minute, second=0, microsecond=0)
if next_run <= now:
next_run += timedelta(days=1)
return next_run
def run_scheduled_task():
try:
while global_scheduler_state.is_running:
schedule.run_pending()
time.sleep(1)
except Exception as e:
print(f"μŠ€μΌ€μ€„λŸ¬ μ—λŸ¬ λ°œμƒ: {e}")
traceback.print_exc()
def perform_news_task(task_type, keyword, num_articles, file_prefix):
try:
articles = crawl_naver_news(keyword, num_articles)
# 기사 λ‚΄μš© κ°€μ Έμ˜€κΈ°
for article in articles:
article['content'] = get_article_content(article['link'])
time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
# κ²°κ³Ό μ €μž₯
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
except Exception as e:
print(f"파일 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
return
global_scheduler_state.last_run = datetime.now()
print(f"{datetime.now()} - {task_type} λ‰΄μŠ€ 기사 μˆ˜μ§‘ μ™„λ£Œ: {keyword}")
# μ „μ—­ μƒνƒœμ— μˆ˜μ§‘ κ²°κ³Όλ₯Ό μ €μž₯ (UI μ—…λ°μ΄νŠΈμš©)
result_item = {
'task_type': task_type,
'keyword': keyword,
'timestamp': timestamp,
'num_articles': len(articles),
'filename': filename
}
global_scheduler_state.scheduled_results.append(result_item)
except Exception as e:
print(f"μž‘μ—… μ‹€ν–‰ 쀑 였λ₯˜ λ°œμƒ: {e}")
traceback.print_exc()
def start_scheduler(daily_tasks, interval_tasks):
if not global_scheduler_state.is_running:
schedule.clear()
global_scheduler_state.scheduled_jobs = []
# 일별 νƒœμŠ€ν¬ 등둝
for task in daily_tasks:
hour = task['hour']
minute = task['minute']
keyword = task['keyword']
num_articles = task['num_articles']
job_id = f"daily_{keyword}_{hour}_{minute}"
schedule.every().day.at(f"{hour:02d}:{minute:02d}").do(
perform_news_task, "daily", keyword, num_articles, job_id
).tag(job_id)
global_scheduler_state.scheduled_jobs.append({
'id': job_id,
'type': 'daily',
'time': f"{hour:02d}:{minute:02d}",
'keyword': keyword,
'num_articles': num_articles
})
# μ‹œκ°„ 간격 νƒœμŠ€ν¬ 등둝
for task in interval_tasks:
interval_minutes = task['interval_minutes']
keyword = task['keyword']
num_articles = task['num_articles']
run_immediately = task['run_immediately']
job_id = f"interval_{keyword}_{interval_minutes}"
if run_immediately:
# μ¦‰μ‹œ μ‹€ν–‰
perform_news_task("interval", keyword, num_articles, job_id)
# λΆ„ κ°„κ²©μœΌλ‘œ μ˜ˆμ•½
schedule.every(interval_minutes).minutes.do(
perform_news_task, "interval", keyword, num_articles, job_id
).tag(job_id)
global_scheduler_state.scheduled_jobs.append({
'id': job_id,
'type': 'interval',
'interval': f"{interval_minutes}λΆ„λ§ˆλ‹€",
'keyword': keyword,
'num_articles': num_articles,
'run_immediately': run_immediately
})
# λ‹€μŒ μ‹€ν–‰ μ‹œκ°„ 계산
next_run = schedule.next_run()
if next_run:
global_scheduler_state.next_run = next_run
# μŠ€μΌ€μ€„λŸ¬ μ“°λ ˆλ“œ μ‹œμž‘
global_scheduler_state.is_running = True
global_scheduler_state.thread = threading.Thread(
target=run_scheduled_task, daemon=True
)
global_scheduler_state.thread.start()
# μƒνƒœλ₯Ό μ„Έμ…˜ μƒνƒœλ‘œλ„ 볡사 (UI ν‘œμ‹œμš©)
if 'scheduler_status' not in st.session_state:
st.session_state.scheduler_status = {}
st.session_state.scheduler_status = {
'is_running': global_scheduler_state.is_running,
'last_run': global_scheduler_state.last_run,
'next_run': global_scheduler_state.next_run,
'jobs_count': len(global_scheduler_state.scheduled_jobs)
}
def stop_scheduler():
if global_scheduler_state.is_running:
global_scheduler_state.is_running = False
schedule.clear()
if global_scheduler_state.thread:
global_scheduler_state.thread.join(timeout=1)
global_scheduler_state.next_run = None
global_scheduler_state.scheduled_jobs = []
# UI μƒνƒœ μ—…λ°μ΄νŠΈ
if 'scheduler_status' in st.session_state:
st.session_state.scheduler_status['is_running'] = False
# 메뉴에 λ”°λ₯Έ ν™”λ©΄ ν‘œμ‹œ
if menu == "λ‰΄μŠ€ 기사 크둀링":
st.header("λ‰΄μŠ€ 기사 크둀링")
keyword = st.text_input("검색어 μž…λ ₯", "인곡지λŠ₯")
num_articles = st.slider("κ°€μ Έμ˜¬ 기사 수", min_value=1, max_value=20, value=5)
if st.button("기사 κ°€μ Έμ˜€κΈ°"):
with st.spinner("기사λ₯Ό μˆ˜μ§‘ μ€‘μž…λ‹ˆλ‹€..."):
articles = crawl_naver_news(keyword, num_articles)
# 기사 λ‚΄μš© κ°€μ Έμ˜€κΈ°
progress_bar = st.progress(0)
for i, article in enumerate(articles):
progress_bar.progress((i + 1) / len(articles))
article['content'] = get_article_content(article['link'])
time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
# κ²°κ³Ό μ €μž₯ 및 ν‘œμ‹œ
save_articles(articles)
st.success(f"{len(articles)}개의 기사λ₯Ό μˆ˜μ§‘ν–ˆμŠ΅λ‹ˆλ‹€!")
# μˆ˜μ§‘ν•œ 기사 ν‘œμ‹œ
for article in articles:
with st.expander(f"{article['title']} - {article['source']}"):
st.write(f"**좜처:** {article['source']}")
st.write(f"**λ‚ μ§œ:** {article['date']}")
st.write(f"**μš”μ•½:** {article['description']}")
st.write(f"**링크:** {article['link']}")
st.write("**본문 미리보기:**")
st.write(article['content'][:300] + "..." if len(article['content']) > 300 else article['content'])
elif menu == "기사 λΆ„μ„ν•˜κΈ°":
st.header("기사 λΆ„μ„ν•˜κΈ°")
articles = load_saved_articles()
if not articles:
st.warning("μ €μž₯된 기사가 μ—†μŠ΅λ‹ˆλ‹€. λ¨Όμ € 'λ‰΄μŠ€ 기사 크둀링' λ©”λ‰΄μ—μ„œ 기사λ₯Ό μˆ˜μ§‘ν•΄μ£Όμ„Έμš”.")
else:
# 기사 선택
titles = [article['title'] for article in articles]
selected_title = st.selectbox("뢄석할 기사 선택", titles)
selected_article = next((a for a in articles if a['title'] == selected_title), None)
if selected_article:
st.write(f"**제λͺ©:** {selected_article['title']}")
st.write(f"**좜처:** {selected_article['source']}")
# λ³Έλ¬Έ ν‘œμ‹œ
with st.expander("기사 λ³Έλ¬Έ 보기"):
st.write(selected_article['content'])
# 뢄석 방법 선택
analysis_type = st.radio(
"뢄석 방법",
["ν‚€μ›Œλ“œ 뢄석", "감정 뢄석", "ν…μŠ€νŠΈ 톡계"]
)
if analysis_type == "ν‚€μ›Œλ“œ 뢄석":
if st.button("ν‚€μ›Œλ“œ λΆ„μ„ν•˜κΈ°"):
with st.spinner("ν‚€μ›Œλ“œλ₯Ό 뢄석 μ€‘μž…λ‹ˆλ‹€..."):
keyword_tab1, keyword_tab2 = st.tabs(["ν‚€μ›Œλ“œ λΉˆλ„", "μ›Œλ“œν΄λΌμš°λ“œ"])
with keyword_tab1:
keywords = analyze_keywords(selected_article['content'])
# μ‹œκ°ν™”
df = pd.DataFrame(keywords, columns=['단어', 'λΉˆλ„μˆ˜'])
st.bar_chart(df.set_index('단어'))
st.write("**μ£Όμš” ν‚€μ›Œλ“œ:**")
for word, count in keywords:
st.write(f"- {word}: {count}회")
with keyword_tab2:
keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
if wordcloud_available:
wc = generate_wordcloud(keyword_dict)
if wc:
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wc, interpolation='bilinear')
ax.axis('off')
st.pyplot(fig)
# ν‚€μ›Œλ“œ μƒμœ„ 20개 ν‘œμ‹œ
st.write("**μƒμœ„ 20개 ν‚€μ›Œλ“œ:**")
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
st.dataframe(keyword_df)
else:
st.error("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
else:
# μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μ‚¬μš©ν•  수 μ—†λŠ” 경우 λŒ€μ²΄ ν‘œμ‹œ
st.warning("μ›Œλ“œν΄λΌμš°λ“œ κΈ°λŠ₯을 μ‚¬μš©ν•  수 μ—†μŠ΅λ‹ˆλ‹€. ν•„μš”ν•œ νŒ¨ν‚€μ§€κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
# λŒ€μ‹  ν‚€μ›Œλ“œλ§Œ ν‘œμ‹œ
st.write("**μƒμœ„ ν‚€μ›Œλ“œ:**")
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:30]
keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
st.dataframe(keyword_df)
# λ§‰λŒ€ 차트둜 ν‘œμ‹œ
st.bar_chart(keyword_df.set_index('ν‚€μ›Œλ“œ').head(15))
elif analysis_type == "ν…μŠ€νŠΈ 톡계":
if st.button("ν…μŠ€νŠΈ 톡계 뢄석"):
content = selected_article['content']
# ν…μŠ€νŠΈ 톡계 계산
word_count = len(re.findall(r'\b\w+\b', content))
char_count = len(content)
sentence_count = len(re.split(r'[.!?]+', content))
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
# 톡계 ν‘œμ‹œ
st.subheader("ν…μŠ€νŠΈ 톡계")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("단어 수", f"{word_count:,}")
with col2:
st.metric("문자 수", f"{char_count:,}")
with col3:
st.metric("λ¬Έμž₯ 수", f"{sentence_count:,}")
col1, col2 = st.columns(2)
with col1:
st.metric("평균 단어 길이", f"{avg_word_length:.1f}자")
with col2:
st.metric("평균 λ¬Έμž₯ 길이", f"{avg_sentence_length:.1f}단어")
# ν…μŠ€νŠΈ λ³΅μž‘μ„± 점수 (κ°„λ‹¨ν•œ μ˜ˆμ‹œ)
complexity_score = min(10, (avg_sentence_length / 10) * 5 + (avg_word_length / 5) * 5)
st.progress(complexity_score / 10)
st.write(f"ν…μŠ€νŠΈ λ³΅μž‘μ„± 점수: {complexity_score:.1f}/10")
# μΆœν˜„ λΉˆλ„ λ§‰λŒ€ κ·Έλž˜ν”„
st.subheader("ν’ˆμ‚¬λ³„ 뢄포")
# μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
is_korean = bool(re.search(r'[κ°€-힣]', content))
try:
# μ˜μ–΄/ν•œκ΅­μ–΄ 토큰화 및 ν’ˆμ‚¬ 뢄석
if is_korean:
# ν•œκ΅­μ–΄μΈ 경우 (κ°„λ‹¨ν•œ ν˜•νƒœμ†Œ μœ μ‚¬ 뢄석)
try:
# transformers ν† ν¬λ‚˜μ΄μ € μ‹œλ„
try:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
tokens = tokenizer.tokenize(content[:5000]) # λ„ˆλ¬΄ κΈ΄ ν…μŠ€νŠΈλŠ” μž˜λΌμ„œ 뢄석
# κ°„λ‹¨ν•œ νŒ¨ν„΄ 맀칭으둜 ν’ˆμ‚¬ μΆ”μ •
pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '기타': 0}
for token in tokens:
if token.endswith("λ‹€") or token.endswith("μš”"):
pos_counts['동사'] += 1
elif token.endswith("게") or token.endswith("히"):
pos_counts['뢀사'] += 1
elif token.endswith("은") or token.endswith("λŠ”") or token.endswith("이") or token.endswith("κ°€"):
pos_counts['λͺ…사'] += 1
else:
if len(token) > 1:
pos_counts['λͺ…사'] += 1
else:
pos_counts['기타'] += 1
except Exception:
# μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ ν† ν°ν™”λ‘œ λŒ€μ²΄
tokens = tokenize_korean(content[:5000])
pos_counts = {
'λͺ…사λ₯˜': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["λ‹€", "μš”", "게", "히", "은", "λŠ”"])]),
'기타': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["λ‹€", "μš”", "게", "히", "은", "λŠ”"])])
}
except Exception as e:
st.error(f"ν•œκ΅­μ–΄ ν’ˆμ‚¬ 뢄석 μ‹€νŒ¨: {str(e)}")
pos_counts = {'데이터': len(content) // 10, '뢄석': len(content) // 15, '였λ₯˜': len(content) // 20}
else:
# μ˜μ–΄ λ¬Έμ„œμΈ 경우 (NLTK μ‹œλ„)
try:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
# ν•„μš”ν•œ 데이터 λ‹€μš΄λ‘œλ“œ
try:
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
except Exception:
pass
tokens = word_tokenize(content[:5000])
tagged = pos_tag(tokens)
# μ˜μ–΄ ν’ˆμ‚¬ λ§€ν•‘
pos_dict = {
'NN': 'λͺ…사', 'NNS': 'λͺ…사', 'NNP': '고유λͺ…사', 'NNPS': '고유λͺ…사',
'VB': '동사', 'VBD': '동사', 'VBG': '동사', 'VBN': '동사', 'VBP': '동사', 'VBZ': '동사',
'JJ': 'ν˜•μš©μ‚¬', 'JJR': 'ν˜•μš©μ‚¬', 'JJS': 'ν˜•μš©μ‚¬',
'RB': '뢀사', 'RBR': '뢀사', 'RBS': '뢀사'
}
pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '기타': 0}
for _, pos in tagged:
if pos in pos_dict:
pos_counts[pos_dict[pos]] += 1
else:
pos_counts['기타'] += 1
except Exception:
# μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ κ·œμΉ™μœΌλ‘œ ν’ˆμ‚¬ μœ μΆ”
tokens = re.findall(r'\b\w+\b', content.lower())
pos_counts = {
'λͺ…사': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
'동사': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
'뢀사': len([t for t in tokens if t.endswith('ly')]),
'기타': len([t for t in tokens if len(t) <= 2])
}
# κ²°κ³Ό μ‹œκ°ν™”
pos_df = pd.DataFrame({
'ν’ˆμ‚¬': list(pos_counts.keys()),
'λΉˆλ„': list(pos_counts.values())
})
st.bar_chart(pos_df.set_index('ν’ˆμ‚¬'))
if is_korean:
st.info("ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ κ°μ§€λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
else:
st.info("μ˜μ–΄ ν…μŠ€νŠΈκ°€ κ°μ§€λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
except Exception as e:
st.error(f"ν’ˆμ‚¬ 뢄석 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
st.error(traceback.format_exc())
elif analysis_type == "감정 뢄석":
if st.button("감정 λΆ„μ„ν•˜κΈ°"):
if st.session_state.openai_api_key:
with st.spinner("κΈ°μ‚¬μ˜ 감정을 뢄석 μ€‘μž…λ‹ˆλ‹€..."):
try:
# API ν‚€ μ„€μ •
openai.api_key = st.session_state.openai_api_key
# API 호좜
response = openai.chat.completions.create(
model="gpt-4.1-mini",
messages=[
{"role": "system", "content": "당신은 ν…μŠ€νŠΈμ˜ 감정과 λ…Όμ‘°λ₯Ό λΆ„μ„ν•˜λŠ” μ „λ¬Έκ°€μž…λ‹ˆλ‹€. λ‹€μŒ λ‰΄μŠ€ κΈ°μ‚¬μ˜ 감정과 λ…Όμ‘°λ₯Ό λΆ„μ„ν•˜κ³ , '긍정적', '뢀정적', '쀑립적' 쀑 ν•˜λ‚˜λ‘œ λΆ„λ₯˜ν•΄ μ£Όμ„Έμš”. λ˜ν•œ κΈ°μ‚¬μ—μ„œ λ“œλŸ¬λ‚˜λŠ” 핡심 감정 ν‚€μ›Œλ“œλ₯Ό 5개 μΆ”μΆœν•˜κ³ , 각 ν‚€μ›Œλ“œλ³„λ‘œ 1-10 μ‚¬μ΄μ˜ 강도 점수λ₯Ό λ§€κ²¨μ£Όμ„Έμš”. JSON ν˜•μ‹μœΌλ‘œ λ‹€μŒκ³Ό 같이 μ‘λ‹΅ν•΄μ£Όμ„Έμš”: {'sentiment': '긍정적/뢀정적/쀑립적', 'reason': '이유 μ„€λͺ…...', 'keywords': [{'word': 'ν‚€μ›Œλ“œ1', 'score': 8}, {'word': 'ν‚€μ›Œλ“œ2', 'score': 7}, ...]}"},
{"role": "user", "content": f"λ‹€μŒ λ‰΄μŠ€ 기사λ₯Ό 뢄석해 μ£Όμ„Έμš”:\n\n제λͺ©: {selected_article['title']}\n\nλ‚΄μš©: {selected_article['content'][:1500]}"}
],
max_tokens=800,
response_format={"type": "json_object"}
)
# JSON νŒŒμ‹±
analysis_result = json.loads(response.choices[0].message.content)
# κ²°κ³Ό μ‹œκ°ν™”
st.subheader("감정 뢄석 κ²°κ³Ό")
# 1. 감정 νƒ€μž…μ— λ”°λ₯Έ μ‹œκ°μ  ν‘œν˜„
sentiment_type = analysis_result.get('sentiment', '쀑립적')
col1, col2, col3 = st.columns([1, 3, 1])
with col2:
if sentiment_type == "긍정적":
st.markdown(f"""
<div style="background-color:#DCEDC8; padding:20px; border-radius:10px; text-align:center;">
<h1 style="color:#388E3C; font-size:28px;">πŸ˜€ 긍정적 λ…Όμ‘° πŸ˜€</h1>
<p style="font-size:16px;">감정 강도: λ†’μŒ</p>
</div>
""", unsafe_allow_html=True)
elif sentiment_type == "뢀정적":
st.markdown(f"""
<div style="background-color:#FFCDD2; padding:20px; border-radius:10px; text-align:center;">
<h1 style="color:#D32F2F; font-size:28px;">😞 뢀정적 λ…Όμ‘° 😞</h1>
<p style="font-size:16px;">감정 강도: λ†’μŒ</p>
</div>
""", unsafe_allow_html=True)
else:
st.markdown(f"""
<div style="background-color:#E0E0E0; padding:20px; border-radius:10px; text-align:center;">
<h1 style="color:#616161; font-size:28px;">😐 쀑립적 λ…Όμ‘° 😐</h1>
<p style="font-size:16px;">감정 강도: 쀑간</p>
</div>
""", unsafe_allow_html=True)
# 2. 이유 μ„€λͺ…
st.markdown("### 뢄석 κ·Όκ±°")
st.markdown(f"<div style='background-color:#F5F5F5; padding:15px; border-radius:5px;'>{analysis_result.get('reason', '')}</div>", unsafe_allow_html=True)
# 3. 감정 ν‚€μ›Œλ“œ μ‹œκ°ν™”
st.markdown("### 핡심 감정 ν‚€μ›Œλ“œ")
# ν‚€μ›Œλ“œ 데이터 μ€€λΉ„
keywords = analysis_result.get('keywords', [])
if keywords:
# λ§‰λŒ€ 차트용 데이터
keyword_names = [item.get('word', '') for item in keywords]
keyword_scores = [item.get('score', 0) for item in keywords]
# λ ˆμ΄λ” 차트 생성
fig = go.Figure()
# 색상 μ„€μ •
if sentiment_type == "긍정적":
fill_color = 'rgba(76, 175, 80, 0.3)' # μ—°ν•œ μ΄ˆλ‘μƒ‰
line_color = 'rgba(76, 175, 80, 1)' # μ§„ν•œ μ΄ˆλ‘μƒ‰
elif sentiment_type == "뢀정적":
fill_color = 'rgba(244, 67, 54, 0.3)' # μ—°ν•œ 빨간색
line_color = 'rgba(244, 67, 54, 1)' # μ§„ν•œ 빨간색
else:
fill_color = 'rgba(158, 158, 158, 0.3)' # μ—°ν•œ νšŒμƒ‰
line_color = 'rgba(158, 158, 158, 1)' # μ§„ν•œ νšŒμƒ‰
# λ ˆμ΄λ” 차트 데이터 μ€€λΉ„
radar_keywords = keyword_names.copy()
radar_scores = keyword_scores.copy()
# λ ˆμ΄λ” 차트 생성
fig.add_trace(go.Scatterpolar(
r=radar_scores,
theta=radar_keywords,
fill='toself',
fillcolor=fill_color,
line=dict(color=line_color, width=2),
name='감정 ν‚€μ›Œλ“œ'
))
# λ ˆμ΄λ” 차트 λ ˆμ΄μ•„μ›ƒ μ„€μ •
fig.update_layout(
polar=dict(
radialaxis=dict(
visible=True,
range=[0, 10],
tickmode='linear',
tick0=0,
dtick=2
)
),
showlegend=False,
title={
'text': '감정 ν‚€μ›Œλ“œ λ ˆμ΄λ” 뢄석',
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'
},
height=500,
width=500,
margin=dict(l=80, r=80, t=80, b=80)
)
# 차트 쀑앙에 ν‘œμ‹œ
col1, col2, col3 = st.columns([1, 2, 1])
with col2:
st.plotly_chart(fig)
# ν‚€μ›Œλ“œ μΉ΄λ“œλ‘œ ν‘œμ‹œ
st.markdown("#### ν‚€μ›Œλ“œ μ„ΈλΆ€ μ„€λͺ…")
cols = st.columns(min(len(keywords), 5))
for i, keyword in enumerate(keywords):
with cols[i % len(cols)]:
word = keyword.get('word', '')
score = keyword.get('score', 0)
# μ μˆ˜μ— λ”°λ₯Έ 색상 계산
r, g, b = 0, 0, 0
if sentiment_type == "긍정적":
g = min(200 + score * 5, 255)
r = max(255 - score * 20, 100)
elif sentiment_type == "뢀정적":
r = min(200 + score * 5, 255)
g = max(255 - score * 20, 100)
else:
r = g = b = 128
# μΉ΄λ“œ 생성
st.markdown(f"""
<div style="background-color:rgba({r},{g},{b},0.2); padding:10px; border-radius:5px; text-align:center; margin:5px;">
<h3 style="margin:0;">{word}</h3>
<div style="background-color:#E0E0E0; border-radius:3px; margin-top:5px;">
<div style="width:{score*10}%; background-color:rgba({r},{g},{b},0.8); height:10px; border-radius:3px;"></div>
</div>
<p style="margin:2px; font-size:12px;">강도: {score}/10</p>
</div>
""", unsafe_allow_html=True)
else:
st.info("ν‚€μ›Œλ“œλ₯Ό μΆ”μΆœν•˜μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€.")
# 4. μš”μ•½ 톡계
st.markdown("### μ£Όμš” 톡계")
col1, col2, col3 = st.columns(3)
with col1:
st.metric(label="긍정/λΆ€μ • 점수", value=f"{7 if sentiment_type == '긍정적' else 3 if sentiment_type == '뢀정적' else 5}/10")
with col2:
st.metric(label="ν‚€μ›Œλ“œ 수", value=len(keywords))
with col3:
avg_score = sum(keyword_scores) / len(keyword_scores) if keyword_scores else 0
st.metric(label="평균 강도", value=f"{avg_score:.1f}/10")
except Exception as e:
st.error(f"감정 뢄석 였λ₯˜: {str(e)}")
st.code(traceback.format_exc())
else:
st.warning("OpenAI API ν‚€κ°€ μ„€μ •λ˜μ–΄ μžˆμ§€ μ•ŠμŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ API ν‚€λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”.")
elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
st.header("μƒˆ 기사 μƒμ„±ν•˜κΈ°")
articles = load_saved_articles()
if not articles:
st.warning("μ €μž₯된 기사가 μ—†μŠ΅λ‹ˆλ‹€. λ¨Όμ € 'λ‰΄μŠ€ 기사 크둀링' λ©”λ‰΄μ—μ„œ 기사λ₯Ό μˆ˜μ§‘ν•΄μ£Όμ„Έμš”.")
else:
# 기사 선택
titles = [article['title'] for article in articles]
selected_title = st.selectbox("원본 기사 선택", titles)
selected_article = next((a for a in articles if a['title'] == selected_title), None)
if selected_article:
st.write(f"**원본 제λͺ©:** {selected_article['title']}")
with st.expander("원본 기사 λ‚΄μš©"):
st.write(selected_article['content'])
prompt_text = st.text_area("생성 μ§€μΉ¨",
"""λ‹€μŒ 기사 양식을 λ”°λΌμ„œ λ‹€μ‹œ μž‘μ„±ν•΄μ€˜.
μ—­ν• : 당신은 μ‹ λ¬Έμ‚¬μ˜ κΈ°μžμž…λ‹ˆλ‹€.
μž‘μ—…: 졜근 μΌμ–΄λ‚œ 사건에 λŒ€ν•œ λ³΄λ„μžλ£Œλ₯Ό μž‘μ„±ν•΄μ•Ό ν•©λ‹ˆλ‹€. μžλ£ŒλŠ” 사싀을 기반으둜 ν•˜λ©°, 객관적이고 μ •ν™•ν•΄μ•Ό ν•©λ‹ˆλ‹€.
μ§€μΉ¨:
제곡된 정보λ₯Ό λ°”νƒ•μœΌλ‘œ μ‹ λ¬Έ λ³΄λ„μžλ£Œ ν˜•μ‹μ— 맞좰 기사λ₯Ό μž‘μ„±ν•˜μ„Έμš”.
기사 제λͺ©μ€ 주제λ₯Ό λͺ…ν™•νžˆ λ°˜μ˜ν•˜κ³  λ…μžμ˜ 관심을 끌 수 μžˆλ„λ‘ μž‘μ„±ν•©λ‹ˆλ‹€.
기사 λ‚΄μš©μ€ μ •ν™•ν•˜κ³  κ°„κ²°ν•˜λ©° 섀득λ ₯ μžˆλŠ” λ¬Έμž₯으둜 κ΅¬μ„±ν•©λ‹ˆλ‹€.
κ΄€λ ¨μžμ˜ 인터뷰λ₯Ό 인용 ν˜•νƒœλ‘œ λ„£μ–΄μ£Όμ„Έμš”.
μœ„μ˜ 정보와 지침을 μ°Έκ³ ν•˜μ—¬ μ‹ λ¬Έ λ³΄λ„μžλ£Œ ν˜•μ‹μ˜ 기사λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”""", height=200)
# 이미지 생성 μ—¬λΆ€ 선택 μ˜΅μ…˜ μΆ”κ°€
generate_image_too = st.checkbox("기사 생성 ν›„ 이미지도 ν•¨κ»˜ μƒμ„±ν•˜κΈ°", value=True)
if st.button("μƒˆ 기사 μƒμ„±ν•˜κΈ°"):
if st.session_state.openai_api_key:
with st.spinner("기사λ₯Ό 생성 μ€‘μž…λ‹ˆλ‹€..."):
new_article = generate_article(selected_article['content'], prompt_text)
st.write("**μƒμ„±λœ 기사:**")
st.write(new_article)
# 이미지 μƒμ„±ν•˜κΈ° (μ˜΅μ…˜μ΄ μ„ νƒλœ 경우)
if generate_image_too:
with st.spinner("기사 κ΄€λ ¨ 이미지λ₯Ό 생성 μ€‘μž…λ‹ˆλ‹€..."):
# 이미지 생성 ν”„λ‘¬ν”„νŠΈ μ€€λΉ„
image_prompt = f"""신문기사 제λͺ© "{selected_article['title']}" 을 보고 이미지λ₯Ό λ§Œλ“€μ–΄μ€˜
μ΄λ―Έμ§€μ—λŠ” λ‹€μŒ μš”μ†Œκ°€ ν¬ν•¨λ˜μ–΄μ•Ό ν•©λ‹ˆλ‹€:
- 기사λ₯Ό 이해할 수 μžˆλŠ” 도식
- 기사 λ‚΄μš©κ³Ό κ΄€λ ¨λœ ν…μŠ€νŠΈ
- μ‹¬ν”Œν•˜κ²Œ 처리
"""
# 이미지 생성
image_url = generate_image(image_prompt)
if image_url and not image_url.startswith("이미지 생성 였λ₯˜") and not image_url.startswith("였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."):
st.subheader("μƒμ„±λœ 이미지:")
st.image(image_url)
else:
st.error(image_url)
# μƒμ„±λœ 기사 μ €μž₯ μ˜΅μ…˜
if st.button("μƒμ„±λœ 기사 μ €μž₯"):
new_article_data = {
'title': f"[생성됨] {selected_article['title']}",
'source': f"AI 생성 (원본: {selected_article['source']})",
'date': datetime.now().strftime("%Y-%m-%d %H:%M"),
'description': new_article[:100] + "...",
'link': "",
'content': new_article
}
articles.append(new_article_data)
save_articles(articles)
st.success("μƒμ„±λœ 기사가 μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€!")
else:
st.warning("OpenAI API ν‚€λ₯Ό μ‚¬μ΄λ“œλ°”μ—μ„œ μ„€μ •ν•΄μ£Όμ„Έμš”.")
elif menu == "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°":
st.header("λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°")
# νƒ­ 생성
tab1, tab2, tab3 = st.tabs(["일별 μ˜ˆμ•½", "μ‹œκ°„ 간격 μ˜ˆμ•½", "μŠ€μΌ€μ€„λŸ¬ μƒνƒœ"])
# 일별 μ˜ˆμ•½ νƒ­
with tab1:
st.subheader("맀일 μ •ν•΄μ§„ μ‹œκ°„μ— 기사 μˆ˜μ§‘ν•˜κΈ°")
# ν‚€μ›Œλ“œ μž…λ ₯
daily_keyword = st.text_input("검색 ν‚€μ›Œλ“œ", value="인곡지λŠ₯", key="daily_keyword")
daily_num_articles = st.slider("μˆ˜μ§‘ν•  기사 수", min_value=1, max_value=20, value=5, key="daily_num_articles")
# μ‹œκ°„ μ„€μ •
daily_col1, daily_col2 = st.columns(2)
with daily_col1:
daily_hour = st.selectbox("μ‹œ", range(24), format_func=lambda x: f"{x:02d}μ‹œ", key="daily_hour")
with daily_col2:
daily_minute = st.selectbox("λΆ„", range(0, 60, 5), format_func=lambda x: f"{x:02d}λΆ„", key="daily_minute")
# 일별 μ˜ˆμ•½ 리슀트
if 'daily_tasks' not in st.session_state:
st.session_state.daily_tasks = []
if st.button("일별 μ˜ˆμ•½ μΆ”κ°€"):
st.session_state.daily_tasks.append({
'hour': daily_hour,
'minute': daily_minute,
'keyword': daily_keyword,
'num_articles': daily_num_articles
})
st.success(f"일별 μ˜ˆμ•½μ΄ μΆ”κ°€λ˜μ—ˆμŠ΅λ‹ˆλ‹€: 맀일 {daily_hour:02d}:{daily_minute:02d} - '{daily_keyword}'")
# μ˜ˆμ•½ λͺ©λ‘ ν‘œμ‹œ
if st.session_state.daily_tasks:
st.subheader("일별 μ˜ˆμ•½ λͺ©λ‘")
for i, task in enumerate(st.session_state.daily_tasks):
st.write(f"{i+1}. 맀일 {task['hour']:02d}:{task['minute']:02d} - '{task['keyword']}' ({task['num_articles']}개)")
if st.button("일별 μ˜ˆμ•½ μ΄ˆκΈ°ν™”"):
st.session_state.daily_tasks = []
st.warning("일별 μ˜ˆμ•½μ΄ λͺ¨λ‘ μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
# μ‹œκ°„ 간격 μ˜ˆμ•½ νƒ­
with tab2:
st.subheader("μ‹œκ°„ κ°„κ²©μœΌλ‘œ 기사 μˆ˜μ§‘ν•˜κΈ°")
# ν‚€μ›Œλ“œ μž…λ ₯
interval_keyword = st.text_input("검색 ν‚€μ›Œλ“œ", value="빅데이터", key="interval_keyword")
interval_num_articles = st.slider("μˆ˜μ§‘ν•  기사 수", min_value=1, max_value=20, value=5, key="interval_num_articles")
# μ‹œκ°„ 간격 μ„€μ •
interval_minutes = st.number_input("μ‹€ν–‰ 간격(λΆ„)", min_value=1, max_value=60*24, value=30, key="interval_minutes")
# μ¦‰μ‹œ μ‹€ν–‰ μ—¬λΆ€
run_immediately = st.checkbox("μ¦‰μ‹œ μ‹€ν–‰", value=True, help="μ²΄ν¬ν•˜λ©΄ μŠ€μΌ€μ€„λŸ¬ μ‹œμž‘ μ‹œ μ¦‰μ‹œ μ‹€ν–‰ν•©λ‹ˆλ‹€.")
# μ‹œκ°„ 간격 μ˜ˆμ•½ 리슀트
if 'interval_tasks' not in st.session_state:
st.session_state.interval_tasks = []
if st.button("μ‹œκ°„ 간격 μ˜ˆμ•½ μΆ”κ°€"):
st.session_state.interval_tasks.append({
'interval_minutes': interval_minutes,
'keyword': interval_keyword,
'num_articles': interval_num_articles,
'run_immediately': run_immediately
})
st.success(f"μ‹œκ°„ 간격 μ˜ˆμ•½μ΄ μΆ”κ°€λ˜μ—ˆμŠ΅λ‹ˆλ‹€: {interval_minutes}λΆ„λ§ˆλ‹€ - '{interval_keyword}'")
# μ˜ˆμ•½ λͺ©λ‘ ν‘œμ‹œ
if st.session_state.interval_tasks:
st.subheader("μ‹œκ°„ 간격 μ˜ˆμ•½ λͺ©λ‘")
for i, task in enumerate(st.session_state.interval_tasks):
immediate_text = "μ¦‰μ‹œ μ‹€ν–‰ ν›„ " if task['run_immediately'] else ""
st.write(f"{i+1}. {immediate_text}{task['interval_minutes']}λΆ„λ§ˆλ‹€ - '{task['keyword']}' ({task['num_articles']}개)")
if st.button("μ‹œκ°„ 간격 μ˜ˆμ•½ μ΄ˆκΈ°ν™”"):
st.session_state.interval_tasks = []
st.warning("μ‹œκ°„ 간격 μ˜ˆμ•½μ΄ λͺ¨λ‘ μ΄ˆκΈ°ν™”λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
# μŠ€μΌ€μ€„λŸ¬ μƒνƒœ νƒ­
with tab3:
st.subheader("μŠ€μΌ€μ€„λŸ¬ μ œμ–΄ 및 μƒνƒœ")
col1, col2 = st.columns(2)
with col1:
# μŠ€μΌ€μ€„λŸ¬ μ‹œμž‘/쀑지 λ²„νŠΌ
if not global_scheduler_state.is_running:
if st.button("μŠ€μΌ€μ€„λŸ¬ μ‹œμž‘"):
if not st.session_state.daily_tasks and not st.session_state.interval_tasks:
st.error("μ˜ˆμ•½λœ μž‘μ—…μ΄ μ—†μŠ΅λ‹ˆλ‹€. λ¨Όμ € 일별 μ˜ˆμ•½ λ˜λŠ” μ‹œκ°„ 간격 μ˜ˆμ•½μ„ μΆ”κ°€ν•΄μ£Όμ„Έμš”.")
else:
start_scheduler(st.session_state.daily_tasks, st.session_state.interval_tasks)
st.success("μŠ€μΌ€μ€„λŸ¬κ°€ μ‹œμž‘λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
else:
if st.button("μŠ€μΌ€μ€„λŸ¬ 쀑지"):
stop_scheduler()
st.warning("μŠ€μΌ€μ€„λŸ¬κ°€ μ€‘μ§€λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
with col2:
# μŠ€μΌ€μ€„λŸ¬ μƒνƒœ ν‘œμ‹œ
if 'scheduler_status' in st.session_state:
st.write(f"μƒνƒœ: {'싀행쀑' if global_scheduler_state.is_running else '쀑지'}")
if global_scheduler_state.last_run:
st.write(f"λ§ˆμ§€λ§‰ μ‹€ν–‰: {global_scheduler_state.last_run.strftime('%Y-%m-%d %H:%M:%S')}")
if global_scheduler_state.next_run and global_scheduler_state.is_running:
st.write(f"λ‹€μŒ μ‹€ν–‰: {global_scheduler_state.next_run.strftime('%Y-%m-%d %H:%M:%S')}")
else:
st.write("μƒνƒœ: 쀑지")
# μ˜ˆμ•½λœ μž‘μ—… λͺ©λ‘
if global_scheduler_state.scheduled_jobs:
st.subheader("ν˜„μž¬ μ‹€ν–‰ 쀑인 μ˜ˆμ•½ μž‘μ—…")
for i, job in enumerate(global_scheduler_state.scheduled_jobs):
if job['type'] == 'daily':
st.write(f"{i+1}. [일별] 맀일 {job['time']} - '{job['keyword']}' ({job['num_articles']}개)")
else:
immediate_text = "[μ¦‰μ‹œ μ‹€ν–‰ ν›„] " if job.get('run_immediately', False) else ""
st.write(f"{i+1}. [간격] {immediate_text}{job['interval']} - '{job['keyword']}' ({job['num_articles']}개)")
# μŠ€μΌ€μ€„λŸ¬ μ‹€ν–‰ κ²°κ³Ό
if global_scheduler_state.scheduled_results:
st.subheader("μŠ€μΌ€μ€„λŸ¬ μ‹€ν–‰ κ²°κ³Ό")
# κ²°κ³Όλ₯Ό UI에 ν‘œμ‹œν•˜κΈ° 전에 볡사
results_for_display = global_scheduler_state.scheduled_results.copy()
if results_for_display:
result_df = pd.DataFrame(results_for_display)
result_df['μ‹€ν–‰μ‹œκ°„'] = result_df['timestamp'].apply(lambda x: datetime.strptime(x, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M:%S"))
result_df = result_df.rename(columns={
'task_type': 'μž‘μ—…μœ ν˜•',
'keyword': 'ν‚€μ›Œλ“œ',
'num_articles': 'κΈ°μ‚¬μˆ˜',
'filename': '파일λͺ…'
})
result_df['μž‘μ—…μœ ν˜•'] = result_df['μž‘μ—…μœ ν˜•'].apply(lambda x: '일별' if x == 'daily' else 'μ‹œκ°„κ°„κ²©')
st.dataframe(
result_df[['μž‘μ—…μœ ν˜•', 'ν‚€μ›Œλ“œ', 'κΈ°μ‚¬μˆ˜', 'μ‹€ν–‰μ‹œκ°„', '파일λͺ…']],
hide_index=True
)
# μˆ˜μ§‘λœ 파일 보기
if os.path.exists(SCHEDULED_NEWS_DIR):
files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
if files:
st.subheader("μˆ˜μ§‘λœ 파일 μ—΄κΈ°")
selected_file = st.selectbox("파일 선택", files, index=len(files)-1 if files else 0)
if selected_file and st.button("파일 λ‚΄μš© 보기"):
with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
articles = json.load(f)
st.write(f"**파일λͺ…:** {selected_file}")
st.write(f"**μˆ˜μ§‘ 기사 수:** {len(articles)}개")
for article in articles:
with st.expander(f"{article['title']} - {article['source']}"):
st.write(f"**좜처:** {article['source']}")
st.write(f"**λ‚ μ§œ:** {article['date']}")
st.write(f"**링크:** {article['link']}")
st.write("**λ³Έλ¬Έ:**")
st.write(article['content'][:500] + "..." if len(article['content']) > 500 else article['content'])
# ν‘Έν„°
st.markdown("---")
st.markdown("Β© λ‰΄μŠ€ 기사 도ꡬ @conanssam")