Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +322 -236
src/streamlit_app.py
CHANGED
@@ -4,33 +4,100 @@ import requests
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
import re
|
6 |
import time
|
7 |
-
import nltk
|
8 |
-
from nltk.tokenize import word_tokenize
|
9 |
-
from nltk.corpus import stopwords
|
10 |
-
from collections import Counter
|
11 |
import json
|
12 |
import os
|
13 |
from datetime import datetime, timedelta
|
14 |
-
import openai
|
15 |
-
from dotenv import load_dotenv
|
16 |
import traceback
|
17 |
import plotly.graph_objects as go
|
18 |
import schedule
|
19 |
import threading
|
20 |
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
# μλν΄λΌμ°λ μΆκ°
|
28 |
try:
|
29 |
from wordcloud import WordCloud
|
|
|
30 |
except ImportError:
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
# μ€μΌμ€λ¬ μν ν΄λμ€ μΆκ°
|
35 |
class SchedulerState:
|
36 |
def __init__(self):
|
@@ -61,31 +128,6 @@ if st.session_state.openai_api_key is None:
|
|
61 |
load_dotenv() # λ‘컬 .env νμΌ
|
62 |
st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
|
63 |
|
64 |
-
# νμν NLTK λ°μ΄ν° λ€μ΄λ‘λ
|
65 |
-
try:
|
66 |
-
nltk.data.find('tokenizers/punkt')
|
67 |
-
except LookupError:
|
68 |
-
nltk.download('punkt')
|
69 |
-
|
70 |
-
# try:
|
71 |
-
# nltk.data.find('tokenizers/punkt_tab')
|
72 |
-
# except LookupError:
|
73 |
-
# nltk.download('punkt_tab')
|
74 |
-
|
75 |
-
try:
|
76 |
-
nltk.data.find('corpora/stopwords')
|
77 |
-
except LookupError:
|
78 |
-
nltk.download('stopwords')
|
79 |
-
|
80 |
-
# OpenAI API ν€ μ€μ
|
81 |
-
# openai.api_key μ€μ μ κ° API νΈμΆ μ§μ μ st.session_state.openai_api_key μ¬μ©νλλ‘ λ³κ²½νκ±°λ,
|
82 |
-
# μ± μμ μμ μ ν λ² μ€μ ν©λλ€. μ¬κΈ°μλ νμλ₯Ό μ νν©λλ€.
|
83 |
-
if st.session_state.openai_api_key:
|
84 |
-
openai.api_key = st.session_state.openai_api_key
|
85 |
-
else:
|
86 |
-
# UI μ΄κΈ°μλ ν€κ° μμ μ μοΏ½οΏ½λ―λ‘, λμ€μ ν€ μ
λ ₯ μ openai.api_keyκ° μ€μ λλλ‘ μ λ
|
87 |
-
pass
|
88 |
-
|
89 |
# νμ΄μ§ μ€μ
|
90 |
st.set_page_config(page_title="λ΄μ€ κΈ°μ¬ λꡬ", page_icon="π°", layout="wide")
|
91 |
|
@@ -96,21 +138,17 @@ menu = st.sidebar.radio(
|
|
96 |
["λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§", "κΈ°μ¬ λΆμνκΈ°", "μ κΈ°μ¬ μμ±νκΈ°", "λ΄μ€ κΈ°μ¬ μμ½νκΈ°"]
|
97 |
)
|
98 |
|
99 |
-
#
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
return False
|
108 |
-
return True
|
109 |
-
|
110 |
# μ μ₯λ κΈ°μ¬λ₯Ό λΆλ¬μ€λ ν¨μ
|
111 |
def load_saved_articles():
|
112 |
try:
|
113 |
-
ensure_directory(TMP_DIR)
|
114 |
if os.path.exists(SAVED_ARTICLES_PATH):
|
115 |
with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
|
116 |
return json.load(f)
|
@@ -122,15 +160,12 @@ def load_saved_articles():
|
|
122 |
# κΈ°μ¬λ₯Ό μ μ₯νλ ν¨μ
|
123 |
def save_articles(articles):
|
124 |
try:
|
125 |
-
ensure_directory(TMP_DIR)
|
126 |
with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
|
127 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
128 |
-
|
129 |
-
os.chmod(SAVED_ARTICLES_PATH, 0o666)
|
130 |
except Exception as e:
|
131 |
st.error(f"κΈ°μ¬ μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}")
|
132 |
return False
|
133 |
-
return True
|
134 |
|
135 |
@st.cache_data
|
136 |
def crawl_naver_news(keyword, num_articles=5):
|
@@ -217,47 +252,89 @@ def get_article_content(url):
|
|
217 |
except Exception as e:
|
218 |
return f"μ€λ₯ λ°μ: {str(e)}"
|
219 |
|
220 |
-
# NLTKλ₯Ό μ΄μ©ν ν€μλ λΆμ
|
221 |
def analyze_keywords(text, top_n=10):
|
222 |
-
# νκ΅μ΄ λΆμ©μ΄ λͺ©λ‘
|
223 |
-
korean_stopwords = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
-
|
226 |
-
tokens = [word for word in tokens if
|
227 |
|
|
|
|
|
228 |
word_count = Counter(tokens)
|
229 |
top_keywords = word_count.most_common(top_n)
|
230 |
|
231 |
return top_keywords
|
232 |
|
233 |
-
|
234 |
def extract_keywords_for_wordcloud(text, top_n=50):
|
235 |
if not text or len(text.strip()) < 10:
|
236 |
return {}
|
237 |
|
238 |
try:
|
239 |
-
|
240 |
-
|
241 |
-
except Exception as e:
|
242 |
-
st.warning(f"{str(e)} μ€λ₯λ°μ")
|
243 |
-
tokens = text.lower().split()
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
stop_words = set()
|
|
|
|
|
246 |
try:
|
|
|
247 |
stop_words = set(stopwords.words('english'))
|
248 |
except Exception:
|
249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
|
|
|
251 |
korea_stop_words = {
|
252 |
'λ°', 'λ±', 'λ₯Ό', 'μ΄', 'μ', 'κ°', 'μ', 'λ', 'μΌλ‘', 'μμ', 'κ·Έ', 'λ', 'λλ', 'νλ', 'ν ', 'νκ³ ',
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
}
|
262 |
stop_words.update(korea_stop_words)
|
263 |
|
@@ -279,51 +356,45 @@ def extract_keywords_for_wordcloud(text, top_n=50):
|
|
279 |
return dict(sorted_words[:top_n])
|
280 |
|
281 |
except Exception as e:
|
282 |
-
st.error(f"μ€λ₯λ°μ {str(e)}")
|
283 |
return {"data": 1, "analysis": 1, "news": 1}
|
284 |
-
|
285 |
|
286 |
# μλ ν΄λΌμ°λ μμ± ν¨μ
|
287 |
-
|
288 |
def generate_wordcloud(keywords_dict):
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
random_state=42
|
318 |
-
).generate_from_frequencies(keywords_dict)
|
319 |
|
320 |
-
|
321 |
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
st.warning("μλν΄λΌμ°λ μμ±μ μ€ν¨νμ΅λλ€. ν°νΈ λ¬Έμ μΌ μ μμ΅λλ€. NanumGothic.ttf νμΌμ΄ νλ‘μ νΈ λ£¨νΈμ μλμ§ νμΈν΄μ£ΌμΈμ.")
|
326 |
-
return None
|
327 |
|
328 |
# λ΄μ€ λΆμ ν¨μ
|
329 |
def analyze_news_content(news_df):
|
@@ -331,32 +402,37 @@ def analyze_news_content(news_df):
|
|
331 |
return "λ°μ΄ν°κ° μμ΅λλ€"
|
332 |
|
333 |
results = {}
|
334 |
-
|
|
|
335 |
if 'source' in news_df.columns:
|
336 |
-
|
337 |
-
#μΉ΄ν
κ³ λ¦¬λ³
|
338 |
if 'date' in news_df.columns:
|
339 |
-
|
340 |
|
341 |
-
|
342 |
all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
|
343 |
|
344 |
if len(all_text.strip()) > 0:
|
345 |
-
results['top_keywords_for_wordcloud']= extract_keywords_for_wordcloud(all_text, top_n=50)
|
346 |
results['top_keywords'] = analyze_keywords(all_text)
|
347 |
else:
|
348 |
-
results['top_keywords_for_wordcloud']={}
|
349 |
results['top_keywords'] = []
|
|
|
350 |
return results
|
351 |
|
352 |
# OpenAI APIλ₯Ό μ΄μ©ν μ κΈ°μ¬ μμ±
|
353 |
def generate_article(original_content, prompt_text):
|
354 |
if not st.session_state.openai_api_key:
|
355 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
356 |
-
|
357 |
try:
|
|
|
|
|
|
|
|
|
358 |
response = openai.chat.completions.create(
|
359 |
-
model="gpt-4.1-mini",
|
360 |
messages=[
|
361 |
{"role": "system", "content": "λΉμ μ μ λ¬Έμ μΈ λ΄μ€ κΈ°μμ
λλ€. μ£Όμ΄μ§ λ΄μ©μ λ°νμΌλ‘ μλ‘μ΄ κΈ°μ¬λ₯Ό μμ±ν΄μ£ΌμΈμ."},
|
362 |
{"role": "user", "content": f"λ€μ λ΄μ©μ λ°νμΌλ‘ {prompt_text}\n\n{original_content[:1000]}"}
|
@@ -371,13 +447,17 @@ def generate_article(original_content, prompt_text):
|
|
371 |
def generate_image(prompt):
|
372 |
if not st.session_state.openai_api_key:
|
373 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
374 |
-
|
375 |
try:
|
|
|
|
|
|
|
|
|
376 |
response = openai.images.generate(
|
377 |
model="gpt-image-1",
|
378 |
prompt=prompt
|
379 |
)
|
380 |
-
image_base64=response.data[0].b64_json
|
381 |
return f"data:image/png;base64,{image_base64}"
|
382 |
except Exception as e:
|
383 |
return f"μ΄λ―Έμ§ μμ± μ€λ₯: {str(e)}"
|
@@ -409,18 +489,12 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
409 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
410 |
|
411 |
# κ²°κ³Ό μ μ₯
|
412 |
-
if not ensure_directory(SCHEDULED_NEWS_DIR):
|
413 |
-
print(f"μ€μΌμ€λ λ΄μ€ λλ ν 리 μμ± μ€ν¨")
|
414 |
-
return
|
415 |
-
|
416 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
417 |
filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
|
418 |
|
419 |
try:
|
420 |
with open(filename, 'w', encoding='utf-8') as f:
|
421 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
422 |
-
# νμΌ κΆν μ€μ
|
423 |
-
os.chmod(filename, 0o666)
|
424 |
except Exception as e:
|
425 |
print(f"νμΌ μ μ₯ μ€ μ€λ₯ λ°μ: {e}")
|
426 |
return
|
@@ -542,8 +616,9 @@ if menu == "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§":
|
|
542 |
articles = crawl_naver_news(keyword, num_articles)
|
543 |
|
544 |
# κΈ°μ¬ λ΄μ© κ°μ Έμ€κΈ°
|
|
|
545 |
for i, article in enumerate(articles):
|
546 |
-
|
547 |
article['content'] = get_article_content(article['link'])
|
548 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
549 |
|
@@ -559,7 +634,7 @@ if menu == "λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§":
|
|
559 |
st.write(f"**μμ½:** {article['description']}")
|
560 |
st.write(f"**λ§ν¬:** {article['link']}")
|
561 |
st.write("**본문 미리보기:**")
|
562 |
-
st.write(article['content'][:300] + "...")
|
563 |
|
564 |
elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
565 |
st.header("κΈ°μ¬ λΆμνκΈ°")
|
@@ -594,7 +669,6 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
594 |
keyword_tab1, keyword_tab2 = st.tabs(["ν€μλ λΉλ", "μλν΄λΌμ°λ"])
|
595 |
|
596 |
with keyword_tab1:
|
597 |
-
|
598 |
keywords = analyze_keywords(selected_article['content'])
|
599 |
|
600 |
# μκ°ν
|
@@ -604,23 +678,38 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
604 |
st.write("**μ£Όμ ν€μλ:**")
|
605 |
for word, count in keywords:
|
606 |
st.write(f"- {word}: {count}ν")
|
|
|
607 |
with keyword_tab2:
|
608 |
keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
|
609 |
-
wc = generate_wordcloud(keyword_dict)
|
610 |
|
611 |
-
if
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
|
617 |
-
#
|
618 |
-
st.write("**μμ
|
619 |
-
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:
|
620 |
keyword_df = pd.DataFrame(top_keywords, columns=['ν€μλ', 'λΉλ'])
|
621 |
st.dataframe(keyword_df)
|
622 |
-
|
623 |
-
|
|
|
624 |
|
625 |
elif analysis_type == "ν
μ€νΈ ν΅κ³":
|
626 |
if st.button("ν
μ€νΈ ν΅κ³ λΆμ"):
|
@@ -655,79 +744,87 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
655 |
st.write(f"ν
μ€νΈ 볡μ‘μ± μ μ: {complexity_score:.1f}/10")
|
656 |
|
657 |
# μΆν λΉλ λ§λ κ·Έλν
|
658 |
-
st.subheader("νμ¬λ³ λΆν¬
|
|
|
|
|
|
|
|
|
659 |
try:
|
660 |
-
#
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
|
667 |
-
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
except LookupError:
|
673 |
-
nltk.download('averaged_perceptron_tagger')
|
674 |
-
|
675 |
-
# Try using the correct resource name as shown in the error message
|
676 |
-
try:
|
677 |
-
nltk.data.find('averaged_perceptron_tagger_eng')
|
678 |
-
except LookupError:
|
679 |
-
nltk.download('averaged_perceptron_tagger_eng')
|
680 |
-
|
681 |
-
# μΈμ΄ κ°μ§ (κ°λ¨ν λ°©μ)
|
682 |
-
is_korean = bool(re.search(r'[κ°-ν£]', content))
|
683 |
-
|
684 |
-
if is_korean and konlpy_installed:
|
685 |
-
# νκ΅μ΄ ννμ λΆμ
|
686 |
-
okt = Okt()
|
687 |
-
tagged = okt.pos(content)
|
688 |
-
|
689 |
-
# νκ΅μ΄ νμ¬ λ§€ν
|
690 |
-
pos_dict = {
|
691 |
-
'Noun': 'λͺ
μ¬', 'NNG': 'λͺ
μ¬', 'NNP': 'κ³ μ λͺ
μ¬',
|
692 |
-
'Verb': 'λμ¬', 'VV': 'λμ¬', 'VA': 'νμ©μ¬',
|
693 |
-
'Adjective': 'νμ©μ¬',
|
694 |
-
'Adverb': 'λΆμ¬',
|
695 |
-
'Josa': 'μ‘°μ¬', 'Punctuation': 'ꡬλμ ',
|
696 |
-
'Determiner': 'κ΄νμ¬', 'Exclamation': 'κ°νμ¬'
|
697 |
-
}
|
698 |
-
|
699 |
-
pos_counts = {'λͺ
μ¬': 0, 'λμ¬': 0, 'νμ©μ¬': 0, 'λΆμ¬': 0, 'μ‘°μ¬': 0, 'ꡬλμ ': 0, 'κ΄νμ¬': 0, 'κ°νμ¬': 0, 'κΈ°ν': 0}
|
700 |
-
|
701 |
-
for _, pos in tagged:
|
702 |
-
if pos in pos_dict:
|
703 |
-
pos_counts[pos_dict[pos]] += 1
|
704 |
-
elif pos.startswith('N'): # κΈ°ν λͺ
μ¬λ₯
|
705 |
-
pos_counts['λͺ
μ¬'] += 1
|
706 |
-
elif pos.startswith('V'): # κΈ°ν λμ¬λ₯
|
707 |
-
pos_counts['λμ¬'] += 1
|
708 |
-
else:
|
709 |
-
pos_counts['κΈ°ν'] += 1
|
710 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
else:
|
712 |
-
# μμ΄
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
|
732 |
# κ²°κ³Ό μκ°ν
|
733 |
pos_df = pd.DataFrame({
|
@@ -750,14 +847,10 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
750 |
if st.session_state.openai_api_key:
|
751 |
with st.spinner("κΈ°μ¬μ κ°μ μ λΆμ μ€μ
λλ€..."):
|
752 |
try:
|
753 |
-
#
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
else:
|
758 |
-
st.error("OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€.")
|
759 |
-
st.stop()
|
760 |
-
|
761 |
response = openai.chat.completions.create(
|
762 |
model="gpt-4.1-mini",
|
763 |
messages=[
|
@@ -829,7 +922,7 @@ elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
|
829 |
fill_color = 'rgba(158, 158, 158, 0.3)' # μ°ν νμ
|
830 |
line_color = 'rgba(158, 158, 158, 1)' # μ§ν νμ
|
831 |
|
832 |
-
# λ μ΄λ μ°¨νΈ λ°μ΄ν° μ€λΉ
|
833 |
radar_keywords = keyword_names.copy()
|
834 |
radar_scores = keyword_scores.copy()
|
835 |
|
@@ -941,7 +1034,8 @@ elif menu == "μ κΈ°μ¬ μμ±νκΈ°":
|
|
941 |
with st.expander("μλ³Έ κΈ°μ¬ λ΄μ©"):
|
942 |
st.write(selected_article['content'])
|
943 |
|
944 |
-
prompt_text =""
|
|
|
945 |
μν : λΉμ μ μ λ¬Έμ¬μ κΈ°μμ
λλ€.
|
946 |
μμ
: μ΅κ·Ό μΌμ΄λ μ¬κ±΄μ λν 보λμλ£λ₯Ό μμ±ν΄μΌ ν©λλ€. μλ£λ μ¬μ€μ κΈ°λ°μΌλ‘ νλ©°, κ°κ΄μ μ΄κ³ μ νν΄μΌ ν©λλ€.
|
947 |
μ§μΉ¨:
|
@@ -949,14 +1043,13 @@ elif menu == "μ κΈ°μ¬ μμ±νκΈ°":
|
|
949 |
κΈ°μ¬ μ λͺ©μ μ£Όμ λ₯Ό λͺ
νν λ°μνκ³ λ
μμ κ΄μ¬μ λ μ μλλ‘ μμ±ν©λλ€.
|
950 |
κΈ°μ¬ λ΄μ©μ μ ννκ³ κ°κ²°νλ©° μ€λλ ₯ μλ λ¬Έμ₯μΌλ‘ ꡬμ±ν©λλ€.
|
951 |
κ΄λ ¨μμ μΈν°λ·°λ₯Ό μΈμ© ννλ‘ λ£μ΄μ£ΌμΈμ.
|
952 |
-
μμ μ 보μ μ§μΉ¨μ μ°Έκ³ νμ¬ μ λ¬Έ 보λμλ£ νμμ κΈ°μ¬λ₯Ό μμ±ν΄ μ£ΌμΈμ"""
|
953 |
|
954 |
# μ΄λ―Έμ§ μμ± μ¬λΆ μ ν μ΅μ
μΆκ°
|
955 |
generate_image_too = st.checkbox("κΈ°μ¬ μμ± ν μ΄λ―Έμ§λ ν¨κ» μμ±νκΈ°", value=True)
|
956 |
|
957 |
if st.button("μ κΈ°μ¬ μμ±νκΈ°"):
|
958 |
if st.session_state.openai_api_key:
|
959 |
-
# openai.api_key = st.session_state.openai_api_key # μ΄λ―Έ μλ¨μμ μ€μ λ¨ λλ κ° ν¨μ νΈμΆ μ μ€μ
|
960 |
with st.spinner("κΈ°μ¬λ₯Ό μμ± μ€μ
λλ€..."):
|
961 |
new_article = generate_article(selected_article['content'], prompt_text)
|
962 |
|
@@ -975,13 +1068,6 @@ elif menu == "μ κΈ°μ¬ μμ±νκΈ°":
|
|
975 |
"""
|
976 |
|
977 |
# μ΄λ―Έμ§ μμ±
|
978 |
-
# μ΄λ―Έμ§ μμ± API νΈμΆ μ μ ν€ νμΈ λ° μ€μ
|
979 |
-
if not openai.api_key:
|
980 |
-
if st.session_state.openai_api_key:
|
981 |
-
openai.api_key = st.session_state.openai_api_key
|
982 |
-
else:
|
983 |
-
st.error("OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€.")
|
984 |
-
st.stop()
|
985 |
image_url = generate_image(image_prompt)
|
986 |
|
987 |
if image_url and not image_url.startswith("μ΄λ―Έμ§ μμ± μ€λ₯") and not image_url.startswith("μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€."):
|
@@ -1157,7 +1243,7 @@ elif menu == "λ΄μ€ κΈ°μ¬ μμ½νκΈ°":
|
|
1157 |
files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
|
1158 |
if files:
|
1159 |
st.subheader("μμ§λ νμΌ μ΄κΈ°")
|
1160 |
-
selected_file = st.selectbox("νμΌ μ ν", files, index=len(files)-1 if files else 0)
|
1161 |
if selected_file and st.button("νμΌ λ΄μ© 보기"):
|
1162 |
with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
|
1163 |
articles = json.load(f)
|
@@ -1175,4 +1261,4 @@ elif menu == "λ΄μ€ κΈ°μ¬ μμ½νκΈ°":
|
|
1175 |
|
1176 |
# νΈν°
|
1177 |
st.markdown("---")
|
1178 |
-
st.markdown("Β© λ΄μ€ κΈ°μ¬ λꡬ @conanssam")
|
|
|
4 |
from bs4 import BeautifulSoup
|
5 |
import re
|
6 |
import time
|
|
|
|
|
|
|
|
|
7 |
import json
|
8 |
import os
|
9 |
from datetime import datetime, timedelta
|
|
|
|
|
10 |
import traceback
|
11 |
import plotly.graph_objects as go
|
12 |
import schedule
|
13 |
import threading
|
14 |
import matplotlib.pyplot as plt
|
15 |
+
from pathlib import Path
|
16 |
+
import openai
|
17 |
+
from dotenv import load_dotenv
|
18 |
+
|
19 |
+
# νκΉ
νμ΄μ€ Spaces νκ²½μ λ§κ² μμ λλ ν 리 μ€μ
|
20 |
+
# /tmp ν΄λλ μ‘΄μ¬ν μ μμ§λ§ κΆν λ¬Έμ κ° μμ μ μμΌλ―λ‘ νμ¬ μμ
λλ ν 리 κΈ°λ°μΌλ‘ λ³κ²½
|
21 |
+
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
|
22 |
+
DATA_DIR = os.path.join(CURRENT_DIR, "data")
|
23 |
+
NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
|
24 |
+
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
|
25 |
+
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
|
26 |
+
|
27 |
+
# λλ ν 리 μμ± ν¨μ
|
28 |
+
def ensure_directory(directory):
|
29 |
+
try:
|
30 |
+
os.makedirs(directory, exist_ok=True)
|
31 |
+
return True
|
32 |
+
except Exception as e:
|
33 |
+
st.error(f"λλ ν 리 μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
34 |
+
return False
|
35 |
+
|
36 |
+
# νμν λͺ¨λ λλ ν 리 μμ±
|
37 |
+
ensure_directory(DATA_DIR)
|
38 |
+
ensure_directory(NLTK_DATA_DIR)
|
39 |
+
ensure_directory(SCHEDULED_NEWS_DIR)
|
40 |
|
41 |
+
# NLTK μ€μ - κΆν λ¬Έμ ν΄κ²°μ μν΄ μ¬μ©μ μ§μ λλ ν 리 μ¬μ©
|
42 |
+
import nltk
|
43 |
+
nltk.data.path.append(NLTK_DATA_DIR)
|
44 |
+
|
45 |
+
# νμν NLTK λ°μ΄ν° λ€μ΄λ‘λ (κΆν λ¬Έμ ν΄κ²°)
|
46 |
+
try:
|
47 |
+
# μ¬μ©μ μ§μ λλ ν 리μ λ°μ΄ν° λ€μ΄λ‘λ
|
48 |
+
try:
|
49 |
+
nltk.data.find('tokenizers/punkt')
|
50 |
+
except LookupError:
|
51 |
+
nltk.download('punkt', download_dir=NLTK_DATA_DIR)
|
52 |
+
|
53 |
+
try:
|
54 |
+
nltk.data.find('corpora/stopwords')
|
55 |
+
except LookupError:
|
56 |
+
nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
|
57 |
+
except Exception as e:
|
58 |
+
st.warning(f"NLTK λ°μ΄ν° λ€μ΄λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}. κΈ°λ³Έ ν ν¬λμ΄μ§ λ°©μμ μ¬μ©ν©λλ€.")
|
59 |
+
|
60 |
+
# νκ΅μ΄ ν ν¬λμ΄μ§μ μν λ체 ν¨μ (KoNLPy λμ μ¬μ©)
|
61 |
+
def tokenize_korean(text):
|
62 |
+
try:
|
63 |
+
# 1. λ¨Όμ transformers λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ΄ μλμ§ νμΈ
|
64 |
+
try:
|
65 |
+
from transformers import AutoTokenizer
|
66 |
+
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
|
67 |
+
return tokenizer.tokenize(text)
|
68 |
+
except (ImportError, Exception) as e:
|
69 |
+
st.debug(f"Transformers ν ν¬λμ΄μ λ‘λ μ€ν¨: {str(e)}")
|
70 |
+
|
71 |
+
# 2. soynlp μλ
|
72 |
+
try:
|
73 |
+
from soynlp.tokenizer import LTokenizer
|
74 |
+
tokenizer = LTokenizer()
|
75 |
+
return tokenizer.tokenize(text)
|
76 |
+
except (ImportError, Exception) as e:
|
77 |
+
st.debug(f"soynlp ν ν¬λμ΄μ λ‘λ μ€ν¨: {str(e)}")
|
78 |
+
|
79 |
+
# 3. kss μλ
|
80 |
+
try:
|
81 |
+
import kss
|
82 |
+
tokens = []
|
83 |
+
for sentence in kss.split_sentences(text):
|
84 |
+
tokens.extend(sentence.split())
|
85 |
+
return tokens
|
86 |
+
except (ImportError, Exception) as e:
|
87 |
+
st.debug(f"kss ν ν¬λμ΄μ λ‘λ μ€ν¨: {str(e)}")
|
88 |
+
except Exception as e:
|
89 |
+
st.debug(f"νκ΅μ΄ ν ν¬λμ΄μ§ μ€ν¨: {str(e)}")
|
90 |
+
|
91 |
+
# 4. κΈ°λ³Έ μ κ·μ κΈ°λ° ν ν¬λμ΄μ - λͺ¨λ λ°©λ²μ΄ μ€ν¨νμ λ ν΄λ°±
|
92 |
+
return re.findall(r'[κ°-ν£]+|[a-zA-Z]+|[0-9]+|[^\sκ°-ν£a-zA-Z0-9]+', text)
|
93 |
|
94 |
+
# μλν΄λΌμ°λ μΆκ° (μ νμ μ¬μ©)
|
95 |
try:
|
96 |
from wordcloud import WordCloud
|
97 |
+
wordcloud_available = True
|
98 |
except ImportError:
|
99 |
+
wordcloud_available = False
|
100 |
+
|
|
|
101 |
# μ€μΌμ€λ¬ μν ν΄λμ€ μΆκ°
|
102 |
class SchedulerState:
|
103 |
def __init__(self):
|
|
|
128 |
load_dotenv() # λ‘컬 .env νμΌ
|
129 |
st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
# νμ΄μ§ μ€μ
|
132 |
st.set_page_config(page_title="λ΄μ€ κΈ°μ¬ λꡬ", page_icon="π°", layout="wide")
|
133 |
|
|
|
138 |
["λ΄μ€ κΈ°μ¬ ν¬λ‘€λ§", "κΈ°μ¬ λΆμνκΈ°", "μ κΈ°μ¬ μμ±νκΈ°", "λ΄μ€ κΈ°μ¬ μμ½νκΈ°"]
|
139 |
)
|
140 |
|
141 |
+
# OpenAI API ν€ μ
λ ₯ (μ¬μ΄λλ°)
|
142 |
+
openai_api_key = st.sidebar.text_input("OpenAI API ν€ (μ νμ¬ν)",
|
143 |
+
value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
|
144 |
+
type="password")
|
145 |
+
if openai_api_key:
|
146 |
+
st.session_state.openai_api_key = openai_api_key
|
147 |
+
openai.api_key = openai_api_key
|
148 |
+
|
|
|
|
|
|
|
149 |
# μ μ₯λ κΈ°μ¬λ₯Ό λΆλ¬μ€λ ν¨μ
|
150 |
def load_saved_articles():
|
151 |
try:
|
|
|
152 |
if os.path.exists(SAVED_ARTICLES_PATH):
|
153 |
with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
|
154 |
return json.load(f)
|
|
|
160 |
# κΈ°μ¬λ₯Ό μ μ₯νλ ν¨μ
|
161 |
def save_articles(articles):
|
162 |
try:
|
|
|
163 |
with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
|
164 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
165 |
+
return True
|
|
|
166 |
except Exception as e:
|
167 |
st.error(f"κΈ°μ¬ μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}")
|
168 |
return False
|
|
|
169 |
|
170 |
@st.cache_data
|
171 |
def crawl_naver_news(keyword, num_articles=5):
|
|
|
252 |
except Exception as e:
|
253 |
return f"μ€λ₯ λ°μ: {str(e)}"
|
254 |
|
255 |
+
# NLTKλ₯Ό μ΄μ©ν ν€μλ λΆμ (νκ΅μ΄ λμ μΆκ°)
|
256 |
def analyze_keywords(text, top_n=10):
|
257 |
+
# νκ΅μ΄ λΆμ©μ΄ λͺ©λ‘
|
258 |
+
korean_stopwords = [
|
259 |
+
'μ΄', 'κ·Έ', 'μ ', 'κ²', 'λ°', 'λ±', 'λ₯Ό', 'μ', 'μ', 'μμ', 'μ', 'μΌλ‘', 'λ‘',
|
260 |
+
'μκ²', 'λΏ', 'λ€', 'λ', 'κ°', 'μ΄λ€', 'μκ²μ', 'κ»', 'κ»μ', 'λΆν°', 'κΉμ§'
|
261 |
+
]
|
262 |
+
|
263 |
+
# μΈμ΄ κ°μ§ (κ°λ¨νκ² νκΈ ν¬ν¨ μ¬λΆλ‘ 체ν¬)
|
264 |
+
is_korean = bool(re.search(r'[κ°-ν£]', text))
|
265 |
+
|
266 |
+
if is_korean:
|
267 |
+
# νκ΅μ΄ ν
μ€νΈμΈ κ²½μ° νκ΅μ΄ ν ν¬λμ΄μ μ¬μ©
|
268 |
+
tokens = tokenize_korean(text)
|
269 |
+
else:
|
270 |
+
# νκΈμ΄ μλ κ²½μ° NLTK ν ν¬λμ΄μ μ¬μ©
|
271 |
+
try:
|
272 |
+
from nltk.tokenize import word_tokenize
|
273 |
+
tokens = word_tokenize(text)
|
274 |
+
except Exception:
|
275 |
+
# NLTKκ° μ€ν¨νλ©΄ κ°λ¨ν ν ν¬λμ΄μ λ‘ λ체
|
276 |
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
277 |
|
278 |
+
# λΆμ©μ΄ νν°λ§
|
279 |
+
tokens = [word for word in tokens if len(word) > 1 and word.lower() not in korean_stopwords]
|
280 |
|
281 |
+
# λΉλ κ³μ°
|
282 |
+
from collections import Counter
|
283 |
word_count = Counter(tokens)
|
284 |
top_keywords = word_count.most_common(top_n)
|
285 |
|
286 |
return top_keywords
|
287 |
|
288 |
+
# μλ ν΄λΌμ°λμ© λΆμ
|
289 |
def extract_keywords_for_wordcloud(text, top_n=50):
|
290 |
if not text or len(text.strip()) < 10:
|
291 |
return {}
|
292 |
|
293 |
try:
|
294 |
+
# μΈμ΄ κ°μ§ (κ°λ¨νκ² νκΈ ν¬ν¨ μ¬λΆλ‘ 체ν¬)
|
295 |
+
is_korean = bool(re.search(r'[κ°-ν£]', text))
|
|
|
|
|
|
|
296 |
|
297 |
+
if is_korean:
|
298 |
+
# νκ΅μ΄ ν
μ€νΈμΈ κ²½μ° νκ΅μ΄ ν ν¬λμ΄μ μ¬μ©
|
299 |
+
tokens = tokenize_korean(text.lower())
|
300 |
+
else:
|
301 |
+
# μμ΄ λλ κΈ°ν μΈμ΄λ NLTK μ¬μ© μλ
|
302 |
+
try:
|
303 |
+
from nltk.tokenize import word_tokenize
|
304 |
+
tokens = word_tokenize(text.lower())
|
305 |
+
except Exception:
|
306 |
+
# μ€ν¨νλ©΄ κ°λ¨ν ν ν¬λμ΄μ§
|
307 |
+
tokens = text.lower().split()
|
308 |
+
|
309 |
+
# λΆμ©μ΄ μ€μ
|
310 |
stop_words = set()
|
311 |
+
|
312 |
+
# μμ΄ λΆμ©μ΄ (NLTK μμΌλ©΄ μ¬μ©)
|
313 |
try:
|
314 |
+
from nltk.corpus import stopwords
|
315 |
stop_words = set(stopwords.words('english'))
|
316 |
except Exception:
|
317 |
+
# κΈ°λ³Έ μμ΄ λΆμ©οΏ½οΏ½οΏ½
|
318 |
+
stop_words = {
|
319 |
+
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
320 |
+
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
321 |
+
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
322 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
323 |
+
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
324 |
+
'will', 'shall', 'can', 'may', 'must', 'ought'
|
325 |
+
}
|
326 |
|
327 |
+
# νκ΅μ΄ λΆμ©μ΄
|
328 |
korea_stop_words = {
|
329 |
'λ°', 'λ±', 'λ₯Ό', 'μ΄', 'μ', 'κ°', 'μ', 'λ', 'μΌλ‘', 'μμ', 'κ·Έ', 'λ', 'λλ', 'νλ', 'ν ', 'νκ³ ',
|
330 |
+
'μλ€', 'μ΄λ€', 'μν΄', 'κ²μ΄λ€', 'κ²μ', 'λν', 'λλ¬Έ', 'κ·Έλ¦¬κ³ ', 'νμ§λ§', 'κ·Έλ¬λ', 'κ·Έλμ',
|
331 |
+
'μ
λλ€', 'ν©λλ€', 'μ΅λλ€', 'μ', 'μ£ ', 'κ³ ', 'κ³Ό', 'μ', 'λ', 'μ', 'μ', 'κ²', 'λ€', 'μ ', 'μ ',
|
332 |
+
'λ
', 'μ', 'μΌ', 'μ', 'λΆ', 'μ΄', 'μ§λ', 'μ¬ν΄', 'λ΄λ
', 'μ΅κ·Ό', 'νμ¬', 'μ€λ', 'λ΄μΌ', 'μ΄μ ',
|
333 |
+
'μ€μ ', 'μ€ν', 'λΆν°', 'κΉμ§', 'μκ²', 'κ»μ', 'μ΄λΌκ³ ', 'λΌκ³ ', 'νλ©°', 'νλ©΄μ', 'λ°λΌ', 'ν΅ν΄',
|
334 |
+
'κ΄λ ¨', 'ννΈ', 'νΉν', 'κ°μ₯', 'λ§€μ°', 'λ', 'λ', 'λ§μ΄', 'μ‘°κΈ', 'νμ', 'μμ£Ό', 'κ°λ', 'κ±°μ',
|
335 |
+
'μ ν', 'λ°λ‘', 'μ λ§', 'λ§μ½', 'λΉλ‘―ν', 'λ±μ', 'λ±μ΄', 'λ±μ', 'λ±κ³Ό', 'λ±λ', 'λ±μ', 'λ±μμ',
|
336 |
+
'κΈ°μ', 'λ΄μ€', 'μ¬μ§', 'μ°ν©λ΄μ€', 'λ΄μμ€', 'μ 곡', '무λ¨', 'μ μ¬', 'μ¬λ°°ν¬', 'κΈμ§', 'μ΅μ»€', 'λ©νΈ',
|
337 |
+
'μΌλ³΄', 'λ°μΌλ¦¬', 'κ²½μ ', 'μ¬ν', 'μ μΉ', 'μΈκ³', 'κ³Όν', 'μμ΄ν°', 'λ·μ»΄', 'μ¨λ·', 'λΈλ‘ν°', 'μ μμ λ¬Έ'
|
338 |
}
|
339 |
stop_words.update(korea_stop_words)
|
340 |
|
|
|
356 |
return dict(sorted_words[:top_n])
|
357 |
|
358 |
except Exception as e:
|
359 |
+
st.error(f"ν€μλ μΆμΆ μ€ μ€λ₯λ°μ {str(e)}")
|
360 |
return {"data": 1, "analysis": 1, "news": 1}
|
|
|
361 |
|
362 |
# μλ ν΄λΌμ°λ μμ± ν¨μ
|
|
|
363 |
def generate_wordcloud(keywords_dict):
|
364 |
+
if not wordcloud_available:
|
365 |
+
st.warning("μλν΄λΌμ°λλ₯Ό μν οΏ½οΏ½μ΄λΈλ¬λ¦¬κ° μ€μΉλμ§ μμμ΅λλ€.")
|
366 |
+
return None
|
367 |
+
|
368 |
+
try:
|
369 |
+
# λλκ³ λ ν°νΈ νμΈ (μμΌλ©΄ κΈ°λ³Έ ν°νΈ μ¬μ©)
|
370 |
+
font_path = os.path.join(CURRENT_DIR, "NanumGothic.ttf")
|
371 |
+
if not os.path.exists(font_path):
|
372 |
+
# κΈ°λ³Έ ν°νΈ μ¬μ©
|
373 |
+
wc = WordCloud(
|
374 |
+
width=800,
|
375 |
+
height=400,
|
376 |
+
background_color='white',
|
377 |
+
colormap='viridis',
|
378 |
+
max_font_size=150,
|
379 |
+
random_state=42
|
380 |
+
).generate_from_frequencies(keywords_dict)
|
381 |
+
else:
|
382 |
+
# λλκ³ λ ν°νΈ μ¬μ©
|
383 |
+
wc = WordCloud(
|
384 |
+
font_path=font_path,
|
385 |
+
width=800,
|
386 |
+
height=400,
|
387 |
+
background_color='white',
|
388 |
+
colormap='viridis',
|
389 |
+
max_font_size=150,
|
390 |
+
random_state=42
|
391 |
+
).generate_from_frequencies(keywords_dict)
|
|
|
|
|
392 |
|
393 |
+
return wc
|
394 |
|
395 |
+
except Exception as e:
|
396 |
+
st.error(f"μλν΄λΌμ°λ μμ± μ€ μ€λ₯ λ°μ: {str(e)}")
|
397 |
+
return None
|
|
|
|
|
398 |
|
399 |
# λ΄μ€ λΆμ ν¨μ
|
400 |
def analyze_news_content(news_df):
|
|
|
402 |
return "λ°μ΄ν°κ° μμ΅λλ€"
|
403 |
|
404 |
results = {}
|
405 |
+
|
406 |
+
# μΉ΄ν
κ³ λ¦¬λ³ λΆμ
|
407 |
if 'source' in news_df.columns:
|
408 |
+
results['source_counts'] = news_df['source'].value_counts().to_dict()
|
|
|
409 |
if 'date' in news_df.columns:
|
410 |
+
results['date_counts'] = news_df['date'].value_counts().to_dict()
|
411 |
|
412 |
+
# ν€μλ λΆμ
|
413 |
all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
|
414 |
|
415 |
if len(all_text.strip()) > 0:
|
416 |
+
results['top_keywords_for_wordcloud'] = extract_keywords_for_wordcloud(all_text, top_n=50)
|
417 |
results['top_keywords'] = analyze_keywords(all_text)
|
418 |
else:
|
419 |
+
results['top_keywords_for_wordcloud'] = {}
|
420 |
results['top_keywords'] = []
|
421 |
+
|
422 |
return results
|
423 |
|
424 |
# OpenAI APIλ₯Ό μ΄μ©ν μ κΈ°μ¬ μμ±
|
425 |
def generate_article(original_content, prompt_text):
|
426 |
if not st.session_state.openai_api_key:
|
427 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
428 |
+
|
429 |
try:
|
430 |
+
# API ν€ μ€μ
|
431 |
+
openai.api_key = st.session_state.openai_api_key
|
432 |
+
|
433 |
+
# API νΈμΆ
|
434 |
response = openai.chat.completions.create(
|
435 |
+
model="gpt-4.1-mini", # λλ λ€λ₯Έ μ¬μ© κ°λ₯ν λͺ¨λΈ
|
436 |
messages=[
|
437 |
{"role": "system", "content": "λΉμ μ μ λ¬Έμ μΈ λ΄μ€ κΈ°μμ
λλ€. μ£Όμ΄μ§ λ΄μ©μ λ°νμΌλ‘ μλ‘μ΄ κΈ°μ¬λ₯Ό μμ±ν΄μ£ΌμΈμ."},
|
438 |
{"role": "user", "content": f"λ€μ λ΄μ©μ λ°νμΌλ‘ {prompt_text}\n\n{original_content[:1000]}"}
|
|
|
447 |
def generate_image(prompt):
|
448 |
if not st.session_state.openai_api_key:
|
449 |
return "μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€. μ¬μ΄λλ°μμ ν€λ₯Ό μ
λ ₯νκ±°λ νκ²½ λ³μλ₯Ό μ€μ ν΄μ£ΌμΈμ."
|
450 |
+
|
451 |
try:
|
452 |
+
# API ν€ μ€μ
|
453 |
+
openai.api_key = st.session_state.openai_api_key
|
454 |
+
|
455 |
+
# API νΈμΆ
|
456 |
response = openai.images.generate(
|
457 |
model="gpt-image-1",
|
458 |
prompt=prompt
|
459 |
)
|
460 |
+
image_base64 = response.data[0].b64_json
|
461 |
return f"data:image/png;base64,{image_base64}"
|
462 |
except Exception as e:
|
463 |
return f"μ΄λ―Έμ§ μμ± μ€λ₯: {str(e)}"
|
|
|
489 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
490 |
|
491 |
# κ²°κ³Ό μ μ₯
|
|
|
|
|
|
|
|
|
492 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
493 |
filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
|
494 |
|
495 |
try:
|
496 |
with open(filename, 'w', encoding='utf-8') as f:
|
497 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
|
|
|
|
498 |
except Exception as e:
|
499 |
print(f"νμΌ μ μ₯ μ€ μ€λ₯ λ°μ: {e}")
|
500 |
return
|
|
|
616 |
articles = crawl_naver_news(keyword, num_articles)
|
617 |
|
618 |
# κΈ°μ¬ λ΄μ© κ°μ Έμ€κΈ°
|
619 |
+
progress_bar = st.progress(0)
|
620 |
for i, article in enumerate(articles):
|
621 |
+
progress_bar.progress((i + 1) / len(articles))
|
622 |
article['content'] = get_article_content(article['link'])
|
623 |
time.sleep(0.5) # μλ² λΆν λ°©μ§
|
624 |
|
|
|
634 |
st.write(f"**μμ½:** {article['description']}")
|
635 |
st.write(f"**λ§ν¬:** {article['link']}")
|
636 |
st.write("**본문 미리보기:**")
|
637 |
+
st.write(article['content'][:300] + "..." if len(article['content']) > 300 else article['content'])
|
638 |
|
639 |
elif menu == "κΈ°μ¬ λΆμνκΈ°":
|
640 |
st.header("κΈ°μ¬ λΆμνκΈ°")
|
|
|
669 |
keyword_tab1, keyword_tab2 = st.tabs(["ν€μλ λΉλ", "μλν΄λΌμ°λ"])
|
670 |
|
671 |
with keyword_tab1:
|
|
|
672 |
keywords = analyze_keywords(selected_article['content'])
|
673 |
|
674 |
# μκ°ν
|
|
|
678 |
st.write("**μ£Όμ ν€μλ:**")
|
679 |
for word, count in keywords:
|
680 |
st.write(f"- {word}: {count}ν")
|
681 |
+
|
682 |
with keyword_tab2:
|
683 |
keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
|
|
|
684 |
|
685 |
+
if wordcloud_available:
|
686 |
+
wc = generate_wordcloud(keyword_dict)
|
687 |
+
|
688 |
+
if wc:
|
689 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
690 |
+
ax.imshow(wc, interpolation='bilinear')
|
691 |
+
ax.axis('off')
|
692 |
+
st.pyplot(fig)
|
693 |
+
|
694 |
+
# ν€μλ μμ 20κ° νμ
|
695 |
+
st.write("**μμ 20κ° ν€μλ:**")
|
696 |
+
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
|
697 |
+
keyword_df = pd.DataFrame(top_keywords, columns=['ν€μλ', 'λΉλ'])
|
698 |
+
st.dataframe(keyword_df)
|
699 |
+
else:
|
700 |
+
st.error("μλν΄λΌμ°λλ₯Ό μμ±ν μ μμ΅λλ€.")
|
701 |
+
else:
|
702 |
+
# μλν΄λΌμ°λλ₯Ό μ¬μ©ν μ μλ κ²½μ° λ체 νμ
|
703 |
+
st.warning("μλν΄λΌμ°λ κΈ°λ₯μ μ¬μ©ν μ μμ΅λλ€. νμν ν¨ν€μ§κ° μ€μΉλμ§ μμμ΅λλ€.")
|
704 |
|
705 |
+
# λμ ν€μλλ§ νμ
|
706 |
+
st.write("**μμ ν€μλ:**")
|
707 |
+
top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:30]
|
708 |
keyword_df = pd.DataFrame(top_keywords, columns=['ν€μλ', 'λΉλ'])
|
709 |
st.dataframe(keyword_df)
|
710 |
+
|
711 |
+
# λ§λ μ°¨νΈλ‘ νμ
|
712 |
+
st.bar_chart(keyword_df.set_index('ν€μλ').head(15))
|
713 |
|
714 |
elif analysis_type == "ν
μ€νΈ ν΅κ³":
|
715 |
if st.button("ν
μ€νΈ ν΅κ³ λΆμ"):
|
|
|
744 |
st.write(f"ν
μ€νΈ 볡μ‘μ± μ μ: {complexity_score:.1f}/10")
|
745 |
|
746 |
# μΆν λΉλ λ§λ κ·Έλν
|
747 |
+
st.subheader("νμ¬λ³ λΆν¬")
|
748 |
+
|
749 |
+
# μΈμ΄ κ°μ§ (κ°λ¨νκ² νκΈ ν¬ν¨ μ¬λΆλ‘ 체ν¬)
|
750 |
+
is_korean = bool(re.search(r'[κ°-ν£]', content))
|
751 |
+
|
752 |
try:
|
753 |
+
# μμ΄/νκ΅μ΄ ν ν°ν λ° νμ¬ λΆμ
|
754 |
+
if is_korean:
|
755 |
+
# νκ΅μ΄μΈ κ²½μ° (κ°λ¨ν ννμ μ μ¬ λΆμ)
|
756 |
+
try:
|
757 |
+
# transformers ν ν¬λμ΄μ μλ
|
758 |
+
try:
|
759 |
+
from transformers import AutoTokenizer
|
760 |
+
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
|
761 |
+
tokens = tokenizer.tokenize(content[:5000]) # λ무 κΈ΄ ν
μ€νΈλ μλΌμ λΆμ
|
762 |
+
|
763 |
+
# κ°λ¨ν ν¨ν΄ λ§€μΉμΌλ‘ νμ¬ μΆμ
|
764 |
+
pos_counts = {'λͺ
μ¬': 0, 'λμ¬': 0, 'νμ©μ¬': 0, 'λΆμ¬': 0, 'κΈ°ν': 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
765 |
|
766 |
+
for token in tokens:
|
767 |
+
if token.endswith("λ€") or token.endswith("μ"):
|
768 |
+
pos_counts['λμ¬'] += 1
|
769 |
+
elif token.endswith("κ²") or token.endswith("ν"):
|
770 |
+
pos_counts['λΆμ¬'] += 1
|
771 |
+
elif token.endswith("μ") or token.endswith("λ") or token.endswith("μ΄") or token.endswith("κ°"):
|
772 |
+
pos_counts['λͺ
μ¬'] += 1
|
773 |
+
else:
|
774 |
+
if len(token) > 1:
|
775 |
+
pos_counts['λͺ
μ¬'] += 1
|
776 |
+
else:
|
777 |
+
pos_counts['κΈ°ν'] += 1
|
778 |
+
|
779 |
+
except Exception:
|
780 |
+
# μ€ν¨νλ©΄ κ°λ¨ν ν ν°νλ‘ λ체
|
781 |
+
tokens = tokenize_korean(content[:5000])
|
782 |
+
pos_counts = {
|
783 |
+
'λͺ
μ¬λ₯': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["λ€", "μ", "κ²", "ν", "μ", "λ"])]),
|
784 |
+
'κΈ°ν': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["λ€", "μ", "κ²", "ν", "μ", "λ"])])
|
785 |
+
}
|
786 |
+
except Exception as e:
|
787 |
+
st.error(f"νκ΅μ΄ νμ¬ λΆμ μ€ν¨: {str(e)}")
|
788 |
+
pos_counts = {'λ°μ΄ν°': len(content) // 10, 'λΆμ': len(content) // 15, 'μ€λ₯': len(content) // 20}
|
789 |
else:
|
790 |
+
# μμ΄ λ¬ΈμμΈ κ²½μ° (NLTK μλ)
|
791 |
+
try:
|
792 |
+
from nltk import pos_tag
|
793 |
+
from nltk.tokenize import word_tokenize
|
794 |
+
|
795 |
+
# νμν λ°μ΄ν° λ€μ΄λ‘λ
|
796 |
+
try:
|
797 |
+
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
|
798 |
+
except Exception:
|
799 |
+
pass
|
800 |
+
|
801 |
+
tokens = word_tokenize(content[:5000])
|
802 |
+
tagged = pos_tag(tokens)
|
803 |
+
|
804 |
+
# μμ΄ νμ¬ λ§€ν
|
805 |
+
pos_dict = {
|
806 |
+
'NN': 'λͺ
μ¬', 'NNS': 'λͺ
μ¬', 'NNP': 'κ³ μ λͺ
μ¬', 'NNPS': 'κ³ μ λͺ
μ¬',
|
807 |
+
'VB': 'λμ¬', 'VBD': 'λμ¬', 'VBG': 'λμ¬', 'VBN': 'λμ¬', 'VBP': 'λμ¬', 'VBZ': 'λμ¬',
|
808 |
+
'JJ': 'νμ©μ¬', 'JJR': 'νμ©μ¬', 'JJS': 'νμ©μ¬',
|
809 |
+
'RB': 'λΆμ¬', 'RBR': 'λΆμ¬', 'RBS': 'λΆμ¬'
|
810 |
+
}
|
811 |
+
|
812 |
+
pos_counts = {'λͺ
μ¬': 0, 'λμ¬': 0, 'νμ©μ¬': 0, 'λΆμ¬': 0, 'κΈ°ν': 0}
|
813 |
+
|
814 |
+
for _, pos in tagged:
|
815 |
+
if pos in pos_dict:
|
816 |
+
pos_counts[pos_dict[pos]] += 1
|
817 |
+
else:
|
818 |
+
pos_counts['κΈ°ν'] += 1
|
819 |
+
except Exception:
|
820 |
+
# μ€ν¨νλ©΄ κ°λ¨ν κ·μΉμΌλ‘ νμ¬ μ μΆ
|
821 |
+
tokens = re.findall(r'\b\w+\b', content.lower())
|
822 |
+
pos_counts = {
|
823 |
+
'λͺ
μ¬': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
|
824 |
+
'λμ¬': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
|
825 |
+
'λΆμ¬': len([t for t in tokens if t.endswith('ly')]),
|
826 |
+
'κΈ°ν': len([t for t in tokens if len(t) <= 2])
|
827 |
+
}
|
828 |
|
829 |
# κ²°κ³Ό μκ°ν
|
830 |
pos_df = pd.DataFrame({
|
|
|
847 |
if st.session_state.openai_api_key:
|
848 |
with st.spinner("κΈ°μ¬μ κ°μ μ λΆμ μ€μ
λλ€..."):
|
849 |
try:
|
850 |
+
# API ν€ μ€μ
|
851 |
+
openai.api_key = st.session_state.openai_api_key
|
852 |
+
|
853 |
+
# API νΈμΆ
|
|
|
|
|
|
|
|
|
854 |
response = openai.chat.completions.create(
|
855 |
model="gpt-4.1-mini",
|
856 |
messages=[
|
|
|
922 |
fill_color = 'rgba(158, 158, 158, 0.3)' # μ°ν νμ
|
923 |
line_color = 'rgba(158, 158, 158, 1)' # μ§ν νμ
|
924 |
|
925 |
+
# λ μ΄λ μ°¨νΈ λ°μ΄ν° μ€λΉ
|
926 |
radar_keywords = keyword_names.copy()
|
927 |
radar_scores = keyword_scores.copy()
|
928 |
|
|
|
1034 |
with st.expander("μλ³Έ κΈ°μ¬ λ΄μ©"):
|
1035 |
st.write(selected_article['content'])
|
1036 |
|
1037 |
+
prompt_text = st.text_area("μμ± μ§μΉ¨",
|
1038 |
+
"""λ€μ κΈ°μ¬ μμμ λ°λΌμ λ€μ μμ±ν΄μ€.
|
1039 |
μν : λΉμ μ μ λ¬Έμ¬μ κΈ°μμ
λλ€.
|
1040 |
μμ
: μ΅κ·Ό μΌμ΄λ μ¬κ±΄μ λν 보λμλ£λ₯Ό μμ±ν΄μΌ ν©λλ€. μλ£λ μ¬μ€μ κΈ°λ°μΌλ‘ νλ©°, κ°κ΄μ μ΄κ³ μ νν΄μΌ ν©λλ€.
|
1041 |
μ§μΉ¨:
|
|
|
1043 |
κΈ°μ¬ μ λͺ©μ μ£Όμ λ₯Ό λͺ
νν λ°μνκ³ λ
μμ κ΄μ¬μ λ μ μλλ‘ μμ±ν©λλ€.
|
1044 |
κΈ°μ¬ λ΄μ©μ μ ννκ³ κ°κ²°νλ©° μ€λλ ₯ μλ λ¬Έμ₯μΌλ‘ ꡬμ±ν©λλ€.
|
1045 |
κ΄λ ¨μμ μΈν°λ·°λ₯Ό μΈμ© ννλ‘ λ£μ΄μ£ΌμΈμ.
|
1046 |
+
μμ μ 보μ μ§μΉ¨μ μ°Έκ³ νμ¬ μ λ¬Έ 보λμλ£ νμμ κΈ°μ¬λ₯Ό μμ±ν΄ μ£ΌμΈμ""", height=200)
|
1047 |
|
1048 |
# μ΄λ―Έμ§ μμ± μ¬λΆ μ ν μ΅μ
μΆκ°
|
1049 |
generate_image_too = st.checkbox("κΈ°μ¬ μμ± ν μ΄λ―Έμ§λ ν¨κ» μμ±νκΈ°", value=True)
|
1050 |
|
1051 |
if st.button("μ κΈ°μ¬ μμ±νκΈ°"):
|
1052 |
if st.session_state.openai_api_key:
|
|
|
1053 |
with st.spinner("κΈ°μ¬λ₯Ό μμ± μ€μ
λλ€..."):
|
1054 |
new_article = generate_article(selected_article['content'], prompt_text)
|
1055 |
|
|
|
1068 |
"""
|
1069 |
|
1070 |
# μ΄λ―Έμ§ μμ±
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1071 |
image_url = generate_image(image_prompt)
|
1072 |
|
1073 |
if image_url and not image_url.startswith("μ΄λ―Έμ§ μμ± μ€λ₯") and not image_url.startswith("μ€λ₯: OpenAI API ν€κ° μ€μ λμ§ μμμ΅λλ€."):
|
|
|
1243 |
files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
|
1244 |
if files:
|
1245 |
st.subheader("μμ§λ νμΌ μ΄κΈ°")
|
1246 |
+
selected_file = st.selectbox("νμΌ μ ν", files, index=len(files)-1 if files else 0)
|
1247 |
if selected_file and st.button("νμΌ λ΄μ© 보기"):
|
1248 |
with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
|
1249 |
articles = json.load(f)
|
|
|
1261 |
|
1262 |
# νΈν°
|
1263 |
st.markdown("---")
|
1264 |
+
st.markdown("Β© λ΄μ€ κΈ°μ¬ λꡬ @conanssam")
|