JUNGU commited on
Commit
31658d4
Β·
verified Β·
1 Parent(s): 3e823c4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +322 -236
src/streamlit_app.py CHANGED
@@ -4,33 +4,100 @@ import requests
4
  from bs4 import BeautifulSoup
5
  import re
6
  import time
7
- import nltk
8
- from nltk.tokenize import word_tokenize
9
- from nltk.corpus import stopwords
10
- from collections import Counter
11
  import json
12
  import os
13
  from datetime import datetime, timedelta
14
- import openai
15
- from dotenv import load_dotenv
16
  import traceback
17
  import plotly.graph_objects as go
18
  import schedule
19
  import threading
20
  import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- # /tmp 경둜 μ„€μ •
23
- TMP_DIR = "/tmp"
24
- SAVED_ARTICLES_PATH = os.path.join(TMP_DIR, "saved_articles.json")
25
- SCHEDULED_NEWS_DIR = os.path.join(TMP_DIR, "scheduled_news")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # μ›Œλ“œν΄λΌμš°λ“œ μΆ”κ°€
28
  try:
29
  from wordcloud import WordCloud
 
30
  except ImportError:
31
- st.error("wordcloud νŒ¨ν‚€μ§€λ₯Ό μ„€μΉ˜ν•΄μ£Όμ„Έμš”: pip install wordcloud")
32
- WordCloud = None
33
-
34
  # μŠ€μΌ€μ€„λŸ¬ μƒνƒœ 클래슀 μΆ”κ°€
35
  class SchedulerState:
36
  def __init__(self):
@@ -61,31 +128,6 @@ if st.session_state.openai_api_key is None:
61
  load_dotenv() # 둜컬 .env 파일
62
  st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
63
 
64
- # ν•„μš”ν•œ NLTK 데이터 λ‹€μš΄λ‘œλ“œ
65
- try:
66
- nltk.data.find('tokenizers/punkt')
67
- except LookupError:
68
- nltk.download('punkt')
69
-
70
- # try:
71
- # nltk.data.find('tokenizers/punkt_tab')
72
- # except LookupError:
73
- # nltk.download('punkt_tab')
74
-
75
- try:
76
- nltk.data.find('corpora/stopwords')
77
- except LookupError:
78
- nltk.download('stopwords')
79
-
80
- # OpenAI API ν‚€ μ„€μ •
81
- # openai.api_key 섀정은 각 API 호좜 직전에 st.session_state.openai_api_key μ‚¬μš©ν•˜λ„λ‘ λ³€κ²½ν•˜κ±°λ‚˜,
82
- # μ•± μ‹œμž‘ μ‹œμ μ— ν•œ 번 μ„€μ •ν•©λ‹ˆλ‹€. μ—¬κΈ°μ„œλŠ” ν›„μžλ₯Ό μ„ νƒν•©λ‹ˆλ‹€.
83
- if st.session_state.openai_api_key:
84
- openai.api_key = st.session_state.openai_api_key
85
- else:
86
- # UI μ΄ˆκΈ°μ—λŠ” ν‚€κ°€ 없을 수 μžˆοΏ½οΏ½λ―€λ‘œ, λ‚˜μ€‘μ— ν‚€ μž…λ ₯ μ‹œ openai.api_keyκ°€ μ„€μ •λ˜λ„λ‘ μœ λ„
87
- pass
88
-
89
  # νŽ˜μ΄μ§€ μ„€μ •
90
  st.set_page_config(page_title="λ‰΄μŠ€ 기사 도ꡬ", page_icon="πŸ“°", layout="wide")
91
 
@@ -96,21 +138,17 @@ menu = st.sidebar.radio(
96
  ["λ‰΄μŠ€ 기사 크둀링", "기사 λΆ„μ„ν•˜κΈ°", "μƒˆ 기사 μƒμ„±ν•˜κΈ°", "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°"]
97
  )
98
 
99
- # 디렉토리 생성 ν•¨μˆ˜
100
- def ensure_directory(directory):
101
- try:
102
- os.makedirs(directory, mode=0o777, exist_ok=True)
103
- # 디렉토리 κΆŒν•œ μ„€μ •
104
- os.chmod(directory, 0o777)
105
- except Exception as e:
106
- st.error(f"디렉토리 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
107
- return False
108
- return True
109
-
110
  # μ €μž₯된 기사λ₯Ό λΆˆλŸ¬μ˜€λŠ” ν•¨μˆ˜
111
  def load_saved_articles():
112
  try:
113
- ensure_directory(TMP_DIR)
114
  if os.path.exists(SAVED_ARTICLES_PATH):
115
  with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
116
  return json.load(f)
@@ -122,15 +160,12 @@ def load_saved_articles():
122
  # 기사λ₯Ό μ €μž₯ν•˜λŠ” ν•¨μˆ˜
123
  def save_articles(articles):
124
  try:
125
- ensure_directory(TMP_DIR)
126
  with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
127
  json.dump(articles, f, ensure_ascii=False, indent=2)
128
- # 파일 κΆŒν•œ μ„€μ •
129
- os.chmod(SAVED_ARTICLES_PATH, 0o666)
130
  except Exception as e:
131
  st.error(f"기사 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
132
  return False
133
- return True
134
 
135
  @st.cache_data
136
  def crawl_naver_news(keyword, num_articles=5):
@@ -217,47 +252,89 @@ def get_article_content(url):
217
  except Exception as e:
218
  return f"였λ₯˜ λ°œμƒ: {str(e)}"
219
 
220
- # NLTKλ₯Ό μ΄μš©ν•œ ν‚€μ›Œλ“œ 뢄석
221
  def analyze_keywords(text, top_n=10):
222
- # ν•œκ΅­μ–΄ λΆˆμš©μ–΄ λͺ©λ‘ (직접 μ •μ˜ν•΄μ•Ό ν•©λ‹ˆλ‹€)
223
- korean_stopwords = ['이', 'κ·Έ', 'μ €', '것', '및', 'λ“±', 'λ₯Ό', '을', '에', 'μ—μ„œ', '의', '으둜', '둜']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- tokens = word_tokenize(text)
226
- tokens = [word for word in tokens if word.isalnum() and len(word) > 1 and word not in korean_stopwords]
227
 
 
 
228
  word_count = Counter(tokens)
229
  top_keywords = word_count.most_common(top_n)
230
 
231
  return top_keywords
232
 
233
- #μ›Œλ“œ ν΄λΌμš°λ“œμš© 뢄석
234
  def extract_keywords_for_wordcloud(text, top_n=50):
235
  if not text or len(text.strip()) < 10:
236
  return {}
237
 
238
  try:
239
- try:
240
- tokens = word_tokenize(text.lower())
241
- except Exception as e:
242
- st.warning(f"{str(e)} 였λ₯˜λ°œμƒ")
243
- tokens = text.lower().split()
244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  stop_words = set()
 
 
246
  try:
 
247
  stop_words = set(stopwords.words('english'))
248
  except Exception:
249
- pass
 
 
 
 
 
 
 
 
250
 
 
251
  korea_stop_words = {
252
  '및', 'λ“±', 'λ₯Ό', '이', '의', 'κ°€', '에', 'λŠ”', '으둜', 'μ—μ„œ', 'κ·Έ', '또', 'λ˜λŠ”', 'ν•˜λŠ”', 'ν• ', 'ν•˜κ³ ',
253
- 'μžˆλ‹€', '이닀', 'μœ„ν•΄', '것이닀', '것은', 'λŒ€ν•œ', 'λ•Œλ¬Έ', '그리고', 'ν•˜μ§€λ§Œ', 'κ·ΈλŸ¬λ‚˜', 'κ·Έλž˜μ„œ',
254
- 'μž…λ‹ˆλ‹€', 'ν•©λ‹ˆλ‹€', 'μŠ΅λ‹ˆλ‹€', 'μš”', 'μ£ ', 'κ³ ', 'κ³Ό', '와', '도', '은', '수', '것', 'λ“€', '제', 'μ €',
255
- 'λ…„', 'μ›”', '일', 'μ‹œ', 'λΆ„', '초', 'μ§€λ‚œ', 'μ˜¬ν•΄', 'λ‚΄λ…„', '졜근', 'ν˜„μž¬', '였늘', '내일', 'μ–΄μ œ',
256
- 'μ˜€μ „', 'μ˜€ν›„', 'λΆ€ν„°', 'κΉŒμ§€', 'μ—κ²Œ', 'κ»˜μ„œ', '이라고', '라고', 'ν•˜λ©°', 'ν•˜λ©΄μ„œ', '따라', '톡해',
257
- 'κ΄€λ ¨', 'ν•œνŽΈ', '특히', 'κ°€μž₯', '맀우', '더', '덜', '많이', '쑰금', '항상', '자주', '가끔', '거의',
258
- 'μ „ν˜€', 'λ°”λ‘œ', '정말', 'λ§Œμ•½', 'λΉ„λ‘―ν•œ', '등을', '등이', 'λ“±μ˜', 'λ“±κ³Ό', '등도', '등에', 'λ“±μ—μ„œ',
259
- '기자', 'λ‰΄μŠ€', '사진', 'μ—°ν•©λ‰΄μŠ€', 'λ‰΄μ‹œμŠ€', '제곡', '무단', 'μ „μž¬', '재배포', 'κΈˆμ§€', '액컀', '멘트',
260
- '일보', '데일리', '경제', 'μ‚¬νšŒ', 'μ •μΉ˜', '세계', 'κ³Όν•™', '아이티', 'λ‹·μ»΄', '씨넷', 'λΈ”λ‘œν„°', 'μ „μžμ‹ λ¬Έ'
261
  }
262
  stop_words.update(korea_stop_words)
263
 
@@ -279,51 +356,45 @@ def extract_keywords_for_wordcloud(text, top_n=50):
279
  return dict(sorted_words[:top_n])
280
 
281
  except Exception as e:
282
- st.error(f"였λ₯˜λ°œμƒ {str(e)}")
283
  return {"data": 1, "analysis": 1, "news": 1}
284
-
285
 
286
  # μ›Œλ“œ ν΄λΌμš°λ“œ 생성 ν•¨μˆ˜
287
-
288
  def generate_wordcloud(keywords_dict):
289
- if not WordCloud:
290
- st.warning("μ›Œλ“œν΄λΌμš°λ“œ μ„€μΉ˜μ•ˆλ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€.")
291
- return None
292
- try:
293
- # ν”„λ‘œμ νŠΈ λ£¨νŠΈμ— NanumGothic.ttfκ°€ μžˆλ‹€κ³  κ°€μ •
294
- font_path = "NanumGothic.ttf"
295
-
296
- # λ‘œμ»¬μ— 폰트 파일이 μžˆλŠ”μ§€ 확인, μ—†μœΌλ©΄ 기본으둜 μ‹œλ„
297
- if not os.path.exists(font_path):
298
- st.warning(f"폰트 파일({font_path})을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. κΈ°λ³Έ 폰트둜 μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μƒμ„±ν•©λ‹ˆλ‹€. ν•œκΈ€μ΄ 깨질 수 μžˆμŠ΅λ‹ˆλ‹€.")
299
- # font_path = None # λ˜λŠ” μ‹œμŠ€ν…œ κΈ°λ³Έ 폰트 경둜λ₯Ό μ§€μ • (ν”Œλž«νΌλ§ˆλ‹€ 닀름)
300
- # WordCloud μƒμ„±μžμ—μ„œ font_pathλ₯Ό None으둜 두면 μ‹œμŠ€ν…œ 기본값을 μ‹œλ„ν•˜κ±°λ‚˜, μ•„μ˜ˆ λΉΌκ³  호좜
301
- wc = WordCloud(
302
- width=800,
303
- height=400,
304
- background_color='white',
305
- colormap='viridis',
306
- max_font_size=150,
307
- random_state=42
308
- ).generate_from_frequencies(keywords_dict)
309
- else:
310
- wc= WordCloud(
311
- font_path=font_path,
312
- width=800,
313
- height=400,
314
- background_color = 'white',
315
- colormap = 'viridis',
316
- max_font_size=150,
317
- random_state=42
318
- ).generate_from_frequencies(keywords_dict)
319
 
320
- return wc
321
 
322
- except Exception as e:
323
- st.error(f"μ›Œλ“œν΄λΌμš°λ“œ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
324
- # traceback.print_exc() # 디버깅 μ‹œ μ‚¬μš©
325
- st.warning("μ›Œλ“œν΄λΌμš°λ“œ 생성에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€. 폰트 문제일 수 μžˆμŠ΅λ‹ˆλ‹€. NanumGothic.ttf 파일이 ν”„λ‘œμ νŠΈ λ£¨νŠΈμ— μžˆλŠ”μ§€ ν™•μΈν•΄μ£Όμ„Έμš”.")
326
- return None
327
 
328
  # λ‰΄μŠ€ 뢄석 ν•¨μˆ˜
329
  def analyze_news_content(news_df):
@@ -331,32 +402,37 @@ def analyze_news_content(news_df):
331
  return "데이터가 μ—†μŠ΅λ‹ˆλ‹€"
332
 
333
  results = {}
334
- #μΉ΄ν…Œκ³ λ¦¬λ³„
 
335
  if 'source' in news_df.columns:
336
- results['source_counts'] = news_df['source'].value_counts().to_dict()
337
- #μΉ΄ν…Œκ³ λ¦¬λ³„
338
  if 'date' in news_df.columns:
339
- results['date_counts'] = news_df['date'].value_counts().to_dict()
340
 
341
- #ν‚€μ›Œλ“œλΆ„μ„
342
  all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
343
 
344
  if len(all_text.strip()) > 0:
345
- results['top_keywords_for_wordcloud']= extract_keywords_for_wordcloud(all_text, top_n=50)
346
  results['top_keywords'] = analyze_keywords(all_text)
347
  else:
348
- results['top_keywords_for_wordcloud']={}
349
  results['top_keywords'] = []
 
350
  return results
351
 
352
  # OpenAI APIλ₯Ό μ΄μš©ν•œ μƒˆ 기사 생성
353
  def generate_article(original_content, prompt_text):
354
  if not st.session_state.openai_api_key:
355
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
356
- openai.api_key = st.session_state.openai_api_key
357
  try:
 
 
 
 
358
  response = openai.chat.completions.create(
359
- model="gpt-4.1-mini",
360
  messages=[
361
  {"role": "system", "content": "당신은 전문적인 λ‰΄μŠ€ κΈ°μžμž…λ‹ˆλ‹€. μ£Όμ–΄μ§„ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μƒˆλ‘œμš΄ 기사λ₯Ό μž‘μ„±ν•΄μ£Όμ„Έμš”."},
362
  {"role": "user", "content": f"λ‹€μŒ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ {prompt_text}\n\n{original_content[:1000]}"}
@@ -371,13 +447,17 @@ def generate_article(original_content, prompt_text):
371
  def generate_image(prompt):
372
  if not st.session_state.openai_api_key:
373
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
374
- openai.api_key = st.session_state.openai_api_key
375
  try:
 
 
 
 
376
  response = openai.images.generate(
377
  model="gpt-image-1",
378
  prompt=prompt
379
  )
380
- image_base64=response.data[0].b64_json
381
  return f"data:image/png;base64,{image_base64}"
382
  except Exception as e:
383
  return f"이미지 생성 였λ₯˜: {str(e)}"
@@ -409,18 +489,12 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
409
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
410
 
411
  # κ²°κ³Ό μ €μž₯
412
- if not ensure_directory(SCHEDULED_NEWS_DIR):
413
- print(f"μŠ€μΌ€μ€„λœ λ‰΄μŠ€ 디렉토리 생성 μ‹€νŒ¨")
414
- return
415
-
416
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
417
  filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
418
 
419
  try:
420
  with open(filename, 'w', encoding='utf-8') as f:
421
  json.dump(articles, f, ensure_ascii=False, indent=2)
422
- # 파일 κΆŒν•œ μ„€μ •
423
- os.chmod(filename, 0o666)
424
  except Exception as e:
425
  print(f"파일 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
426
  return
@@ -542,8 +616,9 @@ if menu == "λ‰΄μŠ€ 기사 크둀링":
542
  articles = crawl_naver_news(keyword, num_articles)
543
 
544
  # 기사 λ‚΄μš© κ°€μ Έμ˜€κΈ°
 
545
  for i, article in enumerate(articles):
546
- st.progress((i + 1) / len(articles))
547
  article['content'] = get_article_content(article['link'])
548
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
549
 
@@ -559,7 +634,7 @@ if menu == "λ‰΄μŠ€ 기사 크둀링":
559
  st.write(f"**μš”μ•½:** {article['description']}")
560
  st.write(f"**링크:** {article['link']}")
561
  st.write("**λ³Έλ¬Έ 미리보기:**")
562
- st.write(article['content'][:300] + "...")
563
 
564
  elif menu == "기사 λΆ„μ„ν•˜κΈ°":
565
  st.header("기사 λΆ„μ„ν•˜κΈ°")
@@ -594,7 +669,6 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
594
  keyword_tab1, keyword_tab2 = st.tabs(["ν‚€μ›Œλ“œ λΉˆλ„", "μ›Œλ“œν΄λΌμš°λ“œ"])
595
 
596
  with keyword_tab1:
597
-
598
  keywords = analyze_keywords(selected_article['content'])
599
 
600
  # μ‹œκ°ν™”
@@ -604,23 +678,38 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
604
  st.write("**μ£Όμš” ν‚€μ›Œλ“œ:**")
605
  for word, count in keywords:
606
  st.write(f"- {word}: {count}회")
 
607
  with keyword_tab2:
608
  keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
609
- wc = generate_wordcloud(keyword_dict)
610
 
611
- if wc:
612
- fig, ax = plt.subplots(figsize=(10, 5))
613
- ax.imshow(wc, interpolation='bilinear')
614
- ax.axis('off')
615
- st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
617
- # ν‚€μ›Œλ“œ μƒμœ„ 20개 ν‘œμ‹œ
618
- st.write("**μƒμœ„ 20개 ν‚€μ›Œλ“œ:**")
619
- top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
620
  keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
621
  st.dataframe(keyword_df)
622
- else:
623
- st.error("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
 
624
 
625
  elif analysis_type == "ν…μŠ€νŠΈ 톡계":
626
  if st.button("ν…μŠ€νŠΈ 톡계 뢄석"):
@@ -655,79 +744,87 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
655
  st.write(f"ν…μŠ€νŠΈ λ³΅μž‘μ„± 점수: {complexity_score:.1f}/10")
656
 
657
  # μΆœν˜„ λΉˆλ„ λ§‰λŒ€ κ·Έλž˜ν”„
658
- st.subheader("ν’ˆμ‚¬λ³„ 뢄포 (ν•œκ΅­μ–΄/μ˜μ–΄ 지원)")
 
 
 
 
659
  try:
660
- # KoNLPy μ„€μΉ˜ 확인
661
- try:
662
- from konlpy.tag import Okt
663
- konlpy_installed = True
664
- except ImportError:
665
- konlpy_installed = False
666
- st.warning("ν•œκ΅­μ–΄ ν˜•νƒœμ†Œ 뢄석을 μœ„ν•΄ KoNLPyλ₯Ό μ„€μΉ˜ν•΄μ£Όμ„Έμš”: pip install konlpy")
667
-
668
- # μ˜μ–΄ POS tagger μ€€λΉ„
669
- from nltk import pos_tag
670
- try:
671
- nltk.data.find('taggers/averaged_perceptron_tagger')
672
- except LookupError:
673
- nltk.download('averaged_perceptron_tagger')
674
-
675
- # Try using the correct resource name as shown in the error message
676
- try:
677
- nltk.data.find('averaged_perceptron_tagger_eng')
678
- except LookupError:
679
- nltk.download('averaged_perceptron_tagger_eng')
680
-
681
- # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•œ 방식)
682
- is_korean = bool(re.search(r'[κ°€-힣]', content))
683
-
684
- if is_korean and konlpy_installed:
685
- # ν•œκ΅­μ–΄ ν˜•νƒœμ†Œ 뢄석
686
- okt = Okt()
687
- tagged = okt.pos(content)
688
-
689
- # ν•œκ΅­μ–΄ ν’ˆμ‚¬ λ§€ν•‘
690
- pos_dict = {
691
- 'Noun': 'λͺ…사', 'NNG': 'λͺ…사', 'NNP': '고유λͺ…사',
692
- 'Verb': '동사', 'VV': '동사', 'VA': 'ν˜•μš©μ‚¬',
693
- 'Adjective': 'ν˜•μš©μ‚¬',
694
- 'Adverb': '뢀사',
695
- 'Josa': '쑰사', 'Punctuation': 'ꡬ두점',
696
- 'Determiner': 'κ΄€ν˜•μ‚¬', 'Exclamation': '감탄사'
697
- }
698
-
699
- pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '쑰사': 0, 'ꡬ두점': 0, 'κ΄€ν˜•μ‚¬': 0, '감탄사': 0, '기타': 0}
700
-
701
- for _, pos in tagged:
702
- if pos in pos_dict:
703
- pos_counts[pos_dict[pos]] += 1
704
- elif pos.startswith('N'): # 기타 λͺ…사λ₯˜
705
- pos_counts['λͺ…사'] += 1
706
- elif pos.startswith('V'): # 기타 동사λ₯˜
707
- pos_counts['동사'] += 1
708
- else:
709
- pos_counts['기타'] += 1
710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  else:
712
- # μ˜μ–΄ POS νƒœκΉ…
713
- tokens = word_tokenize(content.lower())
714
- tagged = pos_tag(tokens)
715
-
716
- # μ˜μ–΄ ν’ˆμ‚¬ λ§€ν•‘
717
- pos_dict = {
718
- 'NN': 'λͺ…사', 'NNS': 'λͺ…사', 'NNP': '고유λͺ…사', 'NNPS': '고유λͺ…사',
719
- 'VB': '동사', 'VBD': '동사', 'VBG': '동사', 'VBN': '동사', 'VBP': '동사', 'VBZ': '동사',
720
- 'JJ': 'ν˜•μš©μ‚¬', 'JJR': 'ν˜•μš©μ‚¬', 'JJS': 'ν˜•μš©μ‚¬',
721
- 'RB': '뢀사', 'RBR': '뢀사', 'RBS': '뢀사'
722
- }
723
-
724
- pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '기타': 0}
725
-
726
- for _, pos in tagged:
727
- if pos in pos_dict:
728
- pos_counts[pos_dict[pos]] += 1
729
- else:
730
- pos_counts['기타'] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
 
732
  # κ²°κ³Ό μ‹œκ°ν™”
733
  pos_df = pd.DataFrame({
@@ -750,14 +847,10 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
750
  if st.session_state.openai_api_key:
751
  with st.spinner("κΈ°μ‚¬μ˜ 감정을 뢄석 μ€‘μž…λ‹ˆλ‹€..."):
752
  try:
753
- # 감정 뢄석 API 호좜 전에 ν‚€ 확인 및 μ„€μ •
754
- if not openai.api_key:
755
- if st.session_state.openai_api_key:
756
- openai.api_key = st.session_state.openai_api_key
757
- else:
758
- st.error("OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
759
- st.stop()
760
-
761
  response = openai.chat.completions.create(
762
  model="gpt-4.1-mini",
763
  messages=[
@@ -829,7 +922,7 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
829
  fill_color = 'rgba(158, 158, 158, 0.3)' # μ—°ν•œ νšŒμƒ‰
830
  line_color = 'rgba(158, 158, 158, 1)' # μ§„ν•œ νšŒμƒ‰
831
 
832
- # λ ˆμ΄λ” 차트 데이터 μ€€λΉ„ - λ§ˆμ§€λ§‰ 점이 첫 점과 μ—°κ²°λ˜λ„λ‘ 데이터 μΆ”κ°€
833
  radar_keywords = keyword_names.copy()
834
  radar_scores = keyword_scores.copy()
835
 
@@ -941,7 +1034,8 @@ elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
941
  with st.expander("원본 기사 λ‚΄μš©"):
942
  st.write(selected_article['content'])
943
 
944
- prompt_text ="""λ‹€μŒ 기사 양식을 λ”°λΌμ„œ λ‹€μ‹œ μž‘μ„±ν•΄μ€˜.
 
945
  μ—­ν• : 당신은 μ‹ λ¬Έμ‚¬μ˜ κΈ°μžμž…λ‹ˆλ‹€.
946
  μž‘μ—…: 졜근 μΌμ–΄λ‚œ 사건에 λŒ€ν•œ λ³΄λ„μžλ£Œλ₯Ό μž‘μ„±ν•΄μ•Ό ν•©λ‹ˆλ‹€. μžλ£ŒλŠ” 사싀을 기반으둜 ν•˜λ©°, 객관적이고 μ •ν™•ν•΄μ•Ό ν•©λ‹ˆλ‹€.
947
  μ§€μΉ¨:
@@ -949,14 +1043,13 @@ elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
949
  기사 제λͺ©μ€ 주제λ₯Ό λͺ…ν™•νžˆ λ°˜μ˜ν•˜κ³  λ…μžμ˜ 관심을 끌 수 μžˆλ„λ‘ μž‘μ„±ν•©λ‹ˆλ‹€.
950
  기사 λ‚΄μš©μ€ μ •ν™•ν•˜κ³  κ°„κ²°ν•˜λ©° 섀득λ ₯ μžˆλŠ” λ¬Έμž₯으둜 κ΅¬μ„±ν•©λ‹ˆλ‹€.
951
  κ΄€λ ¨μžμ˜ 인터뷰λ₯Ό 인용 ν˜•νƒœλ‘œ λ„£μ–΄μ£Όμ„Έμš”.
952
- μœ„μ˜ 정보와 지침을 μ°Έκ³ ν•˜μ—¬ μ‹ λ¬Έ λ³΄λ„μžλ£Œ ν˜•μ‹μ˜ 기사λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”"""
953
 
954
  # 이미지 생성 μ—¬λΆ€ 선택 μ˜΅μ…˜ μΆ”κ°€
955
  generate_image_too = st.checkbox("기사 생성 ν›„ 이미지도 ν•¨κ»˜ μƒμ„±ν•˜κΈ°", value=True)
956
 
957
  if st.button("μƒˆ 기사 μƒμ„±ν•˜κΈ°"):
958
  if st.session_state.openai_api_key:
959
- # openai.api_key = st.session_state.openai_api_key # 이미 μƒλ‹¨μ—μ„œ 섀정됨 λ˜λŠ” 각 ν•¨μˆ˜ 호좜 μ‹œ μ„€μ •
960
  with st.spinner("기사λ₯Ό 생성 μ€‘μž…λ‹ˆλ‹€..."):
961
  new_article = generate_article(selected_article['content'], prompt_text)
962
 
@@ -975,13 +1068,6 @@ elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
975
  """
976
 
977
  # 이미지 생성
978
- # 이미지 생성 API 호좜 전에 ν‚€ 확인 및 μ„€μ •
979
- if not openai.api_key:
980
- if st.session_state.openai_api_key:
981
- openai.api_key = st.session_state.openai_api_key
982
- else:
983
- st.error("OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
984
- st.stop()
985
  image_url = generate_image(image_prompt)
986
 
987
  if image_url and not image_url.startswith("이미지 생성 였λ₯˜") and not image_url.startswith("였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."):
@@ -1157,7 +1243,7 @@ elif menu == "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°":
1157
  files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
1158
  if files:
1159
  st.subheader("μˆ˜μ§‘λœ 파일 μ—΄κΈ°")
1160
- selected_file = st.selectbox("파일 선택", files, index=len(files)-1 if files else 0) # filesκ°€ λΉ„μ–΄μžˆμ„ 경우 λŒ€λΉ„
1161
  if selected_file and st.button("파일 λ‚΄μš© 보기"):
1162
  with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
1163
  articles = json.load(f)
@@ -1175,4 +1261,4 @@ elif menu == "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°":
1175
 
1176
  # ν‘Έν„°
1177
  st.markdown("---")
1178
- st.markdown("Β© λ‰΄μŠ€ 기사 도ꡬ @conanssam")
 
4
  from bs4 import BeautifulSoup
5
  import re
6
  import time
 
 
 
 
7
  import json
8
  import os
9
  from datetime import datetime, timedelta
 
 
10
  import traceback
11
  import plotly.graph_objects as go
12
  import schedule
13
  import threading
14
  import matplotlib.pyplot as plt
15
+ from pathlib import Path
16
+ import openai
17
+ from dotenv import load_dotenv
18
+
19
+ # ν—ˆκΉ…νŽ˜μ΄μŠ€ Spaces ν™˜κ²½μ— 맞게 μž„μ‹œ 디렉토리 μ„€μ •
20
+ # /tmp ν΄λ”λŠ” μ‘΄μž¬ν•  수 μžˆμ§€λ§Œ κΆŒν•œ λ¬Έμ œκ°€ μžˆμ„ 수 μžˆμœΌλ―€λ‘œ ν˜„μž¬ μž‘μ—… 디렉토리 기반으둜 λ³€κ²½
21
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
22
+ DATA_DIR = os.path.join(CURRENT_DIR, "data")
23
+ NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
24
+ SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
25
+ SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
26
+
27
+ # 디렉토리 생성 ν•¨μˆ˜
28
+ def ensure_directory(directory):
29
+ try:
30
+ os.makedirs(directory, exist_ok=True)
31
+ return True
32
+ except Exception as e:
33
+ st.error(f"디렉토리 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
34
+ return False
35
+
36
+ # ν•„μš”ν•œ λͺ¨λ“  디렉토리 생성
37
+ ensure_directory(DATA_DIR)
38
+ ensure_directory(NLTK_DATA_DIR)
39
+ ensure_directory(SCHEDULED_NEWS_DIR)
40
 
41
+ # NLTK μ„€μ • - κΆŒν•œ 문제 해결을 μœ„ν•΄ μ‚¬μš©μž μ§€μ • 디렉토리 μ‚¬μš©
42
+ import nltk
43
+ nltk.data.path.append(NLTK_DATA_DIR)
44
+
45
+ # ν•„μš”ν•œ NLTK 데이터 λ‹€μš΄λ‘œλ“œ (κΆŒν•œ 문제 ν•΄κ²°)
46
+ try:
47
+ # μ‚¬μš©μž μ§€μ • 디렉토리에 데이터 λ‹€μš΄λ‘œλ“œ
48
+ try:
49
+ nltk.data.find('tokenizers/punkt')
50
+ except LookupError:
51
+ nltk.download('punkt', download_dir=NLTK_DATA_DIR)
52
+
53
+ try:
54
+ nltk.data.find('corpora/stopwords')
55
+ except LookupError:
56
+ nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
57
+ except Exception as e:
58
+ st.warning(f"NLTK 데이터 λ‹€μš΄λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {str(e)}. κΈ°λ³Έ ν† ν¬λ‚˜μ΄μ§• 방식을 μ‚¬μš©ν•©λ‹ˆλ‹€.")
59
+
60
+ # ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ§•μ„ μœ„ν•œ λŒ€μ²΄ ν•¨μˆ˜ (KoNLPy λŒ€μ‹  μ‚¬μš©)
61
+ def tokenize_korean(text):
62
+ try:
63
+ # 1. λ¨Όμ € transformers λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ–΄ μžˆλŠ”μ§€ 확인
64
+ try:
65
+ from transformers import AutoTokenizer
66
+ tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
67
+ return tokenizer.tokenize(text)
68
+ except (ImportError, Exception) as e:
69
+ st.debug(f"Transformers ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ μ‹€νŒ¨: {str(e)}")
70
+
71
+ # 2. soynlp μ‹œλ„
72
+ try:
73
+ from soynlp.tokenizer import LTokenizer
74
+ tokenizer = LTokenizer()
75
+ return tokenizer.tokenize(text)
76
+ except (ImportError, Exception) as e:
77
+ st.debug(f"soynlp ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ μ‹€νŒ¨: {str(e)}")
78
+
79
+ # 3. kss μ‹œλ„
80
+ try:
81
+ import kss
82
+ tokens = []
83
+ for sentence in kss.split_sentences(text):
84
+ tokens.extend(sentence.split())
85
+ return tokens
86
+ except (ImportError, Exception) as e:
87
+ st.debug(f"kss ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ μ‹€νŒ¨: {str(e)}")
88
+ except Exception as e:
89
+ st.debug(f"ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ§• μ‹€νŒ¨: {str(e)}")
90
+
91
+ # 4. κΈ°λ³Έ μ •κ·œμ‹ 기반 ν† ν¬λ‚˜μ΄μ € - λͺ¨λ“  방법이 μ‹€νŒ¨ν–ˆμ„ λ•Œ 폴백
92
+ return re.findall(r'[κ°€-힣]+|[a-zA-Z]+|[0-9]+|[^\sκ°€-힣a-zA-Z0-9]+', text)
93
 
94
+ # μ›Œλ“œν΄λΌμš°λ“œ μΆ”κ°€ (선택적 μ‚¬μš©)
95
  try:
96
  from wordcloud import WordCloud
97
+ wordcloud_available = True
98
  except ImportError:
99
+ wordcloud_available = False
100
+
 
101
  # μŠ€μΌ€μ€„λŸ¬ μƒνƒœ 클래슀 μΆ”κ°€
102
  class SchedulerState:
103
  def __init__(self):
 
128
  load_dotenv() # 둜컬 .env 파일
129
  st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  # νŽ˜μ΄μ§€ μ„€μ •
132
  st.set_page_config(page_title="λ‰΄μŠ€ 기사 도ꡬ", page_icon="πŸ“°", layout="wide")
133
 
 
138
  ["λ‰΄μŠ€ 기사 크둀링", "기사 λΆ„μ„ν•˜κΈ°", "μƒˆ 기사 μƒμ„±ν•˜κΈ°", "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°"]
139
  )
140
 
141
+ # OpenAI API ν‚€ μž…λ ₯ (μ‚¬μ΄λ“œλ°”)
142
+ openai_api_key = st.sidebar.text_input("OpenAI API ν‚€ (선택사항)",
143
+ value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
144
+ type="password")
145
+ if openai_api_key:
146
+ st.session_state.openai_api_key = openai_api_key
147
+ openai.api_key = openai_api_key
148
+
 
 
 
149
  # μ €μž₯된 기사λ₯Ό λΆˆλŸ¬μ˜€λŠ” ν•¨μˆ˜
150
  def load_saved_articles():
151
  try:
 
152
  if os.path.exists(SAVED_ARTICLES_PATH):
153
  with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
154
  return json.load(f)
 
160
  # 기사λ₯Ό μ €μž₯ν•˜λŠ” ν•¨μˆ˜
161
  def save_articles(articles):
162
  try:
 
163
  with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
164
  json.dump(articles, f, ensure_ascii=False, indent=2)
165
+ return True
 
166
  except Exception as e:
167
  st.error(f"기사 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
168
  return False
 
169
 
170
  @st.cache_data
171
  def crawl_naver_news(keyword, num_articles=5):
 
252
  except Exception as e:
253
  return f"였λ₯˜ λ°œμƒ: {str(e)}"
254
 
255
+ # NLTKλ₯Ό μ΄μš©ν•œ ν‚€μ›Œλ“œ 뢄석 (ν•œκ΅­μ–΄ λŒ€μ‘ μΆ”κ°€)
256
  def analyze_keywords(text, top_n=10):
257
+ # ν•œκ΅­μ–΄ λΆˆμš©μ–΄ λͺ©λ‘
258
+ korean_stopwords = [
259
+ '이', 'κ·Έ', 'μ €', '것', '및', 'λ“±', 'λ₯Ό', '을', '에', 'μ—μ„œ', '의', '으둜', '둜',
260
+ 'μ—κ²Œ', '뿐', 'λ‹€', 'λŠ”', 'κ°€', '이닀', 'μ—κ²Œμ„œ', '께', 'κ»˜μ„œ', 'λΆ€ν„°', 'κΉŒμ§€'
261
+ ]
262
+
263
+ # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
264
+ is_korean = bool(re.search(r'[κ°€-힣]', text))
265
+
266
+ if is_korean:
267
+ # ν•œκ΅­μ–΄ ν…μŠ€νŠΈμΈ 경우 ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
268
+ tokens = tokenize_korean(text)
269
+ else:
270
+ # ν•œκΈ€μ΄ μ—†λŠ” 경우 NLTK ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
271
+ try:
272
+ from nltk.tokenize import word_tokenize
273
+ tokens = word_tokenize(text)
274
+ except Exception:
275
+ # NLTKκ°€ μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ ν† ν¬λ‚˜μ΄μ €λ‘œ λŒ€μ²΄
276
+ tokens = re.findall(r'\b\w+\b', text.lower())
277
 
278
+ # λΆˆμš©μ–΄ 필터링
279
+ tokens = [word for word in tokens if len(word) > 1 and word.lower() not in korean_stopwords]
280
 
281
+ # λΉˆλ„ 계산
282
+ from collections import Counter
283
  word_count = Counter(tokens)
284
  top_keywords = word_count.most_common(top_n)
285
 
286
  return top_keywords
287
 
288
+ # μ›Œλ“œ ν΄λΌμš°λ“œμš© 뢄석
289
  def extract_keywords_for_wordcloud(text, top_n=50):
290
  if not text or len(text.strip()) < 10:
291
  return {}
292
 
293
  try:
294
+ # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
295
+ is_korean = bool(re.search(r'[κ°€-힣]', text))
 
 
 
296
 
297
+ if is_korean:
298
+ # ν•œκ΅­μ–΄ ν…μŠ€νŠΈμΈ 경우 ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
299
+ tokens = tokenize_korean(text.lower())
300
+ else:
301
+ # μ˜μ–΄ λ˜λŠ” 기타 μ–Έμ–΄λŠ” NLTK μ‚¬μš© μ‹œλ„
302
+ try:
303
+ from nltk.tokenize import word_tokenize
304
+ tokens = word_tokenize(text.lower())
305
+ except Exception:
306
+ # μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ ν† ν¬λ‚˜μ΄μ§•
307
+ tokens = text.lower().split()
308
+
309
+ # λΆˆμš©μ–΄ μ„€μ •
310
  stop_words = set()
311
+
312
+ # μ˜μ–΄ λΆˆμš©μ–΄ (NLTK 있으면 μ‚¬μš©)
313
  try:
314
+ from nltk.corpus import stopwords
315
  stop_words = set(stopwords.words('english'))
316
  except Exception:
317
+ # κΈ°λ³Έ μ˜μ–΄ 뢈용���
318
+ stop_words = {
319
+ 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
320
+ 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
321
+ 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
322
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
323
+ 'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
324
+ 'will', 'shall', 'can', 'may', 'must', 'ought'
325
+ }
326
 
327
+ # ν•œκ΅­μ–΄ λΆˆμš©μ–΄
328
  korea_stop_words = {
329
  '및', 'λ“±', 'λ₯Ό', '이', '의', 'κ°€', '에', 'λŠ”', '으둜', 'μ—μ„œ', 'κ·Έ', '또', 'λ˜λŠ”', 'ν•˜λŠ”', 'ν• ', 'ν•˜κ³ ',
330
+ 'μžˆλ‹€', '이닀', 'μœ„ν•΄', '것이닀', '것은', 'λŒ€ν•œ', 'λ•Œλ¬Έ', '그리고', 'ν•˜μ§€λ§Œ', 'κ·ΈλŸ¬λ‚˜', 'κ·Έλž˜μ„œ',
331
+ 'μž…λ‹ˆλ‹€', 'ν•©λ‹ˆλ‹€', 'μŠ΅λ‹ˆλ‹€', 'μš”', 'μ£ ', 'κ³ ', 'κ³Ό', '와', '도', '은', '수', '것', 'λ“€', '제', 'μ €',
332
+ 'λ…„', 'μ›”', '일', 'μ‹œ', 'λΆ„', '초', 'μ§€λ‚œ', 'μ˜¬ν•΄', 'λ‚΄λ…„', '졜근', 'ν˜„μž¬', '였늘', '내일', 'μ–΄μ œ',
333
+ 'μ˜€μ „', 'μ˜€ν›„', 'λΆ€ν„°', 'κΉŒμ§€', 'μ—κ²Œ', 'κ»˜μ„œ', '이라고', '라고', 'ν•˜λ©°', 'ν•˜λ©΄μ„œ', '따라', '톡해',
334
+ 'κ΄€λ ¨', 'ν•œνŽΈ', '특히', 'κ°€μž₯', '맀우', '더', '덜', '많이', '쑰금', '항상', '자주', '가끔', '거의',
335
+ 'μ „ν˜€', 'λ°”λ‘œ', '정말', 'λ§Œμ•½', 'λΉ„λ‘―ν•œ', '등을', '등이', 'λ“±μ˜', 'λ“±κ³Ό', '등도', '등에', 'λ“±μ—μ„œ',
336
+ '기자', 'λ‰΄μŠ€', '사진', 'μ—°ν•©λ‰΄μŠ€', 'λ‰΄μ‹œμŠ€', '제곡', '무단', 'μ „μž¬', '재배포', 'κΈˆμ§€', '액컀', '멘트',
337
+ '일보', '데일리', '경제', 'μ‚¬νšŒ', 'μ •μΉ˜', '세계', 'κ³Όν•™', '아이티', 'λ‹·μ»΄', '씨넷', 'λΈ”λ‘œν„°', 'μ „μžμ‹ λ¬Έ'
338
  }
339
  stop_words.update(korea_stop_words)
340
 
 
356
  return dict(sorted_words[:top_n])
357
 
358
  except Exception as e:
359
+ st.error(f"ν‚€μ›Œλ“œ μΆ”μΆœ 쀑 였λ₯˜λ°œμƒ {str(e)}")
360
  return {"data": 1, "analysis": 1, "news": 1}
 
361
 
362
  # μ›Œλ“œ ν΄λΌμš°λ“œ 생성 ν•¨μˆ˜
 
363
  def generate_wordcloud(keywords_dict):
364
+ if not wordcloud_available:
365
+ st.warning("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μœ„ν•œ οΏ½οΏ½μ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
366
+ return None
367
+
368
+ try:
369
+ # λ‚˜λˆ”κ³ λ”• 폰트 확인 (μ—†μœΌλ©΄ κΈ°λ³Έ 폰트 μ‚¬μš©)
370
+ font_path = os.path.join(CURRENT_DIR, "NanumGothic.ttf")
371
+ if not os.path.exists(font_path):
372
+ # κΈ°λ³Έ 폰트 μ‚¬μš©
373
+ wc = WordCloud(
374
+ width=800,
375
+ height=400,
376
+ background_color='white',
377
+ colormap='viridis',
378
+ max_font_size=150,
379
+ random_state=42
380
+ ).generate_from_frequencies(keywords_dict)
381
+ else:
382
+ # λ‚˜λˆ”κ³ λ”• 폰트 μ‚¬μš©
383
+ wc = WordCloud(
384
+ font_path=font_path,
385
+ width=800,
386
+ height=400,
387
+ background_color='white',
388
+ colormap='viridis',
389
+ max_font_size=150,
390
+ random_state=42
391
+ ).generate_from_frequencies(keywords_dict)
 
 
392
 
393
+ return wc
394
 
395
+ except Exception as e:
396
+ st.error(f"μ›Œλ“œν΄λΌμš°λ“œ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
397
+ return None
 
 
398
 
399
  # λ‰΄μŠ€ 뢄석 ν•¨μˆ˜
400
  def analyze_news_content(news_df):
 
402
  return "데이터가 μ—†μŠ΅λ‹ˆλ‹€"
403
 
404
  results = {}
405
+
406
+ # μΉ΄ν…Œκ³ λ¦¬λ³„ 뢄석
407
  if 'source' in news_df.columns:
408
+ results['source_counts'] = news_df['source'].value_counts().to_dict()
 
409
  if 'date' in news_df.columns:
410
+ results['date_counts'] = news_df['date'].value_counts().to_dict()
411
 
412
+ # ν‚€μ›Œλ“œ 뢄석
413
  all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
414
 
415
  if len(all_text.strip()) > 0:
416
+ results['top_keywords_for_wordcloud'] = extract_keywords_for_wordcloud(all_text, top_n=50)
417
  results['top_keywords'] = analyze_keywords(all_text)
418
  else:
419
+ results['top_keywords_for_wordcloud'] = {}
420
  results['top_keywords'] = []
421
+
422
  return results
423
 
424
  # OpenAI APIλ₯Ό μ΄μš©ν•œ μƒˆ 기사 생성
425
  def generate_article(original_content, prompt_text):
426
  if not st.session_state.openai_api_key:
427
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
428
+
429
  try:
430
+ # API ν‚€ μ„€μ •
431
+ openai.api_key = st.session_state.openai_api_key
432
+
433
+ # API 호좜
434
  response = openai.chat.completions.create(
435
+ model="gpt-4.1-mini", # λ˜λŠ” λ‹€λ₯Έ μ‚¬μš© κ°€λŠ₯ν•œ λͺ¨λΈ
436
  messages=[
437
  {"role": "system", "content": "당신은 전문적인 λ‰΄μŠ€ κΈ°μžμž…λ‹ˆλ‹€. μ£Όμ–΄μ§„ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μƒˆλ‘œμš΄ 기사λ₯Ό μž‘μ„±ν•΄μ£Όμ„Έμš”."},
438
  {"role": "user", "content": f"λ‹€μŒ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ {prompt_text}\n\n{original_content[:1000]}"}
 
447
  def generate_image(prompt):
448
  if not st.session_state.openai_api_key:
449
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
450
+
451
  try:
452
+ # API ν‚€ μ„€μ •
453
+ openai.api_key = st.session_state.openai_api_key
454
+
455
+ # API 호좜
456
  response = openai.images.generate(
457
  model="gpt-image-1",
458
  prompt=prompt
459
  )
460
+ image_base64 = response.data[0].b64_json
461
  return f"data:image/png;base64,{image_base64}"
462
  except Exception as e:
463
  return f"이미지 생성 였λ₯˜: {str(e)}"
 
489
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
490
 
491
  # κ²°κ³Ό μ €μž₯
 
 
 
 
492
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
493
  filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
494
 
495
  try:
496
  with open(filename, 'w', encoding='utf-8') as f:
497
  json.dump(articles, f, ensure_ascii=False, indent=2)
 
 
498
  except Exception as e:
499
  print(f"파일 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
500
  return
 
616
  articles = crawl_naver_news(keyword, num_articles)
617
 
618
  # 기사 λ‚΄μš© κ°€μ Έμ˜€κΈ°
619
+ progress_bar = st.progress(0)
620
  for i, article in enumerate(articles):
621
+ progress_bar.progress((i + 1) / len(articles))
622
  article['content'] = get_article_content(article['link'])
623
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
624
 
 
634
  st.write(f"**μš”μ•½:** {article['description']}")
635
  st.write(f"**링크:** {article['link']}")
636
  st.write("**λ³Έλ¬Έ 미리보기:**")
637
+ st.write(article['content'][:300] + "..." if len(article['content']) > 300 else article['content'])
638
 
639
  elif menu == "기사 λΆ„μ„ν•˜κΈ°":
640
  st.header("기사 λΆ„μ„ν•˜κΈ°")
 
669
  keyword_tab1, keyword_tab2 = st.tabs(["ν‚€μ›Œλ“œ λΉˆλ„", "μ›Œλ“œν΄λΌμš°λ“œ"])
670
 
671
  with keyword_tab1:
 
672
  keywords = analyze_keywords(selected_article['content'])
673
 
674
  # μ‹œκ°ν™”
 
678
  st.write("**μ£Όμš” ν‚€μ›Œλ“œ:**")
679
  for word, count in keywords:
680
  st.write(f"- {word}: {count}회")
681
+
682
  with keyword_tab2:
683
  keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
 
684
 
685
+ if wordcloud_available:
686
+ wc = generate_wordcloud(keyword_dict)
687
+
688
+ if wc:
689
+ fig, ax = plt.subplots(figsize=(10, 5))
690
+ ax.imshow(wc, interpolation='bilinear')
691
+ ax.axis('off')
692
+ st.pyplot(fig)
693
+
694
+ # ν‚€μ›Œλ“œ μƒμœ„ 20개 ν‘œμ‹œ
695
+ st.write("**μƒμœ„ 20개 ν‚€μ›Œλ“œ:**")
696
+ top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
697
+ keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
698
+ st.dataframe(keyword_df)
699
+ else:
700
+ st.error("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
701
+ else:
702
+ # μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μ‚¬μš©ν•  수 μ—†λŠ” 경우 λŒ€μ²΄ ν‘œμ‹œ
703
+ st.warning("μ›Œλ“œν΄λΌμš°λ“œ κΈ°λŠ₯을 μ‚¬μš©ν•  수 μ—†μŠ΅λ‹ˆλ‹€. ν•„μš”ν•œ νŒ¨ν‚€μ§€κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
704
 
705
+ # λŒ€μ‹  ν‚€μ›Œλ“œλ§Œ ν‘œμ‹œ
706
+ st.write("**μƒμœ„ ν‚€μ›Œλ“œ:**")
707
+ top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:30]
708
  keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
709
  st.dataframe(keyword_df)
710
+
711
+ # λ§‰λŒ€ 차트둜 ν‘œμ‹œ
712
+ st.bar_chart(keyword_df.set_index('ν‚€μ›Œλ“œ').head(15))
713
 
714
  elif analysis_type == "ν…μŠ€νŠΈ 톡계":
715
  if st.button("ν…μŠ€νŠΈ 톡계 뢄석"):
 
744
  st.write(f"ν…μŠ€νŠΈ λ³΅μž‘μ„± 점수: {complexity_score:.1f}/10")
745
 
746
  # μΆœν˜„ λΉˆλ„ λ§‰λŒ€ κ·Έλž˜ν”„
747
+ st.subheader("ν’ˆμ‚¬λ³„ 뢄포")
748
+
749
+ # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
750
+ is_korean = bool(re.search(r'[κ°€-힣]', content))
751
+
752
  try:
753
+ # μ˜μ–΄/ν•œκ΅­μ–΄ 토큰화 및 ν’ˆμ‚¬ 뢄석
754
+ if is_korean:
755
+ # ν•œκ΅­μ–΄μΈ 경우 (κ°„λ‹¨ν•œ ν˜•νƒœμ†Œ μœ μ‚¬ 뢄석)
756
+ try:
757
+ # transformers ν† ν¬λ‚˜μ΄μ € μ‹œλ„
758
+ try:
759
+ from transformers import AutoTokenizer
760
+ tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
761
+ tokens = tokenizer.tokenize(content[:5000]) # λ„ˆλ¬΄ κΈ΄ ν…μŠ€νŠΈλŠ” μž˜λΌμ„œ 뢄석
762
+
763
+ # κ°„λ‹¨ν•œ νŒ¨ν„΄ 맀칭으둜 ν’ˆμ‚¬ μΆ”μ •
764
+ pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '기타': 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
 
766
+ for token in tokens:
767
+ if token.endswith("λ‹€") or token.endswith("μš”"):
768
+ pos_counts['동사'] += 1
769
+ elif token.endswith("게") or token.endswith("히"):
770
+ pos_counts['뢀사'] += 1
771
+ elif token.endswith("은") or token.endswith("λŠ”") or token.endswith("이") or token.endswith("κ°€"):
772
+ pos_counts['λͺ…사'] += 1
773
+ else:
774
+ if len(token) > 1:
775
+ pos_counts['λͺ…사'] += 1
776
+ else:
777
+ pos_counts['기타'] += 1
778
+
779
+ except Exception:
780
+ # μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ ν† ν°ν™”λ‘œ λŒ€μ²΄
781
+ tokens = tokenize_korean(content[:5000])
782
+ pos_counts = {
783
+ 'λͺ…사λ₯˜': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["λ‹€", "μš”", "게", "히", "은", "λŠ”"])]),
784
+ '기타': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["λ‹€", "μš”", "게", "히", "은", "λŠ”"])])
785
+ }
786
+ except Exception as e:
787
+ st.error(f"ν•œκ΅­μ–΄ ν’ˆμ‚¬ 뢄석 μ‹€νŒ¨: {str(e)}")
788
+ pos_counts = {'데이터': len(content) // 10, '뢄석': len(content) // 15, '였λ₯˜': len(content) // 20}
789
  else:
790
+ # μ˜μ–΄ λ¬Έμ„œμΈ 경우 (NLTK μ‹œλ„)
791
+ try:
792
+ from nltk import pos_tag
793
+ from nltk.tokenize import word_tokenize
794
+
795
+ # ν•„μš”ν•œ 데이터 λ‹€μš΄λ‘œλ“œ
796
+ try:
797
+ nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
798
+ except Exception:
799
+ pass
800
+
801
+ tokens = word_tokenize(content[:5000])
802
+ tagged = pos_tag(tokens)
803
+
804
+ # μ˜μ–΄ ν’ˆμ‚¬ λ§€ν•‘
805
+ pos_dict = {
806
+ 'NN': 'λͺ…사', 'NNS': 'λͺ…사', 'NNP': '고유λͺ…사', 'NNPS': '고유λͺ…사',
807
+ 'VB': '동사', 'VBD': '동사', 'VBG': '동사', 'VBN': '동사', 'VBP': '동사', 'VBZ': '동사',
808
+ 'JJ': 'ν˜•μš©μ‚¬', 'JJR': 'ν˜•μš©μ‚¬', 'JJS': 'ν˜•μš©μ‚¬',
809
+ 'RB': '뢀사', 'RBR': '뢀사', 'RBS': '뢀사'
810
+ }
811
+
812
+ pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '기타': 0}
813
+
814
+ for _, pos in tagged:
815
+ if pos in pos_dict:
816
+ pos_counts[pos_dict[pos]] += 1
817
+ else:
818
+ pos_counts['기타'] += 1
819
+ except Exception:
820
+ # μ‹€νŒ¨ν•˜λ©΄ κ°„λ‹¨ν•œ κ·œμΉ™μœΌλ‘œ ν’ˆμ‚¬ μœ μΆ”
821
+ tokens = re.findall(r'\b\w+\b', content.lower())
822
+ pos_counts = {
823
+ 'λͺ…사': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
824
+ '동사': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
825
+ '뢀사': len([t for t in tokens if t.endswith('ly')]),
826
+ '기타': len([t for t in tokens if len(t) <= 2])
827
+ }
828
 
829
  # κ²°κ³Ό μ‹œκ°ν™”
830
  pos_df = pd.DataFrame({
 
847
  if st.session_state.openai_api_key:
848
  with st.spinner("κΈ°μ‚¬μ˜ 감정을 뢄석 μ€‘μž…λ‹ˆλ‹€..."):
849
  try:
850
+ # API ν‚€ μ„€μ •
851
+ openai.api_key = st.session_state.openai_api_key
852
+
853
+ # API 호좜
 
 
 
 
854
  response = openai.chat.completions.create(
855
  model="gpt-4.1-mini",
856
  messages=[
 
922
  fill_color = 'rgba(158, 158, 158, 0.3)' # μ—°ν•œ νšŒμƒ‰
923
  line_color = 'rgba(158, 158, 158, 1)' # μ§„ν•œ νšŒμƒ‰
924
 
925
+ # λ ˆμ΄λ” 차트 데이터 μ€€λΉ„
926
  radar_keywords = keyword_names.copy()
927
  radar_scores = keyword_scores.copy()
928
 
 
1034
  with st.expander("원본 기사 λ‚΄μš©"):
1035
  st.write(selected_article['content'])
1036
 
1037
+ prompt_text = st.text_area("생성 μ§€μΉ¨",
1038
+ """λ‹€μŒ 기사 양식을 λ”°λΌμ„œ λ‹€μ‹œ μž‘μ„±ν•΄μ€˜.
1039
  μ—­ν• : 당신은 μ‹ λ¬Έμ‚¬μ˜ κΈ°μžμž…λ‹ˆλ‹€.
1040
  μž‘μ—…: 졜근 μΌμ–΄λ‚œ 사건에 λŒ€ν•œ λ³΄λ„μžλ£Œλ₯Ό μž‘μ„±ν•΄μ•Ό ν•©λ‹ˆλ‹€. μžλ£ŒλŠ” 사싀을 기반으둜 ν•˜λ©°, 객관적이고 μ •ν™•ν•΄μ•Ό ν•©λ‹ˆλ‹€.
1041
  μ§€μΉ¨:
 
1043
  기사 제λͺ©μ€ 주제λ₯Ό λͺ…ν™•νžˆ λ°˜μ˜ν•˜κ³  λ…μžμ˜ 관심을 끌 수 μžˆλ„λ‘ μž‘μ„±ν•©λ‹ˆλ‹€.
1044
  기사 λ‚΄μš©μ€ μ •ν™•ν•˜κ³  κ°„κ²°ν•˜λ©° 섀득λ ₯ μžˆλŠ” λ¬Έμž₯으둜 κ΅¬μ„±ν•©λ‹ˆλ‹€.
1045
  κ΄€λ ¨μžμ˜ 인터뷰λ₯Ό 인용 ν˜•νƒœλ‘œ λ„£μ–΄μ£Όμ„Έμš”.
1046
+ μœ„μ˜ 정보와 지침을 μ°Έκ³ ν•˜μ—¬ μ‹ λ¬Έ λ³΄λ„μžλ£Œ ν˜•μ‹μ˜ 기사λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”""", height=200)
1047
 
1048
  # 이미지 생성 μ—¬λΆ€ 선택 μ˜΅μ…˜ μΆ”κ°€
1049
  generate_image_too = st.checkbox("기사 생성 ν›„ 이미지도 ν•¨κ»˜ μƒμ„±ν•˜κΈ°", value=True)
1050
 
1051
  if st.button("μƒˆ 기사 μƒμ„±ν•˜κΈ°"):
1052
  if st.session_state.openai_api_key:
 
1053
  with st.spinner("기사λ₯Ό 생성 μ€‘μž…λ‹ˆλ‹€..."):
1054
  new_article = generate_article(selected_article['content'], prompt_text)
1055
 
 
1068
  """
1069
 
1070
  # 이미지 생성
 
 
 
 
 
 
 
1071
  image_url = generate_image(image_prompt)
1072
 
1073
  if image_url and not image_url.startswith("이미지 생성 였λ₯˜") and not image_url.startswith("였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."):
 
1243
  files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
1244
  if files:
1245
  st.subheader("μˆ˜μ§‘λœ 파일 μ—΄κΈ°")
1246
+ selected_file = st.selectbox("파일 선택", files, index=len(files)-1 if files else 0)
1247
  if selected_file and st.button("파일 λ‚΄μš© 보기"):
1248
  with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
1249
  articles = json.load(f)
 
1261
 
1262
  # ν‘Έν„°
1263
  st.markdown("---")
1264
+ st.markdown("Β© λ‰΄μŠ€ 기사 도ꡬ @conanssam")