JUNGU commited on
Commit
fd4dd5b
Β·
verified Β·
1 Parent(s): 82eb0eb

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +239 -258
src/streamlit_app.py CHANGED
@@ -4,73 +4,33 @@ import requests
4
  from bs4 import BeautifulSoup
5
  import re
6
  import time
 
 
 
 
7
  import json
8
  import os
9
  from datetime import datetime, timedelta
 
 
10
  import traceback
11
  import plotly.graph_objects as go
12
  import schedule
13
  import threading
14
  import matplotlib.pyplot as plt
15
- from pathlib import Path
16
- import openai
17
- from dotenv import load_dotenv
18
-
19
- # ν—ˆκΉ…νŽ˜μ΄μŠ€ Spaces ν™˜κ²½μ— 맞게 μž„μ‹œ 디렉토리 μ„€μ •
20
- # /tmp ν΄λ”λŠ” μ‘΄μž¬ν•  수 μžˆμ§€λ§Œ κΆŒν•œ λ¬Έμ œκ°€ μžˆμ„ 수 μžˆμœΌλ―€λ‘œ ν˜„μž¬ μž‘μ—… 디렉토리 기반으둜 λ³€κ²½
21
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
22
- DATA_DIR = os.path.join(CURRENT_DIR, "data")
23
- SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
24
- SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
25
-
26
- # 디렉토리 생성 ν•¨μˆ˜
27
- def ensure_directory(directory):
28
- try:
29
- os.makedirs(directory, exist_ok=True)
30
- return True
31
- except Exception as e:
32
- st.error(f"디렉토리 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
33
- return False
34
 
35
- # ν•„μš”ν•œ λͺ¨λ“  디렉토리 생성
36
- ensure_directory(DATA_DIR)
37
- ensure_directory(SCHEDULED_NEWS_DIR)
 
38
 
39
- # ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ§•μ„ μœ„ν•œ KSS μ„€μ •
40
- try:
41
- import kss
42
- kss_available = True
43
- except ImportError:
44
- st.warning("KSS λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ–΄ μžˆμ§€ μ•ŠμŠ΅λ‹ˆλ‹€. 'pip install kss'둜 μ„€μΉ˜ν•˜μ„Έμš”.")
45
- kss_available = False
46
-
47
- # ν•œκ΅­μ–΄ ν† ν¬λ‚˜μ΄μ§• ν•¨μˆ˜ (KSS μ‚¬μš©)
48
- def tokenize_korean(text):
49
- try:
50
- if kss_available:
51
- tokens = []
52
- # λ¬Έμž₯ 뢄리 ν›„ 각 λ¬Έμž₯μ—μ„œ 단어 μΆ”μΆœ
53
- for sentence in kss.split_sentences(text):
54
- # κΈ°λ³Έ 곡백 기반 토큰화에 μ •κ·œμ‹ νŒ¨ν„΄ μΆ”κ°€ν•˜μ—¬ 더 μ •κ΅ν•˜κ²Œ 처리
55
- raw_tokens = sentence.split()
56
- for token in raw_tokens:
57
- # 쑰사, 특수문자 등을 뢄리
58
- sub_tokens = re.findall(r'[κ°€-힣]+|[a-zA-Z]+|[0-9]+|[^\sκ°€-힣a-zA-Z0-9]+', token)
59
- tokens.extend(sub_tokens)
60
- return tokens
61
- except Exception as e:
62
- st.debug(f"KSS ν† ν¬λ‚˜μ΄μ§• μ‹€νŒ¨: {str(e)}")
63
-
64
- # KSS μ‚¬μš© λΆˆκ°€λŠ₯ν•˜κ±°λ‚˜ 였λ₯˜ λ°œμƒμ‹œ κΈ°λ³Έ μ •κ·œμ‹ 기반 ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
65
- return re.findall(r'[κ°€-힣]+|[a-zA-Z]+|[0-9]+|[^\sκ°€-힣a-zA-Z0-9]+', text)
66
-
67
- # μ›Œλ“œν΄λΌμš°λ“œ μΆ”κ°€ (선택적 μ‚¬μš©)
68
  try:
69
  from wordcloud import WordCloud
70
- wordcloud_available = True
71
  except ImportError:
72
- wordcloud_available = False
73
-
 
74
  # μŠ€μΌ€μ€„λŸ¬ μƒνƒœ 클래슀 μΆ”κ°€
75
  class SchedulerState:
76
  def __init__(self):
@@ -101,6 +61,31 @@ if st.session_state.openai_api_key is None:
101
  load_dotenv() # 둜컬 .env 파일
102
  st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # νŽ˜μ΄μ§€ μ„€μ •
105
  st.set_page_config(page_title="λ‰΄μŠ€ 기사 도ꡬ", page_icon="πŸ“°", layout="wide")
106
 
@@ -111,17 +96,21 @@ menu = st.sidebar.radio(
111
  ["λ‰΄μŠ€ 기사 크둀링", "기사 λΆ„μ„ν•˜κΈ°", "μƒˆ 기사 μƒμ„±ν•˜κΈ°", "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°"]
112
  )
113
 
114
- # OpenAI API ν‚€ μž…λ ₯ (μ‚¬μ΄λ“œλ°”)
115
- openai_api_key = st.sidebar.text_input("OpenAI API ν‚€ (선택사항)",
116
- value=st.session_state.openai_api_key if st.session_state.openai_api_key else "",
117
- type="password")
118
- if openai_api_key:
119
- st.session_state.openai_api_key = openai_api_key
120
- openai.api_key = openai_api_key
121
-
 
 
 
122
  # μ €μž₯된 기사λ₯Ό λΆˆλŸ¬μ˜€λŠ” ν•¨μˆ˜
123
  def load_saved_articles():
124
  try:
 
125
  if os.path.exists(SAVED_ARTICLES_PATH):
126
  with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
127
  return json.load(f)
@@ -133,12 +122,15 @@ def load_saved_articles():
133
  # 기사λ₯Ό μ €μž₯ν•˜λŠ” ν•¨μˆ˜
134
  def save_articles(articles):
135
  try:
 
136
  with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
137
  json.dump(articles, f, ensure_ascii=False, indent=2)
138
- return True
 
139
  except Exception as e:
140
  st.error(f"기사 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
141
  return False
 
142
 
143
  @st.cache_data
144
  def crawl_naver_news(keyword, num_articles=5):
@@ -225,89 +217,49 @@ def get_article_content(url):
225
  except Exception as e:
226
  return f"였λ₯˜ λ°œμƒ: {str(e)}"
227
 
228
- # KSSλ₯Ό μ΄μš©ν•œ ν‚€μ›Œλ“œ 뢄석
229
  def analyze_keywords(text, top_n=10):
230
- # ν•œκ΅­μ–΄ λΆˆμš©μ–΄ λͺ©λ‘ (ν™•μž₯)
231
- korean_stopwords = [
232
- '이', 'κ·Έ', 'μ €', '것', '및', 'λ“±', 'λ₯Ό', '을', '에', 'μ—μ„œ', '의', '으둜', '둜',
233
- 'μ—κ²Œ', '뿐', 'λ‹€', 'λŠ”', 'κ°€', '이닀', 'μ—κ²Œμ„œ', '께', 'κ»˜μ„œ', 'λΆ€ν„°', 'κΉŒμ§€',
234
- '이런', 'μ €λŸ°', '그런', 'μ–΄λ–€', '무슨', '이것', '저것', '그것', '이번', 'μ €λ²ˆ', '그번',
235
- '이거', 'μ €κ±°', 'κ·Έκ±°', 'ν•˜λ‹€', 'λ˜λ‹€', 'μžˆλ‹€', 'μ—†λ‹€', 'κ°™λ‹€', '보닀', '이렇닀', 'κ·Έλ ‡λ‹€',
236
- 'ν•˜λŠ”', 'λ˜λŠ”', 'μžˆλŠ”', 'μ—†λŠ”', '같은', 'λ³΄λŠ”', '이런', '그런', 'μ €λŸ°', 'ν–ˆλ‹€', '됐닀',
237
- 'μžˆμ—ˆλ‹€', 'μ—†μ—ˆλ‹€', 'κ°™μ•˜λ‹€', 'λ΄€λ‹€', '또', 'λ˜ν•œ', '그리고', 'ν•˜μ§€λ§Œ', 'κ·ΈλŸ¬λ‚˜', 'κ·Έλž˜μ„œ',
238
- 'λ•Œλ¬Έμ—', 'λ”°λΌμ„œ', 'ν•˜λ©°', '되며', '있으며', 'μ—†μœΌλ©°', 'κ°™μœΌλ©°', '보며', 'ν•˜κ³ ', '되고',
239
- '있고', 'μ—†κ³ ', 'κ°™κ³ ', '보고', '톡해', 'μœ„ν•΄', 'λ•Œ', '쀑', 'ν›„'
240
- ]
241
 
242
- # μ˜μ–΄ λΆˆμš©μ–΄ λͺ©λ‘
243
- english_stopwords = [
244
- 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
245
- 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
246
- 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
247
- 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
248
- 'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
249
- 'will', 'shall', 'can', 'may', 'must', 'ought'
250
- ]
251
 
252
- # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
253
- is_korean = bool(re.search(r'[κ°€-힣]', text))
254
-
255
- if is_korean:
256
- # ν•œκ΅­μ–΄ ν…μŠ€νŠΈμΈ 경우 KSS 기반 ν† ν¬λ‚˜μ΄μ € μ‚¬μš©
257
- tokens = tokenize_korean(text)
258
- else:
259
- # μ˜μ–΄ λ˜λŠ” 기타 μ–Έμ–΄λŠ” κ°„λ‹¨ν•œ μ •κ·œμ‹ 토큰화
260
- tokens = re.findall(r'\b\w+\b', text.lower())
261
-
262
- # λΆˆμš©μ–΄ 필터링 (언어에 따라 λ‹€λ₯Έ λΆˆμš©μ–΄ 적용)
263
- stopwords = korean_stopwords if is_korean else english_stopwords
264
- tokens = [word for word in tokens if len(word) > 1 and word.lower() not in stopwords]
265
-
266
- # λΉˆλ„ 계산
267
- from collections import Counter
268
  word_count = Counter(tokens)
269
  top_keywords = word_count.most_common(top_n)
270
 
271
  return top_keywords
272
 
273
- # μ›Œλ“œ ν΄λΌμš°λ“œμš© 뢄석
274
  def extract_keywords_for_wordcloud(text, top_n=50):
275
  if not text or len(text.strip()) < 10:
276
  return {}
277
 
278
  try:
279
- # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
280
- is_korean = bool(re.search(r'[κ°€-힣]', text))
281
-
282
- # 토큰화 (KSS μ‚¬μš©)
283
- tokens = tokenize_korean(text.lower())
284
 
285
- # λΆˆμš©μ–΄ μ„€μ •
286
- # μ˜μ–΄ λΆˆμš©μ–΄ λͺ©λ‘
287
- english_stopwords = {
288
- 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
289
- 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
290
- 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
291
- 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
292
- 'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
293
- 'will', 'shall', 'can', 'may', 'must', 'ought'
294
- }
295
 
296
- # ν•œκ΅­μ–΄ λΆˆμš©μ–΄
297
- korean_stopwords = {
298
  '및', 'λ“±', 'λ₯Ό', '이', '의', 'κ°€', '에', 'λŠ”', '으둜', 'μ—μ„œ', 'κ·Έ', '또', 'λ˜λŠ”', 'ν•˜λŠ”', 'ν• ', 'ν•˜κ³ ',
299
- 'μžˆλ‹€', '이닀', 'μœ„ν•΄', '것이닀', '것은', 'λŒ€ν•œ', 'λ•Œλ¬Έ', '그리고', 'ν•˜μ§€λ§Œ', 'κ·ΈλŸ¬λ‚˜', 'κ·Έλž˜μ„œ',
300
- 'μž…λ‹ˆλ‹€', 'ν•©λ‹ˆλ‹€', 'μŠ΅λ‹ˆλ‹€', 'μš”', 'μ£ ', 'κ³ ', 'κ³Ό', '와', '도', '은', '수', '것', 'λ“€', '제', 'μ €',
301
- 'λ…„', 'μ›”', '일', 'μ‹œ', 'λΆ„', '초', 'μ§€λ‚œ', 'μ˜¬ν•΄', 'λ‚΄λ…„', '졜근', 'ν˜„μž¬', '였늘', '내일', 'μ–΄μ œ',
302
- 'μ˜€μ „', 'μ˜€ν›„', 'λΆ€ν„°', 'κΉŒμ§€', 'μ—κ²Œ', 'κ»˜μ„œ', '이라고', '라고', 'ν•˜λ©°', 'ν•˜λ©΄μ„œ', '따라', '톡해',
303
- 'κ΄€λ ¨', 'ν•œνŽΈ', '특히', 'κ°€μž₯', '맀우', '더', '덜', '많이', '쑰금', '항상', '자주', '가끔', '거의',
304
- 'μ „ν˜€', 'λ°”λ‘œ', '정말', 'λ§Œμ•½', 'λΉ„λ‘―ν•œ', '등을', '등이', 'λ“±μ˜', 'λ“±κ³Ό', '등도', '등에', 'λ“±μ—μ„œ',
305
- '기자', 'λ‰΄μŠ€', '사진', 'μ—°ν•©λ‰΄μŠ€', 'λ‰΄μ‹œμŠ€', '제곡', '무단', 'μ „μž¬', '재배포', 'κΈˆμ§€', '액컀', '멘트',
306
- '일보', '데일리', '경제', 'μ‚¬νšŒ', 'μ •μΉ˜', '세계', 'κ³Όν•™', '아이티', 'λ‹·μ»΄', '씨넷', 'λΈ”λ‘œν„°', 'μ „μžμ‹ λ¬Έ'
307
  }
308
-
309
- # 언어에 따라 λΆˆμš©μ–΄ 선택
310
- stop_words = korean_stopwords if is_korean else english_stopwords
311
 
312
  # 1κΈ€μž 이상이고 λΆˆμš©μ–΄κ°€ μ•„λ‹Œ ν† ν°λ§Œ 필터링
313
  filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
@@ -327,45 +279,51 @@ def extract_keywords_for_wordcloud(text, top_n=50):
327
  return dict(sorted_words[:top_n])
328
 
329
  except Exception as e:
330
- st.error(f"ν‚€μ›Œλ“œ μΆ”μΆœ 쀑 였λ₯˜λ°œμƒ {str(e)}")
331
  return {"data": 1, "analysis": 1, "news": 1}
 
332
 
333
  # μ›Œλ“œ ν΄λΌμš°λ“œ 생성 ν•¨μˆ˜
 
334
  def generate_wordcloud(keywords_dict):
335
- if not wordcloud_available:
336
- st.warning("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μœ„ν•œ λΌμ΄λΈŒλŸ¬λ¦¬κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
337
- return None
338
-
339
- try:
340
- # λ‚˜λˆ”κ³ λ”• 폰트 확인 (μ—†μœΌλ©΄ κΈ°λ³Έ 폰트 μ‚¬μš©)
341
- font_path = os.path.join(CURRENT_DIR, "NanumGothic.ttf")
342
- if not os.path.exists(font_path):
343
- # κΈ°λ³Έ 폰트 μ‚¬μš©
344
- wc = WordCloud(
345
- width=800,
346
- height=400,
347
- background_color='white',
348
- colormap='viridis',
349
- max_font_size=150,
350
- random_state=42
351
- ).generate_from_frequencies(keywords_dict)
352
- else:
353
- # λ‚˜λˆ”κ³ λ”• 폰트 μ‚¬μš©
354
- wc = WordCloud(
355
- font_path=font_path,
356
- width=800,
357
- height=400,
358
- background_color='white',
359
- colormap='viridis',
360
- max_font_size=150,
361
- random_state=42
362
- ).generate_from_frequencies(keywords_dict)
 
 
363
 
364
- return wc
365
 
366
- except Exception as e:
367
- st.error(f"μ›Œλ“œν΄λΌμš°λ“œ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
368
- return None
 
 
369
 
370
  # λ‰΄μŠ€ 뢄석 ν•¨μˆ˜
371
  def analyze_news_content(news_df):
@@ -373,37 +331,32 @@ def analyze_news_content(news_df):
373
  return "데이터가 μ—†μŠ΅λ‹ˆλ‹€"
374
 
375
  results = {}
376
-
377
- # μΉ΄ν…Œκ³ λ¦¬λ³„ 뢄석
378
  if 'source' in news_df.columns:
379
- results['source_counts'] = news_df['source'].value_counts().to_dict()
 
380
  if 'date' in news_df.columns:
381
- results['date_counts'] = news_df['date'].value_counts().to_dict()
382
 
383
- # ν‚€μ›Œλ“œ 뢄석
384
  all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
385
 
386
  if len(all_text.strip()) > 0:
387
- results['top_keywords_for_wordcloud'] = extract_keywords_for_wordcloud(all_text, top_n=50)
388
  results['top_keywords'] = analyze_keywords(all_text)
389
  else:
390
- results['top_keywords_for_wordcloud'] = {}
391
  results['top_keywords'] = []
392
-
393
  return results
394
 
395
  # OpenAI APIλ₯Ό μ΄μš©ν•œ μƒˆ 기사 생성
396
  def generate_article(original_content, prompt_text):
397
  if not st.session_state.openai_api_key:
398
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
399
-
400
  try:
401
- # API ν‚€ μ„€μ •
402
- openai.api_key = st.session_state.openai_api_key
403
-
404
- # API 호좜
405
  response = openai.chat.completions.create(
406
- model="gpt-4.1-mini", # λ˜λŠ” λ‹€λ₯Έ μ‚¬μš© κ°€λŠ₯ν•œ λͺ¨λΈ
407
  messages=[
408
  {"role": "system", "content": "당신은 전문적인 λ‰΄μŠ€ κΈ°μžμž…λ‹ˆλ‹€. μ£Όμ–΄μ§„ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μƒˆλ‘œμš΄ 기사λ₯Ό μž‘μ„±ν•΄μ£Όμ„Έμš”."},
409
  {"role": "user", "content": f"λ‹€μŒ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ {prompt_text}\n\n{original_content[:1000]}"}
@@ -418,17 +371,13 @@ def generate_article(original_content, prompt_text):
418
  def generate_image(prompt):
419
  if not st.session_state.openai_api_key:
420
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
421
-
422
  try:
423
- # API ν‚€ μ„€μ •
424
- openai.api_key = st.session_state.openai_api_key
425
-
426
- # API 호좜
427
  response = openai.images.generate(
428
  model="gpt-image-1",
429
  prompt=prompt
430
  )
431
- image_base64 = response.data[0].b64_json
432
  return f"data:image/png;base64,{image_base64}"
433
  except Exception as e:
434
  return f"이미지 생성 였λ₯˜: {str(e)}"
@@ -460,12 +409,18 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
460
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
461
 
462
  # κ²°κ³Ό μ €μž₯
 
 
 
 
463
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
464
  filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
465
 
466
  try:
467
  with open(filename, 'w', encoding='utf-8') as f:
468
  json.dump(articles, f, ensure_ascii=False, indent=2)
 
 
469
  except Exception as e:
470
  print(f"파일 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
471
  return
@@ -587,9 +542,8 @@ if menu == "λ‰΄μŠ€ 기사 크둀링":
587
  articles = crawl_naver_news(keyword, num_articles)
588
 
589
  # 기사 λ‚΄μš© κ°€μ Έμ˜€κΈ°
590
- progress_bar = st.progress(0)
591
  for i, article in enumerate(articles):
592
- progress_bar.progress((i + 1) / len(articles))
593
  article['content'] = get_article_content(article['link'])
594
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
595
 
@@ -605,7 +559,7 @@ if menu == "λ‰΄μŠ€ 기사 크둀링":
605
  st.write(f"**μš”μ•½:** {article['description']}")
606
  st.write(f"**링크:** {article['link']}")
607
  st.write("**λ³Έλ¬Έ 미리보기:**")
608
- st.write(article['content'][:300] + "..." if len(article['content']) > 300 else article['content'])
609
 
610
  elif menu == "기사 λΆ„μ„ν•˜κΈ°":
611
  st.header("기사 λΆ„μ„ν•˜κΈ°")
@@ -640,6 +594,7 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
640
  keyword_tab1, keyword_tab2 = st.tabs(["ν‚€μ›Œλ“œ λΉˆλ„", "μ›Œλ“œν΄λΌμš°λ“œ"])
641
 
642
  with keyword_tab1:
 
643
  keywords = analyze_keywords(selected_article['content'])
644
 
645
  # μ‹œκ°ν™”
@@ -649,38 +604,23 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
649
  st.write("**μ£Όμš” ν‚€μ›Œλ“œ:**")
650
  for word, count in keywords:
651
  st.write(f"- {word}: {count}회")
652
-
653
  with keyword_tab2:
654
  keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
 
655
 
656
- if wordcloud_available:
657
- wc = generate_wordcloud(keyword_dict)
 
 
 
658
 
659
- if wc:
660
- fig, ax = plt.subplots(figsize=(10, 5))
661
- ax.imshow(wc, interpolation='bilinear')
662
- ax.axis('off')
663
- st.pyplot(fig)
664
-
665
- # ν‚€μ›Œλ“œ μƒμœ„ 20개 ν‘œμ‹œ
666
- st.write("**μƒμœ„ 20개 ν‚€μ›Œλ“œ:**")
667
- top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
668
- keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
669
- st.dataframe(keyword_df)
670
- else:
671
- st.error("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
672
- else:
673
- # μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μ‚¬μš©ν•  수 μ—†λŠ” 경우 λŒ€μ²΄ ν‘œμ‹œ
674
- st.warning("μ›Œλ“œν΄λΌμš°λ“œ κΈ°λŠ₯을 μ‚¬μš©ν•  수 μ—†μŠ΅λ‹ˆλ‹€. ν•„μš”ν•œ νŒ¨ν‚€μ§€κ°€ μ„€μΉ˜λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
675
-
676
- # λŒ€μ‹  ν‚€μ›Œλ“œλ§Œ ν‘œμ‹œ
677
- st.write("**μƒμœ„ ν‚€μ›Œλ“œ:**")
678
- top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:30]
679
  keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
680
  st.dataframe(keyword_df)
681
-
682
- # λ§‰λŒ€ 차트둜 ν‘œμ‹œ
683
- st.bar_chart(keyword_df.set_index('ν‚€μ›Œλ“œ').head(15))
684
 
685
  elif analysis_type == "ν…μŠ€νŠΈ 톡계":
686
  if st.button("ν…μŠ€νŠΈ 톡계 뢄석"):
@@ -689,18 +629,7 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
689
  # ν…μŠ€νŠΈ 톡계 계산
690
  word_count = len(re.findall(r'\b\w+\b', content))
691
  char_count = len(content)
692
-
693
- # KSSλ₯Ό μ‚¬μš©ν•˜μ—¬ λ¬Έμž₯ 뢄리
694
- if kss_available:
695
- try:
696
- sentences = kss.split_sentences(content)
697
- sentence_count = len(sentences)
698
- except Exception:
699
- # KSS μ‹€νŒ¨ μ‹œ κ°„λ‹¨ν•œ λ¬Έμž₯ 뢄리
700
- sentence_count = len(re.split(r'[.!?]+', content))
701
- else:
702
- sentence_count = len(re.split(r'[.!?]+', content))
703
-
704
  avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
705
  avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
706
 
@@ -726,38 +655,79 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
726
  st.write(f"ν…μŠ€νŠΈ λ³΅μž‘μ„± 점수: {complexity_score:.1f}/10")
727
 
728
  # μΆœν˜„ λΉˆλ„ λ§‰λŒ€ κ·Έλž˜ν”„
729
- st.subheader("ν’ˆμ‚¬λ³„ 뢄포")
730
-
731
- # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•˜κ²Œ ν•œκΈ€ 포함 μ—¬λΆ€λ‘œ 체크)
732
- is_korean = bool(re.search(r'[κ°€-힣]', content))
733
-
734
  try:
735
- # KSSλ₯Ό μ‚¬μš©ν•˜μ—¬ κ°„λ‹¨ν•œ ν’ˆμ‚¬ μœ μ‚¬ 뢄석
736
- tokens = tokenize_korean(content[:5000]) # λ„ˆλ¬΄ κΈ΄ ν…μŠ€νŠΈλŠ” μž˜λΌμ„œ 뢄석
 
 
 
 
 
737
 
738
- if is_korean:
739
- # ν•œκ΅­μ–΄μΈ 경우 κ°„λ‹¨ν•œ νŒ¨ν„΄ 맀칭으둜 ν’ˆμ‚¬ μΆ”μ •
740
- pos_counts = {'λͺ…사/λŒ€λͺ…사': 0, '동사/ν˜•μš©μ‚¬': 0, '뢀사/쑰사': 0, '기타': 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
741
 
742
- for token in tokens:
743
- if token.endswith(("λ‹€", "μš”", "까", "μ£ ", "λ„€", "κ΅°", "λ‹ˆλ‹€", "μ„Έμš”")):
744
- pos_counts['동사/ν˜•μš©μ‚¬'] += 1
745
- elif token.endswith(("게", "히", "이", "μ§€")):
746
- pos_counts['뢀사/쑰사'] += 1
747
- elif token.endswith(("은", "λŠ”", "이", "κ°€", "을", "λ₯Ό", "에", "의")):
748
- pos_counts['뢀사/쑰사'] += 1
749
- elif len(token) > 1:
750
- pos_counts['λͺ…사/λŒ€λͺ…사'] += 1
751
  else:
752
  pos_counts['기타'] += 1
 
753
  else:
754
- # μ˜μ–΄ λ¬Έμ„œμΈ 경우 κ°„λ‹¨ν•œ νŒ¨ν„΄ λ§€μΉ­
755
- pos_counts = {
756
- 'λͺ…사/λŒ€λͺ…사': len([t for t in tokens if not t.lower().endswith(('ly', 'ing', 'ed'))]),
757
- '동사': len([t for t in tokens if t.lower().endswith(('ing', 'ed', 's'))]),
758
- '뢀사/ν˜•μš©μ‚¬': len([t for t in tokens if t.lower().endswith('ly')]),
759
- '기타': len([t for t in tokens if len(t) <= 2])
 
 
 
 
760
  }
 
 
 
 
 
 
 
 
761
 
762
  # κ²°κ³Ό μ‹œκ°ν™”
763
  pos_df = pd.DataFrame({
@@ -780,10 +750,14 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
780
  if st.session_state.openai_api_key:
781
  with st.spinner("κΈ°μ‚¬μ˜ 감정을 뢄석 μ€‘μž…λ‹ˆλ‹€..."):
782
  try:
783
- # API ν‚€ μ„€μ •
784
- openai.api_key = st.session_state.openai_api_key
785
-
786
- # API 호좜
 
 
 
 
787
  response = openai.chat.completions.create(
788
  model="gpt-4.1-mini",
789
  messages=[
@@ -855,7 +829,7 @@ elif menu == "기사 λΆ„μ„ν•˜κΈ°":
855
  fill_color = 'rgba(158, 158, 158, 0.3)' # μ—°ν•œ νšŒμƒ‰
856
  line_color = 'rgba(158, 158, 158, 1)' # μ§„ν•œ νšŒμƒ‰
857
 
858
- # λ ˆμ΄λ” 차트 데이터 μ€€λΉ„
859
  radar_keywords = keyword_names.copy()
860
  radar_scores = keyword_scores.copy()
861
 
@@ -967,8 +941,7 @@ elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
967
  with st.expander("원본 기사 λ‚΄μš©"):
968
  st.write(selected_article['content'])
969
 
970
- prompt_text = st.text_area("생성 μ§€μΉ¨",
971
- """λ‹€μŒ 기사 양식을 λ”°λΌμ„œ λ‹€μ‹œ μž‘μ„±ν•΄μ€˜.
972
  μ—­ν• : 당신은 μ‹ λ¬Έμ‚¬μ˜ κΈ°μžμž…λ‹ˆλ‹€.
973
  μž‘μ—…: 졜근 μΌμ–΄λ‚œ 사건에 λŒ€ν•œ λ³΄λ„μžλ£Œλ₯Ό μž‘μ„±ν•΄μ•Ό ν•©λ‹ˆλ‹€. μžλ£ŒλŠ” 사싀을 기반으둜 ν•˜λ©°, 객관적이고 μ •ν™•ν•΄μ•Ό ν•©λ‹ˆλ‹€.
974
  μ§€μΉ¨:
@@ -976,13 +949,14 @@ elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
976
  기사 제λͺ©μ€ 주제λ₯Ό λͺ…ν™•νžˆ λ°˜μ˜ν•˜κ³  λ…μžμ˜ 관심을 끌 수 μžˆλ„λ‘ μž‘μ„±ν•©λ‹ˆλ‹€.
977
  기사 λ‚΄μš©μ€ μ •ν™•ν•˜κ³  κ°„κ²°ν•˜λ©° 섀득λ ₯ μžˆλŠ” λ¬Έμž₯으둜 κ΅¬μ„±ν•©λ‹ˆλ‹€.
978
  κ΄€λ ¨μžμ˜ 인터뷰λ₯Ό 인용 ν˜•νƒœλ‘œ λ„£μ–΄μ£Όμ„Έμš”.
979
- μœ„μ˜ 정보와 지침을 μ°Έκ³ ν•˜μ—¬ μ‹ λ¬Έ λ³΄λ„μžλ£Œ ν˜•μ‹μ˜ 기사λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”""", height=200)
980
 
981
  # 이미지 생성 μ—¬λΆ€ 선택 μ˜΅μ…˜ μΆ”κ°€
982
  generate_image_too = st.checkbox("기사 생성 ν›„ 이미지도 ν•¨κ»˜ μƒμ„±ν•˜κΈ°", value=True)
983
 
984
  if st.button("μƒˆ 기사 μƒμ„±ν•˜κΈ°"):
985
  if st.session_state.openai_api_key:
 
986
  with st.spinner("기사λ₯Ό 생성 μ€‘μž…λ‹ˆλ‹€..."):
987
  new_article = generate_article(selected_article['content'], prompt_text)
988
 
@@ -1001,6 +975,13 @@ elif menu == "μƒˆ 기사 μƒμ„±ν•˜κΈ°":
1001
  """
1002
 
1003
  # 이미지 생성
 
 
 
 
 
 
 
1004
  image_url = generate_image(image_prompt)
1005
 
1006
  if image_url and not image_url.startswith("이미지 생성 였λ₯˜") and not image_url.startswith("였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."):
@@ -1176,7 +1157,7 @@ elif menu == "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°":
1176
  files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
1177
  if files:
1178
  st.subheader("μˆ˜μ§‘λœ 파일 μ—΄κΈ°")
1179
- selected_file = st.selectbox("파일 선택", files, index=len(files)-1 if files else 0)
1180
  if selected_file and st.button("파일 λ‚΄μš© 보기"):
1181
  with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
1182
  articles = json.load(f)
@@ -1194,4 +1175,4 @@ elif menu == "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°":
1194
 
1195
  # ν‘Έν„°
1196
  st.markdown("---")
1197
- st.markdown("Β© λ‰΄μŠ€ 기사 도ꡬ @conanssam")
 
4
  from bs4 import BeautifulSoup
5
  import re
6
  import time
7
+ import nltk
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.corpus import stopwords
10
+ from collections import Counter
11
  import json
12
  import os
13
  from datetime import datetime, timedelta
14
+ import openai
15
+ from dotenv import load_dotenv
16
  import traceback
17
  import plotly.graph_objects as go
18
  import schedule
19
  import threading
20
  import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # /tmp 경둜 μ„€μ •
23
+ TMP_DIR = "/tmp"
24
+ SAVED_ARTICLES_PATH = os.path.join(TMP_DIR, "saved_articles.json")
25
+ SCHEDULED_NEWS_DIR = os.path.join(TMP_DIR, "scheduled_news")
26
 
27
+ # μ›Œλ“œν΄λΌμš°λ“œ μΆ”κ°€
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  try:
29
  from wordcloud import WordCloud
 
30
  except ImportError:
31
+ st.error("wordcloud νŒ¨ν‚€μ§€λ₯Ό μ„€μΉ˜ν•΄μ£Όμ„Έμš”: pip install wordcloud")
32
+ WordCloud = None
33
+
34
  # μŠ€μΌ€μ€„λŸ¬ μƒνƒœ 클래슀 μΆ”κ°€
35
  class SchedulerState:
36
  def __init__(self):
 
61
  load_dotenv() # 둜컬 .env 파일
62
  st.session_state.openai_api_key = os.getenv('OPENAI_API_KEY')
63
 
64
+ # ν•„μš”ν•œ NLTK 데이터 λ‹€μš΄λ‘œλ“œ
65
+ try:
66
+ nltk.data.find('tokenizers/punkt')
67
+ except LookupError:
68
+ nltk.download('punkt')
69
+
70
+ try:
71
+ nltk.data.find('tokenizers/punkt_tab')
72
+ except LookupError:
73
+ nltk.download('punkt_tab')
74
+
75
+ try:
76
+ nltk.data.find('corpora/stopwords')
77
+ except LookupError:
78
+ nltk.download('stopwords')
79
+
80
+ # OpenAI API ν‚€ μ„€μ •
81
+ # openai.api_key 섀정은 각 API 호좜 직전에 st.session_state.openai_api_key μ‚¬μš©ν•˜λ„λ‘ λ³€κ²½ν•˜κ±°λ‚˜,
82
+ # μ•± μ‹œμž‘ μ‹œμ μ— ν•œ 번 μ„€μ •ν•©λ‹ˆλ‹€. μ—¬κΈ°μ„œλŠ” ν›„μžλ₯Ό μ„ νƒν•©λ‹ˆλ‹€.
83
+ if st.session_state.openai_api_key:
84
+ openai.api_key = st.session_state.openai_api_key
85
+ else:
86
+ # UI μ΄ˆκΈ°μ—λŠ” ν‚€κ°€ 없을 수 μžˆμœΌλ―€λ‘œ, λ‚˜μ€‘μ— ν‚€ μž…λ ₯ μ‹œ openai.api_keyκ°€ μ„€μ •λ˜λ„λ‘ μœ λ„
87
+ pass
88
+
89
  # νŽ˜μ΄μ§€ μ„€μ •
90
  st.set_page_config(page_title="λ‰΄μŠ€ 기사 도ꡬ", page_icon="πŸ“°", layout="wide")
91
 
 
96
  ["λ‰΄μŠ€ 기사 크둀링", "기사 λΆ„μ„ν•˜κΈ°", "μƒˆ 기사 μƒμ„±ν•˜κΈ°", "λ‰΄μŠ€ 기사 μ˜ˆμ•½ν•˜κΈ°"]
97
  )
98
 
99
+ # 디렉토리 생성 ν•¨μˆ˜
100
+ def ensure_directory(directory):
101
+ try:
102
+ os.makedirs(directory, mode=0o777, exist_ok=True)
103
+ # 디렉토리 κΆŒν•œ μ„€μ •
104
+ os.chmod(directory, 0o777)
105
+ except Exception as e:
106
+ st.error(f"디렉토리 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
107
+ return False
108
+ return True
109
+
110
  # μ €μž₯된 기사λ₯Ό λΆˆλŸ¬μ˜€λŠ” ν•¨μˆ˜
111
  def load_saved_articles():
112
  try:
113
+ ensure_directory(TMP_DIR)
114
  if os.path.exists(SAVED_ARTICLES_PATH):
115
  with open(SAVED_ARTICLES_PATH, 'r', encoding='utf-8') as f:
116
  return json.load(f)
 
122
  # 기사λ₯Ό μ €μž₯ν•˜λŠ” ν•¨μˆ˜
123
  def save_articles(articles):
124
  try:
125
+ ensure_directory(TMP_DIR)
126
  with open(SAVED_ARTICLES_PATH, 'w', encoding='utf-8') as f:
127
  json.dump(articles, f, ensure_ascii=False, indent=2)
128
+ # 파일 κΆŒν•œ μ„€μ •
129
+ os.chmod(SAVED_ARTICLES_PATH, 0o666)
130
  except Exception as e:
131
  st.error(f"기사 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
132
  return False
133
+ return True
134
 
135
  @st.cache_data
136
  def crawl_naver_news(keyword, num_articles=5):
 
217
  except Exception as e:
218
  return f"였λ₯˜ λ°œμƒ: {str(e)}"
219
 
220
+ # NLTKλ₯Ό μ΄μš©ν•œ ν‚€μ›Œλ“œ 뢄석
221
  def analyze_keywords(text, top_n=10):
222
+ # ν•œκ΅­μ–΄ λΆˆμš©μ–΄ λͺ©λ‘ (직접 μ •μ˜ν•΄μ•Ό ν•©λ‹ˆλ‹€)
223
+ korean_stopwords = ['이', 'κ·Έ', 'μ €', '것', '및', 'λ“±', 'λ₯Ό', '을', '에', 'μ—μ„œ', '의', '으둜', '둜']
 
 
 
 
 
 
 
 
 
224
 
225
+ tokens = word_tokenize(text)
226
+ tokens = [word for word in tokens if word.isalnum() and len(word) > 1 and word not in korean_stopwords]
 
 
 
 
 
 
 
227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  word_count = Counter(tokens)
229
  top_keywords = word_count.most_common(top_n)
230
 
231
  return top_keywords
232
 
233
+ #μ›Œλ“œ ν΄λΌμš°λ“œμš© 뢄석
234
  def extract_keywords_for_wordcloud(text, top_n=50):
235
  if not text or len(text.strip()) < 10:
236
  return {}
237
 
238
  try:
239
+ try:
240
+ tokens = word_tokenize(text.lower())
241
+ except Exception as e:
242
+ st.warning(f"{str(e)} 였λ₯˜λ°œμƒ")
243
+ tokens = text.lower().split()
244
 
245
+ stop_words = set()
246
+ try:
247
+ stop_words = set(stopwords.words('english'))
248
+ except Exception:
249
+ pass
 
 
 
 
 
250
 
251
+ korea_stop_words = {
 
252
  '및', 'λ“±', 'λ₯Ό', '이', '의', 'κ°€', '에', 'λŠ”', '으둜', 'μ—μ„œ', 'κ·Έ', '또', 'λ˜λŠ”', 'ν•˜λŠ”', 'ν• ', 'ν•˜κ³ ',
253
+ 'μžˆλ‹€', '이닀', 'μœ„ν•΄', '것이닀', '것은', 'λŒ€ν•œ', 'λ•Œλ¬Έ', '그리고', 'ν•˜μ§€λ§Œ', 'κ·ΈλŸ¬λ‚˜', 'κ·Έλž˜μ„œ',
254
+ 'μž…λ‹ˆλ‹€', 'ν•©λ‹ˆλ‹€', 'μŠ΅λ‹ˆλ‹€', 'μš”', 'μ£ ', 'κ³ ', 'κ³Ό', '와', '도', '은', '수', '것', 'λ“€', '제', 'μ €',
255
+ 'λ…„', 'μ›”', '일', 'μ‹œ', 'λΆ„', '초', 'μ§€λ‚œ', 'μ˜¬ν•΄', 'λ‚΄λ…„', '졜근', 'ν˜„μž¬', '였늘', '내일', 'μ–΄μ œ',
256
+ 'μ˜€μ „', 'μ˜€ν›„', 'λΆ€ν„°', 'κΉŒμ§€', 'μ—κ²Œ', 'κ»˜μ„œ', '이라고', '라고', 'ν•˜λ©°', 'ν•˜λ©΄μ„œ', '따라', '톡해',
257
+ 'κ΄€λ ¨', 'ν•œνŽΈ', '특히', 'κ°€μž₯', '맀우', '더', '덜', '많이', '쑰금', '항상', '자주', '가끔', '거의',
258
+ 'μ „ν˜€', 'λ°”λ‘œ', '정말', 'λ§Œμ•½', 'λΉ„λ‘―ν•œ', '등을', '등이', 'λ“±μ˜', 'λ“±κ³Ό', '등도', '등에', 'λ“±μ—μ„œ',
259
+ '기자', 'λ‰΄μŠ€', '사진', 'μ—°ν•©λ‰΄μŠ€', 'λ‰΄μ‹œμŠ€', '제곡', '무단', 'μ „μž¬', '재배포', 'κΈˆμ§€', '액컀', '멘트',
260
+ '일보', '데일리', '경제', 'μ‚¬νšŒ', 'μ •μΉ˜', '세계', 'κ³Όν•™', '아이티', 'λ‹·μ»΄', '씨넷', 'λΈ”λ‘œν„°', 'μ „μžμ‹ λ¬Έ'
261
  }
262
+ stop_words.update(korea_stop_words)
 
 
263
 
264
  # 1κΈ€μž 이상이고 λΆˆμš©μ–΄κ°€ μ•„λ‹Œ ν† ν°λ§Œ 필터링
265
  filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
 
279
  return dict(sorted_words[:top_n])
280
 
281
  except Exception as e:
282
+ st.error(f"였λ₯˜λ°œμƒ {str(e)}")
283
  return {"data": 1, "analysis": 1, "news": 1}
284
+
285
 
286
  # μ›Œλ“œ ν΄λΌμš°λ“œ 생성 ν•¨μˆ˜
287
+
288
  def generate_wordcloud(keywords_dict):
289
+ if not WordCloud:
290
+ st.warning("μ›Œλ“œν΄λΌμš°λ“œ μ„€μΉ˜μ•ˆλ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€.")
291
+ return None
292
+ try:
293
+ # ν”„λ‘œμ νŠΈ λ£¨νŠΈμ— NanumGothic.ttfκ°€ μžˆλ‹€κ³  κ°€μ •
294
+ font_path = "NanumGothic.ttf"
295
+
296
+ # λ‘œμ»¬μ— 폰트 파일이 μžˆλŠ”μ§€ 확인, μ—†μœΌλ©΄ 기본으둜 μ‹œλ„
297
+ if not os.path.exists(font_path):
298
+ st.warning(f"폰트 파일({font_path})을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. κΈ°λ³Έ 폰트둜 μ›Œλ“œν΄λΌμš°λ“œλ₯Ό μƒμ„±ν•©λ‹ˆλ‹€. ν•œκΈ€μ΄ 깨질 수 μžˆμŠ΅λ‹ˆλ‹€.")
299
+ # font_path = None # λ˜λŠ” μ‹œμŠ€ν…œ κΈ°λ³Έ 폰트 경둜λ₯Ό μ§€μ • (ν”Œλž«νΌοΏ½οΏ½οΏ½λ‹€ 닀름)
300
+ # WordCloud μƒμ„±μžμ—μ„œ font_pathλ₯Ό None으둜 두면 μ‹œμŠ€ν…œ 기본값을 μ‹œλ„ν•˜κ±°λ‚˜, μ•„μ˜ˆ λΉΌκ³  호좜
301
+ wc = WordCloud(
302
+ width=800,
303
+ height=400,
304
+ background_color='white',
305
+ colormap='viridis',
306
+ max_font_size=150,
307
+ random_state=42
308
+ ).generate_from_frequencies(keywords_dict)
309
+ else:
310
+ wc= WordCloud(
311
+ font_path=font_path,
312
+ width=800,
313
+ height=400,
314
+ background_color = 'white',
315
+ colormap = 'viridis',
316
+ max_font_size=150,
317
+ random_state=42
318
+ ).generate_from_frequencies(keywords_dict)
319
 
320
+ return wc
321
 
322
+ except Exception as e:
323
+ st.error(f"μ›Œλ“œν΄λΌμš°λ“œ 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
324
+ # traceback.print_exc() # 디버깅 μ‹œ μ‚¬μš©
325
+ st.warning("μ›Œλ“œν΄λΌμš°λ“œ 생성에 μ‹€νŒ¨ν–ˆμŠ΅λ‹ˆλ‹€. 폰트 문제일 수 μžˆμŠ΅λ‹ˆλ‹€. NanumGothic.ttf 파일이 ν”„λ‘œμ νŠΈ λ£¨νŠΈμ— μžˆλŠ”μ§€ ν™•μΈν•΄μ£Όμ„Έμš”.")
326
+ return None
327
 
328
  # λ‰΄μŠ€ 뢄석 ν•¨μˆ˜
329
  def analyze_news_content(news_df):
 
331
  return "데이터가 μ—†μŠ΅λ‹ˆλ‹€"
332
 
333
  results = {}
334
+ #μΉ΄ν…Œκ³ λ¦¬λ³„
 
335
  if 'source' in news_df.columns:
336
+ results['source_counts'] = news_df['source'].value_counts().to_dict()
337
+ #μΉ΄ν…Œκ³ λ¦¬λ³„
338
  if 'date' in news_df.columns:
339
+ results['date_counts'] = news_df['date'].value_counts().to_dict()
340
 
341
+ #ν‚€μ›Œλ“œλΆ„μ„
342
  all_text = " ".join(news_df['title'].fillna('') + " " + news_df['content'].fillna(''))
343
 
344
  if len(all_text.strip()) > 0:
345
+ results['top_keywords_for_wordcloud']= extract_keywords_for_wordcloud(all_text, top_n=50)
346
  results['top_keywords'] = analyze_keywords(all_text)
347
  else:
348
+ results['top_keywords_for_wordcloud']={}
349
  results['top_keywords'] = []
 
350
  return results
351
 
352
  # OpenAI APIλ₯Ό μ΄μš©ν•œ μƒˆ 기사 생성
353
  def generate_article(original_content, prompt_text):
354
  if not st.session_state.openai_api_key:
355
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
356
+ openai.api_key = st.session_state.openai_api_key
357
  try:
 
 
 
 
358
  response = openai.chat.completions.create(
359
+ model="gpt-4.1-mini",
360
  messages=[
361
  {"role": "system", "content": "당신은 전문적인 λ‰΄μŠ€ κΈ°μžμž…λ‹ˆλ‹€. μ£Όμ–΄μ§„ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ μƒˆλ‘œμš΄ 기사λ₯Ό μž‘μ„±ν•΄μ£Όμ„Έμš”."},
362
  {"role": "user", "content": f"λ‹€μŒ λ‚΄μš©μ„ λ°”νƒ•μœΌλ‘œ {prompt_text}\n\n{original_content[:1000]}"}
 
371
  def generate_image(prompt):
372
  if not st.session_state.openai_api_key:
373
  return "였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. μ‚¬μ΄λ“œλ°”μ—μ„œ ν‚€λ₯Ό μž…λ ₯ν•˜κ±°λ‚˜ ν™˜κ²½ λ³€μˆ˜λ₯Ό μ„€μ •ν•΄μ£Όμ„Έμš”."
374
+ openai.api_key = st.session_state.openai_api_key
375
  try:
 
 
 
 
376
  response = openai.images.generate(
377
  model="gpt-image-1",
378
  prompt=prompt
379
  )
380
+ image_base64=response.data[0].b64_json
381
  return f"data:image/png;base64,{image_base64}"
382
  except Exception as e:
383
  return f"이미지 생성 였λ₯˜: {str(e)}"
 
409
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
410
 
411
  # κ²°κ³Ό μ €μž₯
412
+ if not ensure_directory(SCHEDULED_NEWS_DIR):
413
+ print(f"μŠ€μΌ€μ€„λœ λ‰΄μŠ€ 디렉토리 생성 μ‹€νŒ¨")
414
+ return
415
+
416
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
417
  filename = os.path.join(SCHEDULED_NEWS_DIR, f"{file_prefix}_{task_type}_{timestamp}.json")
418
 
419
  try:
420
  with open(filename, 'w', encoding='utf-8') as f:
421
  json.dump(articles, f, ensure_ascii=False, indent=2)
422
+ # 파일 κΆŒν•œ μ„€μ •
423
+ os.chmod(filename, 0o666)
424
  except Exception as e:
425
  print(f"파일 μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
426
  return
 
542
  articles = crawl_naver_news(keyword, num_articles)
543
 
544
  # 기사 λ‚΄μš© κ°€μ Έμ˜€κΈ°
 
545
  for i, article in enumerate(articles):
546
+ st.progress((i + 1) / len(articles))
547
  article['content'] = get_article_content(article['link'])
548
  time.sleep(0.5) # μ„œλ²„ λΆ€ν•˜ λ°©μ§€
549
 
 
559
  st.write(f"**μš”μ•½:** {article['description']}")
560
  st.write(f"**링크:** {article['link']}")
561
  st.write("**λ³Έλ¬Έ 미리보기:**")
562
+ st.write(article['content'][:300] + "...")
563
 
564
  elif menu == "기사 λΆ„μ„ν•˜κΈ°":
565
  st.header("기사 λΆ„μ„ν•˜κΈ°")
 
594
  keyword_tab1, keyword_tab2 = st.tabs(["ν‚€μ›Œλ“œ λΉˆλ„", "μ›Œλ“œν΄λΌμš°λ“œ"])
595
 
596
  with keyword_tab1:
597
+
598
  keywords = analyze_keywords(selected_article['content'])
599
 
600
  # μ‹œκ°ν™”
 
604
  st.write("**μ£Όμš” ν‚€μ›Œλ“œ:**")
605
  for word, count in keywords:
606
  st.write(f"- {word}: {count}회")
 
607
  with keyword_tab2:
608
  keyword_dict = extract_keywords_for_wordcloud(selected_article['content'])
609
+ wc = generate_wordcloud(keyword_dict)
610
 
611
+ if wc:
612
+ fig, ax = plt.subplots(figsize=(10, 5))
613
+ ax.imshow(wc, interpolation='bilinear')
614
+ ax.axis('off')
615
+ st.pyplot(fig)
616
 
617
+ # ν‚€μ›Œλ“œ μƒμœ„ 20개 ν‘œμ‹œ
618
+ st.write("**μƒμœ„ 20개 ν‚€μ›Œλ“œ:**")
619
+ top_keywords = sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)[:20]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
  keyword_df = pd.DataFrame(top_keywords, columns=['ν‚€μ›Œλ“œ', 'λΉˆλ„'])
621
  st.dataframe(keyword_df)
622
+ else:
623
+ st.error("μ›Œλ“œν΄λΌμš°λ“œλ₯Ό 생성할 수 μ—†μŠ΅λ‹ˆλ‹€.")
 
624
 
625
  elif analysis_type == "ν…μŠ€νŠΈ 톡계":
626
  if st.button("ν…μŠ€νŠΈ 톡계 뢄석"):
 
629
  # ν…μŠ€νŠΈ 톡계 계산
630
  word_count = len(re.findall(r'\b\w+\b', content))
631
  char_count = len(content)
632
+ sentence_count = len(re.split(r'[.!?]+', content))
 
 
 
 
 
 
 
 
 
 
 
633
  avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
634
  avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
635
 
 
655
  st.write(f"ν…μŠ€νŠΈ λ³΅μž‘μ„± 점수: {complexity_score:.1f}/10")
656
 
657
  # μΆœν˜„ λΉˆλ„ λ§‰λŒ€ κ·Έλž˜ν”„
658
+ st.subheader("ν’ˆμ‚¬λ³„ 뢄포 (ν•œκ΅­μ–΄/μ˜μ–΄ 지원)")
 
 
 
 
659
  try:
660
+ # KoNLPy μ„€μΉ˜ 확인
661
+ try:
662
+ from konlpy.tag import Okt
663
+ konlpy_installed = True
664
+ except ImportError:
665
+ konlpy_installed = False
666
+ st.warning("ν•œκ΅­μ–΄ ν˜•νƒœμ†Œ 뢄석을 μœ„ν•΄ KoNLPyλ₯Ό μ„€μΉ˜ν•΄μ£Όμ„Έμš”: pip install konlpy")
667
 
668
+ # μ˜μ–΄ POS tagger μ€€λΉ„
669
+ from nltk import pos_tag
670
+ try:
671
+ nltk.data.find('taggers/averaged_perceptron_tagger')
672
+ except LookupError:
673
+ nltk.download('averaged_perceptron_tagger')
674
+
675
+ # Try using the correct resource name as shown in the error message
676
+ try:
677
+ nltk.data.find('averaged_perceptron_tagger_eng')
678
+ except LookupError:
679
+ nltk.download('averaged_perceptron_tagger_eng')
680
+
681
+ # μ–Έμ–΄ 감지 (κ°„λ‹¨ν•œ 방식)
682
+ is_korean = bool(re.search(r'[κ°€-힣]', content))
683
+
684
+ if is_korean and konlpy_installed:
685
+ # ν•œκ΅­μ–΄ ν˜•νƒœμ†Œ 뢄석
686
+ okt = Okt()
687
+ tagged = okt.pos(content)
688
+
689
+ # ν•œκ΅­μ–΄ ν’ˆμ‚¬ λ§€ν•‘
690
+ pos_dict = {
691
+ 'Noun': 'λͺ…사', 'NNG': 'λͺ…사', 'NNP': '고유λͺ…사',
692
+ 'Verb': '동사', 'VV': '동사', 'VA': 'ν˜•μš©μ‚¬',
693
+ 'Adjective': 'ν˜•μš©μ‚¬',
694
+ 'Adverb': '뢀사',
695
+ 'Josa': '쑰사', 'Punctuation': 'ꡬ두점',
696
+ 'Determiner': 'κ΄€ν˜•μ‚¬', 'Exclamation': '감탄사'
697
+ }
698
 
699
+ pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '쑰사': 0, 'ꡬ두점': 0, 'κ΄€ν˜•μ‚¬': 0, '감탄사': 0, '기타': 0}
700
+
701
+ for _, pos in tagged:
702
+ if pos in pos_dict:
703
+ pos_counts[pos_dict[pos]] += 1
704
+ elif pos.startswith('N'): # 기타 λͺ…사λ₯˜
705
+ pos_counts['λͺ…사'] += 1
706
+ elif pos.startswith('V'): # 기타 동사λ₯˜
707
+ pos_counts['동사'] += 1
708
  else:
709
  pos_counts['기타'] += 1
710
+
711
  else:
712
+ # μ˜μ–΄ POS νƒœκΉ…
713
+ tokens = word_tokenize(content.lower())
714
+ tagged = pos_tag(tokens)
715
+
716
+ # μ˜μ–΄ ν’ˆμ‚¬ λ§€ν•‘
717
+ pos_dict = {
718
+ 'NN': 'λͺ…사', 'NNS': 'λͺ…사', 'NNP': '고유λͺ…사', 'NNPS': '고유λͺ…사',
719
+ 'VB': '동사', 'VBD': '동사', 'VBG': '동사', 'VBN': '동사', 'VBP': '동사', 'VBZ': '동사',
720
+ 'JJ': 'ν˜•μš©μ‚¬', 'JJR': 'ν˜•μš©μ‚¬', 'JJS': 'ν˜•μš©μ‚¬',
721
+ 'RB': '뢀사', 'RBR': '뢀사', 'RBS': '뢀사'
722
  }
723
+
724
+ pos_counts = {'λͺ…사': 0, '동사': 0, 'ν˜•μš©μ‚¬': 0, '뢀사': 0, '기타': 0}
725
+
726
+ for _, pos in tagged:
727
+ if pos in pos_dict:
728
+ pos_counts[pos_dict[pos]] += 1
729
+ else:
730
+ pos_counts['기타'] += 1
731
 
732
  # κ²°κ³Ό μ‹œκ°ν™”
733
  pos_df = pd.DataFrame({
 
750
  if st.session_state.openai_api_key:
751
  with st.spinner("κΈ°μ‚¬μ˜ 감정을 뢄석 μ€‘μž…λ‹ˆλ‹€..."):
752
  try:
753
+ # 감정 뢄석 API 호좜 전에 ν‚€ 확인 및 μ„€μ •
754
+ if not openai.api_key:
755
+ if st.session_state.openai_api_key:
756
+ openai.api_key = st.session_state.openai_api_key
757
+ else:
758
+ st.error("OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
759
+ st.stop()
760
+
761
  response = openai.chat.completions.create(
762
  model="gpt-4.1-mini",
763
  messages=[
 
829
  fill_color = 'rgba(158, 158, 158, 0.3)' # μ—°ν•œ νšŒμƒ‰
830
  line_color = 'rgba(158, 158, 158, 1)' # μ§„ν•œ νšŒμƒ‰
831
 
832
+ # λ ˆμ΄λ” 차트 데이터 μ€€λΉ„ - λ§ˆμ§€λ§‰ 점이 첫 점과 μ—°κ²°λ˜λ„λ‘ 데이터 μΆ”κ°€
833
  radar_keywords = keyword_names.copy()
834
  radar_scores = keyword_scores.copy()
835
 
 
941
  with st.expander("원본 기사 λ‚΄μš©"):
942
  st.write(selected_article['content'])
943
 
944
+ prompt_text ="""λ‹€μŒ 기사 양식을 λ”°λΌμ„œ λ‹€μ‹œ μž‘μ„±ν•΄μ€˜.
 
945
  μ—­ν• : 당신은 μ‹ λ¬Έμ‚¬μ˜ κΈ°μžμž…λ‹ˆλ‹€.
946
  μž‘μ—…: 졜근 μΌμ–΄λ‚œ 사건에 λŒ€ν•œ λ³΄λ„μžλ£Œλ₯Ό μž‘μ„±ν•΄μ•Ό ν•©λ‹ˆλ‹€. μžλ£ŒλŠ” 사싀을 기반으둜 ν•˜λ©°, 객관적이고 μ •ν™•ν•΄μ•Ό ν•©λ‹ˆλ‹€.
947
  μ§€μΉ¨:
 
949
  기사 제λͺ©μ€ 주제λ₯Ό λͺ…ν™•νžˆ λ°˜μ˜ν•˜κ³  λ…μžμ˜ 관심을 끌 수 μžˆλ„λ‘ μž‘μ„±ν•©λ‹ˆλ‹€.
950
  기사 λ‚΄μš©μ€ μ •ν™•ν•˜κ³  κ°„κ²°ν•˜λ©° 섀득λ ₯ μžˆλŠ” λ¬Έμž₯으둜 κ΅¬μ„±ν•©λ‹ˆλ‹€.
951
  κ΄€λ ¨μžμ˜ 인터뷰λ₯Ό 인용 ν˜•νƒœλ‘œ λ„£μ–΄μ£Όμ„Έμš”.
952
+ μœ„μ˜ 정보와 지침을 μ°Έκ³ ν•˜μ—¬ μ‹ λ¬Έ λ³΄λ„μžλ£Œ ν˜•μ‹μ˜ 기사λ₯Ό μž‘μ„±ν•΄ μ£Όμ„Έμš”"""
953
 
954
  # 이미지 생성 μ—¬λΆ€ 선택 μ˜΅μ…˜ μΆ”κ°€
955
  generate_image_too = st.checkbox("기사 생성 ν›„ 이미지도 ν•¨κ»˜ μƒμ„±ν•˜κΈ°", value=True)
956
 
957
  if st.button("μƒˆ 기사 μƒμ„±ν•˜κΈ°"):
958
  if st.session_state.openai_api_key:
959
+ # openai.api_key = st.session_state.openai_api_key # 이미 μƒλ‹¨μ—μ„œ 섀정됨 λ˜λŠ” 각 ν•¨μˆ˜ 호좜 μ‹œ μ„€μ •
960
  with st.spinner("기사λ₯Ό 생성 μ€‘μž…λ‹ˆλ‹€..."):
961
  new_article = generate_article(selected_article['content'], prompt_text)
962
 
 
975
  """
976
 
977
  # 이미지 생성
978
+ # 이미지 생성 API 호좜 전에 ν‚€ 확인 및 μ„€μ •
979
+ if not openai.api_key:
980
+ if st.session_state.openai_api_key:
981
+ openai.api_key = st.session_state.openai_api_key
982
+ else:
983
+ st.error("OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
984
+ st.stop()
985
  image_url = generate_image(image_prompt)
986
 
987
  if image_url and not image_url.startswith("이미지 생성 였λ₯˜") and not image_url.startswith("였λ₯˜: OpenAI API ν‚€κ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."):
 
1157
  files = [f for f in os.listdir(SCHEDULED_NEWS_DIR) if f.endswith('.json')]
1158
  if files:
1159
  st.subheader("μˆ˜μ§‘λœ 파일 μ—΄κΈ°")
1160
+ selected_file = st.selectbox("파일 선택", files, index=len(files)-1 if files else 0) # filesκ°€ λΉ„μ–΄μžˆμ„ 경우 λŒ€λΉ„
1161
  if selected_file and st.button("파일 λ‚΄μš© 보기"):
1162
  with open(os.path.join(SCHEDULED_NEWS_DIR, selected_file), 'r', encoding='utf-8') as f:
1163
  articles = json.load(f)
 
1175
 
1176
  # ν‘Έν„°
1177
  st.markdown("---")
1178
+ st.markdown("Β© λ‰΄μŠ€ 기사 도ꡬ @conanssam")