JUNGU commited on
Commit
a61c48c
ยท
verified ยท
1 Parent(s): f5b13b1

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +48 -8
src/streamlit_app.py CHANGED
@@ -22,6 +22,17 @@ import kss # KoNLPy ๋Œ€์‹  KSS ์‚ฌ์šฉ
22
  from PIL import Image
23
  import base64
24
  from io import BytesIO
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # ์›Œ๋“œํด๋ผ์šฐ๋“œ ์ถ”๊ฐ€
27
  try:
@@ -130,16 +141,21 @@ def crawl_naver_news(keyword, num_articles=5):
130
  """
131
  ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๊ธฐ์‚ฌ๋ฅผ ์ˆ˜์ง‘ํ•˜๋Š” ํ•จ์ˆ˜
132
  """
 
133
  url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
134
  results = []
135
 
136
  try:
137
  # ํŽ˜์ด์ง€ ์š”์ฒญ
 
138
  response = requests.get(url)
 
 
139
  soup = BeautifulSoup(response.text, 'html.parser')
140
 
141
  # ๋‰ด์Šค ์•„์ดํ…œ ์ฐพ๊ธฐ
142
  news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
 
143
 
144
  # ๊ฐ ๋‰ด์Šค ์•„์ดํ…œ์—์„œ ์ •๋ณด ์ถ”์ถœ
145
  for i, item in enumerate(news_items):
@@ -174,40 +190,50 @@ def crawl_naver_news(keyword, num_articles=5):
174
  'description': description,
175
  'source': source,
176
  'date': date,
177
- 'content': "" # ๋‚˜์ค‘์— ์›๋ฌธ ๋‚ด์šฉ์„ ์ €์žฅํ•  ํ•„๋“œ
178
  })
179
 
 
 
180
  except Exception as e:
181
- st.error(f"๊ธฐ์‚ฌ ์ •๋ณด ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
182
  continue
183
 
184
  except Exception as e:
185
- st.error(f"ํŽ˜์ด์ง€ ์š”์ฒญ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
186
 
 
187
  return results
188
 
189
  # ๊ธฐ์‚ฌ ์›๋ฌธ ๊ฐ€์ ธ์˜ค๊ธฐ
190
  def get_article_content(url):
 
191
  try:
192
  response = requests.get(url, timeout=5)
 
 
193
  soup = BeautifulSoup(response.text, 'html.parser')
194
 
195
  # ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๋ณธ๋ฌธ ์ฐพ๊ธฐ
196
  content = soup.select_one('#dic_area')
197
  if content:
198
  text = content.text.strip()
199
- text = re.sub(r'\s+', ' ', text) # ์—ฌ๋Ÿฌ ๊ณต๋ฐฑ ์ œ๊ฑฐ
 
200
  return text
201
 
202
- # ๋‹ค๋ฅธ ๋‰ด์Šค ์‚ฌ์ดํŠธ ๋ณธ๋ฌธ ์ฐพ๊ธฐ (์—ฌ๋Ÿฌ ์‚ฌ์ดํŠธ ๋Œ€์‘ ํ•„์š”)
203
  content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
204
  if content:
205
  text = content.text.strip()
206
  text = re.sub(r'\s+', ' ', text)
 
207
  return text
208
 
 
209
  return "๋ณธ๋ฌธ์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
210
  except Exception as e:
 
211
  return f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
212
 
213
  # NLTK๋ฅผ ์ด์šฉํ•œ ํ‚ค์›Œ๋“œ ๋ถ„์„ (KSS ํ™œ์šฉ)
@@ -423,11 +449,14 @@ def run_scheduled_task():
423
  traceback.print_exc()
424
 
425
  def perform_news_task(task_type, keyword, num_articles, file_prefix):
 
426
  try:
427
  articles = crawl_naver_news(keyword, num_articles)
 
428
 
429
  # ๊ธฐ์‚ฌ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ
430
- for article in articles:
 
431
  article['content'] = get_article_content(article['link'])
432
  time.sleep(0.5) # ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€
433
 
@@ -439,10 +468,12 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
439
  with open(filename, 'w', encoding='utf-8') as f:
440
  json.dump(articles, f, ensure_ascii=False, indent=2)
441
 
 
 
442
  global_scheduler_state.last_run = datetime.now()
443
  print(f"{datetime.now()} - {task_type} ๋‰ด์Šค ๊ธฐ์‚ฌ ์ˆ˜์ง‘ ์™„๋ฃŒ: {keyword}")
444
 
445
- # ์ „์—ญ ์ƒํƒœ์— ์ˆ˜์ง‘ ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅ (UI ์—…๋ฐ์ดํŠธ์šฉ)
446
  result_item = {
447
  'task_type': task_type,
448
  'keyword': keyword,
@@ -453,7 +484,7 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
453
  global_scheduler_state.scheduled_results.append(result_item)
454
 
455
  except Exception as e:
456
- print(f"์ž‘์—… ์‹คํ–‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
457
  traceback.print_exc()
458
 
459
  def start_scheduler(daily_tasks, interval_tasks):
@@ -1015,6 +1046,15 @@ elif menu == "๋‰ด์Šค ๊ธฐ์‚ฌ ์˜ˆ์•ฝํ•˜๊ธฐ":
1015
  with tab3:
1016
  st.subheader("์Šค์ผ€์ค„๋Ÿฌ ์ œ์–ด ๋ฐ ์ƒํƒœ")
1017
 
 
 
 
 
 
 
 
 
 
1018
  col1, col2 = st.columns(2)
1019
 
1020
  with col1:
 
22
  from PIL import Image
23
  import base64
24
  from io import BytesIO
25
+ import logging
26
+
27
+ # ๋กœ๊น… ์„ค์ •
28
+ logging.basicConfig(
29
+ level=logging.INFO,
30
+ format='%(asctime)s - %(levelname)s - %(message)s',
31
+ handlers=[
32
+ logging.StreamHandler(),
33
+ logging.FileHandler('/tmp/crawler.log')
34
+ ]
35
+ )
36
 
37
  # ์›Œ๋“œํด๋ผ์šฐ๋“œ ์ถ”๊ฐ€
38
  try:
 
141
  """
142
  ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๊ธฐ์‚ฌ๋ฅผ ์ˆ˜์ง‘ํ•˜๋Š” ํ•จ์ˆ˜
143
  """
144
+ logging.info(f"ํฌ๋กค๋ง ์‹œ์ž‘: ํ‚ค์›Œ๋“œ={keyword}, ๊ธฐ์‚ฌ ์ˆ˜={num_articles}")
145
  url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
146
  results = []
147
 
148
  try:
149
  # ํŽ˜์ด์ง€ ์š”์ฒญ
150
+ logging.info(f"์š”์ฒญ URL: {url}")
151
  response = requests.get(url)
152
+ logging.info(f"์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
153
+
154
  soup = BeautifulSoup(response.text, 'html.parser')
155
 
156
  # ๋‰ด์Šค ์•„์ดํ…œ ์ฐพ๊ธฐ
157
  news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
158
+ logging.info(f"์ฐพ์€ ๋‰ด์Šค ์•„์ดํ…œ ์ˆ˜: {len(news_items)}")
159
 
160
  # ๊ฐ ๋‰ด์Šค ์•„์ดํ…œ์—์„œ ์ •๋ณด ์ถ”์ถœ
161
  for i, item in enumerate(news_items):
 
190
  'description': description,
191
  'source': source,
192
  'date': date,
193
+ 'content': ""
194
  })
195
 
196
+ logging.info(f"๊ธฐ์‚ฌ ์ถ”์ถœ ์„ฑ๊ณต: {title}")
197
+
198
  except Exception as e:
199
+ logging.error(f"๊ธฐ์‚ฌ ์ •๋ณด ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}", exc_info=True)
200
  continue
201
 
202
  except Exception as e:
203
+ logging.error(f"ํŽ˜์ด์ง€ ์š”์ฒญ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}", exc_info=True)
204
 
205
+ logging.info(f"ํฌ๋กค๋ง ์™„๋ฃŒ: {len(results)}๊ฐœ ๊ธฐ์‚ฌ ์ˆ˜์ง‘")
206
  return results
207
 
208
  # ๊ธฐ์‚ฌ ์›๋ฌธ ๊ฐ€์ ธ์˜ค๊ธฐ
209
  def get_article_content(url):
210
+ logging.info(f"๊ธฐ์‚ฌ ์›๋ฌธ ๊ฐ€์ ธ์˜ค๊ธฐ ์‹œ์ž‘: {url}")
211
  try:
212
  response = requests.get(url, timeout=5)
213
+ logging.info(f"์›๋ฌธ ์š”์ฒญ ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
214
+
215
  soup = BeautifulSoup(response.text, 'html.parser')
216
 
217
  # ๋„ค์ด๋ฒ„ ๋‰ด์Šค ๋ณธ๋ฌธ ์ฐพ๊ธฐ
218
  content = soup.select_one('#dic_area')
219
  if content:
220
  text = content.text.strip()
221
+ text = re.sub(r'\s+', ' ', text)
222
+ logging.info("๋„ค์ด๋ฒ„ ๋‰ด์Šค ๋ณธ๋ฌธ ์ถ”์ถœ ์„ฑ๊ณต")
223
  return text
224
 
225
+ # ๋‹ค๋ฅธ ๋‰ด์Šค ์‚ฌ์ดํŠธ ๋ณธ๋ฌธ ์ฐพ๊ธฐ
226
  content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
227
  if content:
228
  text = content.text.strip()
229
  text = re.sub(r'\s+', ' ', text)
230
+ logging.info("์ผ๋ฐ˜ ๋‰ด์Šค ๋ณธ๋ฌธ ์ถ”์ถœ ์„ฑ๊ณต")
231
  return text
232
 
233
+ logging.warning("๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Œ")
234
  return "๋ณธ๋ฌธ์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
235
  except Exception as e:
236
+ logging.error(f"์›๋ฌธ ๊ฐ€์ ธ์˜ค๊ธฐ ์˜ค๋ฅ˜: {str(e)}", exc_info=True)
237
  return f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
238
 
239
  # NLTK๋ฅผ ์ด์šฉํ•œ ํ‚ค์›Œ๋“œ ๋ถ„์„ (KSS ํ™œ์šฉ)
 
449
  traceback.print_exc()
450
 
451
  def perform_news_task(task_type, keyword, num_articles, file_prefix):
452
+ logging.info(f"์Šค์ผ€์ค„๋Ÿฌ ์ž‘์—… ์‹œ์ž‘: {task_type}, ํ‚ค์›Œ๋“œ={keyword}")
453
  try:
454
  articles = crawl_naver_news(keyword, num_articles)
455
+ logging.info(f"์ˆ˜์ง‘๋œ ๊ธฐ์‚ฌ ์ˆ˜: {len(articles)}")
456
 
457
  # ๊ธฐ์‚ฌ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ
458
+ for i, article in enumerate(articles):
459
+ logging.info(f"๊ธฐ์‚ฌ {i+1}/{len(articles)} ์›๋ฌธ ๊ฐ€์ ธ์˜ค๊ธฐ: {article['title']}")
460
  article['content'] = get_article_content(article['link'])
461
  time.sleep(0.5) # ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€
462
 
 
468
  with open(filename, 'w', encoding='utf-8') as f:
469
  json.dump(articles, f, ensure_ascii=False, indent=2)
470
 
471
+ logging.info(f"๊ฒฐ๊ณผ ์ €์žฅ ์™„๋ฃŒ: {filename}")
472
+
473
  global_scheduler_state.last_run = datetime.now()
474
  print(f"{datetime.now()} - {task_type} ๋‰ด์Šค ๊ธฐ์‚ฌ ์ˆ˜์ง‘ ์™„๋ฃŒ: {keyword}")
475
 
476
+ # ์ „์—ญ ์ƒํƒœ์— ์ˆ˜์ง‘ ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅ
477
  result_item = {
478
  'task_type': task_type,
479
  'keyword': keyword,
 
484
  global_scheduler_state.scheduled_results.append(result_item)
485
 
486
  except Exception as e:
487
+ logging.error(f"์ž‘์—… ์‹คํ–‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}", exc_info=True)
488
  traceback.print_exc()
489
 
490
  def start_scheduler(daily_tasks, interval_tasks):
 
1046
  with tab3:
1047
  st.subheader("์Šค์ผ€์ค„๋Ÿฌ ์ œ์–ด ๋ฐ ์ƒํƒœ")
1048
 
1049
+ # ๋กœ๊ทธ ๋ทฐ์–ด ์ถ”๊ฐ€
1050
+ if st.checkbox("๋กœ๊ทธ ๋ณด๊ธฐ"):
1051
+ try:
1052
+ with open('/tmp/crawler.log', 'r') as f:
1053
+ logs = f.readlines()
1054
+ st.text_area("์ตœ๊ทผ ๋กœ๊ทธ", value=''.join(logs[-100:]), height=400)
1055
+ except Exception as e:
1056
+ st.error(f"๋กœ๊ทธ ํŒŒ์ผ์„ ์ฝ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {str(e)}")
1057
+
1058
  col1, col2 = st.columns(2)
1059
 
1060
  with col1: