Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +48 -8
src/streamlit_app.py
CHANGED
@@ -22,6 +22,17 @@ import kss # KoNLPy ๋์ KSS ์ฌ์ฉ
|
|
22 |
from PIL import Image
|
23 |
import base64
|
24 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ
|
27 |
try:
|
@@ -130,16 +141,21 @@ def crawl_naver_news(keyword, num_articles=5):
|
|
130 |
"""
|
131 |
๋ค์ด๋ฒ ๋ด์ค ๊ธฐ์ฌ๋ฅผ ์์งํ๋ ํจ์
|
132 |
"""
|
|
|
133 |
url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
|
134 |
results = []
|
135 |
|
136 |
try:
|
137 |
# ํ์ด์ง ์์ฒญ
|
|
|
138 |
response = requests.get(url)
|
|
|
|
|
139 |
soup = BeautifulSoup(response.text, 'html.parser')
|
140 |
|
141 |
# ๋ด์ค ์์ดํ
์ฐพ๊ธฐ
|
142 |
news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
|
|
|
143 |
|
144 |
# ๊ฐ ๋ด์ค ์์ดํ
์์ ์ ๋ณด ์ถ์ถ
|
145 |
for i, item in enumerate(news_items):
|
@@ -174,40 +190,50 @@ def crawl_naver_news(keyword, num_articles=5):
|
|
174 |
'description': description,
|
175 |
'source': source,
|
176 |
'date': date,
|
177 |
-
'content': ""
|
178 |
})
|
179 |
|
|
|
|
|
180 |
except Exception as e:
|
181 |
-
|
182 |
continue
|
183 |
|
184 |
except Exception as e:
|
185 |
-
|
186 |
|
|
|
187 |
return results
|
188 |
|
189 |
# ๊ธฐ์ฌ ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ
|
190 |
def get_article_content(url):
|
|
|
191 |
try:
|
192 |
response = requests.get(url, timeout=5)
|
|
|
|
|
193 |
soup = BeautifulSoup(response.text, 'html.parser')
|
194 |
|
195 |
# ๋ค์ด๋ฒ ๋ด์ค ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
196 |
content = soup.select_one('#dic_area')
|
197 |
if content:
|
198 |
text = content.text.strip()
|
199 |
-
text = re.sub(r'\s+', ' ', text)
|
|
|
200 |
return text
|
201 |
|
202 |
-
# ๋ค๋ฅธ ๋ด์ค ์ฌ์ดํธ ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
203 |
content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
|
204 |
if content:
|
205 |
text = content.text.strip()
|
206 |
text = re.sub(r'\s+', ' ', text)
|
|
|
207 |
return text
|
208 |
|
|
|
209 |
return "๋ณธ๋ฌธ์ ๊ฐ์ ธ์ฌ ์ ์์ต๋๋ค."
|
210 |
except Exception as e:
|
|
|
211 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
212 |
|
213 |
# NLTK๋ฅผ ์ด์ฉํ ํค์๋ ๋ถ์ (KSS ํ์ฉ)
|
@@ -423,11 +449,14 @@ def run_scheduled_task():
|
|
423 |
traceback.print_exc()
|
424 |
|
425 |
def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
|
426 |
try:
|
427 |
articles = crawl_naver_news(keyword, num_articles)
|
|
|
428 |
|
429 |
# ๊ธฐ์ฌ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ
|
430 |
-
for article in articles:
|
|
|
431 |
article['content'] = get_article_content(article['link'])
|
432 |
time.sleep(0.5) # ์๋ฒ ๋ถํ ๋ฐฉ์ง
|
433 |
|
@@ -439,10 +468,12 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
439 |
with open(filename, 'w', encoding='utf-8') as f:
|
440 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
441 |
|
|
|
|
|
442 |
global_scheduler_state.last_run = datetime.now()
|
443 |
print(f"{datetime.now()} - {task_type} ๋ด์ค ๊ธฐ์ฌ ์์ง ์๋ฃ: {keyword}")
|
444 |
|
445 |
-
# ์ ์ญ ์ํ์ ์์ง ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ
|
446 |
result_item = {
|
447 |
'task_type': task_type,
|
448 |
'keyword': keyword,
|
@@ -453,7 +484,7 @@ def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
|
453 |
global_scheduler_state.scheduled_results.append(result_item)
|
454 |
|
455 |
except Exception as e:
|
456 |
-
|
457 |
traceback.print_exc()
|
458 |
|
459 |
def start_scheduler(daily_tasks, interval_tasks):
|
@@ -1015,6 +1046,15 @@ elif menu == "๋ด์ค ๊ธฐ์ฌ ์์ฝํ๊ธฐ":
|
|
1015 |
with tab3:
|
1016 |
st.subheader("์ค์ผ์ค๋ฌ ์ ์ด ๋ฐ ์ํ")
|
1017 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1018 |
col1, col2 = st.columns(2)
|
1019 |
|
1020 |
with col1:
|
|
|
22 |
from PIL import Image
|
23 |
import base64
|
24 |
from io import BytesIO
|
25 |
+
import logging
|
26 |
+
|
27 |
+
# ๋ก๊น
์ค์
|
28 |
+
logging.basicConfig(
|
29 |
+
level=logging.INFO,
|
30 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
31 |
+
handlers=[
|
32 |
+
logging.StreamHandler(),
|
33 |
+
logging.FileHandler('/tmp/crawler.log')
|
34 |
+
]
|
35 |
+
)
|
36 |
|
37 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ
|
38 |
try:
|
|
|
141 |
"""
|
142 |
๋ค์ด๋ฒ ๋ด์ค ๊ธฐ์ฌ๋ฅผ ์์งํ๋ ํจ์
|
143 |
"""
|
144 |
+
logging.info(f"ํฌ๋กค๋ง ์์: ํค์๋={keyword}, ๊ธฐ์ฌ ์={num_articles}")
|
145 |
url = f"https://search.naver.com/search.naver?where=news&query={keyword}"
|
146 |
results = []
|
147 |
|
148 |
try:
|
149 |
# ํ์ด์ง ์์ฒญ
|
150 |
+
logging.info(f"์์ฒญ URL: {url}")
|
151 |
response = requests.get(url)
|
152 |
+
logging.info(f"์๋ต ์ํ ์ฝ๋: {response.status_code}")
|
153 |
+
|
154 |
soup = BeautifulSoup(response.text, 'html.parser')
|
155 |
|
156 |
# ๋ด์ค ์์ดํ
์ฐพ๊ธฐ
|
157 |
news_items = soup.select('div.sds-comps-base-layout.sds-comps-full-layout')
|
158 |
+
logging.info(f"์ฐพ์ ๋ด์ค ์์ดํ
์: {len(news_items)}")
|
159 |
|
160 |
# ๊ฐ ๋ด์ค ์์ดํ
์์ ์ ๋ณด ์ถ์ถ
|
161 |
for i, item in enumerate(news_items):
|
|
|
190 |
'description': description,
|
191 |
'source': source,
|
192 |
'date': date,
|
193 |
+
'content': ""
|
194 |
})
|
195 |
|
196 |
+
logging.info(f"๊ธฐ์ฌ ์ถ์ถ ์ฑ๊ณต: {title}")
|
197 |
+
|
198 |
except Exception as e:
|
199 |
+
logging.error(f"๊ธฐ์ฌ ์ ๋ณด ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", exc_info=True)
|
200 |
continue
|
201 |
|
202 |
except Exception as e:
|
203 |
+
logging.error(f"ํ์ด์ง ์์ฒญ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", exc_info=True)
|
204 |
|
205 |
+
logging.info(f"ํฌ๋กค๋ง ์๋ฃ: {len(results)}๊ฐ ๊ธฐ์ฌ ์์ง")
|
206 |
return results
|
207 |
|
208 |
# ๊ธฐ์ฌ ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ
|
209 |
def get_article_content(url):
|
210 |
+
logging.info(f"๊ธฐ์ฌ ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ ์์: {url}")
|
211 |
try:
|
212 |
response = requests.get(url, timeout=5)
|
213 |
+
logging.info(f"์๋ฌธ ์์ฒญ ์ํ ์ฝ๋: {response.status_code}")
|
214 |
+
|
215 |
soup = BeautifulSoup(response.text, 'html.parser')
|
216 |
|
217 |
# ๋ค์ด๋ฒ ๋ด์ค ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
218 |
content = soup.select_one('#dic_area')
|
219 |
if content:
|
220 |
text = content.text.strip()
|
221 |
+
text = re.sub(r'\s+', ' ', text)
|
222 |
+
logging.info("๋ค์ด๋ฒ ๋ด์ค ๋ณธ๋ฌธ ์ถ์ถ ์ฑ๊ณต")
|
223 |
return text
|
224 |
|
225 |
+
# ๋ค๋ฅธ ๋ด์ค ์ฌ์ดํธ ๋ณธ๋ฌธ ์ฐพ๊ธฐ
|
226 |
content = soup.select_one('.article_body, .article-body, .article-content, .news-content-inner')
|
227 |
if content:
|
228 |
text = content.text.strip()
|
229 |
text = re.sub(r'\s+', ' ', text)
|
230 |
+
logging.info("์ผ๋ฐ ๋ด์ค ๋ณธ๋ฌธ ์ถ์ถ ์ฑ๊ณต")
|
231 |
return text
|
232 |
|
233 |
+
logging.warning("๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์")
|
234 |
return "๋ณธ๋ฌธ์ ๊ฐ์ ธ์ฌ ์ ์์ต๋๋ค."
|
235 |
except Exception as e:
|
236 |
+
logging.error(f"์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ ์ค๋ฅ: {str(e)}", exc_info=True)
|
237 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
238 |
|
239 |
# NLTK๋ฅผ ์ด์ฉํ ํค์๋ ๋ถ์ (KSS ํ์ฉ)
|
|
|
449 |
traceback.print_exc()
|
450 |
|
451 |
def perform_news_task(task_type, keyword, num_articles, file_prefix):
|
452 |
+
logging.info(f"์ค์ผ์ค๋ฌ ์์
์์: {task_type}, ํค์๋={keyword}")
|
453 |
try:
|
454 |
articles = crawl_naver_news(keyword, num_articles)
|
455 |
+
logging.info(f"์์ง๋ ๊ธฐ์ฌ ์: {len(articles)}")
|
456 |
|
457 |
# ๊ธฐ์ฌ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ
|
458 |
+
for i, article in enumerate(articles):
|
459 |
+
logging.info(f"๊ธฐ์ฌ {i+1}/{len(articles)} ์๋ฌธ ๊ฐ์ ธ์ค๊ธฐ: {article['title']}")
|
460 |
article['content'] = get_article_content(article['link'])
|
461 |
time.sleep(0.5) # ์๋ฒ ๋ถํ ๋ฐฉ์ง
|
462 |
|
|
|
468 |
with open(filename, 'w', encoding='utf-8') as f:
|
469 |
json.dump(articles, f, ensure_ascii=False, indent=2)
|
470 |
|
471 |
+
logging.info(f"๊ฒฐ๊ณผ ์ ์ฅ ์๋ฃ: {filename}")
|
472 |
+
|
473 |
global_scheduler_state.last_run = datetime.now()
|
474 |
print(f"{datetime.now()} - {task_type} ๋ด์ค ๊ธฐ์ฌ ์์ง ์๋ฃ: {keyword}")
|
475 |
|
476 |
+
# ์ ์ญ ์ํ์ ์์ง ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅ
|
477 |
result_item = {
|
478 |
'task_type': task_type,
|
479 |
'keyword': keyword,
|
|
|
484 |
global_scheduler_state.scheduled_results.append(result_item)
|
485 |
|
486 |
except Exception as e:
|
487 |
+
logging.error(f"์์
์คํ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}", exc_info=True)
|
488 |
traceback.print_exc()
|
489 |
|
490 |
def start_scheduler(daily_tasks, interval_tasks):
|
|
|
1046 |
with tab3:
|
1047 |
st.subheader("์ค์ผ์ค๋ฌ ์ ์ด ๋ฐ ์ํ")
|
1048 |
|
1049 |
+
# ๋ก๊ทธ ๋ทฐ์ด ์ถ๊ฐ
|
1050 |
+
if st.checkbox("๋ก๊ทธ ๋ณด๊ธฐ"):
|
1051 |
+
try:
|
1052 |
+
with open('/tmp/crawler.log', 'r') as f:
|
1053 |
+
logs = f.readlines()
|
1054 |
+
st.text_area("์ต๊ทผ ๋ก๊ทธ", value=''.join(logs[-100:]), height=400)
|
1055 |
+
except Exception as e:
|
1056 |
+
st.error(f"๋ก๊ทธ ํ์ผ์ ์ฝ์ ์ ์์ต๋๋ค: {str(e)}")
|
1057 |
+
|
1058 |
col1, col2 = st.columns(2)
|
1059 |
|
1060 |
with col1:
|