JUNGU commited on
Commit
4cf5f75
ยท
verified ยท
1 Parent(s): 31658d4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +93 -160
src/streamlit_app.py CHANGED
@@ -20,7 +20,6 @@ from dotenv import load_dotenv
20
  # /tmp ํด๋”๋Š” ์กด์žฌํ•  ์ˆ˜ ์žˆ์ง€๋งŒ ๊ถŒํ•œ ๋ฌธ์ œ๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๊ธฐ๋ฐ˜์œผ๋กœ ๋ณ€๊ฒฝ
21
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
22
  DATA_DIR = os.path.join(CURRENT_DIR, "data")
23
- NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
24
  SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
25
  SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
26
 
@@ -35,60 +34,34 @@ def ensure_directory(directory):
35
 
36
  # ํ•„์š”ํ•œ ๋ชจ๋“  ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
37
  ensure_directory(DATA_DIR)
38
- ensure_directory(NLTK_DATA_DIR)
39
  ensure_directory(SCHEDULED_NEWS_DIR)
40
 
41
- # NLTK ์„ค์ • - ๊ถŒํ•œ ๋ฌธ์ œ ํ•ด๊ฒฐ์„ ์œ„ํ•ด ์‚ฌ์šฉ์ž ์ง€์ • ๋””๋ ‰ํ† ๋ฆฌ ์‚ฌ์šฉ
42
- import nltk
43
- nltk.data.path.append(NLTK_DATA_DIR)
44
-
45
- # ํ•„์š”ํ•œ NLTK ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ (๊ถŒํ•œ ๋ฌธ์ œ ํ•ด๊ฒฐ)
46
  try:
47
- # ์‚ฌ์šฉ์ž ์ง€์ • ๋””๋ ‰ํ† ๋ฆฌ์— ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ
48
- try:
49
- nltk.data.find('tokenizers/punkt')
50
- except LookupError:
51
- nltk.download('punkt', download_dir=NLTK_DATA_DIR)
52
-
53
- try:
54
- nltk.data.find('corpora/stopwords')
55
- except LookupError:
56
- nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
57
- except Exception as e:
58
- st.warning(f"NLTK ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}. ๊ธฐ๋ณธ ํ† ํฌ๋‚˜์ด์ง• ๋ฐฉ์‹์„ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
59
 
60
- # ํ•œ๊ตญ์–ด ํ† ํฌ๋‚˜์ด์ง•์„ ์œ„ํ•œ ๋Œ€์ฒด ํ•จ์ˆ˜ (KoNLPy ๋Œ€์‹  ์‚ฌ์šฉ)
61
  def tokenize_korean(text):
62
  try:
63
- # 1. ๋จผ์ € transformers ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์„ค์น˜๋˜์–ด ์žˆ๋Š”์ง€ ํ™•์ธ
64
- try:
65
- from transformers import AutoTokenizer
66
- tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
67
- return tokenizer.tokenize(text)
68
- except (ImportError, Exception) as e:
69
- st.debug(f"Transformers ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
70
-
71
- # 2. soynlp ์‹œ๋„
72
- try:
73
- from soynlp.tokenizer import LTokenizer
74
- tokenizer = LTokenizer()
75
- return tokenizer.tokenize(text)
76
- except (ImportError, Exception) as e:
77
- st.debug(f"soynlp ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
78
-
79
- # 3. kss ์‹œ๋„
80
- try:
81
- import kss
82
  tokens = []
 
83
  for sentence in kss.split_sentences(text):
84
- tokens.extend(sentence.split())
 
 
 
 
 
85
  return tokens
86
- except (ImportError, Exception) as e:
87
- st.debug(f"kss ํ† ํฌ๋‚˜์ด์ € ๋กœ๋“œ ์‹คํŒจ: {str(e)}")
88
  except Exception as e:
89
- st.debug(f"ํ•œ๊ตญ์–ด ํ† ํฌ๋‚˜์ด์ง• ์‹คํŒจ: {str(e)}")
90
 
91
- # 4. ๊ธฐ๋ณธ ์ •๊ทœ์‹ ๊ธฐ๋ฐ˜ ํ† ํฌ๋‚˜์ด์ € - ๋ชจ๋“  ๋ฐฉ๋ฒ•์ด ์‹คํŒจํ–ˆ์„ ๋•Œ ํด๋ฐฑ
92
  return re.findall(r'[๊ฐ€-ํžฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ€-ํžฃa-zA-Z0-9]+', text)
93
 
94
  # ์›Œ๋“œํด๋ผ์šฐ๋“œ ์ถ”๊ฐ€ (์„ ํƒ์  ์‚ฌ์šฉ)
@@ -252,31 +225,43 @@ def get_article_content(url):
252
  except Exception as e:
253
  return f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
254
 
255
- # NLTK๋ฅผ ์ด์šฉํ•œ ํ‚ค์›Œ๋“œ ๋ถ„์„ (ํ•œ๊ตญ์–ด ๋Œ€์‘ ์ถ”๊ฐ€)
256
  def analyze_keywords(text, top_n=10):
257
- # ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
258
  korean_stopwords = [
259
  '์ด', '๊ทธ', '์ €', '๊ฒƒ', '๋ฐ', '๋“ฑ', '๋ฅผ', '์„', '์—', '์—์„œ', '์˜', '์œผ๋กœ', '๋กœ',
260
- '์—๊ฒŒ', '๋ฟ', '๋‹ค', '๋Š”', '๊ฐ€', '์ด๋‹ค', '์—๊ฒŒ์„œ', '๊ป˜', '๊ป˜์„œ', '๋ถ€ํ„ฐ', '๊นŒ์ง€'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  ]
262
 
263
  # ์–ธ์–ด ๊ฐ์ง€ (๊ฐ„๋‹จํ•˜๊ฒŒ ํ•œ๊ธ€ ํฌํ•จ ์—ฌ๋ถ€๋กœ ์ฒดํฌ)
264
  is_korean = bool(re.search(r'[๊ฐ€-ํžฃ]', text))
265
 
266
  if is_korean:
267
- # ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ์ธ ๊ฒฝ์šฐ ํ•œ๊ตญ์–ด ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
268
  tokens = tokenize_korean(text)
269
  else:
270
- # ํ•œ๊ธ€์ด ์—†๋Š” ๊ฒฝ์šฐ NLTK ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
271
- try:
272
- from nltk.tokenize import word_tokenize
273
- tokens = word_tokenize(text)
274
- except Exception:
275
- # NLTK๊ฐ€ ์‹คํŒจํ•˜๋ฉด ๊ฐ„๋‹จํ•œ ํ† ํฌ๋‚˜์ด์ €๋กœ ๋Œ€์ฒด
276
- tokens = re.findall(r'\b\w+\b', text.lower())
277
 
278
- # ๋ถˆ์šฉ์–ด ํ•„ํ„ฐ๋ง
279
- tokens = [word for word in tokens if len(word) > 1 and word.lower() not in korean_stopwords]
 
280
 
281
  # ๋นˆ๋„ ๊ณ„์‚ฐ
282
  from collections import Counter
@@ -294,38 +279,22 @@ def extract_keywords_for_wordcloud(text, top_n=50):
294
  # ์–ธ์–ด ๊ฐ์ง€ (๊ฐ„๋‹จํ•˜๊ฒŒ ํ•œ๊ธ€ ํฌํ•จ ์—ฌ๋ถ€๋กœ ์ฒดํฌ)
295
  is_korean = bool(re.search(r'[๊ฐ€-ํžฃ]', text))
296
 
297
- if is_korean:
298
- # ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ์ธ ๊ฒฝ์šฐ ํ•œ๊ตญ์–ด ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
299
- tokens = tokenize_korean(text.lower())
300
- else:
301
- # ์˜์–ด ๋˜๋Š” ๊ธฐํƒ€ ์–ธ์–ด๋Š” NLTK ์‚ฌ์šฉ ์‹œ๋„
302
- try:
303
- from nltk.tokenize import word_tokenize
304
- tokens = word_tokenize(text.lower())
305
- except Exception:
306
- # ์‹คํŒจํ•˜๋ฉด ๊ฐ„๋‹จํ•œ ํ† ํฌ๋‚˜์ด์ง•
307
- tokens = text.lower().split()
308
 
309
  # ๋ถˆ์šฉ์–ด ์„ค์ •
310
- stop_words = set()
311
-
312
- # ์˜์–ด ๋ถˆ์šฉ์–ด (NLTK ์žˆ์œผ๋ฉด ์‚ฌ์šฉ)
313
- try:
314
- from nltk.corpus import stopwords
315
- stop_words = set(stopwords.words('english'))
316
- except Exception:
317
- # ๊ธฐ๋ณธ ์˜์–ด ๋ถˆ์šฉ์–ด
318
- stop_words = {
319
- 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
320
- 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
321
- 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
322
- 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
323
- 'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
324
- 'will', 'shall', 'can', 'may', 'must', 'ought'
325
- }
326
 
327
  # ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด
328
- korea_stop_words = {
329
  '๋ฐ', '๋“ฑ', '๋ฅผ', '์ด', '์˜', '๊ฐ€', '์—', '๋Š”', '์œผ๋กœ', '์—์„œ', '๊ทธ', '๋˜', '๋˜๋Š”', 'ํ•˜๋Š”', 'ํ• ', 'ํ•˜๊ณ ',
330
  '์žˆ๋‹ค', '์ด๋‹ค', '์œ„ํ•ด', '๊ฒƒ์ด๋‹ค', '๊ฒƒ์€', '๋Œ€ํ•œ', '๋•Œ๋ฌธ', '๊ทธ๋ฆฌ๊ณ ', 'ํ•˜์ง€๋งŒ', '๊ทธ๋Ÿฌ๋‚˜', '๊ทธ๋ž˜์„œ',
331
  '์ž…๋‹ˆ๋‹ค', 'ํ•ฉ๋‹ˆ๋‹ค', '์Šต๋‹ˆ๋‹ค', '์š”', '์ฃ ', '๊ณ ', '๊ณผ', '์™€', '๋„', '์€', '์ˆ˜', '๊ฒƒ', '๋“ค', '์ œ', '์ €',
@@ -336,7 +305,9 @@ def extract_keywords_for_wordcloud(text, top_n=50):
336
  '๊ธฐ์ž', '๋‰ด์Šค', '์‚ฌ์ง„', '์—ฐํ•ฉ๋‰ด์Šค', '๋‰ด์‹œ์Šค', '์ œ๊ณต', '๋ฌด๋‹จ', '์ „์žฌ', '์žฌ๋ฐฐํฌ', '๊ธˆ์ง€', '์•ต์ปค', '๋ฉ˜ํŠธ',
337
  '์ผ๋ณด', '๋ฐ์ผ๋ฆฌ', '๊ฒฝ์ œ', '์‚ฌํšŒ', '์ •์น˜', '์„ธ๊ณ„', '๊ณผํ•™', '์•„์ดํ‹ฐ', '๋‹ท์ปด', '์”จ๋„ท', '๋ธ”๋กœํ„ฐ', '์ „์ž์‹ ๋ฌธ'
338
  }
339
- stop_words.update(korea_stop_words)
 
 
340
 
341
  # 1๊ธ€์ž ์ด์ƒ์ด๊ณ  ๋ถˆ์šฉ์–ด๊ฐ€ ์•„๋‹Œ ํ† ํฐ๋งŒ ํ•„ํ„ฐ๋ง
342
  filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
@@ -718,7 +689,18 @@ elif menu == "๊ธฐ์‚ฌ ๋ถ„์„ํ•˜๊ธฐ":
718
  # ํ…์ŠคํŠธ ํ†ต๊ณ„ ๊ณ„์‚ฐ
719
  word_count = len(re.findall(r'\b\w+\b', content))
720
  char_count = len(content)
721
- sentence_count = len(re.split(r'[.!?]+', content))
 
 
 
 
 
 
 
 
 
 
 
722
  avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
723
  avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
724
 
@@ -750,81 +732,32 @@ elif menu == "๊ธฐ์‚ฌ ๋ถ„์„ํ•˜๊ธฐ":
750
  is_korean = bool(re.search(r'[๊ฐ€-ํžฃ]', content))
751
 
752
  try:
753
- # ์˜์–ด/ํ•œ๊ตญ์–ด ํ† ํฐํ™” ๋ฐ ํ’ˆ์‚ฌ ๋ถ„์„
 
 
754
  if is_korean:
755
- # ํ•œ๊ตญ์–ด์ธ ๊ฒฝ์šฐ (๊ฐ„๋‹จํ•œ ํ˜•ํƒœ์†Œ ์œ ์‚ฌ ๋ถ„์„)
756
- try:
757
- # transformers ํ† ํฌ๋‚˜์ด์ € ์‹œ๋„
758
- try:
759
- from transformers import AutoTokenizer
760
- tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
761
- tokens = tokenizer.tokenize(content[:5000]) # ๋„ˆ๋ฌด ๊ธด ํ…์ŠคํŠธ๋Š” ์ž˜๋ผ์„œ ๋ถ„์„
762
-
763
- # ๊ฐ„๋‹จํ•œ ํŒจํ„ด ๋งค์นญ์œผ๋กœ ํ’ˆ์‚ฌ ์ถ”์ •
764
- pos_counts = {'๋ช…์‚ฌ': 0, '๋™์‚ฌ': 0, 'ํ˜•์šฉ์‚ฌ': 0, '๋ถ€์‚ฌ': 0, '๊ธฐํƒ€': 0}
765
-
766
- for token in tokens:
767
- if token.endswith("๋‹ค") or token.endswith("์š”"):
768
- pos_counts['๋™์‚ฌ'] += 1
769
- elif token.endswith("๊ฒŒ") or token.endswith("ํžˆ"):
770
- pos_counts['๋ถ€์‚ฌ'] += 1
771
- elif token.endswith("์€") or token.endswith("๋Š”") or token.endswith("์ด") or token.endswith("๊ฐ€"):
772
- pos_counts['๋ช…์‚ฌ'] += 1
773
- else:
774
- if len(token) > 1:
775
- pos_counts['๋ช…์‚ฌ'] += 1
776
- else:
777
- pos_counts['๊ธฐํƒ€'] += 1
778
-
779
- except Exception:
780
- # ์‹คํŒจํ•˜๋ฉด ๊ฐ„๋‹จํ•œ ํ† ํฐํ™”๋กœ ๋Œ€์ฒด
781
- tokens = tokenize_korean(content[:5000])
782
- pos_counts = {
783
- '๋ช…์‚ฌ๋ฅ˜': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["๋‹ค", "์š”", "๊ฒŒ", "ํžˆ", "์€", "๋Š”"])]),
784
- '๊ธฐํƒ€': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["๋‹ค", "์š”", "๊ฒŒ", "ํžˆ", "์€", "๋Š”"])])
785
- }
786
- except Exception as e:
787
- st.error(f"ํ•œ๊ตญ์–ด ํ’ˆ์‚ฌ ๋ถ„์„ ์‹คํŒจ: {str(e)}")
788
- pos_counts = {'๋ฐ์ดํ„ฐ': len(content) // 10, '๋ถ„์„': len(content) // 15, '์˜ค๋ฅ˜': len(content) // 20}
789
  else:
790
- # ์˜์–ด ๋ฌธ์„œ์ธ ๊ฒฝ์šฐ (NLTK ์‹œ๋„)
791
- try:
792
- from nltk import pos_tag
793
- from nltk.tokenize import word_tokenize
794
-
795
- # ํ•„์š”ํ•œ ๋ฐ์ดํ„ฐ ๋‹ค์šด๋กœ๋“œ
796
- try:
797
- nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
798
- except Exception:
799
- pass
800
-
801
- tokens = word_tokenize(content[:5000])
802
- tagged = pos_tag(tokens)
803
-
804
- # ์˜์–ด ํ’ˆ์‚ฌ ๋งคํ•‘
805
- pos_dict = {
806
- 'NN': '๋ช…์‚ฌ', 'NNS': '๋ช…์‚ฌ', 'NNP': '๊ณ ์œ ๋ช…์‚ฌ', 'NNPS': '๊ณ ์œ ๋ช…์‚ฌ',
807
- 'VB': '๋™์‚ฌ', 'VBD': '๋™์‚ฌ', 'VBG': '๋™์‚ฌ', 'VBN': '๋™์‚ฌ', 'VBP': '๋™์‚ฌ', 'VBZ': '๋™์‚ฌ',
808
- 'JJ': 'ํ˜•์šฉ์‚ฌ', 'JJR': 'ํ˜•์šฉ์‚ฌ', 'JJS': 'ํ˜•์šฉ์‚ฌ',
809
- 'RB': '๋ถ€์‚ฌ', 'RBR': '๋ถ€์‚ฌ', 'RBS': '๋ถ€์‚ฌ'
810
- }
811
-
812
- pos_counts = {'๋ช…์‚ฌ': 0, '๋™์‚ฌ': 0, 'ํ˜•์šฉ์‚ฌ': 0, '๋ถ€์‚ฌ': 0, '๊ธฐํƒ€': 0}
813
-
814
- for _, pos in tagged:
815
- if pos in pos_dict:
816
- pos_counts[pos_dict[pos]] += 1
817
- else:
818
- pos_counts['๊ธฐํƒ€'] += 1
819
- except Exception:
820
- # ์‹คํŒจํ•˜๋ฉด ๊ฐ„๋‹จํ•œ ๊ทœ์น™์œผ๋กœ ํ’ˆ์‚ฌ ์œ ์ถ”
821
- tokens = re.findall(r'\b\w+\b', content.lower())
822
- pos_counts = {
823
- '๋ช…์‚ฌ': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
824
- '๋™์‚ฌ': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
825
- '๋ถ€์‚ฌ': len([t for t in tokens if t.endswith('ly')]),
826
- '๊ธฐํƒ€': len([t for t in tokens if len(t) <= 2])
827
- }
828
 
829
  # ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
830
  pos_df = pd.DataFrame({
 
20
  # /tmp ํด๋”๋Š” ์กด์žฌํ•  ์ˆ˜ ์žˆ์ง€๋งŒ ๊ถŒํ•œ ๋ฌธ์ œ๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ํ˜„์žฌ ์ž‘์—… ๋””๋ ‰ํ† ๋ฆฌ ๊ธฐ๋ฐ˜์œผ๋กœ ๋ณ€๊ฒฝ
21
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
22
  DATA_DIR = os.path.join(CURRENT_DIR, "data")
 
23
  SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
24
  SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
25
 
 
34
 
35
  # ํ•„์š”ํ•œ ๋ชจ๋“  ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ
36
  ensure_directory(DATA_DIR)
 
37
  ensure_directory(SCHEDULED_NEWS_DIR)
38
 
39
+ # ํ•œ๊ตญ์–ด ํ† ํฌ๋‚˜์ด์ง•์„ ์œ„ํ•œ KSS ์„ค์ •
 
 
 
 
40
  try:
41
+ import kss
42
+ kss_available = True
43
+ except ImportError:
44
+ st.warning("KSS ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์„ค์น˜๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค. 'pip install kss'๋กœ ์„ค์น˜ํ•˜์„ธ์š”.")
45
+ kss_available = False
 
 
 
 
 
 
 
46
 
47
+ # ํ•œ๊ตญ์–ด ํ† ํฌ๋‚˜์ด์ง• ํ•จ์ˆ˜ (KSS ์‚ฌ์šฉ)
48
  def tokenize_korean(text):
49
  try:
50
+ if kss_available:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  tokens = []
52
+ # ๋ฌธ์žฅ ๋ถ„๋ฆฌ ํ›„ ๊ฐ ๋ฌธ์žฅ์—์„œ ๋‹จ์–ด ์ถ”์ถœ
53
  for sentence in kss.split_sentences(text):
54
+ # ๊ธฐ๋ณธ ๊ณต๋ฐฑ ๊ธฐ๋ฐ˜ ํ† ํฐํ™”์— ์ •๊ทœ์‹ ํŒจํ„ด ์ถ”๊ฐ€ํ•˜์—ฌ ๋” ์ •๊ตํ•˜๊ฒŒ ์ฒ˜๋ฆฌ
55
+ raw_tokens = sentence.split()
56
+ for token in raw_tokens:
57
+ # ์กฐ์‚ฌ, ํŠน์ˆ˜๋ฌธ์ž ๋“ฑ์„ ๋ถ„๋ฆฌ
58
+ sub_tokens = re.findall(r'[๊ฐ€-ํžฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ€-ํžฃa-zA-Z0-9]+', token)
59
+ tokens.extend(sub_tokens)
60
  return tokens
 
 
61
  except Exception as e:
62
+ st.debug(f"KSS ํ† ํฌ๋‚˜์ด์ง• ์‹คํŒจ: {str(e)}")
63
 
64
+ # KSS ์‚ฌ์šฉ ๋ถˆ๊ฐ€๋Šฅํ•˜๊ฑฐ๋‚˜ ์˜ค๋ฅ˜ ๋ฐœ์ƒ์‹œ ๊ธฐ๋ณธ ์ •๊ทœ์‹ ๊ธฐ๋ฐ˜ ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
65
  return re.findall(r'[๊ฐ€-ํžฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ€-ํžฃa-zA-Z0-9]+', text)
66
 
67
  # ์›Œ๋“œํด๋ผ์šฐ๋“œ ์ถ”๊ฐ€ (์„ ํƒ์  ์‚ฌ์šฉ)
 
225
  except Exception as e:
226
  return f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
227
 
228
+ # KSS๋ฅผ ์ด์šฉํ•œ ํ‚ค์›Œ๋“œ ๋ถ„์„
229
  def analyze_keywords(text, top_n=10):
230
+ # ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด ๋ชฉ๋ก (ํ™•์žฅ)
231
  korean_stopwords = [
232
  '์ด', '๊ทธ', '์ €', '๊ฒƒ', '๋ฐ', '๋“ฑ', '๋ฅผ', '์„', '์—', '์—์„œ', '์˜', '์œผ๋กœ', '๋กœ',
233
+ '์—๊ฒŒ', '๋ฟ', '๋‹ค', '๋Š”', '๊ฐ€', '์ด๋‹ค', '์—๊ฒŒ์„œ', '๊ป˜', '๊ป˜์„œ', '๋ถ€ํ„ฐ', '๊นŒ์ง€',
234
+ '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋Ÿฐ', '์–ด๋–ค', '๋ฌด์Šจ', '์ด๊ฒƒ', '์ €๊ฒƒ', '๊ทธ๊ฒƒ', '์ด๋ฒˆ', '์ €๋ฒˆ', '๊ทธ๋ฒˆ',
235
+ '์ด๊ฑฐ', '์ €๊ฑฐ', '๊ทธ๊ฑฐ', 'ํ•˜๋‹ค', '๋˜๋‹ค', '์žˆ๋‹ค', '์—†๋‹ค', '๊ฐ™๋‹ค', '๋ณด๋‹ค', '์ด๋ ‡๋‹ค', '๊ทธ๋ ‡๋‹ค',
236
+ 'ํ•˜๋Š”', '๋˜๋Š”', '์žˆ๋Š”', '์—†๋Š”', '๊ฐ™์€', '๋ณด๋Š”', '์ด๋Ÿฐ', '๊ทธ๋Ÿฐ', '์ €๋Ÿฐ', 'ํ–ˆ๋‹ค', '๋๋‹ค',
237
+ '์žˆ์—ˆ๋‹ค', '์—†์—ˆ๋‹ค', '๊ฐ™์•˜๋‹ค', '๋ดค๋‹ค', '๋˜', '๋˜ํ•œ', '๊ทธ๋ฆฌ๊ณ ', 'ํ•˜์ง€๋งŒ', '๊ทธ๋Ÿฌ๋‚˜', '๊ทธ๋ž˜์„œ',
238
+ '๋•Œ๋ฌธ์—', '๋”ฐ๋ผ์„œ', 'ํ•˜๋ฉฐ', '๋˜๋ฉฐ', '๏ฟฝ๏ฟฝ์œผ๋ฉฐ', '์—†์œผ๋ฉฐ', '๊ฐ™์œผ๋ฉฐ', '๋ณด๋ฉฐ', 'ํ•˜๊ณ ', '๋˜๊ณ ',
239
+ '์žˆ๊ณ ', '์—†๊ณ ', '๊ฐ™๊ณ ', '๋ณด๊ณ ', 'ํ†ตํ•ด', '์œ„ํ•ด', '๋•Œ', '์ค‘', 'ํ›„'
240
+ ]
241
+
242
+ # ์˜์–ด ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
243
+ english_stopwords = [
244
+ 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
245
+ 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
246
+ 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
247
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
248
+ 'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
249
+ 'will', 'shall', 'can', 'may', 'must', 'ought'
250
  ]
251
 
252
  # ์–ธ์–ด ๊ฐ์ง€ (๊ฐ„๋‹จํ•˜๊ฒŒ ํ•œ๊ธ€ ํฌํ•จ ์—ฌ๋ถ€๋กœ ์ฒดํฌ)
253
  is_korean = bool(re.search(r'[๊ฐ€-ํžฃ]', text))
254
 
255
  if is_korean:
256
+ # ํ•œ๊ตญ์–ด ํ…์ŠคํŠธ์ธ ๊ฒฝ์šฐ KSS ๊ธฐ๋ฐ˜ ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
257
  tokens = tokenize_korean(text)
258
  else:
259
+ # ์˜์–ด ๋˜๋Š” ๊ธฐํƒ€ ์–ธ์–ด๋Š” ๊ฐ„๋‹จํ•œ ์ •๊ทœ์‹ ํ† ํฐํ™”
260
+ tokens = re.findall(r'\b\w+\b', text.lower())
 
 
 
 
 
261
 
262
+ # ๋ถˆ์šฉ์–ด ํ•„ํ„ฐ๋ง (์–ธ์–ด์— ๋”ฐ๋ผ ๋‹ค๋ฅธ ๋ถˆ์šฉ์–ด ์ ์šฉ)
263
+ stopwords = korean_stopwords if is_korean else english_stopwords
264
+ tokens = [word for word in tokens if len(word) > 1 and word.lower() not in stopwords]
265
 
266
  # ๋นˆ๋„ ๊ณ„์‚ฐ
267
  from collections import Counter
 
279
  # ์–ธ์–ด ๊ฐ์ง€ (๊ฐ„๋‹จํ•˜๊ฒŒ ํ•œ๊ธ€ ํฌํ•จ ์—ฌ๋ถ€๋กœ ์ฒดํฌ)
280
  is_korean = bool(re.search(r'[๊ฐ€-ํžฃ]', text))
281
 
282
+ # ํ† ํฐํ™” (KSS ์‚ฌ์šฉ)
283
+ tokens = tokenize_korean(text.lower())
 
 
 
 
 
 
 
 
 
284
 
285
  # ๋ถˆ์šฉ์–ด ์„ค์ •
286
+ # ์˜์–ด ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
287
+ english_stopwords = {
288
+ 'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
289
+ 'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
290
+ 'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
291
+ 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
292
+ 'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
293
+ 'will', 'shall', 'can', 'may', 'must', 'ought'
294
+ }
 
 
 
 
 
 
 
295
 
296
  # ํ•œ๊ตญ์–ด ๋ถˆ์šฉ์–ด
297
+ korean_stopwords = {
298
  '๋ฐ', '๋“ฑ', '๋ฅผ', '์ด', '์˜', '๊ฐ€', '์—', '๋Š”', '์œผ๋กœ', '์—์„œ', '๊ทธ', '๋˜', '๋˜๋Š”', 'ํ•˜๋Š”', 'ํ• ', 'ํ•˜๊ณ ',
299
  '์žˆ๋‹ค', '์ด๋‹ค', '์œ„ํ•ด', '๊ฒƒ์ด๋‹ค', '๊ฒƒ์€', '๋Œ€ํ•œ', '๋•Œ๋ฌธ', '๊ทธ๋ฆฌ๊ณ ', 'ํ•˜์ง€๋งŒ', '๊ทธ๋Ÿฌ๋‚˜', '๊ทธ๋ž˜์„œ',
300
  '์ž…๋‹ˆ๋‹ค', 'ํ•ฉ๋‹ˆ๋‹ค', '์Šต๋‹ˆ๋‹ค', '์š”', '์ฃ ', '๊ณ ', '๊ณผ', '์™€', '๋„', '์€', '์ˆ˜', '๊ฒƒ', '๋“ค', '์ œ', '์ €',
 
305
  '๊ธฐ์ž', '๋‰ด์Šค', '์‚ฌ์ง„', '์—ฐํ•ฉ๋‰ด์Šค', '๋‰ด์‹œ์Šค', '์ œ๊ณต', '๋ฌด๋‹จ', '์ „์žฌ', '์žฌ๋ฐฐํฌ', '๊ธˆ์ง€', '์•ต์ปค', '๋ฉ˜ํŠธ',
306
  '์ผ๋ณด', '๋ฐ์ผ๋ฆฌ', '๊ฒฝ์ œ', '์‚ฌํšŒ', '์ •์น˜', '์„ธ๊ณ„', '๊ณผํ•™', '์•„์ดํ‹ฐ', '๋‹ท์ปด', '์”จ๋„ท', '๋ธ”๋กœํ„ฐ', '์ „์ž์‹ ๋ฌธ'
307
  }
308
+
309
+ # ์–ธ์–ด์— ๋”ฐ๋ผ ๋ถˆ์šฉ์–ด ์„ ํƒ
310
+ stop_words = korean_stopwords if is_korean else english_stopwords
311
 
312
  # 1๊ธ€์ž ์ด์ƒ์ด๊ณ  ๋ถˆ์šฉ์–ด๊ฐ€ ์•„๋‹Œ ํ† ํฐ๋งŒ ํ•„ํ„ฐ๋ง
313
  filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
 
689
  # ํ…์ŠคํŠธ ํ†ต๊ณ„ ๊ณ„์‚ฐ
690
  word_count = len(re.findall(r'\b\w+\b', content))
691
  char_count = len(content)
692
+
693
+ # KSS๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๋ฌธ์žฅ ๋ถ„๋ฆฌ
694
+ if kss_available:
695
+ try:
696
+ sentences = kss.split_sentences(content)
697
+ sentence_count = len(sentences)
698
+ except Exception:
699
+ # KSS ์‹คํŒจ ์‹œ ๊ฐ„๋‹จํ•œ ๋ฌธ์žฅ ๋ถ„๋ฆฌ
700
+ sentence_count = len(re.split(r'[.!?]+', content))
701
+ else:
702
+ sentence_count = len(re.split(r'[.!?]+', content))
703
+
704
  avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
705
  avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
706
 
 
732
  is_korean = bool(re.search(r'[๊ฐ€-ํžฃ]', content))
733
 
734
  try:
735
+ # KSS๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ๊ฐ„๋‹จํ•œ ํ’ˆ์‚ฌ ์œ ์‚ฌ ๋ถ„์„
736
+ tokens = tokenize_korean(content[:5000]) # ๋„ˆ๋ฌด ๊ธด ํ…์ŠคํŠธ๋Š” ์ž˜๋ผ์„œ ๋ถ„์„
737
+
738
  if is_korean:
739
+ # ํ•œ๊ตญ์–ด์ธ ๊ฒฝ์šฐ ๊ฐ„๋‹จํ•œ ํŒจํ„ด ๋งค์นญ์œผ๋กœ ํ’ˆ์‚ฌ ์ถ”์ •
740
+ pos_counts = {'๋ช…์‚ฌ/๋Œ€๋ช…์‚ฌ': 0, '๋™์‚ฌ/ํ˜•์šฉ์‚ฌ': 0, '๋ถ€์‚ฌ/์กฐ์‚ฌ': 0, '๊ธฐํƒ€': 0}
741
+
742
+ for token in tokens:
743
+ if token.endswith(("๋‹ค", "์š”", "๊นŒ", "์ฃ ", "๋„ค", "๊ตฐ", "๋‹ˆ๋‹ค", "์„ธ์š”")):
744
+ pos_counts['๋™์‚ฌ/ํ˜•์šฉ์‚ฌ'] += 1
745
+ elif token.endswith(("๊ฒŒ", "ํžˆ", "์ด", "์ง€")):
746
+ pos_counts['๋ถ€์‚ฌ/์กฐ์‚ฌ'] += 1
747
+ elif token.endswith(("์€", "๋Š”", "์ด", "๊ฐ€", "์„", "๋ฅผ", "์—", "์˜")):
748
+ pos_counts['๋ถ€์‚ฌ/์กฐ์‚ฌ'] += 1
749
+ elif len(token) > 1:
750
+ pos_counts['๋ช…์‚ฌ/๋Œ€๋ช…์‚ฌ'] += 1
751
+ else:
752
+ pos_counts['๊ธฐํƒ€'] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
  else:
754
+ # ์˜์–ด ๋ฌธ์„œ์ธ ๊ฒฝ์šฐ ๊ฐ„๋‹จํ•œ ํŒจํ„ด ๋งค์นญ
755
+ pos_counts = {
756
+ '๋ช…์‚ฌ/๋Œ€๋ช…์‚ฌ': len([t for t in tokens if not t.lower().endswith(('ly', 'ing', 'ed'))]),
757
+ '๋™์‚ฌ': len([t for t in tokens if t.lower().endswith(('ing', 'ed', 's'))]),
758
+ '๋ถ€์‚ฌ/ํ˜•์šฉ์‚ฌ': len([t for t in tokens if t.lower().endswith('ly')]),
759
+ '๊ธฐํƒ€': len([t for t in tokens if len(t) <= 2])
760
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
 
762
  # ๊ฒฐ๊ณผ ์‹œ๊ฐํ™”
763
  pos_df = pd.DataFrame({