Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +93 -160
src/streamlit_app.py
CHANGED
@@ -20,7 +20,6 @@ from dotenv import load_dotenv
|
|
20 |
# /tmp ํด๋๋ ์กด์ฌํ ์ ์์ง๋ง ๊ถํ ๋ฌธ์ ๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ํ์ฌ ์์
๋๋ ํ ๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก ๋ณ๊ฒฝ
|
21 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
|
22 |
DATA_DIR = os.path.join(CURRENT_DIR, "data")
|
23 |
-
NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
|
24 |
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
|
25 |
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
|
26 |
|
@@ -35,60 +34,34 @@ def ensure_directory(directory):
|
|
35 |
|
36 |
# ํ์ํ ๋ชจ๋ ๋๋ ํ ๋ฆฌ ์์ฑ
|
37 |
ensure_directory(DATA_DIR)
|
38 |
-
ensure_directory(NLTK_DATA_DIR)
|
39 |
ensure_directory(SCHEDULED_NEWS_DIR)
|
40 |
|
41 |
-
#
|
42 |
-
import nltk
|
43 |
-
nltk.data.path.append(NLTK_DATA_DIR)
|
44 |
-
|
45 |
-
# ํ์ํ NLTK ๋ฐ์ดํฐ ๋ค์ด๋ก๋ (๊ถํ ๋ฌธ์ ํด๊ฒฐ)
|
46 |
try:
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
try:
|
54 |
-
nltk.data.find('corpora/stopwords')
|
55 |
-
except LookupError:
|
56 |
-
nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
|
57 |
-
except Exception as e:
|
58 |
-
st.warning(f"NLTK ๋ฐ์ดํฐ ๋ค์ด๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}. ๊ธฐ๋ณธ ํ ํฌ๋์ด์ง ๋ฐฉ์์ ์ฌ์ฉํฉ๋๋ค.")
|
59 |
|
60 |
-
# ํ๊ตญ์ด
|
61 |
def tokenize_korean(text):
|
62 |
try:
|
63 |
-
|
64 |
-
try:
|
65 |
-
from transformers import AutoTokenizer
|
66 |
-
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
|
67 |
-
return tokenizer.tokenize(text)
|
68 |
-
except (ImportError, Exception) as e:
|
69 |
-
st.debug(f"Transformers ํ ํฌ๋์ด์ ๋ก๋ ์คํจ: {str(e)}")
|
70 |
-
|
71 |
-
# 2. soynlp ์๋
|
72 |
-
try:
|
73 |
-
from soynlp.tokenizer import LTokenizer
|
74 |
-
tokenizer = LTokenizer()
|
75 |
-
return tokenizer.tokenize(text)
|
76 |
-
except (ImportError, Exception) as e:
|
77 |
-
st.debug(f"soynlp ํ ํฌ๋์ด์ ๋ก๋ ์คํจ: {str(e)}")
|
78 |
-
|
79 |
-
# 3. kss ์๋
|
80 |
-
try:
|
81 |
-
import kss
|
82 |
tokens = []
|
|
|
83 |
for sentence in kss.split_sentences(text):
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
85 |
return tokens
|
86 |
-
except (ImportError, Exception) as e:
|
87 |
-
st.debug(f"kss ํ ํฌ๋์ด์ ๋ก๋ ์คํจ: {str(e)}")
|
88 |
except Exception as e:
|
89 |
-
st.debug(f"
|
90 |
|
91 |
-
#
|
92 |
return re.findall(r'[๊ฐ-ํฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ-ํฃa-zA-Z0-9]+', text)
|
93 |
|
94 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ (์ ํ์ ์ฌ์ฉ)
|
@@ -252,31 +225,43 @@ def get_article_content(url):
|
|
252 |
except Exception as e:
|
253 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
254 |
|
255 |
-
#
|
256 |
def analyze_keywords(text, top_n=10):
|
257 |
-
# ํ๊ตญ์ด ๋ถ์ฉ์ด ๋ชฉ๋ก
|
258 |
korean_stopwords = [
|
259 |
'์ด', '๊ทธ', '์ ', '๊ฒ', '๋ฐ', '๋ฑ', '๋ฅผ', '์', '์', '์์', '์', '์ผ๋ก', '๋ก',
|
260 |
-
'์๊ฒ', '๋ฟ', '๋ค', '๋', '๊ฐ', '์ด๋ค', '์๊ฒ์', '๊ป', '๊ป์', '๋ถํฐ', '๊น์ง'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
]
|
262 |
|
263 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
264 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
265 |
|
266 |
if is_korean:
|
267 |
-
# ํ๊ตญ์ด ํ
์คํธ์ธ ๊ฒฝ์ฐ
|
268 |
tokens = tokenize_korean(text)
|
269 |
else:
|
270 |
-
#
|
271 |
-
|
272 |
-
from nltk.tokenize import word_tokenize
|
273 |
-
tokens = word_tokenize(text)
|
274 |
-
except Exception:
|
275 |
-
# NLTK๊ฐ ์คํจํ๋ฉด ๊ฐ๋จํ ํ ํฌ๋์ด์ ๋ก ๋์ฒด
|
276 |
-
tokens = re.findall(r'\b\w+\b', text.lower())
|
277 |
|
278 |
-
# ๋ถ์ฉ์ด ํํฐ๋ง
|
279 |
-
|
|
|
280 |
|
281 |
# ๋น๋ ๊ณ์ฐ
|
282 |
from collections import Counter
|
@@ -294,38 +279,22 @@ def extract_keywords_for_wordcloud(text, top_n=50):
|
|
294 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
295 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
296 |
|
297 |
-
|
298 |
-
|
299 |
-
tokens = tokenize_korean(text.lower())
|
300 |
-
else:
|
301 |
-
# ์์ด ๋๋ ๊ธฐํ ์ธ์ด๋ NLTK ์ฌ์ฉ ์๋
|
302 |
-
try:
|
303 |
-
from nltk.tokenize import word_tokenize
|
304 |
-
tokens = word_tokenize(text.lower())
|
305 |
-
except Exception:
|
306 |
-
# ์คํจํ๋ฉด ๊ฐ๋จํ ํ ํฌ๋์ด์ง
|
307 |
-
tokens = text.lower().split()
|
308 |
|
309 |
# ๋ถ์ฉ์ด ์ค์
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
320 |
-
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
321 |
-
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
322 |
-
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
323 |
-
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
324 |
-
'will', 'shall', 'can', 'may', 'must', 'ought'
|
325 |
-
}
|
326 |
|
327 |
# ํ๊ตญ์ด ๋ถ์ฉ์ด
|
328 |
-
|
329 |
'๋ฐ', '๋ฑ', '๋ฅผ', '์ด', '์', '๊ฐ', '์', '๋', '์ผ๋ก', '์์', '๊ทธ', '๋', '๋๋', 'ํ๋', 'ํ ', 'ํ๊ณ ',
|
330 |
'์๋ค', '์ด๋ค', '์ํด', '๊ฒ์ด๋ค', '๊ฒ์', '๋ํ', '๋๋ฌธ', '๊ทธ๋ฆฌ๊ณ ', 'ํ์ง๋ง', '๊ทธ๋ฌ๋', '๊ทธ๋์',
|
331 |
'์
๋๋ค', 'ํฉ๋๋ค', '์ต๋๋ค', '์', '์ฃ ', '๊ณ ', '๊ณผ', '์', '๋', '์', '์', '๊ฒ', '๋ค', '์ ', '์ ',
|
@@ -336,7 +305,9 @@ def extract_keywords_for_wordcloud(text, top_n=50):
|
|
336 |
'๊ธฐ์', '๋ด์ค', '์ฌ์ง', '์ฐํฉ๋ด์ค', '๋ด์์ค', '์ ๊ณต', '๋ฌด๋จ', '์ ์ฌ', '์ฌ๋ฐฐํฌ', '๊ธ์ง', '์ต์ปค', '๋ฉํธ',
|
337 |
'์ผ๋ณด', '๋ฐ์ผ๋ฆฌ', '๊ฒฝ์ ', '์ฌํ', '์ ์น', '์ธ๊ณ', '๊ณผํ', '์์ดํฐ', '๋ท์ปด', '์จ๋ท', '๋ธ๋กํฐ', '์ ์์ ๋ฌธ'
|
338 |
}
|
339 |
-
|
|
|
|
|
340 |
|
341 |
# 1๊ธ์ ์ด์์ด๊ณ ๋ถ์ฉ์ด๊ฐ ์๋ ํ ํฐ๋ง ํํฐ๋ง
|
342 |
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
|
@@ -718,7 +689,18 @@ elif menu == "๊ธฐ์ฌ ๋ถ์ํ๊ธฐ":
|
|
718 |
# ํ
์คํธ ํต๊ณ ๊ณ์ฐ
|
719 |
word_count = len(re.findall(r'\b\w+\b', content))
|
720 |
char_count = len(content)
|
721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
722 |
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
|
723 |
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
724 |
|
@@ -750,81 +732,32 @@ elif menu == "๊ธฐ์ฌ ๋ถ์ํ๊ธฐ":
|
|
750 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', content))
|
751 |
|
752 |
try:
|
753 |
-
#
|
|
|
|
|
754 |
if is_korean:
|
755 |
-
# ํ๊ตญ์ด์ธ ๊ฒฝ์ฐ
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
pos_counts
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
elif token.endswith("๊ฒ") or token.endswith("ํ"):
|
770 |
-
pos_counts['๋ถ์ฌ'] += 1
|
771 |
-
elif token.endswith("์") or token.endswith("๋") or token.endswith("์ด") or token.endswith("๊ฐ"):
|
772 |
-
pos_counts['๋ช
์ฌ'] += 1
|
773 |
-
else:
|
774 |
-
if len(token) > 1:
|
775 |
-
pos_counts['๋ช
์ฌ'] += 1
|
776 |
-
else:
|
777 |
-
pos_counts['๊ธฐํ'] += 1
|
778 |
-
|
779 |
-
except Exception:
|
780 |
-
# ์คํจํ๋ฉด ๊ฐ๋จํ ํ ํฐํ๋ก ๋์ฒด
|
781 |
-
tokens = tokenize_korean(content[:5000])
|
782 |
-
pos_counts = {
|
783 |
-
'๋ช
์ฌ๋ฅ': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["๋ค", "์", "๊ฒ", "ํ", "์", "๋"])]),
|
784 |
-
'๊ธฐํ': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["๋ค", "์", "๊ฒ", "ํ", "์", "๋"])])
|
785 |
-
}
|
786 |
-
except Exception as e:
|
787 |
-
st.error(f"ํ๊ตญ์ด ํ์ฌ ๋ถ์ ์คํจ: {str(e)}")
|
788 |
-
pos_counts = {'๋ฐ์ดํฐ': len(content) // 10, '๋ถ์': len(content) // 15, '์ค๋ฅ': len(content) // 20}
|
789 |
else:
|
790 |
-
# ์์ด ๋ฌธ์์ธ ๊ฒฝ์ฐ
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
|
798 |
-
except Exception:
|
799 |
-
pass
|
800 |
-
|
801 |
-
tokens = word_tokenize(content[:5000])
|
802 |
-
tagged = pos_tag(tokens)
|
803 |
-
|
804 |
-
# ์์ด ํ์ฌ ๋งคํ
|
805 |
-
pos_dict = {
|
806 |
-
'NN': '๋ช
์ฌ', 'NNS': '๋ช
์ฌ', 'NNP': '๊ณ ์ ๋ช
์ฌ', 'NNPS': '๊ณ ์ ๋ช
์ฌ',
|
807 |
-
'VB': '๋์ฌ', 'VBD': '๋์ฌ', 'VBG': '๋์ฌ', 'VBN': '๋์ฌ', 'VBP': '๋์ฌ', 'VBZ': '๋์ฌ',
|
808 |
-
'JJ': 'ํ์ฉ์ฌ', 'JJR': 'ํ์ฉ์ฌ', 'JJS': 'ํ์ฉ์ฌ',
|
809 |
-
'RB': '๋ถ์ฌ', 'RBR': '๋ถ์ฌ', 'RBS': '๋ถ์ฌ'
|
810 |
-
}
|
811 |
-
|
812 |
-
pos_counts = {'๋ช
์ฌ': 0, '๋์ฌ': 0, 'ํ์ฉ์ฌ': 0, '๋ถ์ฌ': 0, '๊ธฐํ': 0}
|
813 |
-
|
814 |
-
for _, pos in tagged:
|
815 |
-
if pos in pos_dict:
|
816 |
-
pos_counts[pos_dict[pos]] += 1
|
817 |
-
else:
|
818 |
-
pos_counts['๊ธฐํ'] += 1
|
819 |
-
except Exception:
|
820 |
-
# ์คํจํ๋ฉด ๊ฐ๋จํ ๊ท์น์ผ๋ก ํ์ฌ ์ ์ถ
|
821 |
-
tokens = re.findall(r'\b\w+\b', content.lower())
|
822 |
-
pos_counts = {
|
823 |
-
'๋ช
์ฌ': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
|
824 |
-
'๋์ฌ': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
|
825 |
-
'๋ถ์ฌ': len([t for t in tokens if t.endswith('ly')]),
|
826 |
-
'๊ธฐํ': len([t for t in tokens if len(t) <= 2])
|
827 |
-
}
|
828 |
|
829 |
# ๊ฒฐ๊ณผ ์๊ฐํ
|
830 |
pos_df = pd.DataFrame({
|
|
|
20 |
# /tmp ํด๋๋ ์กด์ฌํ ์ ์์ง๋ง ๊ถํ ๋ฌธ์ ๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ํ์ฌ ์์
๋๋ ํ ๋ฆฌ ๊ธฐ๋ฐ์ผ๋ก ๋ณ๊ฒฝ
|
21 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
|
22 |
DATA_DIR = os.path.join(CURRENT_DIR, "data")
|
|
|
23 |
SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
|
24 |
SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
|
25 |
|
|
|
34 |
|
35 |
# ํ์ํ ๋ชจ๋ ๋๋ ํ ๋ฆฌ ์์ฑ
|
36 |
ensure_directory(DATA_DIR)
|
|
|
37 |
ensure_directory(SCHEDULED_NEWS_DIR)
|
38 |
|
39 |
+
# ํ๊ตญ์ด ํ ํฌ๋์ด์ง์ ์ํ KSS ์ค์
|
|
|
|
|
|
|
|
|
40 |
try:
|
41 |
+
import kss
|
42 |
+
kss_available = True
|
43 |
+
except ImportError:
|
44 |
+
st.warning("KSS ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ์ค์น๋์ด ์์ง ์์ต๋๋ค. 'pip install kss'๋ก ์ค์นํ์ธ์.")
|
45 |
+
kss_available = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# ํ๊ตญ์ด ํ ํฌ๋์ด์ง ํจ์ (KSS ์ฌ์ฉ)
|
48 |
def tokenize_korean(text):
|
49 |
try:
|
50 |
+
if kss_available:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
tokens = []
|
52 |
+
# ๋ฌธ์ฅ ๋ถ๋ฆฌ ํ ๊ฐ ๋ฌธ์ฅ์์ ๋จ์ด ์ถ์ถ
|
53 |
for sentence in kss.split_sentences(text):
|
54 |
+
# ๊ธฐ๋ณธ ๊ณต๋ฐฑ ๊ธฐ๋ฐ ํ ํฐํ์ ์ ๊ท์ ํจํด ์ถ๊ฐํ์ฌ ๋ ์ ๊ตํ๊ฒ ์ฒ๋ฆฌ
|
55 |
+
raw_tokens = sentence.split()
|
56 |
+
for token in raw_tokens:
|
57 |
+
# ์กฐ์ฌ, ํน์๋ฌธ์ ๋ฑ์ ๋ถ๋ฆฌ
|
58 |
+
sub_tokens = re.findall(r'[๊ฐ-ํฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ-ํฃa-zA-Z0-9]+', token)
|
59 |
+
tokens.extend(sub_tokens)
|
60 |
return tokens
|
|
|
|
|
61 |
except Exception as e:
|
62 |
+
st.debug(f"KSS ํ ํฌ๋์ด์ง ์คํจ: {str(e)}")
|
63 |
|
64 |
+
# KSS ์ฌ์ฉ ๋ถ๊ฐ๋ฅํ๊ฑฐ๋ ์ค๋ฅ ๋ฐ์์ ๊ธฐ๋ณธ ์ ๊ท์ ๊ธฐ๋ฐ ํ ํฌ๋์ด์ ์ฌ์ฉ
|
65 |
return re.findall(r'[๊ฐ-ํฃ]+|[a-zA-Z]+|[0-9]+|[^\s๊ฐ-ํฃa-zA-Z0-9]+', text)
|
66 |
|
67 |
# ์๋ํด๋ผ์ฐ๋ ์ถ๊ฐ (์ ํ์ ์ฌ์ฉ)
|
|
|
225 |
except Exception as e:
|
226 |
return f"์ค๋ฅ ๋ฐ์: {str(e)}"
|
227 |
|
228 |
+
# KSS๋ฅผ ์ด์ฉํ ํค์๋ ๋ถ์
|
229 |
def analyze_keywords(text, top_n=10):
|
230 |
+
# ํ๊ตญ์ด ๋ถ์ฉ์ด ๋ชฉ๋ก (ํ์ฅ)
|
231 |
korean_stopwords = [
|
232 |
'์ด', '๊ทธ', '์ ', '๊ฒ', '๋ฐ', '๋ฑ', '๋ฅผ', '์', '์', '์์', '์', '์ผ๋ก', '๋ก',
|
233 |
+
'์๊ฒ', '๋ฟ', '๋ค', '๋', '๊ฐ', '์ด๋ค', '์๊ฒ์', '๊ป', '๊ป์', '๋ถํฐ', '๊น์ง',
|
234 |
+
'์ด๋ฐ', '์ ๋ฐ', '๊ทธ๋ฐ', '์ด๋ค', '๋ฌด์จ', '์ด๊ฒ', '์ ๊ฒ', '๊ทธ๊ฒ', '์ด๋ฒ', '์ ๋ฒ', '๊ทธ๋ฒ',
|
235 |
+
'์ด๊ฑฐ', '์ ๊ฑฐ', '๊ทธ๊ฑฐ', 'ํ๋ค', '๋๋ค', '์๋ค', '์๋ค', '๊ฐ๋ค', '๋ณด๋ค', '์ด๋ ๋ค', '๊ทธ๋ ๋ค',
|
236 |
+
'ํ๋', '๋๋', '์๋', '์๋', '๊ฐ์', '๋ณด๋', '์ด๋ฐ', '๊ทธ๋ฐ', '์ ๋ฐ', 'ํ๋ค', '๋๋ค',
|
237 |
+
'์์๋ค', '์์๋ค', '๊ฐ์๋ค', '๋ดค๋ค', '๋', '๋ํ', '๊ทธ๋ฆฌ๊ณ ', 'ํ์ง๋ง', '๊ทธ๋ฌ๋', '๊ทธ๋์',
|
238 |
+
'๋๋ฌธ์', '๋ฐ๋ผ์', 'ํ๋ฉฐ', '๋๋ฉฐ', '๏ฟฝ๏ฟฝ์ผ๋ฉฐ', '์์ผ๋ฉฐ', '๊ฐ์ผ๋ฉฐ', '๋ณด๋ฉฐ', 'ํ๊ณ ', '๋๊ณ ',
|
239 |
+
'์๊ณ ', '์๊ณ ', '๊ฐ๊ณ ', '๋ณด๊ณ ', 'ํตํด', '์ํด', '๋', '์ค', 'ํ'
|
240 |
+
]
|
241 |
+
|
242 |
+
# ์์ด ๋ถ์ฉ์ด ๋ชฉ๋ก
|
243 |
+
english_stopwords = [
|
244 |
+
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
245 |
+
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
246 |
+
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
247 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
248 |
+
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
249 |
+
'will', 'shall', 'can', 'may', 'must', 'ought'
|
250 |
]
|
251 |
|
252 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
253 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
254 |
|
255 |
if is_korean:
|
256 |
+
# ํ๊ตญ์ด ํ
์คํธ์ธ ๊ฒฝ์ฐ KSS ๊ธฐ๋ฐ ํ ํฌ๋์ด์ ์ฌ์ฉ
|
257 |
tokens = tokenize_korean(text)
|
258 |
else:
|
259 |
+
# ์์ด ๋๋ ๊ธฐํ ์ธ์ด๋ ๊ฐ๋จํ ์ ๊ท์ ํ ํฐํ
|
260 |
+
tokens = re.findall(r'\b\w+\b', text.lower())
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
+
# ๋ถ์ฉ์ด ํํฐ๋ง (์ธ์ด์ ๋ฐ๋ผ ๋ค๋ฅธ ๋ถ์ฉ์ด ์ ์ฉ)
|
263 |
+
stopwords = korean_stopwords if is_korean else english_stopwords
|
264 |
+
tokens = [word for word in tokens if len(word) > 1 and word.lower() not in stopwords]
|
265 |
|
266 |
# ๋น๋ ๊ณ์ฐ
|
267 |
from collections import Counter
|
|
|
279 |
# ์ธ์ด ๊ฐ์ง (๊ฐ๋จํ๊ฒ ํ๊ธ ํฌํจ ์ฌ๋ถ๋ก ์ฒดํฌ)
|
280 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', text))
|
281 |
|
282 |
+
# ํ ํฐํ (KSS ์ฌ์ฉ)
|
283 |
+
tokens = tokenize_korean(text.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
|
285 |
# ๋ถ์ฉ์ด ์ค์
|
286 |
+
# ์์ด ๋ถ์ฉ์ด ๋ชฉ๋ก
|
287 |
+
english_stopwords = {
|
288 |
+
'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
|
289 |
+
'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
|
290 |
+
'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
|
291 |
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
292 |
+
'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
|
293 |
+
'will', 'shall', 'can', 'may', 'must', 'ought'
|
294 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
# ํ๊ตญ์ด ๋ถ์ฉ์ด
|
297 |
+
korean_stopwords = {
|
298 |
'๋ฐ', '๋ฑ', '๋ฅผ', '์ด', '์', '๊ฐ', '์', '๋', '์ผ๋ก', '์์', '๊ทธ', '๋', '๋๋', 'ํ๋', 'ํ ', 'ํ๊ณ ',
|
299 |
'์๋ค', '์ด๋ค', '์ํด', '๊ฒ์ด๋ค', '๊ฒ์', '๋ํ', '๋๋ฌธ', '๊ทธ๋ฆฌ๊ณ ', 'ํ์ง๋ง', '๊ทธ๋ฌ๋', '๊ทธ๋์',
|
300 |
'์
๋๋ค', 'ํฉ๋๋ค', '์ต๋๋ค', '์', '์ฃ ', '๊ณ ', '๊ณผ', '์', '๋', '์', '์', '๊ฒ', '๋ค', '์ ', '์ ',
|
|
|
305 |
'๊ธฐ์', '๋ด์ค', '์ฌ์ง', '์ฐํฉ๋ด์ค', '๋ด์์ค', '์ ๊ณต', '๋ฌด๋จ', '์ ์ฌ', '์ฌ๋ฐฐํฌ', '๊ธ์ง', '์ต์ปค', '๋ฉํธ',
|
306 |
'์ผ๋ณด', '๋ฐ์ผ๋ฆฌ', '๊ฒฝ์ ', '์ฌํ', '์ ์น', '์ธ๊ณ', '๊ณผํ', '์์ดํฐ', '๋ท์ปด', '์จ๋ท', '๋ธ๋กํฐ', '์ ์์ ๋ฌธ'
|
307 |
}
|
308 |
+
|
309 |
+
# ์ธ์ด์ ๋ฐ๋ผ ๋ถ์ฉ์ด ์ ํ
|
310 |
+
stop_words = korean_stopwords if is_korean else english_stopwords
|
311 |
|
312 |
# 1๊ธ์ ์ด์์ด๊ณ ๋ถ์ฉ์ด๊ฐ ์๋ ํ ํฐ๋ง ํํฐ๋ง
|
313 |
filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
|
|
|
689 |
# ํ
์คํธ ํต๊ณ ๊ณ์ฐ
|
690 |
word_count = len(re.findall(r'\b\w+\b', content))
|
691 |
char_count = len(content)
|
692 |
+
|
693 |
+
# KSS๋ฅผ ์ฌ์ฉํ์ฌ ๋ฌธ์ฅ ๋ถ๋ฆฌ
|
694 |
+
if kss_available:
|
695 |
+
try:
|
696 |
+
sentences = kss.split_sentences(content)
|
697 |
+
sentence_count = len(sentences)
|
698 |
+
except Exception:
|
699 |
+
# KSS ์คํจ ์ ๊ฐ๋จํ ๋ฌธ์ฅ ๋ถ๋ฆฌ
|
700 |
+
sentence_count = len(re.split(r'[.!?]+', content))
|
701 |
+
else:
|
702 |
+
sentence_count = len(re.split(r'[.!?]+', content))
|
703 |
+
|
704 |
avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
|
705 |
avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
|
706 |
|
|
|
732 |
is_korean = bool(re.search(r'[๊ฐ-ํฃ]', content))
|
733 |
|
734 |
try:
|
735 |
+
# KSS๋ฅผ ์ฌ์ฉํ์ฌ ๊ฐ๋จํ ํ์ฌ ์ ์ฌ ๋ถ์
|
736 |
+
tokens = tokenize_korean(content[:5000]) # ๋๋ฌด ๊ธด ํ
์คํธ๋ ์๋ผ์ ๋ถ์
|
737 |
+
|
738 |
if is_korean:
|
739 |
+
# ํ๊ตญ์ด์ธ ๊ฒฝ์ฐ ๊ฐ๋จํ ํจํด ๋งค์นญ์ผ๋ก ํ์ฌ ์ถ์
|
740 |
+
pos_counts = {'๋ช
์ฌ/๋๋ช
์ฌ': 0, '๋์ฌ/ํ์ฉ์ฌ': 0, '๋ถ์ฌ/์กฐ์ฌ': 0, '๊ธฐํ': 0}
|
741 |
+
|
742 |
+
for token in tokens:
|
743 |
+
if token.endswith(("๋ค", "์", "๊น", "์ฃ ", "๋ค", "๊ตฐ", "๋๋ค", "์ธ์")):
|
744 |
+
pos_counts['๋์ฌ/ํ์ฉ์ฌ'] += 1
|
745 |
+
elif token.endswith(("๊ฒ", "ํ", "์ด", "์ง")):
|
746 |
+
pos_counts['๋ถ์ฌ/์กฐ์ฌ'] += 1
|
747 |
+
elif token.endswith(("์", "๋", "์ด", "๊ฐ", "์", "๋ฅผ", "์", "์")):
|
748 |
+
pos_counts['๋ถ์ฌ/์กฐ์ฌ'] += 1
|
749 |
+
elif len(token) > 1:
|
750 |
+
pos_counts['๋ช
์ฌ/๋๋ช
์ฌ'] += 1
|
751 |
+
else:
|
752 |
+
pos_counts['๊ธฐํ'] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
753 |
else:
|
754 |
+
# ์์ด ๋ฌธ์์ธ ๊ฒฝ์ฐ ๊ฐ๋จํ ํจํด ๋งค์นญ
|
755 |
+
pos_counts = {
|
756 |
+
'๋ช
์ฌ/๋๋ช
์ฌ': len([t for t in tokens if not t.lower().endswith(('ly', 'ing', 'ed'))]),
|
757 |
+
'๋์ฌ': len([t for t in tokens if t.lower().endswith(('ing', 'ed', 's'))]),
|
758 |
+
'๋ถ์ฌ/ํ์ฉ์ฌ': len([t for t in tokens if t.lower().endswith('ly')]),
|
759 |
+
'๊ธฐํ': len([t for t in tokens if len(t) <= 2])
|
760 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
761 |
|
762 |
# ๊ฒฐ๊ณผ ์๊ฐํ
|
763 |
pos_df = pd.DataFrame({
|