Spaces:

JUNGU
/

example-ai-crawler

Sleeping

App Files Files Community

JUNGU commited on May 20

Commit

4cf5f75

verified ·

1 Parent(s): 31658d4

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +93 -160

src/streamlit_app.py CHANGED Viewed

@@ -20,7 +20,6 @@ from dotenv import load_dotenv
 # /tmp 폴더는 존재할 수 있지만 권한 문제가 있을 수 있으므로 현재 작업 디렉토리 기반으로 변경
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
 DATA_DIR = os.path.join(CURRENT_DIR, "data")
-NLTK_DATA_DIR = os.path.join(DATA_DIR, "nltk_data")
 SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
 SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
@@ -35,60 +34,34 @@ def ensure_directory(directory):
 # 필요한 모든 디렉토리 생성
 ensure_directory(DATA_DIR)
-ensure_directory(NLTK_DATA_DIR)
 ensure_directory(SCHEDULED_NEWS_DIR)
-# NLTK 설정 - 권한 문제 해결을 위해 사용자 지정 디렉토리 사용
-import nltk
-nltk.data.path.append(NLTK_DATA_DIR)
-# 필요한 NLTK 데이터 다운로드 (권한 문제 해결)
 try:
-    # 사용자 지정 디렉토리에 데이터 다운로드
-    try:
-        nltk.data.find('tokenizers/punkt')
-    except LookupError:
-        nltk.download('punkt', download_dir=NLTK_DATA_DIR)
-    try:
-        nltk.data.find('corpora/stopwords')
-    except LookupError:
-        nltk.download('stopwords', download_dir=NLTK_DATA_DIR)
-except Exception as e:
-    st.warning(f"NLTK 데이터 다운로드 중 오류 발생: {str(e)}. 기본 토크나이징 방식을 사용합니다.")
-# 한국어 토크나이징을 위한 대체 함수 (KoNLPy 대신 사용)
 def tokenize_korean(text):
     try:
-        # 1. 먼저 transformers 라이브러리가 설치되어 있는지 확인
-        try:
-            from transformers import AutoTokenizer
-            tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
-            return tokenizer.tokenize(text)
-        except (ImportError, Exception) as e:
-            st.debug(f"Transformers 토크나이저 로드 실패: {str(e)}")
-        # 2. soynlp 시도
-        try:
-            from soynlp.tokenizer import LTokenizer
-            tokenizer = LTokenizer()
-            return tokenizer.tokenize(text)
-        except (ImportError, Exception) as e:
-            st.debug(f"soynlp 토크나이저 로드 실패: {str(e)}")
-        # 3. kss 시도
-        try:
-            import kss
             tokens = []
             for sentence in kss.split_sentences(text):
-                tokens.extend(sentence.split())
             return tokens
-        except (ImportError, Exception) as e:
-            st.debug(f"kss 토크나이저 로드 실패: {str(e)}")
     except Exception as e:
-        st.debug(f"한국어 토크나이징 실패: {str(e)}")
-    # 4. 기본 정규식 기반 토크나이저 - 모든 방법이 실패했을 때 폴백
     return re.findall(r'[가-힣]+|[a-zA-Z]+|[0-9]+|[^\s가-힣a-zA-Z0-9]+', text)
 # 워드클라우드 추가 (선택적 사용)
@@ -252,31 +225,43 @@ def get_article_content(url):
     except Exception as e:
         return f"오류 발생: {str(e)}"
-# NLTK를 이용한 키워드 분석 (한국어 대응 추가)
 def analyze_keywords(text, top_n=10):
-    # 한국어 불용어 목록
     korean_stopwords = [
         '이', '그', '저', '것', '및', '등', '를', '을', '에', '에서', '의', '으로', '로',
-        '에게', '뿐', '다', '는', '가', '이다', '에게서', '께', '께서', '부터', '까지'
     ]
     # 언어 감지 (간단하게 한글 포함 여부로 체크)
     is_korean = bool(re.search(r'[가-힣]', text))
     if is_korean:
-        # 한국어 텍스트인 경우 한국어 토크나이저 사용
         tokens = tokenize_korean(text)
     else:
-        # 한글이 없는 경우 NLTK 토크나이저 사용
-        try:
-            from nltk.tokenize import word_tokenize
-            tokens = word_tokenize(text)
-        except Exception:
-            # NLTK가 실패하면 간단한 토크나이저로 대체
-            tokens = re.findall(r'\b\w+\b', text.lower())
-    # 불용어 필터링
-    tokens = [word for word in tokens if len(word) > 1 and word.lower() not in korean_stopwords]
     # 빈도 계산
     from collections import Counter
@@ -294,38 +279,22 @@ def extract_keywords_for_wordcloud(text, top_n=50):
         # 언어 감지 (간단하게 한글 포함 여부로 체크)
         is_korean = bool(re.search(r'[가-힣]', text))
-        if is_korean:
-            # 한국어 텍스트인 경우 한국어 토크나이저 사용
-            tokens = tokenize_korean(text.lower())
-        else:
-            # 영어 또는 기타 언어는 NLTK 사용 시도
-            try:
-                from nltk.tokenize import word_tokenize
-                tokens = word_tokenize(text.lower())
-            except Exception:
-                # 실패하면 간단한 토크나이징
-                tokens = text.lower().split()
         # 불용어 설정
-        stop_words = set()
-        # 영어 불용어 (NLTK 있으면 사용)
-        try:
-            from nltk.corpus import stopwords
-            stop_words = set(stopwords.words('english'))
-        except Exception:
-            # 기본 영어 불용어
-            stop_words = {
-                'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
-                'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
-                'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
-                'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
-                'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
-                'will', 'shall', 'can', 'may', 'must', 'ought'
-            }
         # 한국어 불용어
-        korea_stop_words = {
             '및', '등', '를', '이', '의', '가', '에', '는', '으로', '에서', '그', '또', '또는', '하는', '할', '하고',
             '있다', '이다', '위해', '것이다', '것은', '대한', '때문', '그리고', '하지만', '그러나', '그래서',
             '입니다', '합니다', '습니다', '요', '죠', '고', '과', '와', '도', '은', '수', '것', '들', '제', '저',
@@ -336,7 +305,9 @@ def extract_keywords_for_wordcloud(text, top_n=50):
             '기자', '뉴스', '사진', '연합뉴스', '뉴시스', '제공', '무단', '전재', '재배포', '금지', '앵커', '멘트',
             '일보', '데일리', '경제', '사회', '정치', '세계', '과학', '아이티', '닷컴', '씨넷', '블로터', '전자신문'
         }
-        stop_words.update(korea_stop_words)
         # 1글자 이상이고 불용어가 아닌 토큰만 필터링
         filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
@@ -718,7 +689,18 @@ elif menu == "기사 분석하기":
                     # 텍스트 통계 계산
                     word_count = len(re.findall(r'\b\w+\b', content))
                     char_count = len(content)
-                    sentence_count = len(re.split(r'[.!?]+', content))
                     avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
                     avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
@@ -750,81 +732,32 @@ elif menu == "기사 분석하기":
                     is_korean = bool(re.search(r'[가-힣]', content))
                     try:
-                        # 영어/한국어 토큰화 및 품사 분석
                         if is_korean:
-                            # 한국어인 경우 (간단한 형태소 유사 분석)
-                            try:
-                                # transformers 토크나이저 시도
-                                try:
-                                    from transformers import AutoTokenizer
-                                    tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
-                                    tokens = tokenizer.tokenize(content[:5000])  # 너무 긴 텍스트는 잘라서 분석
-                                    # 간단한 패턴 매칭으로 품사 추정
-                                    pos_counts = {'명사': 0, '동사': 0, '형용사': 0, '부사': 0, '기타': 0}
-                                    for token in tokens:
-                                        if token.endswith("다") or token.endswith("요"):
-                                            pos_counts['동사'] += 1
-                                        elif token.endswith("게") or token.endswith("히"):
-                                            pos_counts['부사'] += 1
-                                        elif token.endswith("은") or token.endswith("는") or token.endswith("이") or token.endswith("가"):
-                                            pos_counts['명사'] += 1
-                                        else:
-                                            if len(token) > 1:
-                                                pos_counts['명사'] += 1
-                                            else:
-                                                pos_counts['기타'] += 1
-                                except Exception:
-                                    # 실패하면 간단한 토큰화로 대체
-                                    tokens = tokenize_korean(content[:5000])
-                                    pos_counts = {
-                                        '명사류': len([t for t in tokens if len(t) > 1 and not any(t.endswith(s) for s in ["다", "요", "게", "히", "은", "는"])]),
-                                        '기타': len([t for t in tokens if len(t) <= 1 or any(t.endswith(s) for s in ["다", "요", "게", "히", "은", "는"])])
-                                    }
-                            except Exception as e:
-                                st.error(f"한국어 품사 분석 실패: {str(e)}")
-                                pos_counts = {'데이터': len(content) // 10, '분석': len(content) // 15, '오류': len(content) // 20}
                         else:
-                            # 영어 문서인 경우 (NLTK 시도)
-                            try:
-                                from nltk import pos_tag
-                                from nltk.tokenize import word_tokenize
-                                # 필요한 데이터 다운로드
-                                try:
-                                    nltk.download('averaged_perceptron_tagger', download_dir=NLTK_DATA_DIR)
-                                except Exception:
-                                    pass
-                                tokens = word_tokenize(content[:5000])
-                                tagged = pos_tag(tokens)
-                                # 영어 품사 매핑
-                                pos_dict = {
-                                    'NN': '명사', 'NNS': '명사', 'NNP': '고유명사', 'NNPS': '고유명사',
-                                    'VB': '동사', 'VBD': '동사', 'VBG': '동사', 'VBN': '동사', 'VBP': '동사', 'VBZ': '동사',
-                                    'JJ': '형용사', 'JJR': '형용사', 'JJS': '형용사',
-                                    'RB': '부사', 'RBR': '부사', 'RBS': '부사'
-                                }
-                                pos_counts = {'명사': 0, '동사': 0, '형용사': 0, '부사': 0, '기타': 0}
-                                for _, pos in tagged:
-                                    if pos in pos_dict:
-                                        pos_counts[pos_dict[pos]] += 1
-                                    else:
-                                        pos_counts['기타'] += 1
-                            except Exception:
-                                # 실패하면 간단한 규칙으로 품사 유추
-                                tokens = re.findall(r'\b\w+\b', content.lower())
-                                pos_counts = {
-                                    '명사': len([t for t in tokens if not t.endswith(('ly', 'ing', 'ed'))]),
-                                    '동사': len([t for t in tokens if t.endswith(('ing', 'ed', 's'))]),
-                                    '부사': len([t for t in tokens if t.endswith('ly')]),
-                                    '기타': len([t for t in tokens if len(t) <= 2])
-                                }
                         # 결과 시각화
                         pos_df = pd.DataFrame({

 # /tmp 폴더는 존재할 수 있지만 권한 문제가 있을 수 있으므로 현재 작업 디렉토리 기반으로 변경
 CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) if "__file__" in globals() else os.getcwd()
 DATA_DIR = os.path.join(CURRENT_DIR, "data")
 SAVED_ARTICLES_PATH = os.path.join(DATA_DIR, "saved_articles.json")
 SCHEDULED_NEWS_DIR = os.path.join(DATA_DIR, "scheduled_news")
 # 필요한 모든 디렉토리 생성
 ensure_directory(DATA_DIR)
 ensure_directory(SCHEDULED_NEWS_DIR)
+# 한국어 토크나이징을 위한 KSS 설정
 try:
+    import kss
+    kss_available = True
+except ImportError:
+    st.warning("KSS 라이브러리가 설치되어 있지 않습니다. 'pip install kss'로 설치하세요.")
+    kss_available = False
+# 한국어 토크나이징 함수 (KSS 사용)
 def tokenize_korean(text):
     try:
+        if kss_available:
             tokens = []
+            # 문장 분리 후 각 문장에서 단어 추출
             for sentence in kss.split_sentences(text):
+                # 기본 공백 기반 토큰화에 정규식 패턴 추가하여 더 정교하게 처리
+                raw_tokens = sentence.split()
+                for token in raw_tokens:
+                    # 조사, 특수문자 등을 분리
+                    sub_tokens = re.findall(r'[가-힣]+|[a-zA-Z]+|[0-9]+|[^\s가-힣a-zA-Z0-9]+', token)
+                    tokens.extend(sub_tokens)
             return tokens
     except Exception as e:
+        st.debug(f"KSS 토크나이징 실패: {str(e)}")
+    # KSS 사용 불가능하거나 오류 발생시 기본 정규식 기반 토크나이저 사용
     return re.findall(r'[가-힣]+|[a-zA-Z]+|[0-9]+|[^\s가-힣a-zA-Z0-9]+', text)
 # 워드클라우드 추가 (선택적 사용)
     except Exception as e:
         return f"오류 발생: {str(e)}"
+# KSS를 이용한 키워드 분석
 def analyze_keywords(text, top_n=10):
+    # 한국어 불용어 목록 (확장)
     korean_stopwords = [
         '이', '그', '저', '것', '및', '등', '를', '을', '에', '에서', '의', '으로', '로',
+        '에게', '뿐', '다', '는', '가', '이다', '에게서', '께', '께서', '부터', '까지',
+        '이런', '저런', '그런', '어떤', '무슨', '이것', '저것', '그것', '이번', '저번', '그번',
+        '이거', '저거', '그거', '하다', '되다', '있다', '없다', '같다', '보다', '이렇다', '그렇다',
+        '하는', '되는', '있는', '없는', '같은', '보는', '이런', '그런', '저런', '했다', '됐다',
+        '있었다', '없었다', '같았다', '봤다', '또', '또한', '그리고', '하지만', '그러나', '그래서',
+        '때문에', '따라서', '하며', '되며', '��으며', '없으며', '같으며', '보며', '하고', '되고',
+        '있고', '없고', '같고', '보고', '통해', '위해', '때', '중', '후'
+    ]
+    # 영어 불용어 목록
+    english_stopwords = [
+        'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
+        'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
+        'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
+        'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+        'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
+        'will', 'shall', 'can', 'may', 'must', 'ought'
     ]
     # 언어 감지 (간단하게 한글 포함 여부로 체크)
     is_korean = bool(re.search(r'[가-힣]', text))
     if is_korean:
+        # 한국어 텍스트인 경우 KSS 기반 토크나이저 사용
         tokens = tokenize_korean(text)
     else:
+        # 영어 또는 기타 언어는 간단한 정규식 토큰화
+        tokens = re.findall(r'\b\w+\b', text.lower())
+    # 불용어 필터링 (언어에 따라 다른 불용어 적용)
+    stopwords = korean_stopwords if is_korean else english_stopwords
+    tokens = [word for word in tokens if len(word) > 1 and word.lower() not in stopwords]
     # 빈도 계산
     from collections import Counter
         # 언어 감지 (간단하게 한글 포함 여부로 체크)
         is_korean = bool(re.search(r'[가-힣]', text))
+        # 토큰화 (KSS 사용)
+        tokens = tokenize_korean(text.lower())
         # 불용어 설정
+        # 영어 불용어 목록
+        english_stopwords = {
+            'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
+            'when', 'where', 'how', 'who', 'which', 'this', 'that', 'these', 'those',
+            'then', 'just', 'so', 'than', 'such', 'both', 'through', 'about', 'for',
+            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+            'having', 'do', 'does', 'did', 'doing', 'would', 'should', 'could', 'might',
+            'will', 'shall', 'can', 'may', 'must', 'ought'
+        }
         # 한국어 불용어
+        korean_stopwords = {
             '및', '등', '를', '이', '의', '가', '에', '는', '으로', '에서', '그', '또', '또는', '하는', '할', '하고',
             '있다', '이다', '위해', '것이다', '것은', '대한', '때문', '그리고', '하지만', '그러나', '그래서',
             '입니다', '합니다', '습니다', '요', '죠', '고', '과', '와', '도', '은', '수', '것', '들', '제', '저',
             '기자', '뉴스', '사진', '연합뉴스', '뉴시스', '제공', '무단', '전재', '재배포', '금지', '앵커', '멘트',
             '일보', '데일리', '경제', '사회', '정치', '세계', '과학', '아이티', '닷컴', '씨넷', '블로터', '전자신문'
         }
+        # 언어에 따라 불용어 선택
+        stop_words = korean_stopwords if is_korean else english_stopwords
         # 1글자 이상이고 불용어가 아닌 토큰만 필터링
         filtered_tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
                     # 텍스트 통계 계산
                     word_count = len(re.findall(r'\b\w+\b', content))
                     char_count = len(content)
+                    # KSS를 사용하여 문장 분리
+                    if kss_available:
+                        try:
+                            sentences = kss.split_sentences(content)
+                            sentence_count = len(sentences)
+                        except Exception:
+                            # KSS 실패 시 간단한 문장 분리
+                            sentence_count = len(re.split(r'[.!?]+', content))
+                    else:
+                        sentence_count = len(re.split(r'[.!?]+', content))
                     avg_word_length = sum(len(word) for word in re.findall(r'\b\w+\b', content)) / word_count if word_count > 0 else 0
                     avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
                     is_korean = bool(re.search(r'[가-힣]', content))
                     try:
+                        # KSS를 사용하여 간단한 품사 유사 분석
+                        tokens = tokenize_korean(content[:5000])  # 너무 긴 텍스트는 잘라서 분석
                         if is_korean:
+                            # 한국어인 경우 간단한 패턴 매칭으로 품사 추정
+                            pos_counts = {'명사/대명사': 0, '동사/형용사': 0, '부사/조사': 0, '기타': 0}
+                            for token in tokens:
+                                if token.endswith(("다", "요", "까", "죠", "네", "군", "니다", "세요")):
+                                    pos_counts['동사/형용사'] += 1
+                                elif token.endswith(("게", "히", "이", "지")):
+                                    pos_counts['부사/조사'] += 1
+                                elif token.endswith(("은", "는", "이", "가", "을", "를", "에", "의")):
+                                    pos_counts['부사/조사'] += 1
+                                elif len(token) > 1:
+                                    pos_counts['명사/대명사'] += 1
+                                else:
+                                    pos_counts['기타'] += 1
                         else:
+                            # 영어 문서인 경우 간단한 패턴 매칭
+                            pos_counts = {
+                                '명사/대명사': len([t for t in tokens if not t.lower().endswith(('ly', 'ing', 'ed'))]),
+                                '동사': len([t for t in tokens if t.lower().endswith(('ing', 'ed', 's'))]),
+                                '부사/형용사': len([t for t in tokens if t.lower().endswith('ly')]),
+                                '기타': len([t for t in tokens if len(t) <= 2])
+                            }
                         # 결과 시각화
                         pos_df = pd.DataFrame({