Spaces:

Ocillus
/

Arcana

Sleeping

App Files Files Community

Ocillus commited on Jul 20, 2024

Commit

b6d4bd1

verified ·

1 Parent(s): f7f0bde

Update fiber.py

Browse files

Files changed (1) hide show

fiber.py +20 -18

fiber.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 from typing import List, Dict
 from datetime import datetime
 from collections import Counter
 class FiberDBMS:
     def __init__(self):
@@ -27,16 +28,13 @@ class FiberDBMS:
             self.content_index[word].append(entry_index)
     def load_or_create(self, filename: str) -> None:
-        """Load the database from a file or create a new one if the file does not exist."""
         try:
             self.load_from_file(filename)
             print(f"Loaded {len(self.database)} entries from {filename}.")
         except FileNotFoundError:
             print(f"{filename} not found. Creating a new database.")
-            # Optionally, you can add default entries here if needed.
     def query(self, query: str, top_n: int) -> List[Dict[str, str]]:
-        """Query the database for entries matching the query."""
         query_words = self._tokenize(query)
         matching_indices = set()
         for word in query_words:
@@ -64,7 +62,6 @@ class FiberDBMS:
         return results
     def save(self, filename: str) -> None:
-        """Save the current database to a file."""
         with open(filename, 'w', encoding='utf-8') as f:
             for entry in self.database:
                 line = f"{entry['name']}\t{entry['timestamp']}\t{entry['content']}\t{entry['tags']}\n"
@@ -72,39 +69,42 @@ class FiberDBMS:
         print(f"Updated database saved to {filename}.")
     def _rate_result(self, entry: Dict[str, str], query_words: List[str]) -> float:
-        content_lower = entry['content'].lower()
-        name_lower = entry['name'].lower()
         tags = entry['tags'].split(',')
-        unique_matches = sum(1 for word in set(query_words) if word in content_lower)
-        content_score = sum(content_lower.count(word) for word in query_words)
-        name_score = sum(3 for word in query_words if word in name_lower)
-        phrase_score = 5 if ' '.join(query_words) in content_lower else 0
         unique_match_score = unique_matches * 10
-        # Include all tags in weighting
-        tag_score = sum(2 for tag in tags if any(word in tag.lower() for word in query_words))
-        length_penalty = min(1, len(content_lower) / 100)
         return (content_score + name_score + phrase_score + unique_match_score + tag_score) * length_penalty
     def _tokenize(self, text: str) -> List[str]:
-        return re.findall(r'\w+', text.lower())
     def _get_snippet(self, content: str, query_words: List[str], max_length: int = 200) -> str:
-        lower_content = content.lower()
         best_start = 0
         max_score = 0
-        for i in range(len(lower_content) - max_length):
-            snippet = lower_content[i:i+max_length]
             score = sum(snippet.count(word) * (len(word) ** 0.5) for word in query_words)
             if score > max_score:
                 max_score = score
                 best_start = i
-        snippet = content[best_start:best_start+max_length]
         return snippet + "..." if len(content) > max_length else snippet
     def _update_tags(self, original_tags: str, content: str, query_words: List[str]) -> str:
@@ -165,3 +165,5 @@ def main():
     # Save updated database with new tags
     dbms.save("Celsiaaa.txt")

 from typing import List, Dict
 from datetime import datetime
 from collections import Counter
+import jieba  # For Chinese word segmentation
 class FiberDBMS:
     def __init__(self):
             self.content_index[word].append(entry_index)
     def load_or_create(self, filename: str) -> None:
         try:
             self.load_from_file(filename)
             print(f"Loaded {len(self.database)} entries from {filename}.")
         except FileNotFoundError:
             print(f"{filename} not found. Creating a new database.")
     def query(self, query: str, top_n: int) -> List[Dict[str, str]]:
         query_words = self._tokenize(query)
         matching_indices = set()
         for word in query_words:
         return results
     def save(self, filename: str) -> None:
         with open(filename, 'w', encoding='utf-8') as f:
             for entry in self.database:
                 line = f"{entry['name']}\t{entry['timestamp']}\t{entry['content']}\t{entry['tags']}\n"
         print(f"Updated database saved to {filename}.")
     def _rate_result(self, entry: Dict[str, str], query_words: List[str]) -> float:
+        content_tokens = self._tokenize(entry['content'])
+        name_tokens = self._tokenize(entry['name'])
         tags = entry['tags'].split(',')
+        unique_matches = sum(1 for word in set(query_words) if word in content_tokens)
+        content_score = sum(content_tokens.count(word) for word in query_words)
+        name_score = sum(3 for word in query_words if word in name_tokens)
+        phrase_score = 5 if all(word in content_tokens for word in query_words) else 0
         unique_match_score = unique_matches * 10
+        tag_score = sum(2 for tag in tags if any(word in self._tokenize(tag) for word in query_words))
+        length_penalty = min(1, len(content_tokens) / 100)
         return (content_score + name_score + phrase_score + unique_match_score + tag_score) * length_penalty
     def _tokenize(self, text: str) -> List[str]:
+        # Check if the text contains Chinese characters
+        if re.search(r'[\u4e00-\u9fff]', text):
+            return list(jieba.cut(text))
+        else:
+            return re.findall(r'\w+', text.lower())
     def _get_snippet(self, content: str, query_words: List[str], max_length: int = 200) -> str:
+        content_tokens = self._tokenize(content)
         best_start = 0
         max_score = 0
+        for i in range(len(content_tokens) - max_length):
+            snippet = content_tokens[i:i+max_length]
             score = sum(snippet.count(word) * (len(word) ** 0.5) for word in query_words)
             if score > max_score:
                 max_score = score
                 best_start = i
+        snippet = ''.join(content_tokens[best_start:best_start+max_length])
         return snippet + "..." if len(content) > max_length else snippet
     def _update_tags(self, original_tags: str, content: str, query_words: List[str]) -> str:
     # Save updated database with new tags
     dbms.save("Celsiaaa.txt")
+if __name__ == "__main__":
+    main()