Spaces:
Sleeping
Sleeping
Delete whalecore
Browse files- whalecore/__init__.py +0 -0
- whalecore/agents.py +0 -36
- whalecore/parser.py +0 -49
- whalecore/rag.py +0 -37
whalecore/__init__.py
DELETED
File without changes
|
whalecore/agents.py
DELETED
@@ -1,36 +0,0 @@
|
|
1 |
-
import yaml
|
2 |
-
|
3 |
-
class Agent:
|
4 |
-
def __init__(self, name, persona, instructions):
|
5 |
-
self.name = name
|
6 |
-
self.persona = persona
|
7 |
-
self.instructions = instructions
|
8 |
-
|
9 |
-
def chat(self, message):
|
10 |
-
# Placeholder logic — replace with real LLM call later
|
11 |
-
return f"🧠 {self.name} says:\n{self.instructions}\n\n{self.persona}\n\nYou said: {message[:260]}..."
|
12 |
-
|
13 |
-
def load_agents(config_path="config.yaml"):
|
14 |
-
with open(config_path, 'r') as f:
|
15 |
-
config = yaml.safe_load(f)
|
16 |
-
|
17 |
-
assert isinstance(config, dict), "YAML must contain a top-level 'agents:' key"
|
18 |
-
assert 'agents' in config, "Missing 'agents' key in YAML file"
|
19 |
-
|
20 |
-
print("🧠 YAML loaded successfully:", config)
|
21 |
-
|
22 |
-
agents = []
|
23 |
-
for agent_conf in config['agents']:
|
24 |
-
agent = Agent(
|
25 |
-
name=agent_conf['name'],
|
26 |
-
persona=agent_conf['persona'],
|
27 |
-
instructions=agent_conf['instructions']
|
28 |
-
)
|
29 |
-
agents.append(agent)
|
30 |
-
return agents
|
31 |
-
|
32 |
-
def run_agents_on_text(agent_list, text):
|
33 |
-
results = {}
|
34 |
-
for agent in agent_list:
|
35 |
-
results[agent.name] = agent.chat(text)
|
36 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
whalecore/parser.py
DELETED
@@ -1,49 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import PyPDF2
|
3 |
-
import whisper
|
4 |
-
from pydub import AudioSegment
|
5 |
-
from sentence_transformers import SentenceTransformer
|
6 |
-
|
7 |
-
import warnings
|
8 |
-
warnings.filterwarnings(
|
9 |
-
"ignore",
|
10 |
-
category=FutureWarning,
|
11 |
-
message="`clean_up_tokenization_spaces` was not set.*"
|
12 |
-
)
|
13 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
14 |
-
|
15 |
-
def parse_pdf(filepath):
|
16 |
-
text = ""
|
17 |
-
with open(filepath, 'rb') as f:
|
18 |
-
reader = PyPDF2.PdfReader(f)
|
19 |
-
for page in reader.pages:
|
20 |
-
text += page.extract_text() + "\n"
|
21 |
-
return text
|
22 |
-
|
23 |
-
def parse_audio(filepath):
|
24 |
-
model = whisper.load_model("base")
|
25 |
-
result = model.transcribe(filepath)
|
26 |
-
return result['text']
|
27 |
-
|
28 |
-
def parse_text(filepath):
|
29 |
-
with open(filepath, 'r') as f:
|
30 |
-
return f.read()
|
31 |
-
|
32 |
-
def parse_file(filepath):
|
33 |
-
if filepath.endswith('.pdf'):
|
34 |
-
return parse_pdf(filepath)
|
35 |
-
elif filepath.endswith(('.mp3', '.wav', '.m4a')):
|
36 |
-
return parse_audio(filepath)
|
37 |
-
elif filepath.endswith('.txt'):
|
38 |
-
return parse_text(filepath)
|
39 |
-
else:
|
40 |
-
raise ValueError(f"Unsupported file type: {filepath}")
|
41 |
-
|
42 |
-
def chunk_text(text, chunk_size=300):
|
43 |
-
words = text.split()
|
44 |
-
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
|
45 |
-
|
46 |
-
def chunk_and_embed(text):
|
47 |
-
chunks = chunk_text(text)
|
48 |
-
embeddings = model.encode(chunks).tolist()
|
49 |
-
return list(zip(chunks, embeddings))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
whalecore/rag.py
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
from sentence_transformers import SentenceTransformer
|
2 |
-
from pymongo import MongoClient
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
model = SentenceTransformer('all-MiniLM-L6-v2')
|
6 |
-
client = MongoClient()
|
7 |
-
db = client['huggingwhale']
|
8 |
-
collection = db['docs']
|
9 |
-
|
10 |
-
def chunk_text(text, chunk_size=300):
|
11 |
-
words = text.split()
|
12 |
-
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
|
13 |
-
|
14 |
-
def embed_chunks(chunks):
|
15 |
-
return model.encode(chunks).tolist()
|
16 |
-
|
17 |
-
def store_embeddings(chunks, embeddings):
|
18 |
-
docs = [
|
19 |
-
{"chunk": chunk, "embedding": emb}
|
20 |
-
for chunk, emb in zip(chunks, embeddings)
|
21 |
-
]
|
22 |
-
collection.insert_many(docs)
|
23 |
-
|
24 |
-
def query_rag(question, top_k=3):
|
25 |
-
question_vec = model.encode([question])[0]
|
26 |
-
results = collection.aggregate([
|
27 |
-
{
|
28 |
-
"$vectorSearch": {
|
29 |
-
"index": "default",
|
30 |
-
"path": "embedding",
|
31 |
-
"queryVector": question_vec,
|
32 |
-
"numCandidates": 100,
|
33 |
-
"limit": top_k
|
34 |
-
}
|
35 |
-
}
|
36 |
-
])
|
37 |
-
return [doc['chunk'] for doc in results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|