Spaces:

TeamSAS
/

UB_VSA

Running

App Files Files Community

sivakum4 commited on Apr 29

Commit

7017d8a

1 Parent(s): 2e0cda6

Feat: HF Inference API

Browse files

Files changed (1) hide show

buffalo_rag/model/rag.py +53 -67

buffalo_rag/model/rag.py CHANGED Viewed

@@ -2,49 +2,30 @@ import os
 import json
 from typing import List, Dict, Any, Optional, Tuple
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from buffalo_rag.vector_store.db import VectorStore
 class BuffaloRAG:
-    def __init__(self,
-                 model_name: str = "Qwen/Qwen1.5-1.8B-Chat",
-                 vector_store: Optional[VectorStore] = None):
         self.vector_store = vector_store or VectorStore()
-        try:
-            # Load model and tokenizer
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,
-                device_map="auto",
-                trust_remote_code=True,
-                low_cpu_mem_usage=True
-            )
-            # More conservative generation parameters for stability
-            self.pipe = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=256,  # Shorter outputs for stability
-                do_sample=False,     # Use greedy decoding instead of sampling
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-        except Exception as e:
-            print(f"Error loading main model: {str(e)}")
-            print("Falling back to smaller model...")
-            # Fallback to a smaller, more stable model
-            self.tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-            self.model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-            self.pipe = pipeline(
-                "text-generation",
-                model=self.model,
-                tokenizer=self.tokenizer,
-                max_new_tokens=256
             )
     def retrieve(self,
                 query: str,
@@ -54,46 +35,51 @@ class BuffaloRAG:
         return self.vector_store.hybrid_search(query, k=k, filter_categories=filter_categories)
     def format_context(self, results: List[Dict[str, Any]]) -> str:
-        """Format retrieved results into context."""
-        context = ""
-        for i, result in enumerate(results):
-            chunk = result['chunk']
-            context += f"Source {i+1}: {chunk['title']}\n"
-            context += f"URL: {chunk['url']}\n"
-            context += f"Content: {chunk['content'][:500]}...\n\n"
-        return context
     def generate_response(self, query: str, context: str) -> str:
         """Generate response using the language model with error handling."""
         prompt = f"""You are a friendly and professional counselor for international students at the University at Buffalo. Respond to the student's query in a supportive, detailed, and well-structured manner.
-For your responses:
-1. Address the student respectfully and empathetically
-2. Provide clear, accurate information with specific details and steps when applicable
-3. Organize your answer with appropriate headings, bullet points, or numbered lists when helpful
-4. If the student's question is unclear or lacks essential details, ask 1-2 specific clarifying questions to better understand their situation
-5. Include relevant deadlines, contacts, or resources when appropriate
-6. Conclude with a brief encouraging statement
-7. Only answer related to international students at UB, if it's not related to international students at UB, just say "I'm sorry, I don't have information about that."
-8. Do not entertain any questions that are not related to students at UB.
-Question: {query}
-Relevant Information:
-{context}
-Answer:"""
         try:
-            # Generate response
-            response = self.pipe(prompt)[0]['generated_text']
-            # Extract only the generated part (after the prompt)
-            generated = response[len(prompt):].strip()
-            return generated
         except Exception as e:
             print(f"Error during generation: {str(e)}")
             # Fallback response

 import json
 from typing import List, Dict, Any, Optional, Tuple
+# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from huggingface_hub import InferenceClient
 from buffalo_rag.vector_store.db import VectorStore
 class BuffaloRAG:
+    def __init__(
+        self,
+        model_name: str = "meta-llama/Llama-2-7b-chat-hf",
+        vector_store: Optional[VectorStore] = None
+    ):
+        # 1. Vector store
         self.vector_store = vector_store or VectorStore()
+        # 2. Hugging Face Inference client
+        hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+        if not hf_token:
+            raise ValueError("Please set HUGGINGFACEHUB_API_TOKEN in your environment.")
+        self.client = InferenceClient(
+                provider="cerebras",
+                api_key=hf_token,
             )
     def retrieve(self,
                 query: str,
         return self.vector_store.hybrid_search(query, k=k, filter_categories=filter_categories)
     def format_context(self, results: List[Dict[str, Any]]) -> str:
+        """Concatenate retrieved passages into context."""
+        ctx = []
+        for i, r in enumerate(results, start=1):
+            c = r["chunk"]
+            ctx.append(
+                f"Source {i}: {c['title']}\n"
+                f"URL: {c['url']}\n"
+                f"Content: {c['content'][:500]}...\n"
+            )
+        return "\n".join(ctx)
     def generate_response(self, query: str, context: str) -> str:
         """Generate response using the language model with error handling."""
         prompt = f"""You are a friendly and professional counselor for international students at the University at Buffalo. Respond to the student's query in a supportive, detailed, and well-structured manner.
+                For your responses:
+                1. Address the student respectfully and empathetically
+                2. Provide clear, accurate information with specific details and steps when applicable
+                3. Organize your answer with appropriate headings, bullet points, or numbered lists when helpful
+                4. If the student's question is unclear or lacks essential details, ask 1-2 specific clarifying questions to better understand their situation
+                5. Include relevant deadlines, contacts, or resources when appropriate
+                6. Conclude with a brief encouraging statement
+                7. Only answer related to international students at UB, if it's not related to international students at UB, just say "I'm sorry, I don't have information about that."
+                8. Do not entertain any questions that are not related to students at UB.
+                Question: {query}
+                Relevant Information:
+                {context}
+                Answer:"""
         try:
+            completion = self.client.chat.completions.create(
+                model="meta-llama/Llama-3.3-70B-Instruct",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": prompt
+                    }
+                ],
+                max_tokens=512,
+            )
+            return completion.choices[0].message.content
         except Exception as e:
             print(f"Error during generation: {str(e)}")
             # Fallback response