priyanshu23456 commited on
Commit
756c987
·
verified ·
1 Parent(s): 5113509

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -4
app.py CHANGED
@@ -12,6 +12,7 @@ import faiss
12
  import numpy as np
13
  import tempfile
14
  from PIL import Image
 
15
  import logging
16
 
17
  # Set up logging
@@ -50,15 +51,24 @@ def initialize_models():
50
  "question-answering",
51
  model="distilbert-base-cased-distilled-squad",
52
  tokenizer="distilbert-base-cased",
53
- device=0 if device == "cuda" else -1
54
  )
55
 
56
  logger.info("Loading language model...")
57
- tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
 
 
 
 
 
 
 
 
58
  model = AutoModelForCausalLM.from_pretrained(
59
- "distilgpt2",
 
60
  device_map="auto",
61
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
62
  )
63
 
64
  if tokenizer.pad_token is None:
@@ -70,6 +80,87 @@ def initialize_models():
70
  logger.error(f"Error initializing models: {str(e)}")
71
  raise
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # Cleanup function for temporary files
74
  def cleanup_temp_files(filepath):
75
  try:
 
12
  import numpy as np
13
  import tempfile
14
  from PIL import Image
15
+ from transformers import BitsAndBytesConfig
16
  import logging
17
 
18
  # Set up logging
 
51
  "question-answering",
52
  model="distilbert-base-cased-distilled-squad",
53
  tokenizer="distilbert-base-cased",
54
+ device=-1 # Force CPU for free tier
55
  )
56
 
57
  logger.info("Loading language model...")
58
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct" # Replace distilgpt2
59
+ # Configure 4-bit quantization
60
+ quantization_config = BitsAndBytesConfig(
61
+ load_in_4bit=True,
62
+ bnb_4bit_compute_dtype=torch.float16,
63
+ bnb_4bit_quant_type="nf4",
64
+ bnb_4bit_use_double_quant=True
65
+ )
66
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
67
  model = AutoModelForCausalLM.from_pretrained(
68
+ model_name,
69
+ quantization_config=quantization_config, # Use 4-bit
70
  device_map="auto",
71
+ torch_dtype=torch.float16 # Optimize for CPU fallback
72
  )
73
 
74
  if tokenizer.pad_token is None:
 
80
  logger.error(f"Error initializing models: {str(e)}")
81
  raise
82
 
83
+ # Generation-based answering
84
+ def answer_with_generation(index, embeddings, chunks, question):
85
+ try:
86
+ logger.info(f"Answering with generation model: '{question}'")
87
+ global tokenizer, model
88
+
89
+ if tokenizer is None or model is None:
90
+ logger.info("Generation models not initialized, creating now...")
91
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
92
+ quantization_config = BitsAndBytesConfig(
93
+ load_in_4bit=True,
94
+ bnb_4bit_compute_dtype=torch.float16,
95
+ bnb_4bit_quant_type="nf4",
96
+ bnb_4bit_use_double_quant=True
97
+ )
98
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
99
+ model = AutoModelForCausalLM.from_pretrained(
100
+ model_name,
101
+ quantization_config=quantization_config,
102
+ device_map="auto",
103
+ torch_dtype=torch.float16
104
+ )
105
+
106
+ if tokenizer.pad_token is None:
107
+ tokenizer.pad_token = tokenizer.eos_token
108
+ model.config.pad_token_id = model.config.eos_token_id
109
+
110
+ # Get embeddings for question
111
+ q_embedding = embedder.encode([question])
112
+
113
+ # Find relevant chunks
114
+ _, top_k_indices = index.search(q_embedding, k=3)
115
+ relevant_chunks = [chunks[i] for i in top_k_indices[0]]
116
+ context = " ".join(relevant_chunks)
117
+
118
+ # Limit context size for efficiency
119
+ if len(context) > 2000: # Reduced for Qwen's efficiency
120
+ context = context[:2000]
121
+
122
+ # Create prompt (optimized for Qwen's instruction format)
123
+ prompt = f"""<|im_start|>system
124
+ You are a helpful assistant answering questions based on provided PDF content. Use the information below to give a clear, concise, and accurate answer. Avoid speculation and focus on the context.
125
+ <|im_end|>
126
+ <|im_start|>user
127
+ **Context**: {context}
128
+
129
+ **Question**: {question}
130
+
131
+ **Instruction**: Provide a detailed and accurate answer based on the context. If the context doesn't contain enough information, say so clearly. <|im_end|>"""
132
+
133
+ # Handle inputs
134
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) # Increased for Qwen
135
+
136
+ # Move inputs to CPU (free tier)
137
+ inputs = {k: v.to('cpu') for k, v in inputs.items()}
138
+
139
+ # Generate answer
140
+ output = model.generate(
141
+ **inputs,
142
+ max_new_tokens=300,
143
+ temperature=0.7,
144
+ top_p=0.9,
145
+ do_sample=True,
146
+ num_beams=2, # Reduced for speed
147
+ no_repeat_ngram_size=2
148
+ )
149
+
150
+ # Decode and format answer
151
+ answer = tokenizer.decode(output[0], skip_special_tokens=True)
152
+ # Extract the answer after the instruction
153
+ if "<|im_end|>" in answer:
154
+ answer = answer.split("<|im_end|>")[1].strip()
155
+ elif "Instruction" in answer:
156
+ answer = answer.split("Instruction")[1].strip()
157
+
158
+ logger.info(f"Generation answer: '{answer[:50]}...' (length: {len(answer)})")
159
+ return answer.strip()
160
+ except Exception as e:
161
+ logger.error(f"Generation error: {str(e)}")
162
+ return "I couldn't generate a good answer based on the PDF content."
163
+
164
  # Cleanup function for temporary files
165
  def cleanup_temp_files(filepath):
166
  try: