gaur3009 commited on
Commit
b2ecc50
Β·
verified Β·
1 Parent(s): 05edc93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -21
app.py CHANGED
@@ -5,6 +5,7 @@ import PyPDF2
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
7
  from sklearn.metrics.pairwise import cosine_similarity
 
8
 
9
  class PDFAnalyzer:
10
  def __init__(self):
@@ -19,9 +20,17 @@ class PDFAnalyzer:
19
  self.text_chunks = self._chunk_text(text)
20
  self.embeddings = self.model.encode(self.text_chunks)
21
  self.active_doc = os.path.basename(filepath)
22
- return f"βœ… Loaded {self.active_doc}"
 
 
 
 
23
  except Exception as e:
24
- return f"❌ Error: {str(e)}"
 
 
 
 
25
 
26
  def _extract_text(self, filepath):
27
  with open(filepath, 'rb') as f:
@@ -31,24 +40,45 @@ class PDFAnalyzer:
31
  return [text[i:i+500] for i in range(0, len(text), 500)]
32
 
33
  def query(self, question):
34
- if not self.active_doc:
35
- return "Please upload a PDF document first"
 
 
 
 
36
 
37
  ques_emb = self.model.encode(question)
38
- best_idx = np.argmax(cosine_similarity([ques_emb], self.embeddings)[0])
 
 
 
39
  full_answer = self.text_chunks[best_idx]
 
40
 
41
- # Extract 100-word precise answer with context
42
- words = full_answer.split()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  question_words = set(question.lower().split())
44
 
45
- # Find the most relevant sentence
46
- sentences = re.split(r'(?<=[.!?]) +', full_answer)
47
  best_sentence = max(sentences,
48
- key=lambda s: len(set(s.lower().split()) & question_words),
49
- default="")
50
 
51
- # Get 50 words before and after the best sentence
52
  all_words = ' '.join(sentences).split()
53
  try:
54
  start = max(0, all_words.index(best_sentence.split()[0]) - 50)
@@ -57,35 +87,44 @@ class PDFAnalyzer:
57
  start = 0
58
  end = 100
59
 
60
- precise_answer = ' '.join(all_words[start:end]) + ("..." if end < len(all_words) else "")
61
-
62
- return precise_answer
63
 
64
  def create_app():
65
  analyzer = PDFAnalyzer()
66
 
 
 
 
 
 
 
 
 
 
 
 
67
  with gr.Blocks(theme=gr.themes.Soft()) as app:
68
- gr.Markdown("# πŸ“‘ PDF QA Assistant")
69
 
70
  with gr.Row():
71
  with gr.Column(scale=1):
72
  pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
73
- status = gr.Markdown("**Status:** Waiting for PDF upload")
74
  gr.Button("Process PDF").click(
75
- lambda f: analyzer.process_pdf(f.name) if f else "❌ Invalid file",
76
  inputs=pdf_upload,
77
  outputs=status
78
  )
79
 
80
  with gr.Column(scale=2):
81
  chatbot = gr.Chatbot(height=400)
82
- question = gr.Textbox(label="Ask about the document", placeholder="Type your question...")
83
  question.submit(
84
- lambda q,h: h + [(q, analyzer.query(q))],
85
  inputs=[question, chatbot],
86
  outputs=chatbot
87
  )
88
- gr.Button("Clear").click(
89
  lambda: [None, None, "**Status:** Session cleared"],
90
  outputs=[chatbot, pdf_upload, status]
91
  )
 
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
+ import json
9
 
10
  class PDFAnalyzer:
11
  def __init__(self):
 
20
  self.text_chunks = self._chunk_text(text)
21
  self.embeddings = self.model.encode(self.text_chunks)
22
  self.active_doc = os.path.basename(filepath)
23
+ return json.dumps({
24
+ "status": 200,
25
+ "message": f"Document {self.active_doc} processed successfully",
26
+ "document_id": hash(self.active_doc)
27
+ })
28
  except Exception as e:
29
+ return json.dumps({
30
+ "status": 500,
31
+ "error": str(e),
32
+ "message": "Document processing failed"
33
+ })
34
 
35
  def _extract_text(self, filepath):
36
  with open(filepath, 'rb') as f:
 
40
  return [text[i:i+500] for i in range(0, len(text), 500)]
41
 
42
  def query(self, question):
43
+ if not self.active_doc:
44
+ return json.dumps({
45
+ "status": 400,
46
+ "message": "No document uploaded",
47
+ "results": []
48
+ })
49
 
50
  ques_emb = self.model.encode(question)
51
+ similarities = cosine_similarity([ques_emb], self.embeddings)[0]
52
+ best_idx = np.argmax(similarities)
53
+ confidence = float(similarities[best_idx])
54
+
55
  full_answer = self.text_chunks[best_idx]
56
+ sentences = re.split(r'(?<=[.!?]) +', full_answer)
57
 
58
+ # Generate API-like response
59
+ return json.dumps({
60
+ "status": 200,
61
+ "message": "Success",
62
+ "results": [{
63
+ "text": self._format_answer(full_answer, question),
64
+ "confidence": confidence,
65
+ "document_id": hash(self.active_doc),
66
+ "metadata": {
67
+ "chunk_index": best_idx,
68
+ "document": self.active_doc
69
+ }
70
+ }]
71
+ })
72
+
73
+ def _format_answer(self, text, question):
74
+ # Extract focused answer with 100-word context
75
+ sentences = re.split(r'(?<=[.!?]) +', text)
76
  question_words = set(question.lower().split())
77
 
 
 
78
  best_sentence = max(sentences,
79
+ key=lambda s: len(set(s.lower().split()) & question_words),
80
+ default="")
81
 
 
82
  all_words = ' '.join(sentences).split()
83
  try:
84
  start = max(0, all_words.index(best_sentence.split()[0]) - 50)
 
87
  start = 0
88
  end = 100
89
 
90
+ return ' '.join(all_words[start:end]) + ("..." if end < len(all_words) else "")
 
 
91
 
92
  def create_app():
93
  analyzer = PDFAnalyzer()
94
 
95
+ def format_response(response):
96
+ try:
97
+ data = json.loads(response)
98
+ if data['status'] != 200:
99
+ return f"Error: {data.get('message', 'Unknown error')}"
100
+
101
+ result = data['results'][0]
102
+ return f"**Answer** ({result['confidence']:.2f} confidence):\n{result['text']}"
103
+ except:
104
+ return "Error processing response"
105
+
106
  with gr.Blocks(theme=gr.themes.Soft()) as app:
107
+ gr.Markdown("# πŸ“‘ PDF QA Assistant (Cohere-style API)")
108
 
109
  with gr.Row():
110
  with gr.Column(scale=1):
111
  pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
112
+ status = gr.Markdown("**Status:** Idle")
113
  gr.Button("Process PDF").click(
114
+ lambda f: analyzer.process_pdf(f.name) if f else json.dumps({"status": 400, "error": "No file"}),
115
  inputs=pdf_upload,
116
  outputs=status
117
  )
118
 
119
  with gr.Column(scale=2):
120
  chatbot = gr.Chatbot(height=400)
121
+ question = gr.Textbox(label="Query", placeholder="Enter your question...")
122
  question.submit(
123
+ lambda q,h: h + [(q, format_response(analyzer.query(q)))],
124
  inputs=[question, chatbot],
125
  outputs=chatbot
126
  )
127
+ gr.Button("Clear Session").click(
128
  lambda: [None, None, "**Status:** Session cleared"],
129
  outputs=[chatbot, pdf_upload, status]
130
  )