sagar008 commited on
Commit
050369b
Β·
verified Β·
1 Parent(s): dc91101

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +41 -16
main.py CHANGED
@@ -5,17 +5,18 @@ from fastapi.responses import JSONResponse
5
  from document_processor import DocumentProcessor
6
  from vector_store import vector_store
7
  from models import *
8
- from pdf_processor import PDFProcessor # New module
9
  import time
10
  import hashlib
11
  import os
12
  import google.generativeai as genai
 
13
  from typing import Optional
14
  import tempfile
15
 
16
  # Initialize processors
17
  processor = DocumentProcessor()
18
- pdf_processor = PDFProcessor() # New PDF processor
19
 
20
  # Initialize Gemini
21
  genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
@@ -47,7 +48,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
47
  if not data.document_text:
48
  return {"error": "No document text provided"}
49
 
50
- # ⭐ Use forced doc_id if provided (from Node.js), otherwise generate from text
51
  if data.force_doc_id:
52
  doc_id = data.force_doc_id
53
  print(f"πŸ”§ Using Node.js provided doc_id: {doc_id}")
@@ -89,7 +90,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
89
  @app.post("/analyze_pdf")
90
  async def analyze_pdf(
91
  file: UploadFile = File(...),
92
- force_doc_id: Optional[str] = None # Accept doc_id from Node.js
93
  ):
94
  """Direct PDF upload and analysis with Node.js doc_id support"""
95
  try:
@@ -102,7 +103,7 @@ async def analyze_pdf(
102
  # Read file content
103
  file_content = await file.read()
104
 
105
- # ⭐ Use Node.js provided doc_id OR generate from file content
106
  if force_doc_id:
107
  doc_id = force_doc_id
108
  print(f"πŸ”§ Using Node.js provided doc_id: {doc_id}")
@@ -157,10 +158,8 @@ async def analyze_pdf(
157
 
158
  @app.post("/analyze_document_url")
159
  async def analyze_document_url(data: AnalyzeDocumentURLInput):
160
- """Analyze document from URL with Node.js doc_id support"""
161
  try:
162
- import httpx
163
-
164
  start_time = time.time()
165
 
166
  if not data.document_url:
@@ -168,13 +167,32 @@ async def analyze_document_url(data: AnalyzeDocumentURLInput):
168
 
169
  print(f"πŸ“₯ Downloading document from: {data.document_url}")
170
 
171
- # Download the document
172
- async with httpx.AsyncClient(timeout=60.0) as client:
173
- response = await client.get(data.document_url)
174
- response.raise_for_status()
175
- file_content = response.content
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- # ⭐ Use Node.js provided doc_id OR generate from file content
 
 
 
 
 
 
 
178
  if data.force_doc_id:
179
  doc_id = data.force_doc_id
180
  print(f"πŸ”§ Using Node.js provided doc_id: {doc_id}")
@@ -220,8 +238,14 @@ async def analyze_document_url(data: AnalyzeDocumentURLInput):
220
 
221
  return result
222
 
223
- except httpx.HTTPStatusError as e:
224
- raise HTTPException(status_code=400, detail=f"Failed to download document: {e}")
 
 
 
 
 
 
225
  except Exception as e:
226
  print(f"❌ URL analysis error: {e}")
227
  raise HTTPException(status_code=500, detail=f"Document analysis failed: {str(e)}")
@@ -331,6 +355,7 @@ async def chat_with_document(data: ChatInput):
331
  print(f"❌ Chat error: {e}")
332
  return {"error": f"Chat failed: {str(e)}"}
333
 
 
334
  @app.get("/debug_pinecone/{document_id}")
335
  async def debug_pinecone_storage(document_id: str):
336
  """Debug what's actually stored in Pinecone for a document"""
 
5
  from document_processor import DocumentProcessor
6
  from vector_store import vector_store
7
  from models import *
8
+ from pdf_processor import PDFProcessor
9
  import time
10
  import hashlib
11
  import os
12
  import google.generativeai as genai
13
+ import requests # Use requests instead of httpx for better Cloudinary compatibility
14
  from typing import Optional
15
  import tempfile
16
 
17
  # Initialize processors
18
  processor = DocumentProcessor()
19
+ pdf_processor = PDFProcessor()
20
 
21
  # Initialize Gemini
22
  genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
 
48
  if not data.document_text:
49
  return {"error": "No document text provided"}
50
 
51
+ # Use forced doc_id if provided (from Node.js), otherwise generate from text
52
  if data.force_doc_id:
53
  doc_id = data.force_doc_id
54
  print(f"πŸ”§ Using Node.js provided doc_id: {doc_id}")
 
90
  @app.post("/analyze_pdf")
91
  async def analyze_pdf(
92
  file: UploadFile = File(...),
93
+ force_doc_id: Optional[str] = None
94
  ):
95
  """Direct PDF upload and analysis with Node.js doc_id support"""
96
  try:
 
103
  # Read file content
104
  file_content = await file.read()
105
 
106
+ # Use Node.js provided doc_id OR generate from file content
107
  if force_doc_id:
108
  doc_id = force_doc_id
109
  print(f"πŸ”§ Using Node.js provided doc_id: {doc_id}")
 
158
 
159
  @app.post("/analyze_document_url")
160
  async def analyze_document_url(data: AnalyzeDocumentURLInput):
161
+ """Analyze document from URL with FIXED Cloudinary download"""
162
  try:
 
 
163
  start_time = time.time()
164
 
165
  if not data.document_url:
 
167
 
168
  print(f"πŸ“₯ Downloading document from: {data.document_url}")
169
 
170
+ # ⭐ FIXED: Use requests with proper headers (same as Postman)
171
+ headers = {
172
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
173
+ 'Accept': '*/*',
174
+ 'Accept-Encoding': 'gzip, deflate, br',
175
+ 'Accept-Language': 'en-US,en;q=0.9',
176
+ 'Connection': 'keep-alive'
177
+ }
178
+
179
+ # Test the URL first with HEAD request
180
+ try:
181
+ head_response = requests.head(data.document_url, headers=headers, timeout=10)
182
+ print(f"βœ… HEAD request successful: {head_response.status_code}")
183
+ print(f"πŸ“Š Content-Type: {head_response.headers.get('content-type', 'unknown')}")
184
+ print(f"πŸ“ Content-Length: {head_response.headers.get('content-length', 'unknown')}")
185
+ except Exception as head_error:
186
+ print(f"⚠️ HEAD request failed: {head_error}")
187
 
188
+ # Download the full content using requests (more reliable than httpx for Cloudinary)
189
+ response = requests.get(data.document_url, headers=headers, timeout=60)
190
+ response.raise_for_status()
191
+ file_content = response.content
192
+
193
+ print(f"βœ… Successfully downloaded {len(file_content)} bytes")
194
+
195
+ # Use Node.js provided doc_id OR generate from file content
196
  if data.force_doc_id:
197
  doc_id = data.force_doc_id
198
  print(f"πŸ”§ Using Node.js provided doc_id: {doc_id}")
 
238
 
239
  return result
240
 
241
+ except requests.HTTPError as e:
242
+ error_msg = f"Failed to download document: HTTP {e.response.status_code} - {e.response.reason}"
243
+ print(f"❌ HTTP Error: {error_msg}")
244
+ raise HTTPException(status_code=400, detail=error_msg)
245
+ except requests.RequestException as e:
246
+ error_msg = f"Failed to download document: {str(e)}"
247
+ print(f"❌ Request Error: {error_msg}")
248
+ raise HTTPException(status_code=400, detail=error_msg)
249
  except Exception as e:
250
  print(f"❌ URL analysis error: {e}")
251
  raise HTTPException(status_code=500, detail=f"Document analysis failed: {str(e)}")
 
355
  print(f"❌ Chat error: {e}")
356
  return {"error": f"Chat failed: {str(e)}"}
357
 
358
+
359
  @app.get("/debug_pinecone/{document_id}")
360
  async def debug_pinecone_storage(document_id: str):
361
  """Debug what's actually stored in Pinecone for a document"""