Spaces:

sagar008
/

unified-analysis-for-legal-docs

Sleeping

App Files Files Community

sagar008 commited on Aug 11

Commit

050369b

verified ·

1 Parent(s): dc91101

Update main.py

Browse files

Files changed (1) hide show

main.py +41 -16

main.py CHANGED Viewed

@@ -5,17 +5,18 @@ from fastapi.responses import JSONResponse
 from document_processor import DocumentProcessor
 from vector_store import vector_store
 from models import *
-from pdf_processor import PDFProcessor  # New module
 import time
 import hashlib
 import os
 import google.generativeai as genai
 from typing import Optional
 import tempfile
 # Initialize processors
 processor = DocumentProcessor()
-pdf_processor = PDFProcessor()  # New PDF processor
 # Initialize Gemini
 genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
@@ -47,7 +48,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
         if not data.document_text:
             return {"error": "No document text provided"}
-        # ⭐ Use forced doc_id if provided (from Node.js), otherwise generate from text
         if data.force_doc_id:
             doc_id = data.force_doc_id
             print(f"🔧 Using Node.js provided doc_id: {doc_id}")
@@ -89,7 +90,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
 @app.post("/analyze_pdf")
 async def analyze_pdf(
     file: UploadFile = File(...),
-    force_doc_id: Optional[str] = None  # Accept doc_id from Node.js
 ):
     """Direct PDF upload and analysis with Node.js doc_id support"""
     try:
@@ -102,7 +103,7 @@ async def analyze_pdf(
         # Read file content
         file_content = await file.read()
-        # ⭐ Use Node.js provided doc_id OR generate from file content
         if force_doc_id:
             doc_id = force_doc_id
             print(f"🔧 Using Node.js provided doc_id: {doc_id}")
@@ -157,10 +158,8 @@ async def analyze_pdf(
 @app.post("/analyze_document_url")
 async def analyze_document_url(data: AnalyzeDocumentURLInput):
-    """Analyze document from URL with Node.js doc_id support"""
     try:
-        import httpx
         start_time = time.time()
         if not data.document_url:
@@ -168,13 +167,32 @@ async def analyze_document_url(data: AnalyzeDocumentURLInput):
         print(f"📥 Downloading document from: {data.document_url}")
-        # Download the document
-        async with httpx.AsyncClient(timeout=60.0) as client:
-            response = await client.get(data.document_url)
-            response.raise_for_status()
-            file_content = response.content
-        # ⭐ Use Node.js provided doc_id OR generate from file content
         if data.force_doc_id:
             doc_id = data.force_doc_id
             print(f"🔧 Using Node.js provided doc_id: {doc_id}")
@@ -220,8 +238,14 @@ async def analyze_document_url(data: AnalyzeDocumentURLInput):
         return result
-    except httpx.HTTPStatusError as e:
-        raise HTTPException(status_code=400, detail=f"Failed to download document: {e}")
     except Exception as e:
         print(f"❌ URL analysis error: {e}")
         raise HTTPException(status_code=500, detail=f"Document analysis failed: {str(e)}")
@@ -331,6 +355,7 @@ async def chat_with_document(data: ChatInput):
         print(f"❌ Chat error: {e}")
         return {"error": f"Chat failed: {str(e)}"}
 @app.get("/debug_pinecone/{document_id}")
 async def debug_pinecone_storage(document_id: str):
     """Debug what's actually stored in Pinecone for a document"""

 from document_processor import DocumentProcessor
 from vector_store import vector_store
 from models import *
+from pdf_processor import PDFProcessor
 import time
 import hashlib
 import os
 import google.generativeai as genai
+import requests  # Use requests instead of httpx for better Cloudinary compatibility
 from typing import Optional
 import tempfile
 # Initialize processors
 processor = DocumentProcessor()
+pdf_processor = PDFProcessor()
 # Initialize Gemini
 genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
         if not data.document_text:
             return {"error": "No document text provided"}
+        # Use forced doc_id if provided (from Node.js), otherwise generate from text
         if data.force_doc_id:
             doc_id = data.force_doc_id
             print(f"🔧 Using Node.js provided doc_id: {doc_id}")
 @app.post("/analyze_pdf")
 async def analyze_pdf(
     file: UploadFile = File(...),
+    force_doc_id: Optional[str] = None
 ):
     """Direct PDF upload and analysis with Node.js doc_id support"""
     try:
         # Read file content
         file_content = await file.read()
+        # Use Node.js provided doc_id OR generate from file content
         if force_doc_id:
             doc_id = force_doc_id
             print(f"🔧 Using Node.js provided doc_id: {doc_id}")
 @app.post("/analyze_document_url")
 async def analyze_document_url(data: AnalyzeDocumentURLInput):
+    """Analyze document from URL with FIXED Cloudinary download"""
     try:
         start_time = time.time()
         if not data.document_url:
         print(f"📥 Downloading document from: {data.document_url}")
+        # ⭐ FIXED: Use requests with proper headers (same as Postman)
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': '*/*',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Connection': 'keep-alive'
+        }
+        # Test the URL first with HEAD request
+        try:
+            head_response = requests.head(data.document_url, headers=headers, timeout=10)
+            print(f"✅ HEAD request successful: {head_response.status_code}")
+            print(f"📊 Content-Type: {head_response.headers.get('content-type', 'unknown')}")
+            print(f"📏 Content-Length: {head_response.headers.get('content-length', 'unknown')}")
+        except Exception as head_error:
+            print(f"⚠️ HEAD request failed: {head_error}")
+        # Download the full content using requests (more reliable than httpx for Cloudinary)
+        response = requests.get(data.document_url, headers=headers, timeout=60)
+        response.raise_for_status()
+        file_content = response.content
+        print(f"✅ Successfully downloaded {len(file_content)} bytes")
+        # Use Node.js provided doc_id OR generate from file content
         if data.force_doc_id:
             doc_id = data.force_doc_id
             print(f"🔧 Using Node.js provided doc_id: {doc_id}")
         return result
+    except requests.HTTPError as e:
+        error_msg = f"Failed to download document: HTTP {e.response.status_code} - {e.response.reason}"
+        print(f"❌ HTTP Error: {error_msg}")
+        raise HTTPException(status_code=400, detail=error_msg)
+    except requests.RequestException as e:
+        error_msg = f"Failed to download document: {str(e)}"
+        print(f"❌ Request Error: {error_msg}")
+        raise HTTPException(status_code=400, detail=error_msg)
     except Exception as e:
         print(f"❌ URL analysis error: {e}")
         raise HTTPException(status_code=500, detail=f"Document analysis failed: {str(e)}")
         print(f"❌ Chat error: {e}")
         return {"error": f"Chat failed: {str(e)}"}
 @app.get("/debug_pinecone/{document_id}")
 async def debug_pinecone_storage(document_id: str):
     """Debug what's actually stored in Pinecone for a document"""