Update main.py
Browse files
main.py
CHANGED
@@ -5,17 +5,18 @@ from fastapi.responses import JSONResponse
|
|
5 |
from document_processor import DocumentProcessor
|
6 |
from vector_store import vector_store
|
7 |
from models import *
|
8 |
-
from pdf_processor import PDFProcessor
|
9 |
import time
|
10 |
import hashlib
|
11 |
import os
|
12 |
import google.generativeai as genai
|
|
|
13 |
from typing import Optional
|
14 |
import tempfile
|
15 |
|
16 |
# Initialize processors
|
17 |
processor = DocumentProcessor()
|
18 |
-
pdf_processor = PDFProcessor()
|
19 |
|
20 |
# Initialize Gemini
|
21 |
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
|
@@ -47,7 +48,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
|
|
47 |
if not data.document_text:
|
48 |
return {"error": "No document text provided"}
|
49 |
|
50 |
-
#
|
51 |
if data.force_doc_id:
|
52 |
doc_id = data.force_doc_id
|
53 |
print(f"π§ Using Node.js provided doc_id: {doc_id}")
|
@@ -89,7 +90,7 @@ async def analyze_document(data: AnalyzeDocumentInput):
|
|
89 |
@app.post("/analyze_pdf")
|
90 |
async def analyze_pdf(
|
91 |
file: UploadFile = File(...),
|
92 |
-
force_doc_id: Optional[str] = None
|
93 |
):
|
94 |
"""Direct PDF upload and analysis with Node.js doc_id support"""
|
95 |
try:
|
@@ -102,7 +103,7 @@ async def analyze_pdf(
|
|
102 |
# Read file content
|
103 |
file_content = await file.read()
|
104 |
|
105 |
-
#
|
106 |
if force_doc_id:
|
107 |
doc_id = force_doc_id
|
108 |
print(f"π§ Using Node.js provided doc_id: {doc_id}")
|
@@ -157,10 +158,8 @@ async def analyze_pdf(
|
|
157 |
|
158 |
@app.post("/analyze_document_url")
|
159 |
async def analyze_document_url(data: AnalyzeDocumentURLInput):
|
160 |
-
"""Analyze document from URL with
|
161 |
try:
|
162 |
-
import httpx
|
163 |
-
|
164 |
start_time = time.time()
|
165 |
|
166 |
if not data.document_url:
|
@@ -168,13 +167,32 @@ async def analyze_document_url(data: AnalyzeDocumentURLInput):
|
|
168 |
|
169 |
print(f"π₯ Downloading document from: {data.document_url}")
|
170 |
|
171 |
-
#
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
if data.force_doc_id:
|
179 |
doc_id = data.force_doc_id
|
180 |
print(f"π§ Using Node.js provided doc_id: {doc_id}")
|
@@ -220,8 +238,14 @@ async def analyze_document_url(data: AnalyzeDocumentURLInput):
|
|
220 |
|
221 |
return result
|
222 |
|
223 |
-
except
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
except Exception as e:
|
226 |
print(f"β URL analysis error: {e}")
|
227 |
raise HTTPException(status_code=500, detail=f"Document analysis failed: {str(e)}")
|
@@ -331,6 +355,7 @@ async def chat_with_document(data: ChatInput):
|
|
331 |
print(f"β Chat error: {e}")
|
332 |
return {"error": f"Chat failed: {str(e)}"}
|
333 |
|
|
|
334 |
@app.get("/debug_pinecone/{document_id}")
|
335 |
async def debug_pinecone_storage(document_id: str):
|
336 |
"""Debug what's actually stored in Pinecone for a document"""
|
|
|
5 |
from document_processor import DocumentProcessor
|
6 |
from vector_store import vector_store
|
7 |
from models import *
|
8 |
+
from pdf_processor import PDFProcessor
|
9 |
import time
|
10 |
import hashlib
|
11 |
import os
|
12 |
import google.generativeai as genai
|
13 |
+
import requests # Use requests instead of httpx for better Cloudinary compatibility
|
14 |
from typing import Optional
|
15 |
import tempfile
|
16 |
|
17 |
# Initialize processors
|
18 |
processor = DocumentProcessor()
|
19 |
+
pdf_processor = PDFProcessor()
|
20 |
|
21 |
# Initialize Gemini
|
22 |
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
|
|
|
48 |
if not data.document_text:
|
49 |
return {"error": "No document text provided"}
|
50 |
|
51 |
+
# Use forced doc_id if provided (from Node.js), otherwise generate from text
|
52 |
if data.force_doc_id:
|
53 |
doc_id = data.force_doc_id
|
54 |
print(f"π§ Using Node.js provided doc_id: {doc_id}")
|
|
|
90 |
@app.post("/analyze_pdf")
|
91 |
async def analyze_pdf(
|
92 |
file: UploadFile = File(...),
|
93 |
+
force_doc_id: Optional[str] = None
|
94 |
):
|
95 |
"""Direct PDF upload and analysis with Node.js doc_id support"""
|
96 |
try:
|
|
|
103 |
# Read file content
|
104 |
file_content = await file.read()
|
105 |
|
106 |
+
# Use Node.js provided doc_id OR generate from file content
|
107 |
if force_doc_id:
|
108 |
doc_id = force_doc_id
|
109 |
print(f"π§ Using Node.js provided doc_id: {doc_id}")
|
|
|
158 |
|
159 |
@app.post("/analyze_document_url")
|
160 |
async def analyze_document_url(data: AnalyzeDocumentURLInput):
|
161 |
+
"""Analyze document from URL with FIXED Cloudinary download"""
|
162 |
try:
|
|
|
|
|
163 |
start_time = time.time()
|
164 |
|
165 |
if not data.document_url:
|
|
|
167 |
|
168 |
print(f"π₯ Downloading document from: {data.document_url}")
|
169 |
|
170 |
+
# β FIXED: Use requests with proper headers (same as Postman)
|
171 |
+
headers = {
|
172 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
173 |
+
'Accept': '*/*',
|
174 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
175 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
176 |
+
'Connection': 'keep-alive'
|
177 |
+
}
|
178 |
+
|
179 |
+
# Test the URL first with HEAD request
|
180 |
+
try:
|
181 |
+
head_response = requests.head(data.document_url, headers=headers, timeout=10)
|
182 |
+
print(f"β
HEAD request successful: {head_response.status_code}")
|
183 |
+
print(f"π Content-Type: {head_response.headers.get('content-type', 'unknown')}")
|
184 |
+
print(f"π Content-Length: {head_response.headers.get('content-length', 'unknown')}")
|
185 |
+
except Exception as head_error:
|
186 |
+
print(f"β οΈ HEAD request failed: {head_error}")
|
187 |
|
188 |
+
# Download the full content using requests (more reliable than httpx for Cloudinary)
|
189 |
+
response = requests.get(data.document_url, headers=headers, timeout=60)
|
190 |
+
response.raise_for_status()
|
191 |
+
file_content = response.content
|
192 |
+
|
193 |
+
print(f"β
Successfully downloaded {len(file_content)} bytes")
|
194 |
+
|
195 |
+
# Use Node.js provided doc_id OR generate from file content
|
196 |
if data.force_doc_id:
|
197 |
doc_id = data.force_doc_id
|
198 |
print(f"π§ Using Node.js provided doc_id: {doc_id}")
|
|
|
238 |
|
239 |
return result
|
240 |
|
241 |
+
except requests.HTTPError as e:
|
242 |
+
error_msg = f"Failed to download document: HTTP {e.response.status_code} - {e.response.reason}"
|
243 |
+
print(f"β HTTP Error: {error_msg}")
|
244 |
+
raise HTTPException(status_code=400, detail=error_msg)
|
245 |
+
except requests.RequestException as e:
|
246 |
+
error_msg = f"Failed to download document: {str(e)}"
|
247 |
+
print(f"β Request Error: {error_msg}")
|
248 |
+
raise HTTPException(status_code=400, detail=error_msg)
|
249 |
except Exception as e:
|
250 |
print(f"β URL analysis error: {e}")
|
251 |
raise HTTPException(status_code=500, detail=f"Document analysis failed: {str(e)}")
|
|
|
355 |
print(f"β Chat error: {e}")
|
356 |
return {"error": f"Chat failed: {str(e)}"}
|
357 |
|
358 |
+
|
359 |
@app.get("/debug_pinecone/{document_id}")
|
360 |
async def debug_pinecone_storage(document_id: str):
|
361 |
"""Debug what's actually stored in Pinecone for a document"""
|