Spaces:

prodevroger
/

handwritten

Sleeping

App Files Files Community

IZERE HIRWA Roger commited on 17 days ago

Commit

dd5d745

1 Parent(s): 82e2f24

po

Browse files

Files changed (5) hide show

Dockerfile +14 -11
app.py +311 -267
clip_cache/text.txt +0 -0
data/text.txt +0 -0
requirements.txt +5 -14

Dockerfile CHANGED Viewed

@@ -2,26 +2,29 @@ FROM python:3.11
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 COPY . .
-# Create writable directories
-RUN mkdir -p /app/instance && chmod -R 777 /app/instance
-ENV HF_HOME=/app/transformers_cache
-RUN mkdir -p /app/transformers_cache && chmod -R 777 /app/transformers_cache
-# Create ../data directory for vector store
 RUN mkdir -p /app/data && chmod -R 777 /app/data
-RUN mkdir -p /data && chmod -R 777 /data
-# Create uploads directory
 RUN mkdir -p /app/uploads && chmod -R 777 /app/uploads
-# Create logs directory
 RUN mkdir -p /app/logs && chmod -R 777 /app/logs
 EXPOSE 7860
 CMD ["python", "app.py"]

 WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 COPY . .
+# Create writable directories with proper permissions
 RUN mkdir -p /app/data && chmod -R 777 /app/data
+RUN mkdir -p /app/static && chmod -R 777 /app/static
 RUN mkdir -p /app/uploads && chmod -R 777 /app/uploads
+RUN mkdir -p /app/clip_cache && chmod -R 777 /app/clip_cache
 RUN mkdir -p /app/logs && chmod -R 777 /app/logs
+# Set environment variables for cache directories
+ENV CLIP_CACHE=/app/clip_cache
+ENV HF_HOME=/app/clip_cache
+ENV TORCH_HOME=/app/clip_cache
 EXPOSE 7860
 CMD ["python", "app.py"]

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
-from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Depends, status
-from fastapi.responses import HTMLResponse, JSONResponse
-from fastapi.staticfiles import StaticFiles
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 import pytesseract
 from PIL import Image
 import numpy as np
@@ -15,51 +14,76 @@ import io
 import json
 import uuid
 from datetime import datetime, timedelta
-from typing import List, Dict, Any, Optional
-import base64
 import jwt
-from passlib.context import CryptContext
-app = FastAPI(title="Handwritten Archive Document Digitalization System")
 # Security configuration
 SECRET_KEY = "your-secret-key-change-this-in-production"
 ALGORITHM = "HS256"
 ACCESS_TOKEN_EXPIRE_MINUTES = 30
-pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
-security = HTTPBearer()
-# Default admin user (change in production)
-USERS_DB = {
-    "admin": {
-        "username": "admin",
-        "hashed_password": pwd_context.hash("admin123"),
-        "is_active": True
-    }
-}
-# Mount static files
-app.mount("/static", StaticFiles(directory="static"), name="static")
-# --- Load or Initialize Model/Index ---
-device = "cuda" if torch.cuda.is_available() else "cpu"
-clip_model, preprocess = clip.load("ViT-B/32", device=device)
 INDEX_PATH = "data/index.faiss"
 LABELS_PATH = "data/labels.pkl"
-DOCUMENTS_PATH = "data/documents.json"
 UPLOADS_DIR = "data/uploads"
-# Ensure directories exist
 os.makedirs("data", exist_ok=True)
 os.makedirs("static", exist_ok=True)
 os.makedirs(UPLOADS_DIR, exist_ok=True)
-# Initialize index and labels with error handling
 index = faiss.IndexFlatL2(512)
 labels = []
-documents = []
 if os.path.exists(INDEX_PATH) and os.path.exists(LABELS_PATH):
     try:
@@ -67,80 +91,57 @@ if os.path.exists(INDEX_PATH) and os.path.exists(LABELS_PATH):
         with open(LABELS_PATH, "rb") as f:
             labels = pickle.load(f)
         print(f"✅ Loaded existing index with {len(labels)} labels")
-    except (RuntimeError, EOFError, pickle.UnpicklingError) as e:
         print(f"⚠️ Failed to load existing index: {e}")
-        print("🔄 Starting with fresh index")
         if os.path.exists(INDEX_PATH):
             os.remove(INDEX_PATH)
         if os.path.exists(LABELS_PATH):
             os.remove(LABELS_PATH)
-# Load documents database
-if os.path.exists(DOCUMENTS_PATH):
-    try:
-        with open(DOCUMENTS_PATH, 'r') as f:
-            documents = json.load(f)
-    except:
-        documents = []
-# Authentication functions
-def verify_password(plain_password, hashed_password):
-    return pwd_context.verify(plain_password, hashed_password)
-def get_password_hash(password):
-    return pwd_context.hash(password)
-def authenticate_user(username: str, password: str):
-    user = USERS_DB.get(username)
-    if not user or not verify_password(password, user["hashed_password"]):
-        return False
-    return user
-def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
-    to_encode = data.copy()
-    if expires_delta:
-        expire = datetime.utcnow() + expires_delta
-    else:
-        expire = datetime.utcnow() + timedelta(minutes=15)
-    to_encode.update({"exp": expire})
-    encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
-    return encoded_jwt
-async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)):
-    credentials_exception = HTTPException(
-        status_code=status.HTTP_401_UNAUTHORIZED,
-        detail="Could not validate credentials",
-        headers={"WWW-Authenticate": "Bearer"},
-    )
-    try:
-        payload = jwt.decode(credentials.credentials, SECRET_KEY, algorithms=[ALGORITHM])
-        username: str = payload.get("sub")
-        if username is None:
-            raise credentials_exception
-    except jwt.PyJWTError:
-        raise credentials_exception
-    user = USERS_DB.get(username)
-    if user is None:
-        raise credentials_exception
-    return user
-# --- Utilities ---
 def save_index():
     try:
-        os.makedirs("data", exist_ok=True)
         faiss.write_index(index, INDEX_PATH)
         with open(LABELS_PATH, "wb") as f:
             pickle.dump(labels, f)
     except Exception as e:
         print(f"❌ Failed to save index: {e}")
-def save_documents():
     try:
-        with open(DOCUMENTS_PATH, 'w') as f:
-            json.dump(documents, f, indent=2)
-    except Exception as e:
-        print(f"❌ Failed to save documents: {e}")
 def image_from_pdf(pdf_bytes):
     try:
@@ -152,17 +153,8 @@ def image_from_pdf(pdf_bytes):
 def extract_text(image):
     try:
-        if image is None:
-            return "❌ No image provided"
-        if isinstance(image, bytes):
-            image = Image.open(io.BytesIO(image))
-        elif not isinstance(image, Image.Image):
-            image = Image.fromarray(image)
         if image.mode != 'RGB':
             image = image.convert('RGB')
         custom_config = r'--oem 3 --psm 6'
         text = pytesseract.image_to_string(image, config=custom_config)
         return text.strip() if text.strip() else "❓ No text detected"
@@ -171,17 +163,10 @@ def extract_text(image):
 def get_clip_embedding(image):
     try:
-        if image is None:
             return None
-        if isinstance(image, bytes):
-            image = Image.open(io.BytesIO(image))
-        elif not isinstance(image, Image.Image):
-            image = Image.fromarray(image)
         if image.mode != 'RGB':
             image = image.convert('RGB')
         image_input = preprocess(image).unsqueeze(0).to(device)
         with torch.no_grad():
             image_features = clip_model.encode_image(image_input)
@@ -202,84 +187,205 @@ def save_uploaded_file(file_content: bytes, filename: str) -> str:
     return saved_filename
-# --- API Endpoints ---
-@app.get("/", response_class=HTMLResponse)
-async def dashboard():
-    with open("static/index.html", "r") as f:
-        return HTMLResponse(content=f.read())
-@app.post("/api/login")
-async def login(username: str = Form(...), password: str = Form(...)):
     user = authenticate_user(username, password)
     if not user:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Incorrect username or password"
-        )
-    access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
-    access_token = create_access_token(
-        data={"sub": user["username"]}, expires_delta=access_token_expires
-    )
-    return {"access_token": access_token, "token_type": "bearer", "username": user["username"]}
-@app.post("/api/upload-category")
-async def upload_category(
-    file: UploadFile = File(...),
-    label: str = Form(...),
-    current_user: dict = Depends(get_current_user)
-):
     try:
-        if not label or not label.strip():
-            raise HTTPException(status_code=400, detail="Please provide a label")
-        label = label.strip()
-        file_content = await file.read()
         if file.content_type and file.content_type.startswith('application/pdf'):
             image = image_from_pdf(file_content)
         else:
             image = Image.open(io.BytesIO(file_content))
         if image is None:
-            raise HTTPException(status_code=400, detail="Failed to process image")
         embedding = get_clip_embedding(image)
         if embedding is None:
-            raise HTTPException(status_code=400, detail="Failed to generate embedding")
         index.add(np.array([embedding]))
-        labels.append(label)
         save_index()
-        return {"message": f"✅ Added category '{label}' (Total: {len(labels)} categories)", "status": "success"}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/api/classify-document")
-async def classify_document(
-    file: UploadFile = File(...),
-    current_user: dict = Depends(get_current_user)
-):
     try:
         if len(labels) == 0:
-            raise HTTPException(status_code=400, detail="No categories in database. Please add some first.")
-        file_content = await file.read()
         if file.content_type and file.content_type.startswith('application/pdf'):
             image = image_from_pdf(file_content)
         else:
             image = Image.open(io.BytesIO(file_content))
         if image is None:
-            raise HTTPException(status_code=400, detail="Failed to process image")
         embedding = get_clip_embedding(image)
         if embedding is None:
-            raise HTTPException(status_code=400, detail="Failed to generate embedding")
-        # Search for top 3 matches
         k = min(3, len(labels))
         D, I = index.search(np.array([embedding]), k=k)
@@ -295,137 +401,75 @@ async def classify_document(
                     sim = 1 - D[0][i]
                     matches.append({"category": labels[I[0][i]], "similarity": round(sim, 3)})
-            # Save classified document
             if similarity >= confidence_threshold:
                 saved_filename = save_uploaded_file(file_content, file.filename)
                 ocr_text = extract_text(image)
-                document = {
-                    "id": str(uuid.uuid4()),
-                    "filename": saved_filename,
-                    "original_filename": file.filename,
-                    "category": best_match,
-                    "similarity": round(similarity, 3),
-                    "ocr_text": ocr_text,
-                    "upload_date": datetime.now().isoformat(),
-                    "file_path": os.path.join(UPLOADS_DIR, saved_filename)
-                }
-                documents.append(document)
-                save_documents()
-                return {
                     "status": "success",
                     "category": best_match,
                     "similarity": round(similarity, 3),
-                    "confidence": "high" if similarity >= confidence_threshold else "low",
                     "matches": matches,
                     "document_saved": True,
-                    "document_id": document["id"]
-                }
             else:
-                return {
                     "status": "low_confidence",
                     "category": best_match,
                     "similarity": round(similarity, 3),
                     "confidence": "low",
                     "matches": matches,
                     "document_saved": False
-                }
-        raise HTTPException(status_code=400, detail="Document not recognized")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/api/categories")
-async def get_categories(current_user: dict = Depends(get_current_user)):
-    categories = list(set(labels))  # Remove duplicates
-    category_counts = {}
-    for label in labels:
-        category_counts[label] = category_counts.get(label, 0) + 1
-    return {"categories": categories, "counts": category_counts}
-@app.get("/api/documents/{category}")
-async def get_documents_by_category(
-    category: str,
-    current_user: dict = Depends(get_current_user)
-):
-    category_documents = [doc for doc in documents if doc["category"] == category]
-    return {"documents": category_documents, "count": len(category_documents)}
-@app.get("/api/documents")
-async def get_all_documents(current_user: dict = Depends(get_current_user)):
-    return {"documents": documents, "count": len(documents)}
-@app.delete("/api/documents/{document_id}")
-async def delete_document(
-    document_id: str,
-    current_user: dict = Depends(get_current_user)
-):
-    try:
-        # Find document
-        document_index = None
-        document_to_delete = None
-        for i, doc in enumerate(documents):
-            if doc["id"] == document_id:
-                document_index = i
-                document_to_delete = doc
-                break
-        if document_to_delete is None:
-            raise HTTPException(status_code=404, detail="Document not found")
-        # Delete physical file
-        file_path = document_to_delete.get("file_path")
-        if file_path and os.path.exists(file_path):
-            os.remove(file_path)
-        # Remove from documents list
-        documents.pop(document_index)
-        save_documents()
-        return {"message": "Document deleted successfully", "status": "success"}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/api/ocr")
-async def ocr_document(
-    file: UploadFile = File(...),
-    current_user: dict = Depends(get_current_user)
-):
-    try:
-        file_content = await file.read()
-        if file.content_type and file.content_type.startswith('application/pdf'):
-            image = image_from_pdf(file_content)
-        else:
-            image = Image.open(io.BytesIO(file_content))
-        if image is None:
-            raise HTTPException(status_code=400, detail="Failed to process image")
-        text = extract_text(image)
-        return {"text": text, "status": "success"}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/api/stats")
-async def get_stats(current_user: dict = Depends(get_current_user)):
-    category_stats = {}
-    for doc in documents:
-        category = doc["category"]
-        if category not in category_stats:
-            category_stats[category] = 0
-        category_stats[category] += 1
-    return {
-        "total_categories": len(set(labels)),
-        "total_documents": len(documents),
-        "category_distribution": category_stats
-    }
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+from flask import Flask, request, jsonify, render_template_string, send_from_directory
+from werkzeug.utils import secure_filename
+from werkzeug.security import generate_password_hash, check_password_hash
 import pytesseract
 from PIL import Image
 import numpy as np
 import json
 import uuid
 from datetime import datetime, timedelta
 import jwt
+import sqlite3
+import tempfile
+app = Flask(__name__)
+app.config['SECRET_KEY'] = 'your-secret-key-change-this-in-production'
 # Security configuration
 SECRET_KEY = "your-secret-key-change-this-in-production"
 ALGORITHM = "HS256"
 ACCESS_TOKEN_EXPIRE_MINUTES = 30
+# Set CLIP cache to writable directory
+os.environ['CLIP_CACHE'] = '/app/clip_cache'
+os.makedirs('/app/clip_cache', exist_ok=True)
+# Directories
 INDEX_PATH = "data/index.faiss"
 LABELS_PATH = "data/labels.pkl"
+DATABASE_PATH = "data/documents.db"
 UPLOADS_DIR = "data/uploads"
 os.makedirs("data", exist_ok=True)
 os.makedirs("static", exist_ok=True)
 os.makedirs(UPLOADS_DIR, exist_ok=True)
+# Initialize database
+def init_db():
+    conn = sqlite3.connect(DATABASE_PATH)
+    cursor = conn.cursor()
+    # Users table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS users (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            username TEXT UNIQUE NOT NULL,
+            password_hash TEXT NOT NULL,
+            is_active BOOLEAN DEFAULT TRUE
+        )
+    ''')
+    # Documents table
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS documents (
+            id TEXT PRIMARY KEY,
+            filename TEXT NOT NULL,
+            original_filename TEXT NOT NULL,
+            category TEXT NOT NULL,
+            similarity REAL NOT NULL,
+            ocr_text TEXT,
+            upload_date TEXT NOT NULL,
+            file_path TEXT NOT NULL
+        )
+    ''')
+    # Insert default admin user if not exists
+    cursor.execute('SELECT * FROM users WHERE username = ?', ('admin',))
+    if not cursor.fetchone():
+        admin_hash = generate_password_hash('admin123')
+        cursor.execute('INSERT INTO users (username, password_hash) VALUES (?, ?)',
+                      ('admin', admin_hash))
+    conn.commit()
+    conn.close()
+init_db()
+# Initialize index and labels
 index = faiss.IndexFlatL2(512)
 labels = []
 if os.path.exists(INDEX_PATH) and os.path.exists(LABELS_PATH):
     try:
         with open(LABELS_PATH, "rb") as f:
             labels = pickle.load(f)
         print(f"✅ Loaded existing index with {len(labels)} labels")
+    except Exception as e:
         print(f"⚠️ Failed to load existing index: {e}")
         if os.path.exists(INDEX_PATH):
             os.remove(INDEX_PATH)
         if os.path.exists(LABELS_PATH):
             os.remove(LABELS_PATH)
+# Initialize CLIP model with custom cache
+device = "cuda" if torch.cuda.is_available() else "cpu"
+try:
+    clip_model, preprocess = clip.load("ViT-B/32", device=device, download_root='/app/clip_cache')
+    print("✅ CLIP model loaded successfully")
+except Exception as e:
+    print(f"❌ Failed to load CLIP model: {e}")
+    # Fallback initialization
+    clip_model = None
+    preprocess = None
+# Helper functions
 def save_index():
     try:
         faiss.write_index(index, INDEX_PATH)
         with open(LABELS_PATH, "wb") as f:
             pickle.dump(labels, f)
     except Exception as e:
         print(f"❌ Failed to save index: {e}")
+def authenticate_user(username: str, password: str):
+    conn = sqlite3.connect(DATABASE_PATH)
+    cursor = conn.cursor()
+    cursor.execute('SELECT password_hash FROM users WHERE username = ? AND is_active = TRUE', (username,))
+    result = cursor.fetchone()
+    conn.close()
+    if result and check_password_hash(result[0], password):
+        return {"username": username}
+    return None
+def create_access_token(data: dict):
+    expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
+    to_encode = data.copy()
+    to_encode.update({"exp": expire})
+    return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
+def verify_token(token: str):
     try:
+        payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
+        username = payload.get("sub")
+        return username if username else None
+    except jwt.PyJWTError:
+        return None
 def image_from_pdf(pdf_bytes):
     try:
 def extract_text(image):
     try:
         if image.mode != 'RGB':
             image = image.convert('RGB')
         custom_config = r'--oem 3 --psm 6'
         text = pytesseract.image_to_string(image, config=custom_config)
         return text.strip() if text.strip() else "❓ No text detected"
 def get_clip_embedding(image):
     try:
+        if clip_model is None:
             return None
         if image.mode != 'RGB':
             image = image.convert('RGB')
         image_input = preprocess(image).unsqueeze(0).to(device)
         with torch.no_grad():
             image_features = clip_model.encode_image(image_input)
     return saved_filename
+# Routes
+@app.route("/")
+def dashboard():
+    return render_template_string('''
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>Document Classification System</title>
+        <style>
+            body { font-family: Arial, sans-serif; margin: 40px; }
+            .container { max-width: 800px; margin: 0 auto; }
+            .form-group { margin: 20px 0; }
+            input, button { padding: 10px; margin: 5px; }
+            button { background: #007bff; color: white; border: none; cursor: pointer; }
+            .result { margin: 20px 0; padding: 10px; background: #f8f9fa; border: 1px solid #dee2e6; }
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <h1>Document Classification System</h1>
+            <div class="form-group">
+                <h3>Login</h3>
+                <form id="loginForm">
+                    <input type="text" id="username" placeholder="Username" required>
+                    <input type="password" id="password" placeholder="Password" required>
+                    <button type="submit">Login</button>
+                </form>
+            </div>
+            <div id="mainContent" style="display:none;">
+                <div class="form-group">
+                    <h3>Upload Category</h3>
+                    <form id="categoryForm" enctype="multipart/form-data">
+                        <input type="file" id="categoryFile" accept="image/*,.pdf" required>
+                        <input type="text" id="categoryLabel" placeholder="Category Label" required>
+                        <button type="submit">Add Category</button>
+                    </form>
+                </div>
+                <div class="form-group">
+                    <h3>Classify Document</h3>
+                    <form id="classifyForm" enctype="multipart/form-data">
+                        <input type="file" id="classifyFile" accept="image/*,.pdf" required>
+                        <button type="submit">Classify</button>
+                    </form>
+                </div>
+                <div id="result" class="result" style="display:none;"></div>
+            </div>
+        </div>
+        <script>
+            let token = null;
+            document.getElementById('loginForm').onsubmit = async (e) => {
+                e.preventDefault();
+                const formData = new FormData();
+                formData.append('username', document.getElementById('username').value);
+                formData.append('password', document.getElementById('password').value);
+                const response = await fetch('/api/login', {
+                    method: 'POST',
+                    body: formData
+                });
+                const result = await response.json();
+                if (response.ok) {
+                    token = result.access_token;
+                    document.getElementById('mainContent').style.display = 'block';
+                    document.getElementById('result').innerHTML = 'Login successful!';
+                    document.getElementById('result').style.display = 'block';
+                } else {
+                    document.getElementById('result').innerHTML = 'Login failed: ' + result.detail;
+                    document.getElementById('result').style.display = 'block';
+                }
+            };
+            document.getElementById('categoryForm').onsubmit = async (e) => {
+                e.preventDefault();
+                const formData = new FormData();
+                formData.append('file', document.getElementById('categoryFile').files[0]);
+                formData.append('label', document.getElementById('categoryLabel').value);
+                const response = await fetch('/api/upload-category', {
+                    method: 'POST',
+                    body: formData,
+                    headers: {'Authorization': 'Bearer ' + token}
+                });
+                const result = await response.json();
+                document.getElementById('result').innerHTML = JSON.stringify(result, null, 2);
+                document.getElementById('result').style.display = 'block';
+            };
+            document.getElementById('classifyForm').onsubmit = async (e) => {
+                e.preventDefault();
+                const formData = new FormData();
+                formData.append('file', document.getElementById('classifyFile').files[0]);
+                const response = await fetch('/api/classify-document', {
+                    method: 'POST',
+                    body: formData,
+                    headers: {'Authorization': 'Bearer ' + token}
+                });
+                const result = await response.json();
+                document.getElementById('result').innerHTML = JSON.stringify(result, null, 2);
+                document.getElementById('result').style.display = 'block';
+            };
+        </script>
+    </body>
+    </html>
+    ''')
+@app.route("/api/login", methods=["POST"])
+def login():
+    username = request.form.get("username")
+    password = request.form.get("password")
     user = authenticate_user(username, password)
     if not user:
+        return jsonify({"detail": "Incorrect username or password"}), 401
+    access_token = create_access_token(data={"sub": user["username"]})
+    return jsonify({"access_token": access_token, "token_type": "bearer", "username": user["username"]})
+@app.route("/api/upload-category", methods=["POST"])
+def upload_category():
+    # Verify token
+    auth_header = request.headers.get('Authorization')
+    if not auth_header or not auth_header.startswith('Bearer '):
+        return jsonify({"error": "Missing or invalid token"}), 401
+    token = auth_header.split(' ')[1]
+    username = verify_token(token)
+    if not username:
+        return jsonify({"error": "Invalid token"}), 401
     try:
+        label = request.form.get("label")
+        file = request.files.get("file")
+        if not label or not file:
+            return jsonify({"error": "Missing label or file"}), 400
+        file_content = file.read()
         if file.content_type and file.content_type.startswith('application/pdf'):
             image = image_from_pdf(file_content)
         else:
             image = Image.open(io.BytesIO(file_content))
         if image is None:
+            return jsonify({"error": "Failed to process image"}), 400
         embedding = get_clip_embedding(image)
         if embedding is None:
+            return jsonify({"error": "Failed to generate embedding"}), 400
         index.add(np.array([embedding]))
+        labels.append(label.strip())
         save_index()
+        return jsonify({"message": f"✅ Added category '{label}' (Total: {len(labels)} categories)", "status": "success"})
     except Exception as e:
+        return jsonify({"error": str(e)}), 500
+@app.route("/api/classify-document", methods=["POST"])
+def classify_document():
+    # Verify token
+    auth_header = request.headers.get('Authorization')
+    if not auth_header or not auth_header.startswith('Bearer '):
+        return jsonify({"error": "Missing or invalid token"}), 401
+    token = auth_header.split(' ')[1]
+    username = verify_token(token)
+    if not username:
+        return jsonify({"error": "Invalid token"}), 401
     try:
         if len(labels) == 0:
+            return jsonify({"error": "No categories in database. Please add some first."}), 400
+        file = request.files.get("file")
+        if not file:
+            return jsonify({"error": "Missing file"}), 400
+        file_content = file.read()
         if file.content_type and file.content_type.startswith('application/pdf'):
             image = image_from_pdf(file_content)
         else:
             image = Image.open(io.BytesIO(file_content))
         if image is None:
+            return jsonify({"error": "Failed to process image"}), 400
         embedding = get_clip_embedding(image)
         if embedding is None:
+            return jsonify({"error": "Failed to generate embedding"}), 400
         k = min(3, len(labels))
         D, I = index.search(np.array([embedding]), k=k)
                     sim = 1 - D[0][i]
                     matches.append({"category": labels[I[0][i]], "similarity": round(sim, 3)})
+            # Save classified document to SQLite
             if similarity >= confidence_threshold:
                 saved_filename = save_uploaded_file(file_content, file.filename)
                 ocr_text = extract_text(image)
+                document_id = str(uuid.uuid4())
+                conn = sqlite3.connect(DATABASE_PATH)
+                cursor = conn.cursor()
+                cursor.execute('''
+                    INSERT INTO documents (id, filename, original_filename, category, similarity, ocr_text, upload_date, file_path)
+                    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+                ''', (document_id, saved_filename, file.filename, best_match, round(similarity, 3),
+                      ocr_text, datetime.now().isoformat(), os.path.join(UPLOADS_DIR, saved_filename)))
+                conn.commit()
+                conn.close()
+                return jsonify({
                     "status": "success",
                     "category": best_match,
                     "similarity": round(similarity, 3),
+                    "confidence": "high",
                     "matches": matches,
                     "document_saved": True,
+                    "document_id": document_id
+                })
             else:
+                return jsonify({
                     "status": "low_confidence",
                     "category": best_match,
                     "similarity": round(similarity, 3),
                     "confidence": "low",
                     "matches": matches,
                     "document_saved": False
+                })
+        return jsonify({"error": "Document not recognized"}), 400
     except Exception as e:
+        return jsonify({"error": str(e)}), 500
+@app.route("/api/documents", methods=["GET"])
+def get_all_documents():
+    # Verify token
+    auth_header = request.headers.get('Authorization')
+    if not auth_header or not auth_header.startswith('Bearer '):
+        return jsonify({"error": "Missing or invalid token"}), 401
+    token = auth_header.split(' ')[1]
+    username = verify_token(token)
+    if not username:
+        return jsonify({"error": "Invalid token"}), 401
+    conn = sqlite3.connect(DATABASE_PATH)
+    cursor = conn.cursor()
+    cursor.execute('SELECT * FROM documents ORDER BY upload_date DESC')
+    documents = []
+    for row in cursor.fetchall():
+        documents.append({
+            "id": row[0],
+            "filename": row[1],
+            "original_filename": row[2],
+            "category": row[3],
+            "similarity": row[4],
+            "ocr_text": row[5],
+            "upload_date": row[6],
+            "file_path": row[7]
+        })
+    conn.close()
+    return jsonify({"documents": documents, "count": len(documents)})
 if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860, debug=True)

clip_cache/text.txt ADDED Viewed

File without changes

data/text.txt ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -1,20 +1,11 @@
-fastapi
-uvicorn[standard]
-python-multipart
-python-jose[cryptography]
-passlib[bcrypt]
-bcrypt
-gradio
-faiss-cpu
 pytesseract
 pdf2image
-sentence-transformers
 torch
 torchvision
 Pillow
-ftfy
-regex
-tqdm
 git+https://github.com/openai/CLIP.git
-poppler-utils
-jwt

+flask
+werkzeug
 pytesseract
 pdf2image
+faiss-cpu
 torch
 torchvision
 Pillow
+PyJWT
 git+https://github.com/openai/CLIP.git
+poppler-utils