IZERE HIRWA Roger commited on
Commit
dd5d745
·
1 Parent(s): 82e2f24
Files changed (5) hide show
  1. Dockerfile +14 -11
  2. app.py +311 -267
  3. clip_cache/text.txt +0 -0
  4. data/text.txt +0 -0
  5. requirements.txt +5 -14
Dockerfile CHANGED
@@ -2,26 +2,29 @@ FROM python:3.11
2
 
3
  WORKDIR /app
4
 
 
 
 
 
 
 
5
  COPY requirements.txt .
6
  RUN pip install -r requirements.txt
7
 
8
  COPY . .
9
 
10
- # Create writable directories
11
- RUN mkdir -p /app/instance && chmod -R 777 /app/instance
12
- ENV HF_HOME=/app/transformers_cache
13
- RUN mkdir -p /app/transformers_cache && chmod -R 777 /app/transformers_cache
14
-
15
- # Create ../data directory for vector store
16
  RUN mkdir -p /app/data && chmod -R 777 /app/data
17
- RUN mkdir -p /data && chmod -R 777 /data
18
-
19
- # Create uploads directory
20
  RUN mkdir -p /app/uploads && chmod -R 777 /app/uploads
21
-
22
- # Create logs directory
23
  RUN mkdir -p /app/logs && chmod -R 777 /app/logs
24
 
 
 
 
 
 
25
  EXPOSE 7860
26
 
27
  CMD ["python", "app.py"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y \
7
+ tesseract-ocr \
8
+ poppler-utils \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
  COPY requirements.txt .
12
  RUN pip install -r requirements.txt
13
 
14
  COPY . .
15
 
16
+ # Create writable directories with proper permissions
 
 
 
 
 
17
  RUN mkdir -p /app/data && chmod -R 777 /app/data
18
+ RUN mkdir -p /app/static && chmod -R 777 /app/static
 
 
19
  RUN mkdir -p /app/uploads && chmod -R 777 /app/uploads
20
+ RUN mkdir -p /app/clip_cache && chmod -R 777 /app/clip_cache
 
21
  RUN mkdir -p /app/logs && chmod -R 777 /app/logs
22
 
23
+ # Set environment variables for cache directories
24
+ ENV CLIP_CACHE=/app/clip_cache
25
+ ENV HF_HOME=/app/clip_cache
26
+ ENV TORCH_HOME=/app/clip_cache
27
+
28
  EXPOSE 7860
29
 
30
  CMD ["python", "app.py"]
app.py CHANGED
@@ -1,7 +1,6 @@
1
- from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Depends, status
2
- from fastapi.responses import HTMLResponse, JSONResponse
3
- from fastapi.staticfiles import StaticFiles
4
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
5
  import pytesseract
6
  from PIL import Image
7
  import numpy as np
@@ -15,51 +14,76 @@ import io
15
  import json
16
  import uuid
17
  from datetime import datetime, timedelta
18
- from typing import List, Dict, Any, Optional
19
- import base64
20
  import jwt
21
- from passlib.context import CryptContext
 
22
 
23
- app = FastAPI(title="Handwritten Archive Document Digitalization System")
 
24
 
25
  # Security configuration
26
  SECRET_KEY = "your-secret-key-change-this-in-production"
27
  ALGORITHM = "HS256"
28
  ACCESS_TOKEN_EXPIRE_MINUTES = 30
29
 
30
- pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
31
- security = HTTPBearer()
32
-
33
- # Default admin user (change in production)
34
- USERS_DB = {
35
- "admin": {
36
- "username": "admin",
37
- "hashed_password": pwd_context.hash("admin123"),
38
- "is_active": True
39
- }
40
- }
41
-
42
- # Mount static files
43
- app.mount("/static", StaticFiles(directory="static"), name="static")
44
-
45
- # --- Load or Initialize Model/Index ---
46
- device = "cuda" if torch.cuda.is_available() else "cpu"
47
- clip_model, preprocess = clip.load("ViT-B/32", device=device)
48
 
 
49
  INDEX_PATH = "data/index.faiss"
50
  LABELS_PATH = "data/labels.pkl"
51
- DOCUMENTS_PATH = "data/documents.json"
52
  UPLOADS_DIR = "data/uploads"
53
 
54
- # Ensure directories exist
55
  os.makedirs("data", exist_ok=True)
56
  os.makedirs("static", exist_ok=True)
57
  os.makedirs(UPLOADS_DIR, exist_ok=True)
58
 
59
- # Initialize index and labels with error handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  index = faiss.IndexFlatL2(512)
61
  labels = []
62
- documents = []
63
 
64
  if os.path.exists(INDEX_PATH) and os.path.exists(LABELS_PATH):
65
  try:
@@ -67,80 +91,57 @@ if os.path.exists(INDEX_PATH) and os.path.exists(LABELS_PATH):
67
  with open(LABELS_PATH, "rb") as f:
68
  labels = pickle.load(f)
69
  print(f"✅ Loaded existing index with {len(labels)} labels")
70
- except (RuntimeError, EOFError, pickle.UnpicklingError) as e:
71
  print(f"⚠️ Failed to load existing index: {e}")
72
- print("🔄 Starting with fresh index")
73
  if os.path.exists(INDEX_PATH):
74
  os.remove(INDEX_PATH)
75
  if os.path.exists(LABELS_PATH):
76
  os.remove(LABELS_PATH)
77
 
78
- # Load documents database
79
- if os.path.exists(DOCUMENTS_PATH):
80
- try:
81
- with open(DOCUMENTS_PATH, 'r') as f:
82
- documents = json.load(f)
83
- except:
84
- documents = []
85
-
86
- # Authentication functions
87
- def verify_password(plain_password, hashed_password):
88
- return pwd_context.verify(plain_password, hashed_password)
89
-
90
- def get_password_hash(password):
91
- return pwd_context.hash(password)
92
-
93
- def authenticate_user(username: str, password: str):
94
- user = USERS_DB.get(username)
95
- if not user or not verify_password(password, user["hashed_password"]):
96
- return False
97
- return user
98
-
99
- def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
100
- to_encode = data.copy()
101
- if expires_delta:
102
- expire = datetime.utcnow() + expires_delta
103
- else:
104
- expire = datetime.utcnow() + timedelta(minutes=15)
105
- to_encode.update({"exp": expire})
106
- encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
107
- return encoded_jwt
108
-
109
- async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)):
110
- credentials_exception = HTTPException(
111
- status_code=status.HTTP_401_UNAUTHORIZED,
112
- detail="Could not validate credentials",
113
- headers={"WWW-Authenticate": "Bearer"},
114
- )
115
- try:
116
- payload = jwt.decode(credentials.credentials, SECRET_KEY, algorithms=[ALGORITHM])
117
- username: str = payload.get("sub")
118
- if username is None:
119
- raise credentials_exception
120
- except jwt.PyJWTError:
121
- raise credentials_exception
122
-
123
- user = USERS_DB.get(username)
124
- if user is None:
125
- raise credentials_exception
126
- return user
127
-
128
- # --- Utilities ---
129
  def save_index():
130
  try:
131
- os.makedirs("data", exist_ok=True)
132
  faiss.write_index(index, INDEX_PATH)
133
  with open(LABELS_PATH, "wb") as f:
134
  pickle.dump(labels, f)
135
  except Exception as e:
136
  print(f"❌ Failed to save index: {e}")
137
 
138
- def save_documents():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  try:
140
- with open(DOCUMENTS_PATH, 'w') as f:
141
- json.dump(documents, f, indent=2)
142
- except Exception as e:
143
- print(f"❌ Failed to save documents: {e}")
 
144
 
145
  def image_from_pdf(pdf_bytes):
146
  try:
@@ -152,17 +153,8 @@ def image_from_pdf(pdf_bytes):
152
 
153
  def extract_text(image):
154
  try:
155
- if image is None:
156
- return "❌ No image provided"
157
-
158
- if isinstance(image, bytes):
159
- image = Image.open(io.BytesIO(image))
160
- elif not isinstance(image, Image.Image):
161
- image = Image.fromarray(image)
162
-
163
  if image.mode != 'RGB':
164
  image = image.convert('RGB')
165
-
166
  custom_config = r'--oem 3 --psm 6'
167
  text = pytesseract.image_to_string(image, config=custom_config)
168
  return text.strip() if text.strip() else "❓ No text detected"
@@ -171,17 +163,10 @@ def extract_text(image):
171
 
172
  def get_clip_embedding(image):
173
  try:
174
- if image is None:
175
  return None
176
-
177
- if isinstance(image, bytes):
178
- image = Image.open(io.BytesIO(image))
179
- elif not isinstance(image, Image.Image):
180
- image = Image.fromarray(image)
181
-
182
  if image.mode != 'RGB':
183
  image = image.convert('RGB')
184
-
185
  image_input = preprocess(image).unsqueeze(0).to(device)
186
  with torch.no_grad():
187
  image_features = clip_model.encode_image(image_input)
@@ -202,84 +187,205 @@ def save_uploaded_file(file_content: bytes, filename: str) -> str:
202
 
203
  return saved_filename
204
 
205
- # --- API Endpoints ---
206
-
207
- @app.get("/", response_class=HTMLResponse)
208
- async def dashboard():
209
- with open("static/index.html", "r") as f:
210
- return HTMLResponse(content=f.read())
211
-
212
- @app.post("/api/login")
213
- async def login(username: str = Form(...), password: str = Form(...)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  user = authenticate_user(username, password)
215
  if not user:
216
- raise HTTPException(
217
- status_code=status.HTTP_401_UNAUTHORIZED,
218
- detail="Incorrect username or password"
219
- )
220
- access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
221
- access_token = create_access_token(
222
- data={"sub": user["username"]}, expires_delta=access_token_expires
223
- )
224
- return {"access_token": access_token, "token_type": "bearer", "username": user["username"]}
225
-
226
- @app.post("/api/upload-category")
227
- async def upload_category(
228
- file: UploadFile = File(...),
229
- label: str = Form(...),
230
- current_user: dict = Depends(get_current_user)
231
- ):
 
232
  try:
233
- if not label or not label.strip():
234
- raise HTTPException(status_code=400, detail="Please provide a label")
235
-
236
- label = label.strip()
237
- file_content = await file.read()
238
-
239
  if file.content_type and file.content_type.startswith('application/pdf'):
240
  image = image_from_pdf(file_content)
241
  else:
242
  image = Image.open(io.BytesIO(file_content))
243
 
244
  if image is None:
245
- raise HTTPException(status_code=400, detail="Failed to process image")
246
 
247
  embedding = get_clip_embedding(image)
248
  if embedding is None:
249
- raise HTTPException(status_code=400, detail="Failed to generate embedding")
250
-
251
  index.add(np.array([embedding]))
252
- labels.append(label)
253
  save_index()
254
 
255
- return {"message": f"✅ Added category '{label}' (Total: {len(labels)} categories)", "status": "success"}
256
  except Exception as e:
257
- raise HTTPException(status_code=500, detail=str(e))
258
-
259
- @app.post("/api/classify-document")
260
- async def classify_document(
261
- file: UploadFile = File(...),
262
- current_user: dict = Depends(get_current_user)
263
- ):
 
 
 
 
 
 
 
264
  try:
265
  if len(labels) == 0:
266
- raise HTTPException(status_code=400, detail="No categories in database. Please add some first.")
267
-
268
- file_content = await file.read()
269
 
 
 
 
 
 
270
  if file.content_type and file.content_type.startswith('application/pdf'):
271
  image = image_from_pdf(file_content)
272
  else:
273
  image = Image.open(io.BytesIO(file_content))
274
 
275
  if image is None:
276
- raise HTTPException(status_code=400, detail="Failed to process image")
277
 
278
  embedding = get_clip_embedding(image)
279
  if embedding is None:
280
- raise HTTPException(status_code=400, detail="Failed to generate embedding")
281
-
282
- # Search for top 3 matches
283
  k = min(3, len(labels))
284
  D, I = index.search(np.array([embedding]), k=k)
285
 
@@ -295,137 +401,75 @@ async def classify_document(
295
  sim = 1 - D[0][i]
296
  matches.append({"category": labels[I[0][i]], "similarity": round(sim, 3)})
297
 
298
- # Save classified document
299
  if similarity >= confidence_threshold:
300
  saved_filename = save_uploaded_file(file_content, file.filename)
301
  ocr_text = extract_text(image)
302
 
303
- document = {
304
- "id": str(uuid.uuid4()),
305
- "filename": saved_filename,
306
- "original_filename": file.filename,
307
- "category": best_match,
308
- "similarity": round(similarity, 3),
309
- "ocr_text": ocr_text,
310
- "upload_date": datetime.now().isoformat(),
311
- "file_path": os.path.join(UPLOADS_DIR, saved_filename)
312
- }
313
-
314
- documents.append(document)
315
- save_documents()
316
 
317
- return {
318
  "status": "success",
319
  "category": best_match,
320
  "similarity": round(similarity, 3),
321
- "confidence": "high" if similarity >= confidence_threshold else "low",
322
  "matches": matches,
323
  "document_saved": True,
324
- "document_id": document["id"]
325
- }
326
  else:
327
- return {
328
  "status": "low_confidence",
329
  "category": best_match,
330
  "similarity": round(similarity, 3),
331
  "confidence": "low",
332
  "matches": matches,
333
  "document_saved": False
334
- }
335
 
336
- raise HTTPException(status_code=400, detail="Document not recognized")
337
  except Exception as e:
338
- raise HTTPException(status_code=500, detail=str(e))
339
-
340
- @app.get("/api/categories")
341
- async def get_categories(current_user: dict = Depends(get_current_user)):
342
- categories = list(set(labels)) # Remove duplicates
343
- category_counts = {}
344
- for label in labels:
345
- category_counts[label] = category_counts.get(label, 0) + 1
346
 
347
- return {"categories": categories, "counts": category_counts}
348
-
349
- @app.get("/api/documents/{category}")
350
- async def get_documents_by_category(
351
- category: str,
352
- current_user: dict = Depends(get_current_user)
353
- ):
354
- category_documents = [doc for doc in documents if doc["category"] == category]
355
- return {"documents": category_documents, "count": len(category_documents)}
356
-
357
- @app.get("/api/documents")
358
- async def get_all_documents(current_user: dict = Depends(get_current_user)):
359
- return {"documents": documents, "count": len(documents)}
360
-
361
- @app.delete("/api/documents/{document_id}")
362
- async def delete_document(
363
- document_id: str,
364
- current_user: dict = Depends(get_current_user)
365
- ):
366
- try:
367
- # Find document
368
- document_index = None
369
- document_to_delete = None
370
-
371
- for i, doc in enumerate(documents):
372
- if doc["id"] == document_id:
373
- document_index = i
374
- document_to_delete = doc
375
- break
376
-
377
- if document_to_delete is None:
378
- raise HTTPException(status_code=404, detail="Document not found")
379
-
380
- # Delete physical file
381
- file_path = document_to_delete.get("file_path")
382
- if file_path and os.path.exists(file_path):
383
- os.remove(file_path)
384
-
385
- # Remove from documents list
386
- documents.pop(document_index)
387
- save_documents()
388
-
389
- return {"message": "Document deleted successfully", "status": "success"}
390
- except Exception as e:
391
- raise HTTPException(status_code=500, detail=str(e))
392
-
393
- @app.post("/api/ocr")
394
- async def ocr_document(
395
- file: UploadFile = File(...),
396
- current_user: dict = Depends(get_current_user)
397
- ):
398
- try:
399
- file_content = await file.read()
400
-
401
- if file.content_type and file.content_type.startswith('application/pdf'):
402
- image = image_from_pdf(file_content)
403
- else:
404
- image = Image.open(io.BytesIO(file_content))
405
-
406
- if image is None:
407
- raise HTTPException(status_code=400, detail="Failed to process image")
408
-
409
- text = extract_text(image)
410
- return {"text": text, "status": "success"}
411
- except Exception as e:
412
- raise HTTPException(status_code=500, detail=str(e))
413
-
414
- @app.get("/api/stats")
415
- async def get_stats(current_user: dict = Depends(get_current_user)):
416
- category_stats = {}
417
- for doc in documents:
418
- category = doc["category"]
419
- if category not in category_stats:
420
- category_stats[category] = 0
421
- category_stats[category] += 1
422
 
423
- return {
424
- "total_categories": len(set(labels)),
425
- "total_documents": len(documents),
426
- "category_distribution": category_stats
427
- }
428
 
429
  if __name__ == "__main__":
430
- import uvicorn
431
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ from flask import Flask, request, jsonify, render_template_string, send_from_directory
2
+ from werkzeug.utils import secure_filename
3
+ from werkzeug.security import generate_password_hash, check_password_hash
 
4
  import pytesseract
5
  from PIL import Image
6
  import numpy as np
 
14
  import json
15
  import uuid
16
  from datetime import datetime, timedelta
 
 
17
  import jwt
18
+ import sqlite3
19
+ import tempfile
20
 
21
+ app = Flask(__name__)
22
+ app.config['SECRET_KEY'] = 'your-secret-key-change-this-in-production'
23
 
24
  # Security configuration
25
  SECRET_KEY = "your-secret-key-change-this-in-production"
26
  ALGORITHM = "HS256"
27
  ACCESS_TOKEN_EXPIRE_MINUTES = 30
28
 
29
+ # Set CLIP cache to writable directory
30
+ os.environ['CLIP_CACHE'] = '/app/clip_cache'
31
+ os.makedirs('/app/clip_cache', exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ # Directories
34
  INDEX_PATH = "data/index.faiss"
35
  LABELS_PATH = "data/labels.pkl"
36
+ DATABASE_PATH = "data/documents.db"
37
  UPLOADS_DIR = "data/uploads"
38
 
 
39
  os.makedirs("data", exist_ok=True)
40
  os.makedirs("static", exist_ok=True)
41
  os.makedirs(UPLOADS_DIR, exist_ok=True)
42
 
43
+ # Initialize database
44
+ def init_db():
45
+ conn = sqlite3.connect(DATABASE_PATH)
46
+ cursor = conn.cursor()
47
+
48
+ # Users table
49
+ cursor.execute('''
50
+ CREATE TABLE IF NOT EXISTS users (
51
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
52
+ username TEXT UNIQUE NOT NULL,
53
+ password_hash TEXT NOT NULL,
54
+ is_active BOOLEAN DEFAULT TRUE
55
+ )
56
+ ''')
57
+
58
+ # Documents table
59
+ cursor.execute('''
60
+ CREATE TABLE IF NOT EXISTS documents (
61
+ id TEXT PRIMARY KEY,
62
+ filename TEXT NOT NULL,
63
+ original_filename TEXT NOT NULL,
64
+ category TEXT NOT NULL,
65
+ similarity REAL NOT NULL,
66
+ ocr_text TEXT,
67
+ upload_date TEXT NOT NULL,
68
+ file_path TEXT NOT NULL
69
+ )
70
+ ''')
71
+
72
+ # Insert default admin user if not exists
73
+ cursor.execute('SELECT * FROM users WHERE username = ?', ('admin',))
74
+ if not cursor.fetchone():
75
+ admin_hash = generate_password_hash('admin123')
76
+ cursor.execute('INSERT INTO users (username, password_hash) VALUES (?, ?)',
77
+ ('admin', admin_hash))
78
+
79
+ conn.commit()
80
+ conn.close()
81
+
82
+ init_db()
83
+
84
+ # Initialize index and labels
85
  index = faiss.IndexFlatL2(512)
86
  labels = []
 
87
 
88
  if os.path.exists(INDEX_PATH) and os.path.exists(LABELS_PATH):
89
  try:
 
91
  with open(LABELS_PATH, "rb") as f:
92
  labels = pickle.load(f)
93
  print(f"✅ Loaded existing index with {len(labels)} labels")
94
+ except Exception as e:
95
  print(f"⚠️ Failed to load existing index: {e}")
 
96
  if os.path.exists(INDEX_PATH):
97
  os.remove(INDEX_PATH)
98
  if os.path.exists(LABELS_PATH):
99
  os.remove(LABELS_PATH)
100
 
101
+ # Initialize CLIP model with custom cache
102
+ device = "cuda" if torch.cuda.is_available() else "cpu"
103
+ try:
104
+ clip_model, preprocess = clip.load("ViT-B/32", device=device, download_root='/app/clip_cache')
105
+ print("✅ CLIP model loaded successfully")
106
+ except Exception as e:
107
+ print(f"❌ Failed to load CLIP model: {e}")
108
+ # Fallback initialization
109
+ clip_model = None
110
+ preprocess = None
111
+
112
+ # Helper functions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def save_index():
114
  try:
 
115
  faiss.write_index(index, INDEX_PATH)
116
  with open(LABELS_PATH, "wb") as f:
117
  pickle.dump(labels, f)
118
  except Exception as e:
119
  print(f"❌ Failed to save index: {e}")
120
 
121
+ def authenticate_user(username: str, password: str):
122
+ conn = sqlite3.connect(DATABASE_PATH)
123
+ cursor = conn.cursor()
124
+ cursor.execute('SELECT password_hash FROM users WHERE username = ? AND is_active = TRUE', (username,))
125
+ result = cursor.fetchone()
126
+ conn.close()
127
+
128
+ if result and check_password_hash(result[0], password):
129
+ return {"username": username}
130
+ return None
131
+
132
+ def create_access_token(data: dict):
133
+ expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
134
+ to_encode = data.copy()
135
+ to_encode.update({"exp": expire})
136
+ return jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
137
+
138
+ def verify_token(token: str):
139
  try:
140
+ payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
141
+ username = payload.get("sub")
142
+ return username if username else None
143
+ except jwt.PyJWTError:
144
+ return None
145
 
146
  def image_from_pdf(pdf_bytes):
147
  try:
 
153
 
154
  def extract_text(image):
155
  try:
 
 
 
 
 
 
 
 
156
  if image.mode != 'RGB':
157
  image = image.convert('RGB')
 
158
  custom_config = r'--oem 3 --psm 6'
159
  text = pytesseract.image_to_string(image, config=custom_config)
160
  return text.strip() if text.strip() else "❓ No text detected"
 
163
 
164
  def get_clip_embedding(image):
165
  try:
166
+ if clip_model is None:
167
  return None
 
 
 
 
 
 
168
  if image.mode != 'RGB':
169
  image = image.convert('RGB')
 
170
  image_input = preprocess(image).unsqueeze(0).to(device)
171
  with torch.no_grad():
172
  image_features = clip_model.encode_image(image_input)
 
187
 
188
  return saved_filename
189
 
190
+ # Routes
191
+ @app.route("/")
192
+ def dashboard():
193
+ return render_template_string('''
194
+ <!DOCTYPE html>
195
+ <html>
196
+ <head>
197
+ <title>Document Classification System</title>
198
+ <style>
199
+ body { font-family: Arial, sans-serif; margin: 40px; }
200
+ .container { max-width: 800px; margin: 0 auto; }
201
+ .form-group { margin: 20px 0; }
202
+ input, button { padding: 10px; margin: 5px; }
203
+ button { background: #007bff; color: white; border: none; cursor: pointer; }
204
+ .result { margin: 20px 0; padding: 10px; background: #f8f9fa; border: 1px solid #dee2e6; }
205
+ </style>
206
+ </head>
207
+ <body>
208
+ <div class="container">
209
+ <h1>Document Classification System</h1>
210
+
211
+ <div class="form-group">
212
+ <h3>Login</h3>
213
+ <form id="loginForm">
214
+ <input type="text" id="username" placeholder="Username" required>
215
+ <input type="password" id="password" placeholder="Password" required>
216
+ <button type="submit">Login</button>
217
+ </form>
218
+ </div>
219
+
220
+ <div id="mainContent" style="display:none;">
221
+ <div class="form-group">
222
+ <h3>Upload Category</h3>
223
+ <form id="categoryForm" enctype="multipart/form-data">
224
+ <input type="file" id="categoryFile" accept="image/*,.pdf" required>
225
+ <input type="text" id="categoryLabel" placeholder="Category Label" required>
226
+ <button type="submit">Add Category</button>
227
+ </form>
228
+ </div>
229
+
230
+ <div class="form-group">
231
+ <h3>Classify Document</h3>
232
+ <form id="classifyForm" enctype="multipart/form-data">
233
+ <input type="file" id="classifyFile" accept="image/*,.pdf" required>
234
+ <button type="submit">Classify</button>
235
+ </form>
236
+ </div>
237
+
238
+ <div id="result" class="result" style="display:none;"></div>
239
+ </div>
240
+ </div>
241
+
242
+ <script>
243
+ let token = null;
244
+
245
+ document.getElementById('loginForm').onsubmit = async (e) => {
246
+ e.preventDefault();
247
+ const formData = new FormData();
248
+ formData.append('username', document.getElementById('username').value);
249
+ formData.append('password', document.getElementById('password').value);
250
+
251
+ const response = await fetch('/api/login', {
252
+ method: 'POST',
253
+ body: formData
254
+ });
255
+
256
+ const result = await response.json();
257
+ if (response.ok) {
258
+ token = result.access_token;
259
+ document.getElementById('mainContent').style.display = 'block';
260
+ document.getElementById('result').innerHTML = 'Login successful!';
261
+ document.getElementById('result').style.display = 'block';
262
+ } else {
263
+ document.getElementById('result').innerHTML = 'Login failed: ' + result.detail;
264
+ document.getElementById('result').style.display = 'block';
265
+ }
266
+ };
267
+
268
+ document.getElementById('categoryForm').onsubmit = async (e) => {
269
+ e.preventDefault();
270
+ const formData = new FormData();
271
+ formData.append('file', document.getElementById('categoryFile').files[0]);
272
+ formData.append('label', document.getElementById('categoryLabel').value);
273
+
274
+ const response = await fetch('/api/upload-category', {
275
+ method: 'POST',
276
+ body: formData,
277
+ headers: {'Authorization': 'Bearer ' + token}
278
+ });
279
+
280
+ const result = await response.json();
281
+ document.getElementById('result').innerHTML = JSON.stringify(result, null, 2);
282
+ document.getElementById('result').style.display = 'block';
283
+ };
284
+
285
+ document.getElementById('classifyForm').onsubmit = async (e) => {
286
+ e.preventDefault();
287
+ const formData = new FormData();
288
+ formData.append('file', document.getElementById('classifyFile').files[0]);
289
+
290
+ const response = await fetch('/api/classify-document', {
291
+ method: 'POST',
292
+ body: formData,
293
+ headers: {'Authorization': 'Bearer ' + token}
294
+ });
295
+
296
+ const result = await response.json();
297
+ document.getElementById('result').innerHTML = JSON.stringify(result, null, 2);
298
+ document.getElementById('result').style.display = 'block';
299
+ };
300
+ </script>
301
+ </body>
302
+ </html>
303
+ ''')
304
+
305
+ @app.route("/api/login", methods=["POST"])
306
+ def login():
307
+ username = request.form.get("username")
308
+ password = request.form.get("password")
309
+
310
  user = authenticate_user(username, password)
311
  if not user:
312
+ return jsonify({"detail": "Incorrect username or password"}), 401
313
+
314
+ access_token = create_access_token(data={"sub": user["username"]})
315
+ return jsonify({"access_token": access_token, "token_type": "bearer", "username": user["username"]})
316
+
317
+ @app.route("/api/upload-category", methods=["POST"])
318
+ def upload_category():
319
+ # Verify token
320
+ auth_header = request.headers.get('Authorization')
321
+ if not auth_header or not auth_header.startswith('Bearer '):
322
+ return jsonify({"error": "Missing or invalid token"}), 401
323
+
324
+ token = auth_header.split(' ')[1]
325
+ username = verify_token(token)
326
+ if not username:
327
+ return jsonify({"error": "Invalid token"}), 401
328
+
329
  try:
330
+ label = request.form.get("label")
331
+ file = request.files.get("file")
332
+ if not label or not file:
333
+ return jsonify({"error": "Missing label or file"}), 400
334
+
335
+ file_content = file.read()
336
  if file.content_type and file.content_type.startswith('application/pdf'):
337
  image = image_from_pdf(file_content)
338
  else:
339
  image = Image.open(io.BytesIO(file_content))
340
 
341
  if image is None:
342
+ return jsonify({"error": "Failed to process image"}), 400
343
 
344
  embedding = get_clip_embedding(image)
345
  if embedding is None:
346
+ return jsonify({"error": "Failed to generate embedding"}), 400
347
+
348
  index.add(np.array([embedding]))
349
+ labels.append(label.strip())
350
  save_index()
351
 
352
+ return jsonify({"message": f"✅ Added category '{label}' (Total: {len(labels)} categories)", "status": "success"})
353
  except Exception as e:
354
+ return jsonify({"error": str(e)}), 500
355
+
356
+ @app.route("/api/classify-document", methods=["POST"])
357
+ def classify_document():
358
+ # Verify token
359
+ auth_header = request.headers.get('Authorization')
360
+ if not auth_header or not auth_header.startswith('Bearer '):
361
+ return jsonify({"error": "Missing or invalid token"}), 401
362
+
363
+ token = auth_header.split(' ')[1]
364
+ username = verify_token(token)
365
+ if not username:
366
+ return jsonify({"error": "Invalid token"}), 401
367
+
368
  try:
369
  if len(labels) == 0:
370
+ return jsonify({"error": "No categories in database. Please add some first."}), 400
 
 
371
 
372
+ file = request.files.get("file")
373
+ if not file:
374
+ return jsonify({"error": "Missing file"}), 400
375
+
376
+ file_content = file.read()
377
  if file.content_type and file.content_type.startswith('application/pdf'):
378
  image = image_from_pdf(file_content)
379
  else:
380
  image = Image.open(io.BytesIO(file_content))
381
 
382
  if image is None:
383
+ return jsonify({"error": "Failed to process image"}), 400
384
 
385
  embedding = get_clip_embedding(image)
386
  if embedding is None:
387
+ return jsonify({"error": "Failed to generate embedding"}), 400
388
+
 
389
  k = min(3, len(labels))
390
  D, I = index.search(np.array([embedding]), k=k)
391
 
 
401
  sim = 1 - D[0][i]
402
  matches.append({"category": labels[I[0][i]], "similarity": round(sim, 3)})
403
 
404
+ # Save classified document to SQLite
405
  if similarity >= confidence_threshold:
406
  saved_filename = save_uploaded_file(file_content, file.filename)
407
  ocr_text = extract_text(image)
408
 
409
+ document_id = str(uuid.uuid4())
410
+ conn = sqlite3.connect(DATABASE_PATH)
411
+ cursor = conn.cursor()
412
+ cursor.execute('''
413
+ INSERT INTO documents (id, filename, original_filename, category, similarity, ocr_text, upload_date, file_path)
414
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
415
+ ''', (document_id, saved_filename, file.filename, best_match, round(similarity, 3),
416
+ ocr_text, datetime.now().isoformat(), os.path.join(UPLOADS_DIR, saved_filename)))
417
+ conn.commit()
418
+ conn.close()
 
 
 
419
 
420
+ return jsonify({
421
  "status": "success",
422
  "category": best_match,
423
  "similarity": round(similarity, 3),
424
+ "confidence": "high",
425
  "matches": matches,
426
  "document_saved": True,
427
+ "document_id": document_id
428
+ })
429
  else:
430
+ return jsonify({
431
  "status": "low_confidence",
432
  "category": best_match,
433
  "similarity": round(similarity, 3),
434
  "confidence": "low",
435
  "matches": matches,
436
  "document_saved": False
437
+ })
438
 
439
+ return jsonify({"error": "Document not recognized"}), 400
440
  except Exception as e:
441
+ return jsonify({"error": str(e)}), 500
442
+
443
+ @app.route("/api/documents", methods=["GET"])
444
+ def get_all_documents():
445
+ # Verify token
446
+ auth_header = request.headers.get('Authorization')
447
+ if not auth_header or not auth_header.startswith('Bearer '):
448
+ return jsonify({"error": "Missing or invalid token"}), 401
449
 
450
+ token = auth_header.split(' ')[1]
451
+ username = verify_token(token)
452
+ if not username:
453
+ return jsonify({"error": "Invalid token"}), 401
454
+
455
+ conn = sqlite3.connect(DATABASE_PATH)
456
+ cursor = conn.cursor()
457
+ cursor.execute('SELECT * FROM documents ORDER BY upload_date DESC')
458
+ documents = []
459
+ for row in cursor.fetchall():
460
+ documents.append({
461
+ "id": row[0],
462
+ "filename": row[1],
463
+ "original_filename": row[2],
464
+ "category": row[3],
465
+ "similarity": row[4],
466
+ "ocr_text": row[5],
467
+ "upload_date": row[6],
468
+ "file_path": row[7]
469
+ })
470
+ conn.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
+ return jsonify({"documents": documents, "count": len(documents)})
 
 
 
 
473
 
474
  if __name__ == "__main__":
475
+ app.run(host="0.0.0.0", port=7860, debug=True)
 
clip_cache/text.txt ADDED
File without changes
data/text.txt ADDED
File without changes
requirements.txt CHANGED
@@ -1,20 +1,11 @@
1
- fastapi
2
- uvicorn[standard]
3
- python-multipart
4
- python-jose[cryptography]
5
- passlib[bcrypt]
6
- bcrypt
7
- gradio
8
- faiss-cpu
9
  pytesseract
10
  pdf2image
11
- sentence-transformers
12
  torch
13
  torchvision
14
  Pillow
15
- ftfy
16
- regex
17
- tqdm
18
  git+https://github.com/openai/CLIP.git
19
- poppler-utils
20
- jwt
 
1
+ flask
2
+ werkzeug
 
 
 
 
 
 
3
  pytesseract
4
  pdf2image
5
+ faiss-cpu
6
  torch
7
  torchvision
8
  Pillow
9
+ PyJWT
 
 
10
  git+https://github.com/openai/CLIP.git
11
+ poppler-utils