Spaces:
Runtime error
Runtime error
Harsh Upadhyay
commited on
Commit
·
fca1742
1
Parent(s):
aac0325
removing local file support and adding SQLAlchemy support for all database operations.
Browse files- backend/.gitignore +1 -0
- backend/app/app.py +4 -0
- backend/app/database.py +29 -13
- backend/app/routes/routes.py +119 -56
- backend/app/utils/extract_text.py +20 -4
backend/.gitignore
CHANGED
@@ -13,6 +13,7 @@ env/
|
|
13 |
venv/
|
14 |
instance/
|
15 |
*.db
|
|
|
16 |
|
17 |
# OS/Editor
|
18 |
.DS_Store
|
|
|
13 |
venv/
|
14 |
instance/
|
15 |
*.db
|
16 |
+
*.env
|
17 |
|
18 |
# OS/Editor
|
19 |
.DS_Store
|
backend/app/app.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
|
4 |
+
# ... existing code ...
|
backend/app/database.py
CHANGED
@@ -1,22 +1,23 @@
|
|
1 |
# All sqlite3 and local DB logic will be removed and replaced with SQLAlchemy/Postgres in the next step.
|
2 |
# This file will be refactored to use SQLAlchemy models and sessions.
|
3 |
|
4 |
-
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, ForeignKey, DateTime
|
5 |
from sqlalchemy.orm import declarative_base, sessionmaker, relationship
|
6 |
from sqlalchemy.sql import func
|
7 |
import os
|
8 |
from sqlalchemy.exc import IntegrityError
|
9 |
from werkzeug.security import check_password_hash, generate_password_hash
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
DATABASE_URL
|
13 |
-
"sqlite:///" + os.path.join(os.path.dirname(os.path.abspath(__file__)), 'legal_docs.db')
|
14 |
-
)
|
15 |
|
16 |
-
|
|
|
17 |
|
18 |
if not DATABASE_URL or DATABASE_URL.strip() == "":
|
19 |
-
raise ValueError("DATABASE_URL is not set or is empty. Please set it as an environment variable or
|
20 |
|
21 |
engine = create_engine(DATABASE_URL)
|
22 |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
@@ -45,7 +46,8 @@ class Document(Base):
|
|
45 |
clauses = Column(Text)
|
46 |
features = Column(Text)
|
47 |
context_analysis = Column(Text)
|
48 |
-
|
|
|
49 |
upload_time = Column(DateTime(timezone=True), server_default=func.now())
|
50 |
user_id = Column(Integer, ForeignKey('users.id'))
|
51 |
user = relationship('User', back_populates='documents')
|
@@ -71,7 +73,7 @@ def get_db_session():
|
|
71 |
return SessionLocal()
|
72 |
|
73 |
# --- Document CRUD ---
|
74 |
-
def save_document(title, full_text, summary, clauses, features, context_analysis,
|
75 |
session = get_db_session()
|
76 |
try:
|
77 |
doc = Document(
|
@@ -81,7 +83,8 @@ def save_document(title, full_text, summary, clauses, features, context_analysis
|
|
81 |
clauses=str(clauses),
|
82 |
features=str(features),
|
83 |
context_analysis=str(context_analysis),
|
84 |
-
|
|
|
85 |
user_id=user_id
|
86 |
)
|
87 |
session.add(doc)
|
@@ -104,6 +107,9 @@ def get_all_documents(user_id=None):
|
|
104 |
for doc in documents:
|
105 |
d = doc.__dict__.copy()
|
106 |
d.pop('_sa_instance_state', None)
|
|
|
|
|
|
|
107 |
result.append(d)
|
108 |
return result
|
109 |
finally:
|
@@ -119,6 +125,8 @@ def get_document_by_id(doc_id, user_id=None):
|
|
119 |
if doc:
|
120 |
d = doc.__dict__.copy()
|
121 |
d.pop('_sa_instance_state', None)
|
|
|
|
|
122 |
return d
|
123 |
return None
|
124 |
finally:
|
@@ -128,11 +136,10 @@ def delete_document(doc_id):
|
|
128 |
session = get_db_session()
|
129 |
try:
|
130 |
doc = session.query(Document).filter(Document.id == doc_id).first()
|
131 |
-
file_path = doc.file_path if doc else None
|
132 |
if doc:
|
133 |
session.delete(doc)
|
134 |
session.commit()
|
135 |
-
return
|
136 |
finally:
|
137 |
session.close()
|
138 |
|
@@ -172,13 +179,22 @@ def search_questions_answers(query, user_id=None):
|
|
172 |
'document_id': row.document_id,
|
173 |
'question': row.question,
|
174 |
'answer': row.answer,
|
175 |
-
'created_at': row.created_at
|
176 |
})
|
177 |
return results
|
178 |
finally:
|
179 |
session.close()
|
180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
def save_question_answer(document_id, user_id, question, answer, score):
|
|
|
|
|
182 |
session = get_db_session()
|
183 |
try:
|
184 |
qa = QuestionAnswer(
|
|
|
1 |
# All sqlite3 and local DB logic will be removed and replaced with SQLAlchemy/Postgres in the next step.
|
2 |
# This file will be refactored to use SQLAlchemy models and sessions.
|
3 |
|
4 |
+
from sqlalchemy import create_engine, Column, Integer, String, Text, Float, ForeignKey, DateTime, LargeBinary
|
5 |
from sqlalchemy.orm import declarative_base, sessionmaker, relationship
|
6 |
from sqlalchemy.sql import func
|
7 |
import os
|
8 |
from sqlalchemy.exc import IntegrityError
|
9 |
from werkzeug.security import check_password_hash, generate_password_hash
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
import re
|
12 |
|
13 |
+
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
|
14 |
+
print("DEBUG: DATABASE_URL from os.environ:", os.environ.get('DATABASE_URL'))
|
|
|
|
|
15 |
|
16 |
+
# SQLAlchemy setup
|
17 |
+
DATABASE_URL = os.environ.get('DATABASE_URL')
|
18 |
|
19 |
if not DATABASE_URL or DATABASE_URL.strip() == "":
|
20 |
+
raise ValueError("DATABASE_URL is not set or is empty. Please set it as an environment variable or in your .env file for NeonDB.")
|
21 |
|
22 |
engine = create_engine(DATABASE_URL)
|
23 |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
46 |
clauses = Column(Text)
|
47 |
features = Column(Text)
|
48 |
context_analysis = Column(Text)
|
49 |
+
file_data = Column(LargeBinary) # Store file content in DB
|
50 |
+
file_size = Column(Integer) # Add this
|
51 |
upload_time = Column(DateTime(timezone=True), server_default=func.now())
|
52 |
user_id = Column(Integer, ForeignKey('users.id'))
|
53 |
user = relationship('User', back_populates='documents')
|
|
|
73 |
return SessionLocal()
|
74 |
|
75 |
# --- Document CRUD ---
|
76 |
+
def save_document(title, full_text, summary, clauses, features, context_analysis, file_data, user_id):
|
77 |
session = get_db_session()
|
78 |
try:
|
79 |
doc = Document(
|
|
|
83 |
clauses=str(clauses),
|
84 |
features=str(features),
|
85 |
context_analysis=str(context_analysis),
|
86 |
+
file_data=file_data,
|
87 |
+
file_size=len(file_data) if file_data else 0, # Store file size
|
88 |
user_id=user_id
|
89 |
)
|
90 |
session.add(doc)
|
|
|
107 |
for doc in documents:
|
108 |
d = doc.__dict__.copy()
|
109 |
d.pop('_sa_instance_state', None)
|
110 |
+
d.pop('file_data', None) # Don't return file data in list
|
111 |
+
# Do NOT pop 'summary'; keep it in the result
|
112 |
+
# file_size is included
|
113 |
result.append(d)
|
114 |
return result
|
115 |
finally:
|
|
|
125 |
if doc:
|
126 |
d = doc.__dict__.copy()
|
127 |
d.pop('_sa_instance_state', None)
|
128 |
+
# Don't return file_data by default
|
129 |
+
d.pop('file_data', None)
|
130 |
return d
|
131 |
return None
|
132 |
finally:
|
|
|
136 |
session = get_db_session()
|
137 |
try:
|
138 |
doc = session.query(Document).filter(Document.id == doc_id).first()
|
|
|
139 |
if doc:
|
140 |
session.delete(doc)
|
141 |
session.commit()
|
142 |
+
return True
|
143 |
finally:
|
144 |
session.close()
|
145 |
|
|
|
179 |
'document_id': row.document_id,
|
180 |
'question': row.question,
|
181 |
'answer': row.answer,
|
182 |
+
'created_at': row.created_at.isoformat() if row.created_at else None,
|
183 |
})
|
184 |
return results
|
185 |
finally:
|
186 |
session.close()
|
187 |
|
188 |
+
def clean_answer(answer):
|
189 |
+
# Remove patterns like (3), extra spaces, and leading/trailing punctuation
|
190 |
+
answer = re.sub(r'\(\d+\)', '', answer)
|
191 |
+
answer = re.sub(r'\s+', ' ', answer)
|
192 |
+
answer = answer.strip(' ,.;:')
|
193 |
+
return answer
|
194 |
+
|
195 |
def save_question_answer(document_id, user_id, question, answer, score):
|
196 |
+
score = float(score) # Convert np.float64 to Python float
|
197 |
+
answer = clean_answer(answer) # Clean up answer format
|
198 |
session = get_db_session()
|
199 |
try:
|
200 |
qa = QuestionAnswer(
|
backend/app/routes/routes.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
import os
|
2 |
-
from flask import Blueprint, request, jsonify,
|
3 |
from werkzeug.utils import secure_filename
|
4 |
from app.utils.extract_text import extract_text_from_pdf
|
5 |
from app.utils.summarizer import generate_summary
|
6 |
from app.utils.clause_detector import detect_clauses
|
7 |
-
from app.database import save_document, delete_document
|
8 |
from app.database import get_all_documents, get_document_by_id
|
9 |
from app.database import search_documents, save_question_answer, search_questions_answers
|
10 |
from app.nlp.qa import answer_question
|
@@ -20,7 +20,14 @@ import textract
|
|
20 |
from app.database import get_user_profile, update_user_profile, change_user_password
|
21 |
from app.database import SessionLocal, User
|
22 |
from sqlalchemy.exc import IntegrityError
|
23 |
-
from sqlalchemy import or_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
main = Blueprint("main", __name__)
|
26 |
|
@@ -29,12 +36,7 @@ enhanced_legal_processor = EnhancedLegalProcessor()
|
|
29 |
legal_domain_processor = LegalDomainFeatures()
|
30 |
context_processor = ContextUnderstanding()
|
31 |
|
32 |
-
|
33 |
-
UPLOAD_FOLDER = os.path.join(BASE_DIR, 'uploads')
|
34 |
-
|
35 |
-
# Ensure the upload folder exists
|
36 |
-
if not os.path.exists(UPLOAD_FOLDER):
|
37 |
-
os.makedirs(UPLOAD_FOLDER)
|
38 |
|
39 |
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
|
40 |
|
@@ -74,8 +76,7 @@ def upload_file():
|
|
74 |
if not (file.filename.lower().endswith('.pdf')):
|
75 |
return jsonify({'error': 'File type not allowed. Only PDF files are supported.'}), 400
|
76 |
filename = secure_filename(file.filename)
|
77 |
-
|
78 |
-
file.save(file_path)
|
79 |
identity = get_jwt_identity()
|
80 |
user_id = get_user_id_by_username(identity)
|
81 |
if not user_id:
|
@@ -87,7 +88,7 @@ def upload_file():
|
|
87 |
clauses="[]",
|
88 |
features="{}",
|
89 |
context_analysis="{}",
|
90 |
-
|
91 |
user_id=user_id
|
92 |
)
|
93 |
return jsonify({
|
@@ -103,9 +104,27 @@ def upload_file():
|
|
103 |
@main.route('/documents', methods=['GET'])
|
104 |
@jwt_required()
|
105 |
def list_documents():
|
|
|
|
|
|
|
106 |
try:
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
except Exception as e:
|
110 |
logging.error(f"Error listing documents: {str(e)}", exc_info=True)
|
111 |
return jsonify({"error": str(e)}), 500
|
@@ -123,31 +142,49 @@ def get_document(doc_id):
|
|
123 |
logging.error(f"Error getting document {doc_id}: {str(e)}", exc_info=True)
|
124 |
return jsonify({"error": str(e)}), 500
|
125 |
|
126 |
-
@main.route('/documents/download/<
|
127 |
@jwt_required()
|
128 |
-
def download_document(
|
129 |
try:
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
except Exception as e:
|
132 |
-
logging.error(f"Error downloading file
|
133 |
return jsonify({"error": f"Error downloading file: {str(e)}"}), 500
|
134 |
|
135 |
-
@main.route('/documents/view/<
|
136 |
@jwt_required()
|
137 |
-
def view_document(
|
138 |
try:
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
except Exception as e:
|
141 |
-
logging.error(f"Error viewing file
|
142 |
return jsonify({"error": f"Error viewing file: {str(e)}"}), 500
|
143 |
|
144 |
@main.route('/documents/<int:doc_id>', methods=['DELETE'])
|
145 |
@jwt_required()
|
146 |
def delete_document_route(doc_id):
|
147 |
try:
|
148 |
-
|
149 |
-
if file_path_to_delete and os.path.exists(file_path_to_delete):
|
150 |
-
os.remove(file_path_to_delete)
|
151 |
return jsonify({"success": True, "message": "Document deleted successfully"}), 200
|
152 |
except Exception as e:
|
153 |
logging.error(f"Error deleting document {doc_id}: {str(e)}", exc_info=True)
|
@@ -207,30 +244,31 @@ def login():
|
|
207 |
@jwt_required()
|
208 |
def process_document(doc_id):
|
209 |
try:
|
210 |
-
|
211 |
-
|
|
|
|
|
212 |
return jsonify({'error': 'Document not found'}), 404
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
215 |
if not text:
|
|
|
216 |
return jsonify({'error': 'Could not extract text from file'}), 400
|
217 |
summary = generate_summary(text)
|
218 |
clauses = detect_clauses(text)
|
219 |
features = legal_domain_processor.process_legal_document(text)
|
220 |
context_analysis = context_processor.analyze_context(text)
|
221 |
# Update the document with processed content
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
doc.features = str(features)
|
230 |
-
doc.context_analysis = str(context_analysis)
|
231 |
-
session.commit()
|
232 |
-
finally:
|
233 |
-
session.close()
|
234 |
return jsonify({
|
235 |
'message': 'Document processed successfully',
|
236 |
'document_id': doc_id,
|
@@ -244,30 +282,41 @@ def process_document(doc_id):
|
|
244 |
@jwt_required()
|
245 |
def generate_document_summary(doc_id):
|
246 |
try:
|
247 |
-
|
|
|
248 |
if not doc:
|
|
|
249 |
return jsonify({"error": "Document not found"}), 404
|
250 |
-
summary = doc.
|
251 |
if summary and summary.strip() and summary != 'Processing...':
|
|
|
252 |
return jsonify({"summary": summary}), 200
|
253 |
-
|
254 |
-
|
255 |
return jsonify({"error": "File not found for this document"}), 404
|
256 |
-
text
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
if not text.strip():
|
|
|
258 |
return jsonify({"error": "No text available for summarization"}), 400
|
259 |
-
summary = generate_summary(text)
|
260 |
-
# Save the summary to the database
|
261 |
-
session = SessionLocal()
|
262 |
try:
|
263 |
-
|
264 |
-
|
265 |
-
document.summary = summary
|
266 |
-
session.commit()
|
267 |
-
finally:
|
268 |
session.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
return jsonify({"summary": summary}), 200
|
270 |
except Exception as e:
|
|
|
271 |
return jsonify({"error": f"Error generating summary: {str(e)}"}), 500
|
272 |
|
273 |
@main.route('/ask-question', methods=['POST', 'OPTIONS'])
|
@@ -390,10 +439,24 @@ def dashboard_stats():
|
|
390 |
processed_documents = sum(1 for doc in documents if doc.get('summary') and doc.get('summary') != 'Processing...')
|
391 |
pending_analysis = total_documents - processed_documents
|
392 |
qa_results = search_questions_answers('', user_id=user_id)
|
393 |
-
from datetime import datetime, timedelta
|
394 |
now = datetime.utcnow()
|
395 |
last_30_days = now - timedelta(days=30)
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
return jsonify({
|
398 |
'total_documents': total_documents,
|
399 |
'processed_documents': processed_documents,
|
|
|
1 |
import os
|
2 |
+
from flask import Blueprint, request, jsonify, send_file
|
3 |
from werkzeug.utils import secure_filename
|
4 |
from app.utils.extract_text import extract_text_from_pdf
|
5 |
from app.utils.summarizer import generate_summary
|
6 |
from app.utils.clause_detector import detect_clauses
|
7 |
+
from app.database import save_document, delete_document, Document
|
8 |
from app.database import get_all_documents, get_document_by_id
|
9 |
from app.database import search_documents, save_question_answer, search_questions_answers
|
10 |
from app.nlp.qa import answer_question
|
|
|
20 |
from app.database import get_user_profile, update_user_profile, change_user_password
|
21 |
from app.database import SessionLocal, User
|
22 |
from sqlalchemy.exc import IntegrityError
|
23 |
+
from sqlalchemy import or_, Index
|
24 |
+
import io
|
25 |
+
from datetime import datetime, timedelta, timezone
|
26 |
+
from sqlalchemy import Column, Integer, String, Text, DateTime, LargeBinary, func
|
27 |
+
from sqlalchemy.orm import relationship
|
28 |
+
from sqlalchemy.ext.declarative import declarative_base
|
29 |
+
from sqlalchemy import create_engine
|
30 |
+
from sqlalchemy.pool import NullPool
|
31 |
|
32 |
main = Blueprint("main", __name__)
|
33 |
|
|
|
36 |
legal_domain_processor = LegalDomainFeatures()
|
37 |
context_processor = ContextUnderstanding()
|
38 |
|
39 |
+
# Remove UPLOAD_FOLDER, file_path, and local file logic
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
|
42 |
|
|
|
76 |
if not (file.filename.lower().endswith('.pdf')):
|
77 |
return jsonify({'error': 'File type not allowed. Only PDF files are supported.'}), 400
|
78 |
filename = secure_filename(file.filename)
|
79 |
+
file_content = file.read() # Read file content as bytes
|
|
|
80 |
identity = get_jwt_identity()
|
81 |
user_id = get_user_id_by_username(identity)
|
82 |
if not user_id:
|
|
|
88 |
clauses="[]",
|
89 |
features="{}",
|
90 |
context_analysis="{}",
|
91 |
+
file_data=file_content, # Store file in DB
|
92 |
user_id=user_id
|
93 |
)
|
94 |
return jsonify({
|
|
|
104 |
@main.route('/documents', methods=['GET'])
|
105 |
@jwt_required()
|
106 |
def list_documents():
|
107 |
+
page = int(request.args.get('page', 1))
|
108 |
+
limit = int(request.args.get('limit', 20))
|
109 |
+
offset = (page - 1) * limit
|
110 |
try:
|
111 |
+
identity = get_jwt_identity()
|
112 |
+
user_id = get_user_id_by_username(identity)
|
113 |
+
session = SessionLocal()
|
114 |
+
query = session.query(Document).filter(Document.user_id == user_id).order_by(Document.upload_time.desc())
|
115 |
+
documents = query.offset(offset).limit(limit).all()
|
116 |
+
result = []
|
117 |
+
for doc in documents:
|
118 |
+
result.append({
|
119 |
+
'id': doc.id,
|
120 |
+
'title': doc.title,
|
121 |
+
'summary': doc.summary,
|
122 |
+
'file_size': doc.file_size,
|
123 |
+
'upload_time': doc.upload_time.isoformat() if doc.upload_time else None,
|
124 |
+
'type': doc.title.split('.')[-1].upper() if '.' in doc.title else 'UNKNOWN',
|
125 |
+
})
|
126 |
+
session.close()
|
127 |
+
return jsonify(result), 200
|
128 |
except Exception as e:
|
129 |
logging.error(f"Error listing documents: {str(e)}", exc_info=True)
|
130 |
return jsonify({"error": str(e)}), 500
|
|
|
142 |
logging.error(f"Error getting document {doc_id}: {str(e)}", exc_info=True)
|
143 |
return jsonify({"error": str(e)}), 500
|
144 |
|
145 |
+
@main.route('/documents/download/<int:doc_id>', methods=['GET'])
|
146 |
@jwt_required()
|
147 |
+
def download_document(doc_id):
|
148 |
try:
|
149 |
+
session = SessionLocal()
|
150 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
151 |
+
session.close()
|
152 |
+
if not doc or not doc.file_data:
|
153 |
+
return jsonify({"error": "File not found"}), 404
|
154 |
+
return send_file(
|
155 |
+
io.BytesIO(doc.file_data),
|
156 |
+
as_attachment=True,
|
157 |
+
download_name=doc.title,
|
158 |
+
mimetype='application/pdf'
|
159 |
+
)
|
160 |
except Exception as e:
|
161 |
+
logging.error(f"Error downloading file: {str(e)}", exc_info=True)
|
162 |
return jsonify({"error": f"Error downloading file: {str(e)}"}), 500
|
163 |
|
164 |
+
@main.route('/documents/view/<int:doc_id>', methods=['GET'])
|
165 |
@jwt_required()
|
166 |
+
def view_document(doc_id):
|
167 |
try:
|
168 |
+
session = SessionLocal()
|
169 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
170 |
+
session.close()
|
171 |
+
if not doc or not doc.file_data:
|
172 |
+
return jsonify({"error": "File not found"}), 404
|
173 |
+
return send_file(
|
174 |
+
io.BytesIO(doc.file_data),
|
175 |
+
as_attachment=False,
|
176 |
+
download_name=doc.title,
|
177 |
+
mimetype='application/pdf'
|
178 |
+
)
|
179 |
except Exception as e:
|
180 |
+
logging.error(f"Error viewing file: {str(e)}", exc_info=True)
|
181 |
return jsonify({"error": f"Error viewing file: {str(e)}"}), 500
|
182 |
|
183 |
@main.route('/documents/<int:doc_id>', methods=['DELETE'])
|
184 |
@jwt_required()
|
185 |
def delete_document_route(doc_id):
|
186 |
try:
|
187 |
+
delete_document(doc_id)
|
|
|
|
|
188 |
return jsonify({"success": True, "message": "Document deleted successfully"}), 200
|
189 |
except Exception as e:
|
190 |
logging.error(f"Error deleting document {doc_id}: {str(e)}", exc_info=True)
|
|
|
244 |
@jwt_required()
|
245 |
def process_document(doc_id):
|
246 |
try:
|
247 |
+
session = SessionLocal()
|
248 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
249 |
+
if not doc:
|
250 |
+
session.close()
|
251 |
return jsonify({'error': 'Document not found'}), 404
|
252 |
+
if not doc.file_data:
|
253 |
+
session.close()
|
254 |
+
return jsonify({'error': 'File not found for this document'}), 404
|
255 |
+
# Extract text from file_data
|
256 |
+
text = extract_text_from_pdf(io.BytesIO(doc.file_data))
|
257 |
if not text:
|
258 |
+
session.close()
|
259 |
return jsonify({'error': 'Could not extract text from file'}), 400
|
260 |
summary = generate_summary(text)
|
261 |
clauses = detect_clauses(text)
|
262 |
features = legal_domain_processor.process_legal_document(text)
|
263 |
context_analysis = context_processor.analyze_context(text)
|
264 |
# Update the document with processed content
|
265 |
+
doc.full_text = text
|
266 |
+
doc.summary = summary
|
267 |
+
doc.clauses = str(clauses)
|
268 |
+
doc.features = str(features)
|
269 |
+
doc.context_analysis = str(context_analysis)
|
270 |
+
session.commit()
|
271 |
+
session.close()
|
|
|
|
|
|
|
|
|
|
|
272 |
return jsonify({
|
273 |
'message': 'Document processed successfully',
|
274 |
'document_id': doc_id,
|
|
|
282 |
@jwt_required()
|
283 |
def generate_document_summary(doc_id):
|
284 |
try:
|
285 |
+
session = SessionLocal()
|
286 |
+
doc = session.query(Document).filter(Document.id == doc_id).first()
|
287 |
if not doc:
|
288 |
+
session.close()
|
289 |
return jsonify({"error": "Document not found"}), 404
|
290 |
+
summary = doc.summary
|
291 |
if summary and summary.strip() and summary != 'Processing...':
|
292 |
+
session.close()
|
293 |
return jsonify({"summary": summary}), 200
|
294 |
+
if not doc.file_data:
|
295 |
+
session.close()
|
296 |
return jsonify({"error": "File not found for this document"}), 404
|
297 |
+
# Extract text from file_data
|
298 |
+
try:
|
299 |
+
text = extract_text_from_pdf(io.BytesIO(doc.file_data))
|
300 |
+
except Exception as e:
|
301 |
+
session.close()
|
302 |
+
logging.error(f"Error extracting text from PDF: {e}")
|
303 |
+
return jsonify({"error": f"Error extracting text from PDF: {e}"}), 500
|
304 |
if not text.strip():
|
305 |
+
session.close()
|
306 |
return jsonify({"error": "No text available for summarization"}), 400
|
|
|
|
|
|
|
307 |
try:
|
308 |
+
summary = generate_summary(text)
|
309 |
+
except Exception as e:
|
|
|
|
|
|
|
310 |
session.close()
|
311 |
+
logging.error(f"Error generating summary: {e}")
|
312 |
+
return jsonify({"error": f"Error generating summary: {e}"}), 500
|
313 |
+
# Save the summary to the database
|
314 |
+
doc.summary = summary
|
315 |
+
session.commit()
|
316 |
+
session.close()
|
317 |
return jsonify({"summary": summary}), 200
|
318 |
except Exception as e:
|
319 |
+
logging.error(f"Error in generate_document_summary: {e}", exc_info=True)
|
320 |
return jsonify({"error": f"Error generating summary: {str(e)}"}), 500
|
321 |
|
322 |
@main.route('/ask-question', methods=['POST', 'OPTIONS'])
|
|
|
439 |
processed_documents = sum(1 for doc in documents if doc.get('summary') and doc.get('summary') != 'Processing...')
|
440 |
pending_analysis = total_documents - processed_documents
|
441 |
qa_results = search_questions_answers('', user_id=user_id)
|
|
|
442 |
now = datetime.utcnow()
|
443 |
last_30_days = now - timedelta(days=30)
|
444 |
+
def parse_dt(val):
|
445 |
+
if isinstance(val, datetime):
|
446 |
+
# Convert to naive UTC
|
447 |
+
if val.tzinfo is not None:
|
448 |
+
return val.astimezone(timezone.utc).replace(tzinfo=None)
|
449 |
+
return val
|
450 |
+
if isinstance(val, str):
|
451 |
+
try:
|
452 |
+
dt = datetime.fromisoformat(val)
|
453 |
+
if dt.tzinfo is not None:
|
454 |
+
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
455 |
+
return dt
|
456 |
+
except Exception:
|
457 |
+
return None
|
458 |
+
return None
|
459 |
+
recent_questions = sum(1 for q in qa_results if q['created_at'] and parse_dt(q['created_at']) and parse_dt(q['created_at']) >= last_30_days)
|
460 |
return jsonify({
|
461 |
'total_documents': total_documents,
|
462 |
'processed_documents': processed_documents,
|
backend/app/utils/extract_text.py
CHANGED
@@ -1,8 +1,24 @@
|
|
1 |
import tempfile
|
2 |
from pdfminer.high_level import extract_text
|
3 |
import os
|
|
|
4 |
|
5 |
-
def extract_text_from_pdf(
|
6 |
-
|
7 |
-
text
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import tempfile
|
2 |
from pdfminer.high_level import extract_text
|
3 |
import os
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
|
6 |
+
def extract_text_from_pdf(file_or_path):
|
7 |
+
"""
|
8 |
+
Extract text from a PDF file. Accepts either a file path (str) or a file-like object (e.g., BytesIO).
|
9 |
+
"""
|
10 |
+
if isinstance(file_or_path, (str, bytes)):
|
11 |
+
# Assume it's a file path
|
12 |
+
with open(file_or_path, 'rb') as f:
|
13 |
+
reader = PdfReader(f)
|
14 |
+
text = ""
|
15 |
+
for page in reader.pages:
|
16 |
+
text += page.extract_text() or ""
|
17 |
+
return text
|
18 |
+
else:
|
19 |
+
# Assume it's a file-like object
|
20 |
+
reader = PdfReader(file_or_path)
|
21 |
+
text = ""
|
22 |
+
for page in reader.pages:
|
23 |
+
text += page.extract_text() or ""
|
24 |
+
return text
|