Harsh Upadhyay commited on
Commit
fca1742
·
1 Parent(s): aac0325

removing local file support and adding SQLAlchemy support for all database operations.

Browse files
backend/.gitignore CHANGED
@@ -13,6 +13,7 @@ env/
13
  venv/
14
  instance/
15
  *.db
 
16
 
17
  # OS/Editor
18
  .DS_Store
 
13
  venv/
14
  instance/
15
  *.db
16
+ *.env
17
 
18
  # OS/Editor
19
  .DS_Store
backend/app/app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
4
+ # ... existing code ...
backend/app/database.py CHANGED
@@ -1,22 +1,23 @@
1
  # All sqlite3 and local DB logic will be removed and replaced with SQLAlchemy/Postgres in the next step.
2
  # This file will be refactored to use SQLAlchemy models and sessions.
3
 
4
- from sqlalchemy import create_engine, Column, Integer, String, Text, Float, ForeignKey, DateTime
5
  from sqlalchemy.orm import declarative_base, sessionmaker, relationship
6
  from sqlalchemy.sql import func
7
  import os
8
  from sqlalchemy.exc import IntegrityError
9
  from werkzeug.security import check_password_hash, generate_password_hash
 
 
10
 
11
- # SQLAlchemy setup
12
- DATABASE_URL = os.environ.get('DATABASE_URL') or (
13
- "sqlite:///" + os.path.join(os.path.dirname(os.path.abspath(__file__)), 'legal_docs.db')
14
- )
15
 
16
- print("DATABASE_URL:", repr(DATABASE_URL)) # Debug print
 
17
 
18
  if not DATABASE_URL or DATABASE_URL.strip() == "":
19
- raise ValueError("DATABASE_URL is not set or is empty. Please set it as an environment variable or secret.")
20
 
21
  engine = create_engine(DATABASE_URL)
22
  SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
@@ -45,7 +46,8 @@ class Document(Base):
45
  clauses = Column(Text)
46
  features = Column(Text)
47
  context_analysis = Column(Text)
48
- file_path = Column(String)
 
49
  upload_time = Column(DateTime(timezone=True), server_default=func.now())
50
  user_id = Column(Integer, ForeignKey('users.id'))
51
  user = relationship('User', back_populates='documents')
@@ -71,7 +73,7 @@ def get_db_session():
71
  return SessionLocal()
72
 
73
  # --- Document CRUD ---
74
- def save_document(title, full_text, summary, clauses, features, context_analysis, file_path, user_id):
75
  session = get_db_session()
76
  try:
77
  doc = Document(
@@ -81,7 +83,8 @@ def save_document(title, full_text, summary, clauses, features, context_analysis
81
  clauses=str(clauses),
82
  features=str(features),
83
  context_analysis=str(context_analysis),
84
- file_path=file_path,
 
85
  user_id=user_id
86
  )
87
  session.add(doc)
@@ -104,6 +107,9 @@ def get_all_documents(user_id=None):
104
  for doc in documents:
105
  d = doc.__dict__.copy()
106
  d.pop('_sa_instance_state', None)
 
 
 
107
  result.append(d)
108
  return result
109
  finally:
@@ -119,6 +125,8 @@ def get_document_by_id(doc_id, user_id=None):
119
  if doc:
120
  d = doc.__dict__.copy()
121
  d.pop('_sa_instance_state', None)
 
 
122
  return d
123
  return None
124
  finally:
@@ -128,11 +136,10 @@ def delete_document(doc_id):
128
  session = get_db_session()
129
  try:
130
  doc = session.query(Document).filter(Document.id == doc_id).first()
131
- file_path = doc.file_path if doc else None
132
  if doc:
133
  session.delete(doc)
134
  session.commit()
135
- return file_path
136
  finally:
137
  session.close()
138
 
@@ -172,13 +179,22 @@ def search_questions_answers(query, user_id=None):
172
  'document_id': row.document_id,
173
  'question': row.question,
174
  'answer': row.answer,
175
- 'created_at': row.created_at
176
  })
177
  return results
178
  finally:
179
  session.close()
180
 
 
 
 
 
 
 
 
181
  def save_question_answer(document_id, user_id, question, answer, score):
 
 
182
  session = get_db_session()
183
  try:
184
  qa = QuestionAnswer(
 
1
  # All sqlite3 and local DB logic will be removed and replaced with SQLAlchemy/Postgres in the next step.
2
  # This file will be refactored to use SQLAlchemy models and sessions.
3
 
4
+ from sqlalchemy import create_engine, Column, Integer, String, Text, Float, ForeignKey, DateTime, LargeBinary
5
  from sqlalchemy.orm import declarative_base, sessionmaker, relationship
6
  from sqlalchemy.sql import func
7
  import os
8
  from sqlalchemy.exc import IntegrityError
9
  from werkzeug.security import check_password_hash, generate_password_hash
10
+ from dotenv import load_dotenv
11
+ import re
12
 
13
+ load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), '..', '.env'))
14
+ print("DEBUG: DATABASE_URL from os.environ:", os.environ.get('DATABASE_URL'))
 
 
15
 
16
+ # SQLAlchemy setup
17
+ DATABASE_URL = os.environ.get('DATABASE_URL')
18
 
19
  if not DATABASE_URL or DATABASE_URL.strip() == "":
20
+ raise ValueError("DATABASE_URL is not set or is empty. Please set it as an environment variable or in your .env file for NeonDB.")
21
 
22
  engine = create_engine(DATABASE_URL)
23
  SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 
46
  clauses = Column(Text)
47
  features = Column(Text)
48
  context_analysis = Column(Text)
49
+ file_data = Column(LargeBinary) # Store file content in DB
50
+ file_size = Column(Integer) # Add this
51
  upload_time = Column(DateTime(timezone=True), server_default=func.now())
52
  user_id = Column(Integer, ForeignKey('users.id'))
53
  user = relationship('User', back_populates='documents')
 
73
  return SessionLocal()
74
 
75
  # --- Document CRUD ---
76
+ def save_document(title, full_text, summary, clauses, features, context_analysis, file_data, user_id):
77
  session = get_db_session()
78
  try:
79
  doc = Document(
 
83
  clauses=str(clauses),
84
  features=str(features),
85
  context_analysis=str(context_analysis),
86
+ file_data=file_data,
87
+ file_size=len(file_data) if file_data else 0, # Store file size
88
  user_id=user_id
89
  )
90
  session.add(doc)
 
107
  for doc in documents:
108
  d = doc.__dict__.copy()
109
  d.pop('_sa_instance_state', None)
110
+ d.pop('file_data', None) # Don't return file data in list
111
+ # Do NOT pop 'summary'; keep it in the result
112
+ # file_size is included
113
  result.append(d)
114
  return result
115
  finally:
 
125
  if doc:
126
  d = doc.__dict__.copy()
127
  d.pop('_sa_instance_state', None)
128
+ # Don't return file_data by default
129
+ d.pop('file_data', None)
130
  return d
131
  return None
132
  finally:
 
136
  session = get_db_session()
137
  try:
138
  doc = session.query(Document).filter(Document.id == doc_id).first()
 
139
  if doc:
140
  session.delete(doc)
141
  session.commit()
142
+ return True
143
  finally:
144
  session.close()
145
 
 
179
  'document_id': row.document_id,
180
  'question': row.question,
181
  'answer': row.answer,
182
+ 'created_at': row.created_at.isoformat() if row.created_at else None,
183
  })
184
  return results
185
  finally:
186
  session.close()
187
 
188
+ def clean_answer(answer):
189
+ # Remove patterns like (3), extra spaces, and leading/trailing punctuation
190
+ answer = re.sub(r'\(\d+\)', '', answer)
191
+ answer = re.sub(r'\s+', ' ', answer)
192
+ answer = answer.strip(' ,.;:')
193
+ return answer
194
+
195
  def save_question_answer(document_id, user_id, question, answer, score):
196
+ score = float(score) # Convert np.float64 to Python float
197
+ answer = clean_answer(answer) # Clean up answer format
198
  session = get_db_session()
199
  try:
200
  qa = QuestionAnswer(
backend/app/routes/routes.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
2
- from flask import Blueprint, request, jsonify, send_from_directory, current_app
3
  from werkzeug.utils import secure_filename
4
  from app.utils.extract_text import extract_text_from_pdf
5
  from app.utils.summarizer import generate_summary
6
  from app.utils.clause_detector import detect_clauses
7
- from app.database import save_document, delete_document
8
  from app.database import get_all_documents, get_document_by_id
9
  from app.database import search_documents, save_question_answer, search_questions_answers
10
  from app.nlp.qa import answer_question
@@ -20,7 +20,14 @@ import textract
20
  from app.database import get_user_profile, update_user_profile, change_user_password
21
  from app.database import SessionLocal, User
22
  from sqlalchemy.exc import IntegrityError
23
- from sqlalchemy import or_
 
 
 
 
 
 
 
24
 
25
  main = Blueprint("main", __name__)
26
 
@@ -29,12 +36,7 @@ enhanced_legal_processor = EnhancedLegalProcessor()
29
  legal_domain_processor = LegalDomainFeatures()
30
  context_processor = ContextUnderstanding()
31
 
32
- BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
33
- UPLOAD_FOLDER = os.path.join(BASE_DIR, 'uploads')
34
-
35
- # Ensure the upload folder exists
36
- if not os.path.exists(UPLOAD_FOLDER):
37
- os.makedirs(UPLOAD_FOLDER)
38
 
39
  ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
40
 
@@ -74,8 +76,7 @@ def upload_file():
74
  if not (file.filename.lower().endswith('.pdf')):
75
  return jsonify({'error': 'File type not allowed. Only PDF files are supported.'}), 400
76
  filename = secure_filename(file.filename)
77
- file_path = os.path.join(UPLOAD_FOLDER, filename)
78
- file.save(file_path)
79
  identity = get_jwt_identity()
80
  user_id = get_user_id_by_username(identity)
81
  if not user_id:
@@ -87,7 +88,7 @@ def upload_file():
87
  clauses="[]",
88
  features="{}",
89
  context_analysis="{}",
90
- file_path=file_path,
91
  user_id=user_id
92
  )
93
  return jsonify({
@@ -103,9 +104,27 @@ def upload_file():
103
  @main.route('/documents', methods=['GET'])
104
  @jwt_required()
105
  def list_documents():
 
 
 
106
  try:
107
- docs = get_all_documents()
108
- return jsonify(docs), 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  except Exception as e:
110
  logging.error(f"Error listing documents: {str(e)}", exc_info=True)
111
  return jsonify({"error": str(e)}), 500
@@ -123,31 +142,49 @@ def get_document(doc_id):
123
  logging.error(f"Error getting document {doc_id}: {str(e)}", exc_info=True)
124
  return jsonify({"error": str(e)}), 500
125
 
126
- @main.route('/documents/download/<filename>', methods=['GET'])
127
  @jwt_required()
128
- def download_document(filename):
129
  try:
130
- return send_from_directory(UPLOAD_FOLDER, filename, as_attachment=True)
 
 
 
 
 
 
 
 
 
 
131
  except Exception as e:
132
- logging.error(f"Error downloading file {filename}: {str(e)}", exc_info=True)
133
  return jsonify({"error": f"Error downloading file: {str(e)}"}), 500
134
 
135
- @main.route('/documents/view/<filename>', methods=['GET'])
136
  @jwt_required()
137
- def view_document(filename):
138
  try:
139
- return send_from_directory(UPLOAD_FOLDER, filename)
 
 
 
 
 
 
 
 
 
 
140
  except Exception as e:
141
- logging.error(f"Error viewing file {filename}: {str(e)}", exc_info=True)
142
  return jsonify({"error": f"Error viewing file: {str(e)}"}), 500
143
 
144
  @main.route('/documents/<int:doc_id>', methods=['DELETE'])
145
  @jwt_required()
146
  def delete_document_route(doc_id):
147
  try:
148
- file_path_to_delete = delete_document(doc_id)
149
- if file_path_to_delete and os.path.exists(file_path_to_delete):
150
- os.remove(file_path_to_delete)
151
  return jsonify({"success": True, "message": "Document deleted successfully"}), 200
152
  except Exception as e:
153
  logging.error(f"Error deleting document {doc_id}: {str(e)}", exc_info=True)
@@ -207,30 +244,31 @@ def login():
207
  @jwt_required()
208
  def process_document(doc_id):
209
  try:
210
- document = get_document_by_id(doc_id)
211
- if not document:
 
 
212
  return jsonify({'error': 'Document not found'}), 404
213
- file_path = document['file_path']
214
- text = extract_text_from_file(file_path)
 
 
 
215
  if not text:
 
216
  return jsonify({'error': 'Could not extract text from file'}), 400
217
  summary = generate_summary(text)
218
  clauses = detect_clauses(text)
219
  features = legal_domain_processor.process_legal_document(text)
220
  context_analysis = context_processor.analyze_context(text)
221
  # Update the document with processed content
222
- session = SessionLocal()
223
- try:
224
- doc = session.query(User).get(doc_id)
225
- if doc:
226
- doc.full_text = text
227
- doc.summary = summary
228
- doc.clauses = str(clauses)
229
- doc.features = str(features)
230
- doc.context_analysis = str(context_analysis)
231
- session.commit()
232
- finally:
233
- session.close()
234
  return jsonify({
235
  'message': 'Document processed successfully',
236
  'document_id': doc_id,
@@ -244,30 +282,41 @@ def process_document(doc_id):
244
  @jwt_required()
245
  def generate_document_summary(doc_id):
246
  try:
247
- doc = get_document_by_id(doc_id)
 
248
  if not doc:
 
249
  return jsonify({"error": "Document not found"}), 404
250
- summary = doc.get('summary', '')
251
  if summary and summary.strip() and summary != 'Processing...':
 
252
  return jsonify({"summary": summary}), 200
253
- file_path = doc.get('file_path', '')
254
- if not file_path or not os.path.exists(file_path):
255
  return jsonify({"error": "File not found for this document"}), 404
256
- text = extract_text_from_file(file_path)
 
 
 
 
 
 
257
  if not text.strip():
 
258
  return jsonify({"error": "No text available for summarization"}), 400
259
- summary = generate_summary(text)
260
- # Save the summary to the database
261
- session = SessionLocal()
262
  try:
263
- document = session.query(User).get(doc_id)
264
- if document:
265
- document.summary = summary
266
- session.commit()
267
- finally:
268
  session.close()
 
 
 
 
 
 
269
  return jsonify({"summary": summary}), 200
270
  except Exception as e:
 
271
  return jsonify({"error": f"Error generating summary: {str(e)}"}), 500
272
 
273
  @main.route('/ask-question', methods=['POST', 'OPTIONS'])
@@ -390,10 +439,24 @@ def dashboard_stats():
390
  processed_documents = sum(1 for doc in documents if doc.get('summary') and doc.get('summary') != 'Processing...')
391
  pending_analysis = total_documents - processed_documents
392
  qa_results = search_questions_answers('', user_id=user_id)
393
- from datetime import datetime, timedelta
394
  now = datetime.utcnow()
395
  last_30_days = now - timedelta(days=30)
396
- recent_questions = sum(1 for q in qa_results if q['created_at'] and q['created_at'] >= last_30_days)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  return jsonify({
398
  'total_documents': total_documents,
399
  'processed_documents': processed_documents,
 
1
  import os
2
+ from flask import Blueprint, request, jsonify, send_file
3
  from werkzeug.utils import secure_filename
4
  from app.utils.extract_text import extract_text_from_pdf
5
  from app.utils.summarizer import generate_summary
6
  from app.utils.clause_detector import detect_clauses
7
+ from app.database import save_document, delete_document, Document
8
  from app.database import get_all_documents, get_document_by_id
9
  from app.database import search_documents, save_question_answer, search_questions_answers
10
  from app.nlp.qa import answer_question
 
20
  from app.database import get_user_profile, update_user_profile, change_user_password
21
  from app.database import SessionLocal, User
22
  from sqlalchemy.exc import IntegrityError
23
+ from sqlalchemy import or_, Index
24
+ import io
25
+ from datetime import datetime, timedelta, timezone
26
+ from sqlalchemy import Column, Integer, String, Text, DateTime, LargeBinary, func
27
+ from sqlalchemy.orm import relationship
28
+ from sqlalchemy.ext.declarative import declarative_base
29
+ from sqlalchemy import create_engine
30
+ from sqlalchemy.pool import NullPool
31
 
32
  main = Blueprint("main", __name__)
33
 
 
36
  legal_domain_processor = LegalDomainFeatures()
37
  context_processor = ContextUnderstanding()
38
 
39
+ # Remove UPLOAD_FOLDER, file_path, and local file logic
 
 
 
 
 
40
 
41
  ALLOWED_EXTENSIONS = {'pdf', 'doc', 'docx'}
42
 
 
76
  if not (file.filename.lower().endswith('.pdf')):
77
  return jsonify({'error': 'File type not allowed. Only PDF files are supported.'}), 400
78
  filename = secure_filename(file.filename)
79
+ file_content = file.read() # Read file content as bytes
 
80
  identity = get_jwt_identity()
81
  user_id = get_user_id_by_username(identity)
82
  if not user_id:
 
88
  clauses="[]",
89
  features="{}",
90
  context_analysis="{}",
91
+ file_data=file_content, # Store file in DB
92
  user_id=user_id
93
  )
94
  return jsonify({
 
104
  @main.route('/documents', methods=['GET'])
105
  @jwt_required()
106
  def list_documents():
107
+ page = int(request.args.get('page', 1))
108
+ limit = int(request.args.get('limit', 20))
109
+ offset = (page - 1) * limit
110
  try:
111
+ identity = get_jwt_identity()
112
+ user_id = get_user_id_by_username(identity)
113
+ session = SessionLocal()
114
+ query = session.query(Document).filter(Document.user_id == user_id).order_by(Document.upload_time.desc())
115
+ documents = query.offset(offset).limit(limit).all()
116
+ result = []
117
+ for doc in documents:
118
+ result.append({
119
+ 'id': doc.id,
120
+ 'title': doc.title,
121
+ 'summary': doc.summary,
122
+ 'file_size': doc.file_size,
123
+ 'upload_time': doc.upload_time.isoformat() if doc.upload_time else None,
124
+ 'type': doc.title.split('.')[-1].upper() if '.' in doc.title else 'UNKNOWN',
125
+ })
126
+ session.close()
127
+ return jsonify(result), 200
128
  except Exception as e:
129
  logging.error(f"Error listing documents: {str(e)}", exc_info=True)
130
  return jsonify({"error": str(e)}), 500
 
142
  logging.error(f"Error getting document {doc_id}: {str(e)}", exc_info=True)
143
  return jsonify({"error": str(e)}), 500
144
 
145
+ @main.route('/documents/download/<int:doc_id>', methods=['GET'])
146
  @jwt_required()
147
+ def download_document(doc_id):
148
  try:
149
+ session = SessionLocal()
150
+ doc = session.query(Document).filter(Document.id == doc_id).first()
151
+ session.close()
152
+ if not doc or not doc.file_data:
153
+ return jsonify({"error": "File not found"}), 404
154
+ return send_file(
155
+ io.BytesIO(doc.file_data),
156
+ as_attachment=True,
157
+ download_name=doc.title,
158
+ mimetype='application/pdf'
159
+ )
160
  except Exception as e:
161
+ logging.error(f"Error downloading file: {str(e)}", exc_info=True)
162
  return jsonify({"error": f"Error downloading file: {str(e)}"}), 500
163
 
164
+ @main.route('/documents/view/<int:doc_id>', methods=['GET'])
165
  @jwt_required()
166
+ def view_document(doc_id):
167
  try:
168
+ session = SessionLocal()
169
+ doc = session.query(Document).filter(Document.id == doc_id).first()
170
+ session.close()
171
+ if not doc or not doc.file_data:
172
+ return jsonify({"error": "File not found"}), 404
173
+ return send_file(
174
+ io.BytesIO(doc.file_data),
175
+ as_attachment=False,
176
+ download_name=doc.title,
177
+ mimetype='application/pdf'
178
+ )
179
  except Exception as e:
180
+ logging.error(f"Error viewing file: {str(e)}", exc_info=True)
181
  return jsonify({"error": f"Error viewing file: {str(e)}"}), 500
182
 
183
  @main.route('/documents/<int:doc_id>', methods=['DELETE'])
184
  @jwt_required()
185
  def delete_document_route(doc_id):
186
  try:
187
+ delete_document(doc_id)
 
 
188
  return jsonify({"success": True, "message": "Document deleted successfully"}), 200
189
  except Exception as e:
190
  logging.error(f"Error deleting document {doc_id}: {str(e)}", exc_info=True)
 
244
  @jwt_required()
245
  def process_document(doc_id):
246
  try:
247
+ session = SessionLocal()
248
+ doc = session.query(Document).filter(Document.id == doc_id).first()
249
+ if not doc:
250
+ session.close()
251
  return jsonify({'error': 'Document not found'}), 404
252
+ if not doc.file_data:
253
+ session.close()
254
+ return jsonify({'error': 'File not found for this document'}), 404
255
+ # Extract text from file_data
256
+ text = extract_text_from_pdf(io.BytesIO(doc.file_data))
257
  if not text:
258
+ session.close()
259
  return jsonify({'error': 'Could not extract text from file'}), 400
260
  summary = generate_summary(text)
261
  clauses = detect_clauses(text)
262
  features = legal_domain_processor.process_legal_document(text)
263
  context_analysis = context_processor.analyze_context(text)
264
  # Update the document with processed content
265
+ doc.full_text = text
266
+ doc.summary = summary
267
+ doc.clauses = str(clauses)
268
+ doc.features = str(features)
269
+ doc.context_analysis = str(context_analysis)
270
+ session.commit()
271
+ session.close()
 
 
 
 
 
272
  return jsonify({
273
  'message': 'Document processed successfully',
274
  'document_id': doc_id,
 
282
  @jwt_required()
283
  def generate_document_summary(doc_id):
284
  try:
285
+ session = SessionLocal()
286
+ doc = session.query(Document).filter(Document.id == doc_id).first()
287
  if not doc:
288
+ session.close()
289
  return jsonify({"error": "Document not found"}), 404
290
+ summary = doc.summary
291
  if summary and summary.strip() and summary != 'Processing...':
292
+ session.close()
293
  return jsonify({"summary": summary}), 200
294
+ if not doc.file_data:
295
+ session.close()
296
  return jsonify({"error": "File not found for this document"}), 404
297
+ # Extract text from file_data
298
+ try:
299
+ text = extract_text_from_pdf(io.BytesIO(doc.file_data))
300
+ except Exception as e:
301
+ session.close()
302
+ logging.error(f"Error extracting text from PDF: {e}")
303
+ return jsonify({"error": f"Error extracting text from PDF: {e}"}), 500
304
  if not text.strip():
305
+ session.close()
306
  return jsonify({"error": "No text available for summarization"}), 400
 
 
 
307
  try:
308
+ summary = generate_summary(text)
309
+ except Exception as e:
 
 
 
310
  session.close()
311
+ logging.error(f"Error generating summary: {e}")
312
+ return jsonify({"error": f"Error generating summary: {e}"}), 500
313
+ # Save the summary to the database
314
+ doc.summary = summary
315
+ session.commit()
316
+ session.close()
317
  return jsonify({"summary": summary}), 200
318
  except Exception as e:
319
+ logging.error(f"Error in generate_document_summary: {e}", exc_info=True)
320
  return jsonify({"error": f"Error generating summary: {str(e)}"}), 500
321
 
322
  @main.route('/ask-question', methods=['POST', 'OPTIONS'])
 
439
  processed_documents = sum(1 for doc in documents if doc.get('summary') and doc.get('summary') != 'Processing...')
440
  pending_analysis = total_documents - processed_documents
441
  qa_results = search_questions_answers('', user_id=user_id)
 
442
  now = datetime.utcnow()
443
  last_30_days = now - timedelta(days=30)
444
+ def parse_dt(val):
445
+ if isinstance(val, datetime):
446
+ # Convert to naive UTC
447
+ if val.tzinfo is not None:
448
+ return val.astimezone(timezone.utc).replace(tzinfo=None)
449
+ return val
450
+ if isinstance(val, str):
451
+ try:
452
+ dt = datetime.fromisoformat(val)
453
+ if dt.tzinfo is not None:
454
+ return dt.astimezone(timezone.utc).replace(tzinfo=None)
455
+ return dt
456
+ except Exception:
457
+ return None
458
+ return None
459
+ recent_questions = sum(1 for q in qa_results if q['created_at'] and parse_dt(q['created_at']) and parse_dt(q['created_at']) >= last_30_days)
460
  return jsonify({
461
  'total_documents': total_documents,
462
  'processed_documents': processed_documents,
backend/app/utils/extract_text.py CHANGED
@@ -1,8 +1,24 @@
1
  import tempfile
2
  from pdfminer.high_level import extract_text
3
  import os
 
4
 
5
- def extract_text_from_pdf(file_path):
6
- # Extract text directly from the given file path
7
- text = extract_text(file_path)
8
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import tempfile
2
  from pdfminer.high_level import extract_text
3
  import os
4
+ from PyPDF2 import PdfReader
5
 
6
+ def extract_text_from_pdf(file_or_path):
7
+ """
8
+ Extract text from a PDF file. Accepts either a file path (str) or a file-like object (e.g., BytesIO).
9
+ """
10
+ if isinstance(file_or_path, (str, bytes)):
11
+ # Assume it's a file path
12
+ with open(file_or_path, 'rb') as f:
13
+ reader = PdfReader(f)
14
+ text = ""
15
+ for page in reader.pages:
16
+ text += page.extract_text() or ""
17
+ return text
18
+ else:
19
+ # Assume it's a file-like object
20
+ reader = PdfReader(file_or_path)
21
+ text = ""
22
+ for page in reader.pages:
23
+ text += page.extract_text() or ""
24
+ return text