philipk22 commited on
Commit
e3bfbea
·
1 Parent(s): 2729280

Deploying ind_app

Browse files
Files changed (5) hide show
  1. IND-312.pdf +0 -0
  2. ind_app.py +656 -0
  3. preprocessed_docs.json +0 -0
  4. requirements.txt +14 -0
  5. template.md +72 -0
IND-312.pdf ADDED
Binary file (423 kB). View file
 
ind_app.py ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Merged Streamlit App: IND Assistant and Submission Assessment
3
+
4
+ This app combines the functionality of the IND Assistant (chat-based Q&A)
5
+ and the Submission Assessment (checklist-based analysis) into a single
6
+ Streamlit interface.
7
+ """
8
+
9
+ import os
10
+ import json
11
+ import tempfile
12
+ from zipfile import ZipFile
13
+ import streamlit as st
14
+ from llama_parse import LlamaParse
15
+ import pickle
16
+ import hashlib
17
+ from typing import List
18
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
19
+ from langchain_community.vectorstores import Qdrant
20
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
21
+ from langchain_openai.chat_models import ChatOpenAI
22
+ from langchain.prompts import ChatPromptTemplate
23
+ from langchain.schema.runnable import RunnablePassthrough
24
+ from langchain_core.output_parsers import StrOutputParser
25
+ from operator import itemgetter
26
+ import nest_asyncio
27
+ from langchain.schema import Document
28
+ import boto3 # Import boto3 for S3 interaction
29
+ import requests
30
+ from io import BytesIO
31
+
32
+ # Prevent Streamlit from auto-reloading on file changes
33
+ os.environ["STREAMLIT_WATCHER_TYPE"] = "none"
34
+
35
+ # Apply nest_asyncio for async operations
36
+ nest_asyncio.apply()
37
+
38
+ # Set environment variables for API keys
39
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # OpenAI API Key
40
+ os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_CLOUD_API_KEY") # Llama Cloud API Key
41
+ os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
42
+ os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")
43
+ os.environ["AWS_REGION"] = os.getenv("AWS_REGION")
44
+
45
+
46
+ # File paths for IND Assistant
47
+ PDF_FILE = "IND-312.pdf"
48
+ PREPROCESSED_FILE = "preprocessed_docs.json"
49
+
50
+ # --- IND Assistant Functions ---
51
+
52
+ # Load and parse PDF (only for preprocessing)
53
+ def load_pdf(pdf_path: str) -> List[Document]:
54
+ """Loads a PDF, processes it with LlamaParse, and splits it into LangChain documents."""
55
+ from llama_parse import LlamaParse # Import only if needed
56
+
57
+ file_size = os.path.getsize(pdf_path) / (1024 * 1024) # Size in MB
58
+ workers = 2 if file_size > 2 else 1 # Use 2 workers for PDFs >2MB
59
+
60
+ parser = LlamaParse(
61
+ api_key=os.environ["LLAMA_CLOUD_API_KEY"],
62
+ result_type="markdown",
63
+ num_workers=workers,
64
+ verbose=True
65
+ )
66
+
67
+ # Parse PDF to documents
68
+ llama_documents = parser.load_data(pdf_path)
69
+
70
+ # Convert to LangChain documents
71
+ documents = [
72
+ Document(
73
+ page_content=doc.text,
74
+ metadata={"source": pdf_path, "page": doc.metadata.get("page_number", 0)}
75
+ ) for doc in llama_documents
76
+ ]
77
+
78
+ # Split documents into chunks
79
+ text_splitter = RecursiveCharacterTextSplitter(
80
+ chunk_size=500,
81
+ chunk_overlap=50,
82
+ length_function=len,
83
+ )
84
+
85
+ return text_splitter.split_documents(documents)
86
+
87
+ # Preprocess the PDF and save to JSON (Only if it doesn't exist)
88
+ def preprocess_pdf(pdf_path: str, output_path: str = PREPROCESSED_FILE):
89
+ """Preprocess PDF only if the output file does not exist."""
90
+ if os.path.exists(output_path):
91
+ print(f"Preprocessed data already exists at {output_path}. Skipping PDF processing.")
92
+ return # Skip processing if file already exists
93
+
94
+ print("Processing PDF for the first time...")
95
+
96
+ documents = load_pdf(pdf_path) # Load and process the PDF
97
+
98
+ # Convert documents to JSON format
99
+ json_data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents]
100
+
101
+ # Save to file
102
+ with open(output_path, "w", encoding="utf-8") as f:
103
+ json.dump(json_data, f, indent=4)
104
+
105
+ print(f"Preprocessed PDF saved to {output_path}")
106
+
107
+ # Load preprocessed data instead of parsing PDF
108
+ def load_preprocessed_data(json_path: str) -> List[Document]:
109
+ """Load preprocessed data from JSON."""
110
+ if not os.path.exists(json_path):
111
+ raise FileNotFoundError(f"Preprocessed file {json_path} not found. Run preprocessing first.")
112
+
113
+ with open(json_path, "r", encoding="utf-8") as f:
114
+ json_data = json.load(f)
115
+
116
+ return [Document(page_content=d["content"], metadata=d["metadata"]) for d in json_data]
117
+
118
+ # Initialize vector store from preprocessed data
119
+ def init_vector_store(documents: List[Document]):
120
+ """Initialize a vector store using HuggingFace embeddings and Qdrant."""
121
+ if not documents or not all(doc.page_content.strip() for doc in documents):
122
+ raise ValueError("No valid documents found for vector storage")
123
+
124
+ # Initialize embedding model
125
+ embedding_model = HuggingFaceBgeEmbeddings(
126
+ model_name="BAAI/bge-base-en-v1.5",
127
+ encode_kwargs={'normalize_embeddings': True}
128
+ )
129
+
130
+ return Qdrant.from_documents(
131
+ documents=documents,
132
+ embedding=embedding_model,
133
+ location=":memory:",
134
+ collection_name="ind312_docs",
135
+ force_recreate=False
136
+ )
137
+
138
+ # Create RAG chain for retrieval-based Q&A
139
+ def create_rag_chain(retriever):
140
+ """Create a retrieval-augmented generation (RAG) chain for answering questions."""
141
+ # Load prompt template
142
+ with open("template.md") as f:
143
+ template_content = f.read()
144
+
145
+ prompt = ChatPromptTemplate.from_template("""
146
+ You are an FDA regulatory expert. Use this structure for checklists:
147
+ {template}
148
+
149
+ Context from IND-312:
150
+ {context}
151
+
152
+ Question: {question}
153
+
154
+ Answer in Markdown with checkboxes (- [ ]). If unsure, say "I can only answer IND related questions.".
155
+ """)
156
+
157
+ return (
158
+ {
159
+ "context": itemgetter("question") | retriever,
160
+ "question": itemgetter("question"),
161
+ "template": lambda _: template_content # Inject template content
162
+ }
163
+ | RunnablePassthrough.assign(context=itemgetter("context"))
164
+ | {"response": prompt | ChatOpenAI(model="gpt-4") | StrOutputParser()}
165
+ )
166
+
167
+ # Caching function to prevent redundant RAG processing
168
+ @st.cache_data
169
+ def cached_response(question: str):
170
+ """Retrieve cached response if available, otherwise compute response."""
171
+ if "rag_chain" in st.session_state:
172
+ return st.session_state.rag_chain.invoke({"question": question})["response"]
173
+ else:
174
+ st.error("RAG chain not initialized. Please initialize the IND Assistant first.")
175
+ return ""
176
+
177
+ # --- Submission Assessment Functions ---
178
+
179
+ # Access API key from environment variable
180
+ LLAMA_CLOUD_API_KEY = os.environ.get("LLAMA_CLOUD_API_KEY")
181
+
182
+ # Check if the API key is available
183
+ if not LLAMA_CLOUD_API_KEY:
184
+ st.error("LLAMA_CLOUD_API_KEY not found in environment variables. Please set it in your Hugging Face Space secrets.")
185
+ st.stop()
186
+
187
+ # Sample Checklist Configuration (this should be adjusted to your actual IND requirements)
188
+ IND_CHECKLIST = {
189
+ "Form FDA-1571": {
190
+ "file_patterns": ["1571", "fda-1571"],
191
+ "required_keywords": [
192
+ # Sponsor Information
193
+ "Name of Sponsor",
194
+ "Date of Submission",
195
+ "Address 1",
196
+ "Sponsor Telephone Number",
197
+ # Drug Information
198
+ "Name of Drug",
199
+ "IND Type",
200
+ "Proposed Indication for Use",
201
+ # Regulatory Information
202
+ "Phase of Clinical Investigation",
203
+ "Serial Number",
204
+ # Application Contents
205
+ "Table of Contents",
206
+ "Investigator's Brochure",
207
+ "Study protocol",
208
+ "Investigator data",
209
+ "Facilities data",
210
+ "Institutional Review Board data",
211
+ "Environmental assessment",
212
+ "Pharmacology and Toxicology",
213
+ # Signatures and Certifications
214
+ #"Person Responsible for Clinical Investigation Monitoring",
215
+ #"Person Responsible for Reviewing Safety Information",
216
+ "Sponsor or Sponsor's Authorized Representative First Name",
217
+ "Sponsor or Sponsor's Authorized Representative Last Name",
218
+ "Sponsor or Sponsor's Authorized Representative Title",
219
+ "Sponsor or Sponsor's Authorized Representative Telephone Number",
220
+ "Date of Sponsor's Signature"
221
+ ]
222
+ },
223
+ "Table of Contents": {
224
+ "file_patterns": ["toc", "table of contents"],
225
+ "required_keywords": ["table of contents", "sections", "appendices"]
226
+ },
227
+ "Introductory Statement": {
228
+ "file_patterns": ["intro", "introductory", "general plan"],
229
+ "required_keywords": ["introduction", "investigational plan", "objectives"]
230
+ },
231
+ "Investigator Brochure": {
232
+ "file_patterns": ["brochure", "ib"],
233
+ "required_keywords": ["pharmacology", "toxicology", "clinical data"]
234
+ },
235
+ "Clinical Protocol": {
236
+ "file_patterns": ["clinical", "protocol"],
237
+ "required_keywords": ["study design", "objectives", "patient population", "dosing regimen", "endpoints"]
238
+ },
239
+ "CMC Information": {
240
+ "file_patterns": ["cmc", "chemistry", "manufacturing"],
241
+ "required_keywords": ["manufacturing", "controls", "specifications", "stability"]
242
+ },
243
+ "Pharmacology and Toxicology": {
244
+ "file_patterns": ["pharm", "tox", "pharmacology", "toxicology"],
245
+ "required_keywords": ["pharmacology studies", "toxicology studies", "animal studies"]
246
+ },
247
+ "Previous Human Experience": {
248
+ "file_patterns": ["human", "experience", "previous"],
249
+ "required_keywords": ["previous studies", "human subjects", "clinical experience"]
250
+ },
251
+ "Additional Information": {
252
+ "file_patterns": ["additional", "other", "supplemental"],
253
+ "required_keywords": ["additional data", "supplementary information"]
254
+ }
255
+ }
256
+
257
+
258
+ class ChecklistCrossReferenceAgent:
259
+ """
260
+ Agent that cross-references the pre-parsed submission package data
261
+ against a predefined IND checklist.
262
+
263
+ Input:
264
+ submission_data: list of dicts representing each file with keys:
265
+ - "filename": Filename of the document.
266
+ - "file_type": e.g., "pdf" or "txt"
267
+ - "content": Extracted text from the document.
268
+ - "metadata": (Optional) Additional metadata.
269
+ checklist: dict representing the IND checklist.
270
+ Output:
271
+ A mapping of checklist items to their verification status.
272
+ """
273
+ def __init__(self, checklist):
274
+ self.checklist = checklist
275
+
276
+ def run(self, submission_data):
277
+ cross_reference_result = {}
278
+ for document_name, config in self.checklist.items():
279
+ file_patterns = config.get("file_patterns", [])
280
+ required_keywords = config.get("required_keywords", [])
281
+ matched_file = None
282
+
283
+ # Attempt to find a matching file based on filename patterns.
284
+ for file_info in submission_data:
285
+ filename = file_info.get("filename", "").lower()
286
+ if any(pattern.lower() in filename for pattern in file_patterns):
287
+ matched_file = file_info
288
+ break
289
+
290
+ # Build the result per checklist item.
291
+ if not matched_file:
292
+ # File is completely missing.
293
+ cross_reference_result[document_name] = {
294
+ "status": "missing",
295
+ "missing_fields": required_keywords
296
+ }
297
+ else:
298
+ # File found, check if its content includes the required keywords.
299
+ content = matched_file.get("content", "").lower()
300
+ missing_fields = []
301
+ for keyword in required_keywords:
302
+ if keyword.lower() not in content:
303
+ missing_fields.append(keyword)
304
+ if missing_fields:
305
+ cross_reference_result[document_name] = {
306
+ "status": "incomplete",
307
+ "missing_fields": missing_fields
308
+ }
309
+ else:
310
+ cross_reference_result[document_name] = {
311
+ "status": "present",
312
+ "missing_fields": []
313
+ }
314
+ return cross_reference_result
315
+
316
+
317
+ class AssessmentRecommendationAgent:
318
+ """
319
+ Agent that analyzes the cross-reference data and produces an
320
+ assessment report with recommendations.
321
+
322
+ Input:
323
+ cross_reference_result: dict mapping checklist items to their status.
324
+ Output:
325
+ A dict containing an overall compliance flag and detailed recommendations.
326
+ """
327
+ def run(self, cross_reference_result):
328
+ recommendations = {}
329
+ overall_compliant = True
330
+
331
+ for doc, result in cross_reference_result.items():
332
+ status = result.get("status")
333
+ if status == "missing":
334
+ recommendations[doc] = f"{doc} is missing. Please include the document."
335
+ overall_compliant = False
336
+ elif status == "incomplete":
337
+ missing = ", ".join(result.get("missing_fields", []))
338
+ recommendations[doc] = (f"{doc} is incomplete. Missing required fields: {missing}. "
339
+ "Please update accordingly.")
340
+ overall_compliant = False
341
+ else:
342
+ recommendations[doc] = f"{doc} is complete."
343
+ assessment = {
344
+ "overall_compliant": overall_compliant,
345
+ "recommendations": recommendations
346
+ }
347
+ return assessment
348
+
349
+
350
+ class OutputFormatterAgent:
351
+ """
352
+ Agent that formats the assessment report into a user-friendly format.
353
+ This example formats the output as Markdown.
354
+
355
+ Input:
356
+ assessment: dict output from AssessmentRecommendationAgent.
357
+ Output:
358
+ A formatted string report.
359
+ """
360
+ def run(self, assessment):
361
+ overall = "Compliant" if assessment.get("overall_compliant") else "Non-Compliant"
362
+ lines = []
363
+ lines.append("# Submission Package Assessment Report")
364
+ lines.append(f"**Overall Compliance:** {overall}\n")
365
+ recommendations = assessment.get("recommendations", {})
366
+ for doc, rec in recommendations.items():
367
+ lines.append(f"### {doc}")
368
+ # Format recommendations as bullet points
369
+ if "incomplete" in rec.lower():
370
+ missing_fields = rec.split("Missing required fields: ")[1].split(".")[0].split(", ")
371
+ lines.append("- Status: Incomplete")
372
+ lines.append(" - Missing Fields:")
373
+ for field in missing_fields:
374
+ lines.append(f" - {field}")
375
+ else:
376
+ lines.append(f"- Status: {rec}")
377
+ return "\n".join(lines)
378
+
379
+
380
+ class SupervisorAgent:
381
+ """
382
+ Supervisor Agent to orchestrate the agent pipeline in a serial, chained flow:
383
+
384
+ 1. ChecklistCrossReferenceAgent
385
+ 2. AssessmentRecommendationAgent
386
+ 3. OutputFormatterAgent
387
+
388
+ Input:
389
+ submission_data: Pre-processed submission package data.
390
+ Output:
391
+ A final formatted report and completeness percentage.
392
+ """
393
+ def __init__(self, checklist):
394
+ self.checklist_agent = ChecklistCrossReferenceAgent(checklist)
395
+ self.assessment_agent = AssessmentRecommendationAgent()
396
+ self.formatter_agent = OutputFormatterAgent()
397
+ self.total_required_files = 9 # Total number of required files
398
+
399
+ def run(self, submission_data):
400
+ # Step 1: Cross-reference the submission data against the checklist
401
+ cross_ref_result = self.checklist_agent.run(submission_data)
402
+ # Step 2: Analyze the cross-reference result to produce assessment and recommendations
403
+ assessment_report = self.assessment_agent.run(cross_ref_result)
404
+ # Step 3: Calculate completeness percentage
405
+ completeness_percentage = self.calculate_completeness(cross_ref_result)
406
+ # Step 4: Format the assessment report for display
407
+ formatted_report = self.formatter_agent.run(assessment_report)
408
+ return formatted_report, completeness_percentage
409
+
410
+ def calculate_completeness(self, cross_ref_result):
411
+ """Calculate the completeness percentage of the submission package."""
412
+ completed_files = 0
413
+ for result in cross_ref_result.values():
414
+ if result["status"] == "present":
415
+ completed_files += 1
416
+ elif result["status"] == "incomplete":
417
+ completed_files += 0.5 # Consider incomplete files as half finished
418
+ return (completed_files / self.total_required_files) * 100
419
+
420
+
421
+ # --- Helper Functions for ZIP Processing ---
422
+
423
+ def download_zip_from_s3(s3_url: str) -> BytesIO:
424
+ """Downloads a ZIP file from S3 and returns it as a BytesIO object."""
425
+ try:
426
+ s3 = boto3.client(
427
+ 's3',
428
+ aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
429
+ aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
430
+ region_name=os.environ["AWS_REGION"]
431
+ )
432
+
433
+ # Parse S3 URL
434
+ bucket_name = s3_url.split('/')[2]
435
+ key = '/'.join(s3_url.split('/')[3:])
436
+
437
+ # Download the file
438
+ response = s3.get_object(Bucket=bucket_name, Key=key)
439
+ zip_bytes = response['Body'].read()
440
+ return BytesIO(zip_bytes)
441
+ except Exception as e:
442
+ st.error(f"Error downloading ZIP file from S3: {str(e)}")
443
+ return None
444
+
445
+ def download_zip_from_url(url: str) -> BytesIO:
446
+ """Downloads a ZIP file from a URL and returns it as a BytesIO object."""
447
+ try:
448
+ response = requests.get(url, stream=True)
449
+ response.raise_for_status() # Raise an exception for bad status codes
450
+ return BytesIO(response.content)
451
+ except requests.exceptions.RequestException as e:
452
+ st.error(f"Error downloading ZIP file from URL: {str(e)}")
453
+ return None
454
+
455
+ def process_uploaded_zip(zip_file: BytesIO) -> list:
456
+ """
457
+ Processes a ZIP file (from BytesIO), caches embeddings, and returns a list of file dictionaries.
458
+ """
459
+ submission_data = []
460
+
461
+ with ZipFile(zip_file) as zip_ref:
462
+ for filename in zip_ref.namelist():
463
+ file_ext = os.path.splitext(filename)[1].lower()
464
+ file_bytes = zip_ref.read(filename)
465
+ content = ""
466
+
467
+ # Generate a unique cache key based on the file content
468
+ file_hash = hashlib.md5(file_bytes).hexdigest()
469
+ cache_key = f"{filename}_{file_hash}"
470
+ cache_file = f".cache/{cache_key}.pkl" # Cache file path
471
+
472
+ # Create the cache directory if it doesn't exist
473
+ os.makedirs(".cache", exist_ok=True)
474
+
475
+ if os.path.exists(cache_file):
476
+ # Load from cache
477
+ print(f"Loading {filename} from cache")
478
+ try:
479
+ with open(cache_file, "rb") as f:
480
+ content = pickle.load(f)
481
+ except Exception as e:
482
+ st.error(f"Error loading {filename} from cache: {str(e)}")
483
+ content = "" # Or handle the error as appropriate
484
+ else:
485
+ # Process and cache
486
+ print(f"Processing {filename} and caching")
487
+ if file_ext == ".pdf":
488
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
489
+ tmp.write(file_bytes)
490
+ tmp.flush()
491
+ tmp_path = tmp.name
492
+ file_size = os.path.getsize(tmp_path) / (1024 * 1024)
493
+ workers = 2 if file_size > 2 else 1
494
+ try:
495
+ parser = LlamaParse(
496
+ api_key=LLAMA_CLOUD_API_KEY,
497
+ result_type="markdown",
498
+ num_workers=workers,
499
+ verbose=True
500
+ )
501
+ llama_documents = parser.load_data(tmp_path)
502
+ content = "\n".join([doc.text for doc in llama_documents])
503
+ except Exception as e:
504
+ content = f"Error parsing PDF: {str(e)}"
505
+ st.error(f"Error parsing PDF {filename}: {str(e)}")
506
+ finally:
507
+ os.remove(tmp_path)
508
+ elif file_ext == ".txt":
509
+ try:
510
+ content = file_bytes.decode("utf-8")
511
+ except UnicodeDecodeError:
512
+ content = file_bytes.decode("latin1")
513
+ except Exception as e:
514
+ content = f"Error decoding text file {filename}: {str(e)}"
515
+ st.error(f"Error decoding text file {filename}: {str(e)}")
516
+ else:
517
+ continue # Skip unsupported file types
518
+
519
+ # Save to cache
520
+ try:
521
+ with open(cache_file, "wb") as f:
522
+ pickle.dump(content, f)
523
+ except Exception as e:
524
+ st.error(f"Error saving {filename} to cache: {str(e)}")
525
+
526
+ submission_data.append({
527
+ "filename": filename,
528
+ "file_type": file_ext.replace(".", ""),
529
+ "content": content,
530
+ "metadata": {}
531
+ })
532
+ return submission_data
533
+
534
+ # --- Main Streamlit App ---
535
+
536
+ def main():
537
+ st.title("IND Assistant and Submission Assessment")
538
+
539
+ # Sidebar for app selection
540
+ app_mode = st.sidebar.selectbox(
541
+ "Choose an app mode",
542
+ ["IND Assistant", "Submission Assessment"]
543
+ )
544
+
545
+ if app_mode == "IND Assistant":
546
+ st.header("IND Assistant")
547
+ st.markdown("Chat about Investigational New Drug Applications")
548
+
549
+ # Add "Clear Chat History" button on the main screen
550
+ if st.button("Clear Chat History"):
551
+ if "messages" in st.session_state:
552
+ del st.session_state["messages"]
553
+ st.rerun()
554
+
555
+ # Initialize session state
556
+ if "messages" not in st.session_state:
557
+ st.session_state.messages = []
558
+
559
+ # Load preprocessed data and initialize the RAG chain
560
+ if "rag_chain" not in st.session_state or "vectorstore" not in st.session_state:
561
+ if not os.path.exists(PREPROCESSED_FILE):
562
+ st.error(f"❌ Preprocessed file '{PREPROCESSED_FILE}' not found. Please run preprocessing first.")
563
+ return # Stop execution if preprocessed data is missing
564
+
565
+ with st.spinner("🔄 Initializing knowledge base..."):
566
+ documents = load_preprocessed_data(PREPROCESSED_FILE)
567
+ vectorstore = init_vector_store(documents)
568
+ st.session_state.rag_chain = create_rag_chain(vectorstore.as_retriever())
569
+ st.session_state.vectorstore = vectorstore # Store vectorstore in session state
570
+
571
+ # Display chat history
572
+ for message in st.session_state.messages:
573
+ with st.chat_message(message["role"]):
574
+ st.markdown(message["content"])
575
+
576
+ # Chat input and response handling
577
+ if prompt := st.chat_input("Ask about IND requirements"):
578
+ st.session_state.messages.append({"role": "user", "content": prompt})
579
+
580
+ # Display user message
581
+ with st.chat_message("user"):
582
+ st.markdown(prompt)
583
+
584
+ # Generate response (cached if already asked before)
585
+ with st.chat_message("assistant"):
586
+ response = cached_response(prompt)
587
+ st.markdown(response)
588
+
589
+ # Store bot response in chat history
590
+ st.session_state.messages.append({"role": "assistant", "content": response})
591
+
592
+ elif app_mode == "Submission Assessment":
593
+ st.header("Submission Package Assessment")
594
+ st.write(
595
+ """
596
+ Upload a ZIP file containing your submission package, or enter the S3 URL of the ZIP file.
597
+ The ZIP file can include PDF and text files.
598
+
599
+ Required Files:
600
+ 1. Form FDA-1571
601
+ 2. Table of Contents
602
+ 3. Introductory Statement and General Investigational Plan
603
+ 4. Investigator Brochure
604
+ 5. Clinical Protocol
605
+ 6. Chemistry Manufacturing and Control Information (CMC)
606
+ 7. Pharmacology and Toxicology Data
607
+ 8. Previous Human Experience
608
+ 9. Additional Information
609
+ """
610
+ )
611
+
612
+ # Option 1: Upload ZIP file
613
+ uploaded_file = st.file_uploader("Choose a ZIP file", type=["zip"])
614
+
615
+ # Option 2: Enter S3 URL
616
+ s3_url = st.text_input("Or enter S3 URL of the ZIP file:")
617
+
618
+ zip_file = None # Initialize zip_file
619
+
620
+ if uploaded_file is not None:
621
+ zip_file = BytesIO(uploaded_file.read())
622
+ elif s3_url:
623
+ zip_file = download_zip_from_s3(s3_url)
624
+
625
+ if zip_file:
626
+ try:
627
+ # Process the ZIP file
628
+ submission_data = process_uploaded_zip(zip_file)
629
+ st.success("File processed successfully!")
630
+
631
+ # Display a summary of the extracted files
632
+ st.subheader("Extracted Files")
633
+ for file_info in submission_data:
634
+ st.write(f"**{file_info['filename']}** - ({file_info['file_type'].upper()})")
635
+
636
+ # Instantiate and run the SupervisorAgent
637
+ supervisor = SupervisorAgent(IND_CHECKLIST)
638
+ assessment_report, completeness_percentage = supervisor.run(submission_data)
639
+
640
+ # Display Completeness Percentage
641
+ st.subheader("Submission Package Completeness")
642
+ st.progress(completeness_percentage / 100)
643
+ st.write(f"Overall Completeness: {completeness_percentage:.1f}%")
644
+
645
+ # Display Assessment Report
646
+ st.subheader("Assessment Report")
647
+ st.markdown(assessment_report)
648
+
649
+ except Exception as e:
650
+ st.error(f"Error processing file: {str(e)}")
651
+
652
+ if __name__ == "__main__":
653
+ # Preprocess PDF if it doesn't exist
654
+ if not os.path.exists(PREPROCESSED_FILE):
655
+ preprocess_pdf(PDF_FILE)
656
+ main()
preprocessed_docs.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai>=1.0.0
2
+ langchain>=0.0.148
3
+ langchain-openai>=0.0.1
4
+ langchain-community>=0.1.0
5
+ streamlit>=1.32.0
6
+ qdrant-client>=0.3.0
7
+ llama-parse>=0.0.1
8
+ nest-asyncio>=1.5.6
9
+ torch>=2.0.0
10
+ sentence-transformers>=2.2.2
11
+ langgraph>=0.1.0
12
+ tiktoken
13
+ boto3
14
+ requests
template.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1. Pre-IND Meeting Preparation
2
+ Request a Pre-IND Meeting: Schedule a meeting with the FDA to discuss your IND submission.
3
+
4
+ Prepare Meeting Package: Include proposed clinical trial design, preclinical data, manufacturing information, and any other relevant data.
5
+
6
+ Submit Questions: Prepare a list of specific questions for the FDA regarding your IND submission.
7
+
8
+ 2. Form FDA 1571
9
+ Complete Form FDA 1571: Ensure all sections are filled out accurately, including sponsor information, drug information, and clinical trial details.
10
+
11
+ Signature: Obtain the required signature from the sponsor or authorized representative.
12
+
13
+ 3. Table of Contents
14
+ Create a Comprehensive Table of Contents: Organize the IND submission with clear sections and page numbers for easy navigation.
15
+
16
+ 4. Introductory Statement and General Investigational Plan
17
+ Introductory Statement: Provide a brief overview of the drug, including its name, structure, and pharmacological class.
18
+
19
+ General Investigational Plan: Outline the clinical development plan, including the objectives and duration of the proposed studies.
20
+
21
+ 5. Investigator's Brochure
22
+ Compile the Investigator's Brochure: Include all relevant information about the drug, such as its formulation, pharmacology, toxicology, and clinical data.
23
+
24
+ Update as Necessary: Ensure the brochure is up-to-date with the latest data.
25
+
26
+ 6. Clinical Protocol
27
+ Develop Clinical Protocol: Detail the study design, including objectives, patient population, dosing regimen, and endpoints.
28
+
29
+ Inclusion/Exclusion Criteria: Clearly define the criteria for patient selection.
30
+
31
+ Safety Monitoring: Outline the procedures for monitoring patient safety.
32
+
33
+ 7. Chemistry, Manufacturing, and Control (CMC) Information
34
+ Drug Substance Information: Provide details on the drug substance, including its manufacture, characterization, and controls.
35
+
36
+ Drug Product Information: Include information on the drug product, such as formulation, manufacturing process, and specifications.
37
+
38
+ Stability Data: Submit stability data to support the proposed shelf life of the drug.
39
+
40
+ Labeling: Provide draft labeling for the investigational drug.
41
+
42
+ 8. Pharmacology and Toxicology Data
43
+ Pharmacology Studies: Submit data from in vitro and in vivo studies that demonstrate the drug's pharmacological effects.
44
+
45
+ Toxicology Studies: Include data from acute, subacute, and chronic toxicity studies, as well as reproductive and genotoxicity studies.
46
+
47
+ Safety Pharmacology: Provide data on the drug's effects on vital organ systems.
48
+
49
+ 9. Previous Human Experience
50
+ Summarize Previous Human Experience: If applicable, include data from previous clinical trials or use in humans.
51
+
52
+ Safety and Efficacy Data: Highlight any relevant safety and efficacy findings from prior studies.
53
+
54
+ 10. Additional Information
55
+ Environmental Assessment: Submit an environmental assessment or claim an exclusion if applicable.
56
+
57
+ Special Considerations: Include any additional information that may be relevant, such as data from pediatric studies or risk management plans.
58
+
59
+ 11. Review and Quality Control
60
+ Internal Review: Conduct a thorough internal review of the IND submission to ensure accuracy and completeness.
61
+
62
+ Quality Control: Verify that all data and documents meet regulatory standards and guidelines.
63
+
64
+ 12. Submission to FDA
65
+ Compile the IND Submission: Assemble all sections into a single, well-organized submission.
66
+
67
+ Submit to FDA: Send the IND submission to the appropriate FDA division via the required submission method (e.g., electronic submission).
68
+
69
+ Confirmation of Receipt: Obtain confirmation from the FDA that the IND has been received and is under review.
70
+
71
+
72
+