Spaces:

Cylanoid
/

llama_4_Medical_Fraud_Detection

Paused

App Files Files Community

Cylanoid commited on Apr 20

Commit

19103d4

verified ·

1 Parent(s): 50c6bec

Update document_analyzer.py

Browse files

Files changed (1) hide show

document_analyzer.py +10 -26

document_analyzer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # document_analyzer.py
-# Enhanced document analysis module for healthcare fraud detection with Llama 4
 import torch
 import re
@@ -13,9 +13,9 @@ except LookupError:
     nltk.download('punkt')
 class HealthcareFraudAnalyzer:
-    def __init__(self, model, processor, device=None):
         self.model = model
-        self.processor = processor
         self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
         self.model.eval()
@@ -68,32 +68,16 @@ class HealthcareFraudAnalyzer:
         return chunks
     def analyze_chunk(self, chunk: str) -> Dict[str, Any]:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"""Analyze the following healthcare document text for evidence of fraud, neglect, abuse, or criminal conduct.
 Focus on: {', '.join(self.fraud_categories)}.
 Provide specific indicators and cite the relevant text.
 DOCUMENT TEXT:
 {chunk}
-ANALYSIS:"""
-                    }
-                ]
-            }
-        ]
-        inputs = self.processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(self.device)
         with torch.no_grad():
             output = self.model.generate(
@@ -104,8 +88,8 @@ ANALYSIS:"""
                 repetition_penalty=1.2
             )
-        response = self.processor.batch_decode(output[:, inputs["input_ids"].shape[-1]:])[0]
-        analysis = response.strip()
         term_matches = self._find_key_terms(chunk)
@@ -262,7 +246,7 @@ ANALYSIS:"""
         print("\n" + "="*80 + "\n")
-def analyze_pdf_for_fraud(pdf_path, model, processor):
     import pdfplumber
     with pdfplumber.open(pdf_path) as pdf:
@@ -270,7 +254,7 @@ def analyze_pdf_for_fraud(pdf_path, model, processor):
         for page in pdf.pages:
             text += page.extract_text() or ""
-    analyzer = HealthcareFraudAnalyzer(model, processor)
     results = analyzer.analyze_document(text)
     analyzer.print_report(results)

 # document_analyzer.py
+# Enhanced document analysis module for healthcare fraud detection with Llama 4 (text-only)
 import torch
 import re
     nltk.download('punkt')
 class HealthcareFraudAnalyzer:
+    def __init__(self, model, tokenizer, device=None):
         self.model = model
+        self.tokenizer = tokenizer
         self.device = device if device else "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
         self.model.eval()
         return chunks
     def analyze_chunk(self, chunk: str) -> Dict[str, Any]:
+        prompt = f"""<s>[INST] Analyze the following healthcare document text for evidence of fraud, neglect, abuse, or criminal conduct.
 Focus on: {', '.join(self.fraud_categories)}.
 Provide specific indicators and cite the relevant text.
 DOCUMENT TEXT:
 {chunk}
+ANALYSIS: [/INST]"""
+        inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(self.device)
         with torch.no_grad():
             output = self.model.generate(
                 repetition_penalty=1.2
             )
+        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        analysis = response.split("ANALYSIS:")[-1].strip()
         term_matches = self._find_key_terms(chunk)
         print("\n" + "="*80 + "\n")
+def analyze_pdf_for_fraud(pdf_path, model, tokenizer):
     import pdfplumber
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             text += page.extract_text() or ""
+    analyzer = HealthcareFraudAnalyzer(model, tokenizer)
     results = analyzer.analyze_document(text)
     analyzer.print_report(results)