File size: 2,071 Bytes
d9cfebf 4d504fd d9cfebf 4d504fd d9cfebf 19103d4 4d504fd d9cfebf 4d504fd d9cfebf 4d504fd d9cfebf 4d504fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# document_analyzer.py
# Analyzer for healthcare fraud detection using Llama 4 Maverick (text-only)
import torch
import nltk
from nltk.tokenize import sent_tokenize
class HealthcareFraudAnalyzer:
def __init__(self, model, tokenizer, accelerator):
self.model = model
self.tokenizer = tokenizer
self.accelerator = accelerator
self.device = self.accelerator.device
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
def analyze_document(self, sentences):
fraud_indicators = []
for sentence in sentences:
prompt = (
f"Analyze the following sentence for potential healthcare fraud indicators, "
f"such as consent violations, medication misuse, or billing irregularities. "
f"Provide a reason and confidence score (0-1). "
f"Sentence: {sentence}\nOutput format: {{'fraud_detected': bool, 'reason': str, 'confidence': float}}"
)
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
try:
result = eval(response) if response.startswith("{") else {"fraud_detected": False, "reason": "Invalid response", "confidence": 0.0}
if result["fraud_detected"]:
fraud_indicators.append({
"sentence": sentence,
"reason": result["reason"],
"confidence": result["confidence"]
})
except:
continue
return fraud_indicators |