Spaces:

Cylanoid
/

Nursing-Home-Fraud-Detection-using-Llama

Paused

App Files Files Community

Cylanoid commited on Mar 8

Commit

4b6c42c

verified ·

1 Parent(s): 9b8712f

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -157

app.py CHANGED Viewed

@@ -1,163 +1,44 @@
-# app.py
 import gradio as gr
-from transformers import LlamaForCausalLM, LlamaTokenizer
-import datasets
 import torch
-import json
-import os
-import pdfplumber
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-from accelerate import Accelerator
-import bitsandbytes
-import sentencepiece
-import huggingface_hub
-from transformers import TrainingArguments, Trainer
-# Debug: Print all environment variables to verify 'LLama' is present
-print("Environment variables:", dict(os.environ))
-# Retrieve the token from Hugging Face Space secrets
-# Token placement: LLama:levi put token here
-LLama = os.getenv("LLama")  # Retrieves the value of the 'LLama' environment variable
-if not LLama:
-    raise ValueError("LLama token not found in environment variables. Please set it in Hugging Face Space secrets under 'Settings' > 'Secrets' as 'LLama'.")
-# Debug: Print the token to verify it's being read (remove this in production)
-print(f"Retrieved LLama token: {LLama[:5]}... (first 5 chars for security)")
-# Authenticate with Hugging Face
-huggingface_hub.login(token=LLama)
-# Model setup
-MODEL_ID = "meta-llama/Llama-2-7b-hf"
-tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-# Load model with default attention mechanism (no Flash Attention)
-model = LlamaForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    load_in_8bit=True
-)
-# Add padding token if it doesn't exist and resize embeddings
-if tokenizer.pad_token is None:
-    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-    model.resize_token_embeddings(len(tokenizer))
-# Prepare model for LoRA training
-model = prepare_model_for_kbit_training(model)
-peft_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
-)
-model = get_peft_model(model, peft_config)
-model.print_trainable_parameters()
-# Function to process uploaded files and train
-def train_ui(files):
     try:
-        # Process multiple PDFs or JSON
-        raw_text = ""
-        dataset = None  # Initialize dataset as None
-        for file in files:
-            if file.name.endswith(".pdf"):
-                with pdfplumber.open(file.name) as pdf:
-                    for page in pdf.pages:
-                        raw_text += page.extract_text() or ""
-            elif file.name.endswith(".json"):
-                with open(file.name, "r", encoding="utf-8") as f:
-                    raw_data = json.load(f)
-                    training_data = raw_data.get("training_pairs", raw_data)
-                    with open("temp_fraud_data.json", "w", encoding="utf-8") as f:
-                        json.dump({"training_pairs": training_data}, f)
-                    dataset = datasets.load_dataset("json", data_files="temp_fraud_data.json")
-        if not raw_text and not dataset:
-            return "Error: No valid PDF or JSON data found."
-        # Create training pairs from PDFs if no JSON
-        if raw_text:
-            def create_training_pairs(text):
-                pairs = []
-                if "Haloperidol" in text and "daily" in text.lower():
-                    pairs.append({
-                        "input": "Patient received Haloperidol daily. Is this overmedication?",
-                        "output": "Yes, daily Haloperidol use without documented severe psychosis or failed alternatives may indicate overmedication, violating CMS guidelines."
-                    })
-                if "Lorazepam" in text and "frequent" in text.lower():
-                    pairs.append({
-                        "input": "Care logs show frequent Lorazepam use with a 90-day supply. Is this suspicious?",
-                        "output": "Yes, frequent use with a large supply suggests potential overuse or mismanagement, a fraud indicator."
-                    })
-                return pairs
-            training_data = create_training_pairs(raw_text)
-            with open("temp_fraud_data.json", "w") as f:
-                json.dump({"training_pairs": training_data}, f)
-            dataset = datasets.load_dataset("json", data_files="temp_fraud_data.json")
-        # Tokenization function
-        def tokenize_data(example):
-            formatted_text = f"<s>[INST] {example['input']} [/INST] {example['output']}</s>"
-            inputs = tokenizer(formatted_text, padding="max_length", truncation=True, max_length=4096, return_tensors="pt")
-            inputs["labels"] = inputs["input_ids"].clone()
-            return {k: v.squeeze(0) for k, v in inputs.items()}
-        tokenized_dataset = dataset["train"].map(tokenize_data, batched=True, remove_columns=dataset["train"].column_names)
-        # Training setup
-        training_args = TrainingArguments(
-            output_dir="./fine_tuned_llama_healthcare",
-            per_device_train_batch_size=4,
-            gradient_accumulation_steps=8,
-            eval_strategy="no",
-            save_strategy="epoch",
-            save_total_limit=2,
-            num_train_epochs=5,
-            learning_rate=2e-5,
-            weight_decay=0.01,
-            logging_dir="./logs",
-            logging_steps=10,
-            bf16=True,
-            gradient_checkpointing=True,
-            optim="adamw_torch",
-            warmup_steps=100,
-        )
-        def custom_data_collator(features):
-            return {
-                "input_ids": torch.stack([f["input_ids"] for f in features]),
-                "attention_mask": torch.stack([f["attention_mask"] for f in features]),
-                "labels": torch.stack([f["labels"] for f in features]),
-            }
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=tokenized_dataset,
-            data_collator=custom_data_collator,
-        )
-        trainer.train()
-        model.save_pretrained("./fine_tuned_llama_healthcare")
-        tokenizer.save_pretrained("./fine_tuned_llama_healthcare")
-        return "Training completed! Model saved to ./fine_tuned_llama_healthcare"
     except Exception as e:
-        return f"Error: {str(e)}. Please check file format, dependencies, or the LLama token."
-# Gradio UI
-with gr.Blocks(title="Healthcare Fraud Detection Fine-Tuning") as demo:
-    gr.Markdown("# Fine-Tune LLaMA 2 for Healthcare Fraud Analysis")
-    gr.Markdown("Upload PDFs (e.g., care logs, medication records) or a JSON file with training pairs.")
-    file_input = gr.File(label="Upload Files (PDF/JSON)", file_count="multiple")
-    train_button = gr.Button("Start Fine-Tuning")
-    output = gr.Textbox(label="Training Status", lines=5)
-    train_button.click(fn=train_ui, inputs=file_input, outputs=output)
-# Launch the Gradio app
-demo.launch()

 import gradio as gr
+from transformers import LlamaTokenizer, LlamaForCausalLM
 import torch
+# Load the fine-tuned model and tokenizer
+try:
+    tokenizer = LlamaTokenizer.from_pretrained("./fine_tuned_llama2")
+    model = LlamaForCausalLM.from_pretrained("./fine_tuned_llama2")
+    model.eval()
+    print("Model and tokenizer loaded successfully.")
+except Exception as e:
+    print(f"Error loading model or tokenizer: {e}")
+# Function to predict fraud based on text input
+def predict(input_text):
+    if not input_text:
+        return "Please enter some text to analyze."
     try:
+        # Tokenize input
+        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True)
+        # Generate output
+        with torch.no_grad():
+            outputs = model.generate(**inputs, max_new_tokens=50)
+        # Decode and return result
+        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return result
     except Exception as e:
+        return f"Error during prediction: {e}"
+# Create Gradio interface with text input
+interface = gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(
+        lines=2,
+        placeholder="Enter text to analyze (e.g., 'Facility backdates policies. Is this fraudulent?')",
+        label="Input Text"
+    ),
+    outputs=gr.Textbox(label="Prediction"),
+    title="Fine-Tune LLaMA 2 for Healthcare Fraud Analysis",
+    description="Test the fine-tuned LLaMA 2 model to detect healthcare fraud. Enter a description of a facility's behavior to analyze."
+)
+# Launch the interface
+interface.launch()