Spaces:

SlouchyBuffalo
/

Document

Running

File size: 4,263 Bytes

import gradio as gr
import spaces
import PyPDF2
import docx
import io
import os
from typing import Optional
from huggingface_hub import InferenceClient
from prompts import SYSTEM_PROMPT, PROMPTS

def extract_text_from_file(file) -> str:
    """Extract text from uploaded files"""
    if file is None:
        return ""
    
    file_path = file.name
    text = ""
    
    try:
        if file_path.endswith('.pdf'):
            with open(file_path, 'rb') as f:
                reader = PyPDF2.PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() + "\n"
        
        elif file_path.endswith('.docx'):
            doc = docx.Document(file_path)
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
        
        elif file_path.endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
    except Exception as e:
        return f"Error reading file: {str(e)}"
    
    return text

@spaces.GPU
def process_document(document, operation_type, text_input):
    """Main processing function using Cerebras Llama through HuggingFace"""
    
    # Extract text from file or use text input
    if document is not None:
        text = extract_text_from_file(document)
    else:
        text = text_input
    
    if not text.strip():
        return "Please provide either a document or text input."
    
    # Get the appropriate prompt
    prompt = PROMPTS.get(operation_type, "")
    
    # Create the client with Cerebras provider
    try:
        client = InferenceClient(
            "meta-llama/Llama-3.3-70B-Instruct",
            provider="cerebras",
            token=os.getenv("HF_TOKEN"),
        )
        
        # Create conversation messages
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"{prompt}\n\nDocument content:\n{text}"}
        ]
        
        # Generate response using chat completion
        response = client.chat_completion(
            messages=messages,
            max_tokens=3000,
            temperature=0.1,
            stream=False
        )
        
        return response.choices[0].message.content
        
    except Exception as e:
        return f"Error: {str(e)}\n\nPlease ensure:\n1. HF_TOKEN is set in settings\n2. You have Pro access to use Cerebras inference\n3. The Cerebras/Llama integration is enabled in your account"

# Create the Gradio interface
with gr.Blocks(title="Study Assistant", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 📚 Study Assistant - Document Analysis Tool")
    gr.Markdown("Upload a document or paste text, then select the type of analysis you want to perform.")
    gr.Markdown("*Powered by Meta Llama-3.3-70B via Cerebras on HuggingFace*")
    
    with gr.Row():
        with gr.Column():
            document = gr.File(
                label="Upload Document",
                file_types=[".pdf", ".docx", ".txt"],
                file_count="single"
            )
            text_input = gr.Textbox(
                label="Or paste text directly",
                lines=5,
                placeholder="Paste your text here if you don't want to upload a file..."
            )
            
        with gr.Column():
            operation_type = gr.Dropdown(
                choices=["Summary", "Outline", "Analysis", "Study Guide", "Table", "Questions"],
                label="Select Operation",
                value="Summary"
            )
            process_btn = gr.Button("🚀 Process Document", variant="primary", size="lg")
    
    output = gr.Textbox(
        label="Output",
        lines=20,
        show_copy_button=True
    )
    
    gr.Markdown("---")
    gr.Markdown("### Tips:")
    gr.Markdown("- Supported formats: PDF, DOCX, TXT")
    gr.Markdown("- Maximum file size: 200MB")
    gr.Markdown("- Text can be pasted directly if you don't have a file")
    gr.Markdown("- Uses HuggingFace Pro account with Cerebras access")
    
    process_btn.click(
        fn=process_document,
        inputs=[document, operation_type, text_input],
        outputs=output,
        show_progress=True
    )

if __name__ == "__main__":
    demo.launch()