Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer | |
import huggingface_hub | |
import os | |
import torch | |
# --- Configuration --- | |
MODEL_ID = "Fastweb/FastwebMIIA-7B" | |
HF_TOKEN = os.getenv("HF_TOKEN") # For Hugging Face Spaces, set this as a Secret | |
# Global variable to store the pipeline | |
text_generator_pipeline = None | |
model_load_error = None | |
# --- Hugging Face Login and Model Loading --- | |
def load_model_and_pipeline(): | |
global text_generator_pipeline, model_load_error | |
if text_generator_pipeline is not None: | |
return True # Already loaded | |
if not HF_TOKEN: | |
model_load_error = "Hugging Face token (HF_TOKEN) not found in Space secrets. Please add it." | |
print(f"ERROR: {model_load_error}") | |
return False | |
try: | |
print(f"Attempting to login to Hugging Face Hub with token...") | |
huggingface_hub.login(token=HF_TOKEN) | |
print("Login successful.") | |
print(f"Loading tokenizer for {MODEL_ID}...") | |
# trust_remote_code is necessary for some models that define custom architectures/code | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
print("Tokenizer loaded.") | |
print(f"Loading model {MODEL_ID}...") | |
# For large models, specify dtype and device_map | |
# device_map="auto" will try to use GPU if available, otherwise CPU | |
# torch_dtype="auto" or torch.bfloat16 (if supported by hardware) can save memory | |
# On CPU Spaces (free tier), this will be VERY slow or might OOM. | |
# You might need to use quantization (e.g., bitsandbytes) for CPU, but that's more complex. | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
trust_remote_code=True, | |
torch_dtype="auto", # or torch.bfloat16 if on A10G or similar | |
device_map="auto" # "auto" is good for single/multi GPU or CPU fallback | |
) | |
print("Model loaded.") | |
# MIIA is an instruct/chat model, so text-generation is the appropriate task | |
text_generator_pipeline = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
# device=0 if torch.cuda.is_available() else -1 # device_map handles this | |
) | |
print("Text generation pipeline created successfully.") | |
model_load_error = None | |
return True | |
except Exception as e: | |
model_load_error = f"Error loading model/pipeline: {str(e)}. Check model name, token, and Space resources (RAM/GPU)." | |
print(f"ERROR: {model_load_error}") | |
text_generator_pipeline = None # Ensure it's None on error | |
return False | |
# --- Text Analysis Function --- | |
def analyze_text(text_input, file_upload, custom_instruction, max_new_tokens, temperature, top_p): | |
global text_generator_pipeline, model_load_error | |
if text_generator_pipeline is None: | |
if model_load_error: | |
return f"Model not loaded. Error: {model_load_error}" | |
else: | |
return "Model is not loaded. Please ensure HF_TOKEN is set and the Space has enough resources." | |
content_to_analyze = "" | |
if file_upload is not None: | |
try: | |
# file_upload is a TemporaryFileWrapper object, .name gives the path | |
with open(file_upload.name, 'r', encoding='utf-8') as f: | |
content_to_analyze = f.read() | |
if not content_to_analyze.strip() and not text_input.strip(): # if file is empty and no text input | |
return "Uploaded file is empty and no direct text input provided. Please provide some text." | |
elif not content_to_analyze.strip() and text_input.strip(): # if file empty but text input has content | |
content_to_analyze = text_input | |
# If file has content, it will be used. If user also typed, file content takes precedence. | |
# We could add logic to concatenate or choose, but this is simpler. | |
except Exception as e: | |
return f"Error reading uploaded file: {str(e)}" | |
elif text_input: | |
content_to_analyze = text_input | |
else: | |
return "Please provide text directly or upload a document." | |
if not content_to_analyze.strip(): | |
return "Input text is empty." | |
# FastwebMIIA is an instruct model. It expects prompts like Alpaca. | |
# Structure: | |
# Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
# ### Instruction: | |
# {your instruction} | |
# ### Input: | |
# {your text} | |
# ### Response: | |
# {model generates this} | |
prompt = f"""Di seguito è riportata un'istruzione che descrive un task, abbinata a un input che fornisce un contesto più ampio. Scrivi una risposta che completi la richiesta in modo appropriato. | |
### Istruzione: | |
{custom_instruction} | |
### Input: | |
{content_to_analyze} | |
### Risposta:""" | |
# For English, you might change the preamble: | |
# prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
# ### Instruction: | |
# {custom_instruction} | |
# ### Input: | |
# {content_to_analyze} | |
# ### Response:""" | |
print(f"\n--- Sending to Model ---") | |
print(f"Prompt:\n{prompt}") | |
print(f"Max New Tokens: {max_new_tokens}, Temperature: {temperature}, Top P: {top_p}") | |
print("------------------------\n") | |
try: | |
# Note: text-generation pipelines often return the prompt + completion. | |
# We might need to strip the prompt from the output if desired. | |
generated_outputs = text_generator_pipeline( | |
prompt, | |
max_new_tokens=int(max_new_tokens), | |
do_sample=True, | |
temperature=float(temperature) if float(temperature) > 0 else 0.7, # temp 0 means greedy | |
top_p=float(top_p), | |
num_return_sequences=1 | |
) | |
response = generated_outputs[0]['generated_text'] | |
# Often, the response includes the prompt. Let's try to return only the new part. | |
# The model should generate text after "### Risposta:" | |
answer_marker = "### Risposta:" | |
if answer_marker in response: | |
return response.split(answer_marker, 1)[1].strip() | |
else: | |
# Fallback if the marker isn't found (shouldn't happen with good prompting) | |
return response # Or you could try to remove the original prompt string | |
except Exception as e: | |
return f"Error during text generation: {str(e)}" | |
# --- Gradio Interface --- | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown(f""" | |
# 📝 Text Analysis with {MODEL_ID} | |
Test the capabilities of the `{MODEL_ID}` model for text analysis tasks on Italian or English texts. | |
Provide an instruction and your text (directly or via upload). | |
**Important:** Model loading can take a few minutes, especially on the first run or on CPU. | |
This app is best run on a Hugging Face Space with GPU resources for this model size. | |
""") | |
with gr.Row(): | |
status_textbox = gr.Textbox(label="Model Status", value="Attempting to load model...", interactive=False) | |
with gr.Tab("Text Input & Analysis"): | |
with gr.Row(): | |
with gr.Column(scale=2): | |
instruction_prompt = gr.Textbox( | |
label="Instruction for the Model (e.g., 'Riassumi questo testo', 'Identify main topics', 'Translate to English')", | |
value="Riassumi questo testo in 3 frasi concise.", | |
lines=3 | |
) | |
text_area_input = gr.Textbox(label="Enter Text Directly", lines=10, placeholder="Paste your text here...") | |
file_input = gr.File(label="Or Upload a Document (.txt)", file_types=['.txt']) | |
with gr.Column(scale=3): | |
output_text = gr.Textbox(label="Model Output", lines=20, interactive=False) | |
with gr.Accordion("Advanced Generation Parameters", open=False): | |
max_new_tokens_slider = gr.Slider(minimum=50, maximum=1024, value=256, step=10, label="Max New Tokens") | |
temperature_slider = gr.Slider(minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature (higher is more creative)") | |
top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top P (nucleus sampling)") | |
analyze_button = gr.Button("🧠 Analyze Text", variant="primary") | |
analyze_button.click( | |
fn=analyze_text, | |
inputs=[text_area_input, file_input, instruction_prompt, max_new_tokens_slider, temperature_slider, top_p_slider], | |
outputs=output_text | |
) | |
# Load the model when the app starts. | |
# This will update the status_textbox after attempting to load. | |
def startup_load_model(): | |
if load_model_and_pipeline(): | |
return "Model loaded successfully and ready." | |
else: | |
return f"Failed to load model. Error: {model_load_error or 'Unknown error during startup.'}" | |
demo.load(startup_load_model, outputs=status_textbox) | |
if __name__ == "__main__": | |
# For local testing (ensure HF_TOKEN is set as an environment variable or you're logged in via CLI) | |
# You would run: HF_TOKEN="your_hf_token_here" python app.py | |
# If not set, it will fail unless you've done `huggingface-cli login` | |
if not HF_TOKEN and "HF_TOKEN" not in os.environ: | |
print("WARNING: HF_TOKEN environment variable not set.") | |
print("For local execution, either set HF_TOKEN or ensure you are logged in via 'huggingface-cli login'.") | |
# Attempt to use CLI login if available | |
try: | |
HF_TOKEN = huggingface_hub.HfApi().token | |
if HF_TOKEN: | |
print("Using token from huggingface-cli login.") | |
else: | |
print("Could not retrieve token from CLI login. Model access might fail.") | |
except Exception as e: | |
print(f"Could not check CLI login status: {e}. Model access might fail.") | |
demo.queue().launch(debug=True, share=False) # share=True for public link if local | |