Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
import logging | |
from pathlib import Path | |
import tempfile | |
import time | |
# Setup logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class SyllabusFormatter: | |
def __init__(self, model_name="microsoft/Phi-3-mini-4k-instruct"): | |
self.model_name = model_name | |
self.tokenizer = None | |
self.model = None | |
self.pipe = None | |
self.processed_count = 0 | |
self.total_count = 0 | |
def setup_model(self): | |
"""Download and setup the Phi model""" | |
try: | |
# Load tokenizer | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
self.model_name, | |
trust_remote_code=True | |
) | |
# Load model with 8-bit quantization for efficiency | |
self.model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
trust_remote_code=True, | |
load_in_8bit=True | |
) | |
# Create pipeline | |
self.pipe = pipeline( | |
"text-generation", | |
model=self.model, | |
tokenizer=self.tokenizer, | |
max_new_tokens=2048, | |
temperature=0.1, | |
do_sample=True, | |
top_p=0.9, | |
repetition_penalty=1.1 | |
) | |
logger.info("Model setup complete!") | |
return True | |
except Exception as e: | |
logger.error(f"Error setting up model: {str(e)}") | |
return False | |
def create_formatting_prompt(self, unit_content: str, unit_name: str, subject_name: str = "") -> str: | |
"""Create a very clear, focused prompt for formatting syllabus content""" | |
prompt = f"""<|system|>You are a professional academic syllabus formatter. Your ONLY job is to take badly formatted syllabus content and make it beautifully organized and readable. | |
RULES: | |
1. PRESERVE every single word, topic, and concept from the original | |
2. NEVER add explanations, examples, or new content | |
3. ONLY restructure and format the existing text | |
4. Use clear headings, bullet points, and logical grouping | |
5. Separate different topics with proper spacing | |
6. Make it scannable and easy to read | |
FORMAT STYLE: | |
- Use main topic headings with proper capitalization | |
- Group related subtopics under main topics | |
- Use bullet points (β’) for lists of concepts | |
- Use sub-bullets (β¦) for details under main bullets | |
- Separate major sections with line breaks | |
- Keep technical terms exactly as written<|end|> | |
<|user|>Subject: {subject_name} | |
Unit: {unit_name} | |
Original content (poorly formatted): | |
{unit_content} | |
Task: Reformat this content to be beautifully organized and readable. Do NOT add any new information - only restructure what's already there. Make it professional and easy to scan.<|end|> | |
<|assistant|>""" | |
return prompt | |
def format_unit_content(self, unit_content: str, unit_name: str, subject_name: str = "", progress=None) -> str: | |
"""Format a single unit's content using the AI model""" | |
try: | |
# Create prompt | |
prompt = self.create_formatting_prompt(unit_content, unit_name, subject_name) | |
# Generate formatted content | |
response = self.pipe(prompt) | |
# Extract formatted content | |
generated_text = response[0]['generated_text'] | |
assistant_start = generated_text.find("<|assistant|>") | |
if assistant_start != -1: | |
formatted_content = generated_text[assistant_start + len("<|assistant|>"):].strip() | |
else: | |
formatted_content = generated_text.strip() | |
# Clean up and validate | |
formatted_content = self.clean_generated_content(formatted_content) | |
if not self.validate_formatted_content(unit_content, formatted_content): | |
return unit_content | |
return formatted_content | |
except Exception as e: | |
logger.error(f"Error formatting content: {str(e)}") | |
return unit_content | |
def validate_formatted_content(self, original: str, formatted: str) -> bool: | |
"""Validate that formatted content preserves all important information""" | |
# Basic validation | |
if len(formatted) < len(original) * 0.4: | |
return False | |
return True | |
def clean_generated_content(self, content: str) -> str: | |
"""Clean up generated content""" | |
# Remove special tokens | |
for token in ["<|system|>", "<|user|>", "<|assistant|>"]: | |
content = content.replace(token, "") | |
# Clean up extra whitespace | |
content = "\n".join(line.strip() for line in content.split("\n") if line.strip()) | |
return content | |
def process_syllabus_file(self, syllabus_data: dict, progress=gr.Progress()) -> dict: | |
"""Process the entire syllabus file with progress updates""" | |
try: | |
# Count total units | |
total_units = 0 | |
processed = 0 | |
def count_units(data): | |
count = 0 | |
if isinstance(data, dict): | |
for value in data.values(): | |
if isinstance(value, dict): | |
count += count_units(value) | |
elif isinstance(value, str) and "Unit" in str(value): | |
count += 1 | |
return count | |
total_units = count_units(syllabus_data.get("syllabus", {})) | |
logger.info(f"Total units to process: {total_units}") | |
# Process each branch | |
for branch_name, branch_data in syllabus_data.get("syllabus", {}).items(): | |
if not isinstance(branch_data, dict): | |
continue | |
# Process each semester | |
for sem_name, sem_data in branch_data.items(): | |
if not isinstance(sem_data, dict): | |
continue | |
# Process each subject | |
for subject_name, subject_data in sem_data.items(): | |
if not isinstance(subject_data, dict) or "content" not in subject_data: | |
continue | |
content = subject_data["content"] | |
if not isinstance(content, dict): | |
continue | |
# Format each unit | |
for unit_name, unit_content in content.items(): | |
if not unit_name.startswith("Unit") or not isinstance(unit_content, str): | |
continue | |
processed += 1 | |
progress(processed / total_units, desc=f"Processing {unit_name} in {subject_name}...") | |
# Format the unit content | |
formatted_content = self.format_unit_content( | |
unit_content, | |
unit_name, | |
subject_name | |
) | |
# Update the content | |
syllabus_data["syllabus"][branch_name][sem_name][subject_name]["content"][unit_name] = formatted_content | |
# Add formatting metadata | |
if "metadata" not in syllabus_data: | |
syllabus_data["metadata"] = {} | |
syllabus_data["metadata"].update({ | |
"lastFormatted": time.strftime("%Y-%m-%dT%H:%M:%SZ"), | |
"formattingModel": "Phi-3 Mini", | |
"unitsProcessed": processed, | |
"version": "1.0" | |
}) | |
return syllabus_data | |
except Exception as e: | |
logger.error(f"Error processing syllabus: {str(e)}") | |
raise gr.Error(f"Error processing syllabus: {str(e)}") | |
# Initialize the formatter | |
formatter = None | |
def setup_formatter(): | |
global formatter | |
if formatter is None: | |
formatter = SyllabusFormatter() | |
return formatter.setup_model() | |
return True | |
def process_file(file): | |
"""Process the uploaded syllabus file""" | |
try: | |
# Setup formatter if needed | |
if not setup_formatter(): | |
raise gr.Error("Failed to setup the formatting model. Please try again.") | |
# Read and parse JSON | |
content = file.read() | |
syllabus_data = json.loads(content) | |
# Process syllabus | |
formatted_data = formatter.process_syllabus_file(syllabus_data) | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as tmp: | |
json.dump(formatted_data, tmp, indent=2) | |
return tmp.name | |
except json.JSONDecodeError: | |
raise gr.Error("Invalid JSON file. Please check your syllabus file format.") | |
except Exception as e: | |
raise gr.Error(f"Error processing file: {str(e)}") | |
# Custom theme | |
theme = gr.themes.Soft( | |
primary_hue="indigo", | |
secondary_hue="blue", | |
).set( | |
body_background_fill="#fafafa", | |
body_background_fill_dark="#1a1a1a", | |
button_primary_background_fill="*primary_500", | |
button_primary_background_fill_hover="*primary_600" | |
) | |
# Gradio interface | |
title = "π Syllabus Formatter" | |
description = """ | |
Transform your syllabus into a beautifully formatted, easy-to-read document using AI. | |
### Features: | |
- Preserves all original content | |
- Improves readability and organization | |
- Creates logical grouping and sections | |
- Adds professional formatting | |
Simply upload your JSON syllabus file and get a formatted version back! | |
""" | |
css = """ | |
.feedback { | |
margin-top: 20px; | |
padding: 10px; | |
border-radius: 8px; | |
background-color: #f0f9ff; | |
border: 1px solid #bae6fd; | |
} | |
.dark .feedback { | |
background-color: #082f49; | |
border-color: #075985; | |
} | |
""" | |
with gr.Blocks(theme=theme, css=css) as iface: | |
gr.Markdown(f"# {title}") | |
gr.Markdown(description) | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File( | |
label="Upload Syllabus JSON", | |
file_types=[".json"], | |
file_count="single" | |
) | |
process_btn = gr.Button("πͺ Format Syllabus", variant="primary") | |
output_file = gr.File( | |
label="Download Formatted Syllabus", | |
file_count="single", | |
type="file", | |
interactive=False | |
) | |
with gr.Row(): | |
feedback = gr.Markdown( | |
value="Upload a JSON syllabus file to begin...", | |
elem_classes=["feedback"] | |
) | |
def update_feedback(file): | |
return "Processing your syllabus... This may take a few minutes depending on the size." | |
# Setup click event | |
process_btn.click( | |
fn=update_feedback, | |
inputs=[file_input], | |
outputs=[feedback], | |
queue=False | |
).then( | |
fn=process_file, | |
inputs=[file_input], | |
outputs=[output_file] | |
).success( | |
fn=lambda: "β¨ Syllabus formatting complete! You can now download the formatted file.", | |
outputs=[feedback] | |
) | |
gr.Markdown(""" | |
### π Notes: | |
- The formatter preserves all original content while improving organization | |
- Processing time depends on the size of your syllabus | |
- For large files, please be patient as the AI processes each section | |
Made with β€οΈ using Microsoft's Phi-3 Mini model | |
""") | |
# Launch in dev mode | |
if __name__ == "__main__": | |
iface.launch() | |