Spaces:
Build error
Build error
| # Install necessary libraries | |
| import os | |
| import subprocess | |
| # Function to install a package if it is not already installed | |
| def install(package): | |
| subprocess.check_call([os.sys.executable, "-m", "pip", "install", package]) | |
| # Ensure the necessary packages are installed | |
| install("transformers") | |
| install("torch") | |
| install("pandas") | |
| install("scikit-learn") | |
| install("gradio") | |
| import os | |
| import pandas as pd | |
| import gradio as gr | |
| from transformers import AutoModel, AutoTokenizer | |
| import torch | |
| from sklearn.model_selection import train_test_split | |
| # Function to convert a list to a DataFrame | |
| def list_to_dataframe(data_list): | |
| # Convert the list to a DataFrame (assuming it's a list of dicts or tuples) | |
| df = pd.DataFrame(data_list) | |
| return df | |
| # Load your dataset from a file | |
| def load_dataset(file_path=None): | |
| if file_path is None: | |
| file_path = '/content/Valid-part-2.xlsx' # Default path if the file is uploaded manually to Colab | |
| # Check if the file exists | |
| if file_path and not os.path.exists(file_path): | |
| print(f"File not found at '{file_path}', using default list data...") | |
| # Fallback to a default list if file is not found | |
| default_data = [ | |
| {'text': 'Example sentence 1', 'label': 'label1'}, | |
| {'text': 'Example sentence 2', 'label': 'label2'}, | |
| # Add more example data as needed | |
| ] | |
| return list_to_dataframe(default_data) | |
| try: | |
| df = pd.read_excel(file_path) | |
| print("Columns in the dataset:", df.columns.tolist()) | |
| return df | |
| except Exception as e: | |
| print(f"Error loading dataset: {e}") | |
| return None | |
| # Preprocess the data | |
| def preprocess_data(df): | |
| # Add your preprocessing steps here | |
| # For example: cleaning, tokenization, etc. | |
| return df | |
| # Train your model | |
| def train_model(df): | |
| # Split the dataset into training and testing sets | |
| train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
| # Load your pre-trained model and tokenizer from Hugging Face | |
| tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
| model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
| # Add your training code here | |
| # This may involve tokenizing the data and feeding it into the model | |
| return model | |
| # Define the Gradio interface function | |
| def predict(input_text): | |
| # Load the model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
| model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base", trust_remote_code=True) | |
| # Tokenize input and make predictions | |
| inputs = tokenizer(input_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # Process the outputs as needed (e.g., extracting relevant information) | |
| return outputs.last_hidden_state | |
| # Build the Gradio interface | |
| def build_interface(file_path=None): | |
| df = load_dataset(file_path) # Load your dataset | |
| if df is None: | |
| return None | |
| df = preprocess_data(df) # Preprocess the dataset | |
| model = train_model(df) # Train your model | |
| iface = gr.Interface( | |
| fn=predict, | |
| inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."), | |
| outputs="text" | |
| ) | |
| return iface | |
| # Run the Gradio interface | |
| if __name__ == "__main__": | |
| # You can specify a file_path here if you have a specific file to use | |
| file_path = None # Change this to your specific file path if needed | |
| iface = build_interface(file_path=file_path) | |
| if iface: | |
| iface.launch() | |
| else: | |
| print("Failed to build the Gradio interface. Please check the dataset and model.") | |