Spaces:
Build error
Build error
Small details
Browse files- app.py +35 -26
- utils/prompts.py +78 -73
app.py
CHANGED
|
@@ -107,15 +107,11 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
|
|
| 107 |
|
| 108 |
|
| 109 |
def get_txt_from_output(output):
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
except Exception as e:
|
| 116 |
-
gr.Error("Error when parsing notebook, try again.")
|
| 117 |
-
logging.error(f"Failed to parse code: {e}")
|
| 118 |
-
raise
|
| 119 |
|
| 120 |
|
| 121 |
def extract_content_from_output(output):
|
|
@@ -266,22 +262,35 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
|
|
| 266 |
yield messages
|
| 267 |
yield messages
|
| 268 |
|
| 269 |
-
logging.info("--->
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
|
| 284 |
-
commands = get_txt_from_output(cells_txt)
|
| 285 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
| 286 |
|
| 287 |
commands.insert(
|
|
@@ -319,7 +328,7 @@ with gr.Blocks(fill_width=True) as demo:
|
|
| 319 |
label="Hub Dataset ID",
|
| 320 |
placeholder="Search for dataset id on Huggingface",
|
| 321 |
search_type="dataset",
|
| 322 |
-
value="
|
| 323 |
)
|
| 324 |
|
| 325 |
dataset_samples = gr.Examples(
|
|
@@ -357,7 +366,7 @@ with gr.Blocks(fill_width=True) as demo:
|
|
| 357 |
|
| 358 |
with gr.Row():
|
| 359 |
generate_eda_btn = gr.Button("Exploratory Data Analysis")
|
| 360 |
-
generate_embedding_btn = gr.Button("
|
| 361 |
generate_rag_btn = gr.Button("RAG")
|
| 362 |
generate_training_btn = gr.Button(
|
| 363 |
"Training - Coming soon", interactive=False
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
def get_txt_from_output(output):
|
| 110 |
+
extracted_text = extract_content_from_output(output)
|
| 111 |
+
logging.info("--> Extracted text between json block")
|
| 112 |
+
logging.info(extracted_text)
|
| 113 |
+
content = json.loads(extracted_text)
|
| 114 |
+
return content
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
def extract_content_from_output(output):
|
|
|
|
| 262 |
yield messages
|
| 263 |
yield messages
|
| 264 |
|
| 265 |
+
logging.info("---> Notebook markdown code output")
|
| 266 |
+
logging.info(generated_text)
|
| 267 |
+
|
| 268 |
+
retries = 0
|
| 269 |
+
retry_limit = 3
|
| 270 |
+
while retries < retry_limit:
|
| 271 |
+
try:
|
| 272 |
+
formatted_prompt = generate_mapping_prompt(generated_text)
|
| 273 |
+
prompt_messages = [{"role": "user", "content": formatted_prompt}]
|
| 274 |
+
yield messages + [
|
| 275 |
+
gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
output = inference_client.chat_completion(
|
| 279 |
+
messages=prompt_messages, stream=False, max_tokens=2500
|
| 280 |
+
)
|
| 281 |
+
cells_txt = output.choices[0].message.content
|
| 282 |
+
logging.info(f"---> Mapping to json output attempt {retries}")
|
| 283 |
+
logging.info(cells_txt)
|
| 284 |
+
commands = get_txt_from_output(cells_txt)
|
| 285 |
+
break
|
| 286 |
+
except Exception as e:
|
| 287 |
+
logging.warn("Error when parsing output, retrying ..")
|
| 288 |
+
retries += 1
|
| 289 |
+
if retries == retry_limit:
|
| 290 |
+
logging.error(f"Unable to parse output after {retry_limit} retries")
|
| 291 |
+
gr.Error("Unable to generate notebook. Try again please")
|
| 292 |
+
raise e
|
| 293 |
|
|
|
|
| 294 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
| 295 |
|
| 296 |
commands.insert(
|
|
|
|
| 328 |
label="Hub Dataset ID",
|
| 329 |
placeholder="Search for dataset id on Huggingface",
|
| 330 |
search_type="dataset",
|
| 331 |
+
value="",
|
| 332 |
)
|
| 333 |
|
| 334 |
dataset_samples = gr.Examples(
|
|
|
|
| 366 |
|
| 367 |
with gr.Row():
|
| 368 |
generate_eda_btn = gr.Button("Exploratory Data Analysis")
|
| 369 |
+
generate_embedding_btn = gr.Button("Embeddings")
|
| 370 |
generate_rag_btn = gr.Button("RAG")
|
| 371 |
generate_training_btn = gr.Button(
|
| 372 |
"Training - Coming soon", interactive=False
|
utils/prompts.py
CHANGED
|
@@ -3,21 +3,22 @@ import outlines
|
|
| 3 |
|
| 4 |
@outlines.prompt
|
| 5 |
def generate_mapping_prompt(code):
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
## Instruction
|
| 10 |
-
Before returning the result, evaluate if the json object is well formatted, if not, fix it.
|
| 11 |
-
The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
|
| 12 |
|
|
|
|
| 13 |
```json
|
| 14 |
[
|
| 15 |
{
|
| 16 |
-
"cell_type": string //
|
| 17 |
-
"source":
|
| 18 |
}
|
| 19 |
]
|
| 20 |
```
|
|
|
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
|
| 23 |
|
|
@@ -37,26 +38,27 @@ def generate_user_prompt(columns_info, sample_data, first_code):
|
|
| 37 |
|
| 38 |
@outlines.prompt
|
| 39 |
def generate_eda_system_prompt():
|
| 40 |
-
"""You are an expert data analyst tasked with
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
1. Install
|
| 46 |
-
2. Load dataset as
|
| 47 |
-
3. Understand the dataset
|
| 48 |
-
4. Check for missing values
|
| 49 |
-
5. Identify
|
| 50 |
-
6.
|
| 51 |
-
7. Generate descriptive statistics
|
| 52 |
-
8. Visualize the distribution of each column
|
| 53 |
-
9.
|
| 54 |
-
10.
|
| 55 |
-
11.
|
| 56 |
-
|
| 57 |
-
Ensure the notebook is well-organized
|
| 58 |
-
The output should be
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
## Columns and Data Types
|
| 62 |
|
|
@@ -64,30 +66,32 @@ def generate_eda_system_prompt():
|
|
| 64 |
|
| 65 |
## Loading Data code
|
| 66 |
|
| 67 |
-
|
| 68 |
"""
|
| 69 |
|
| 70 |
|
| 71 |
@outlines.prompt
|
| 72 |
def generate_embedding_system_prompt():
|
| 73 |
-
"""You are an expert data scientist tasked with
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
The
|
|
|
|
|
|
|
| 91 |
|
| 92 |
## Columns and Data Types
|
| 93 |
|
|
@@ -95,36 +99,37 @@ def generate_embedding_system_prompt():
|
|
| 95 |
|
| 96 |
## Loading Data code
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
"""
|
| 101 |
|
| 102 |
|
| 103 |
@outlines.prompt
|
| 104 |
def generate_rag_system_prompt():
|
| 105 |
-
"""You are an expert machine learning engineer tasked with
|
| 106 |
-
The
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
The
|
|
|
|
|
|
|
| 128 |
|
| 129 |
## Columns and Data Types
|
| 130 |
|
|
@@ -132,5 +137,5 @@ def generate_rag_system_prompt():
|
|
| 132 |
|
| 133 |
## Loading Data code
|
| 134 |
|
| 135 |
-
|
| 136 |
"""
|
|
|
|
| 3 |
|
| 4 |
@outlines.prompt
|
| 5 |
def generate_mapping_prompt(code):
|
| 6 |
+
"""Convert the provided Python code into a list of cells formatted for a Jupyter notebook.
|
| 7 |
+
Ensure that the JSON objects are correctly formatted; if they are not, correct them.
|
| 8 |
+
Do not include an extra comma at the end of the final list element.
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
The output should be a list of JSON objects with the following format:
|
| 11 |
```json
|
| 12 |
[
|
| 13 |
{
|
| 14 |
+
"cell_type": "string", // Specify "markdown" or "code".
|
| 15 |
+
"source": ["string1", "string2"] // List of text or code strings.
|
| 16 |
}
|
| 17 |
]
|
| 18 |
```
|
| 19 |
+
|
| 20 |
+
## Code
|
| 21 |
+
{{ code }}
|
| 22 |
"""
|
| 23 |
|
| 24 |
|
|
|
|
| 38 |
|
| 39 |
@outlines.prompt
|
| 40 |
def generate_eda_system_prompt():
|
| 41 |
+
"""You are an expert data analyst tasked with creating an Exploratory Data Analysis (EDA) Jupyter notebook.
|
| 42 |
+
Use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualizations. Ensure these libraries are installed as part of the notebook.
|
| 43 |
+
|
| 44 |
+
The EDA notebook should include:
|
| 45 |
+
|
| 46 |
+
1. Install and import necessary libraries.
|
| 47 |
+
2. Load the dataset as a DataFrame using the provided code.
|
| 48 |
+
3. Understand the dataset structure.
|
| 49 |
+
4. Check for missing values.
|
| 50 |
+
5. Identify data types of each column.
|
| 51 |
+
6. Detect duplicated rows.
|
| 52 |
+
7. Generate descriptive statistics.
|
| 53 |
+
8. Visualize the distribution of each column.
|
| 54 |
+
9. Explore relationships between columns.
|
| 55 |
+
10. Perform correlation analysis.
|
| 56 |
+
11. Include any additional relevant visualizations or analyses.
|
| 57 |
+
|
| 58 |
+
Ensure the notebook is well-organized with clear explanations for each step.
|
| 59 |
+
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
|
| 60 |
+
|
| 61 |
+
The user will provide the dataset information in the following format:
|
| 62 |
|
| 63 |
## Columns and Data Types
|
| 64 |
|
|
|
|
| 66 |
|
| 67 |
## Loading Data code
|
| 68 |
|
| 69 |
+
Use the provided code to load the dataset; do not use any other method.
|
| 70 |
"""
|
| 71 |
|
| 72 |
|
| 73 |
@outlines.prompt
|
| 74 |
def generate_embedding_system_prompt():
|
| 75 |
+
"""You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
|
| 76 |
+
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
|
| 77 |
+
|
| 78 |
+
The notebook should include:
|
| 79 |
+
|
| 80 |
+
1. Install necessary libraries with !pip install.
|
| 81 |
+
2. Import libraries.
|
| 82 |
+
3. Load the dataset as a DataFrame using the provided code.
|
| 83 |
+
4. Select the column to generate embeddings.
|
| 84 |
+
5. Remove duplicate data.
|
| 85 |
+
6. Convert the selected column to a list.
|
| 86 |
+
7. Load the sentence-transformers model.
|
| 87 |
+
8. Create a FAISS index.
|
| 88 |
+
9. Encode a query sample.
|
| 89 |
+
10. Search for similar documents using the FAISS index.
|
| 90 |
+
|
| 91 |
+
Ensure the notebook is well-organized with explanations for each step.
|
| 92 |
+
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
|
| 93 |
+
|
| 94 |
+
The user will provide dataset information in the following format:
|
| 95 |
|
| 96 |
## Columns and Data Types
|
| 97 |
|
|
|
|
| 99 |
|
| 100 |
## Loading Data code
|
| 101 |
|
| 102 |
+
Use the provided code to load the dataset; do not use any other method.
|
|
|
|
| 103 |
"""
|
| 104 |
|
| 105 |
|
| 106 |
@outlines.prompt
|
| 107 |
def generate_rag_system_prompt():
|
| 108 |
+
"""You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
|
| 109 |
+
The dataset is provided as a pandas DataFrame.
|
| 110 |
+
|
| 111 |
+
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
|
| 112 |
+
|
| 113 |
+
The RAG notebook should include:
|
| 114 |
+
|
| 115 |
+
1. Install necessary libraries.
|
| 116 |
+
2. Import libraries.
|
| 117 |
+
3. Load the dataset as a DataFrame using the provided code.
|
| 118 |
+
4. Select the column for generating embeddings.
|
| 119 |
+
5. Remove duplicate data.
|
| 120 |
+
6. Convert the selected column to a list.
|
| 121 |
+
7. Load the sentence-transformers model.
|
| 122 |
+
8. Create a FAISS index.
|
| 123 |
+
9. Encode a query sample.
|
| 124 |
+
10. Search for similar documents using the FAISS index.
|
| 125 |
+
11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
|
| 126 |
+
12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
|
| 127 |
+
13. Send the prompt to the pipeline and display the answer.
|
| 128 |
+
|
| 129 |
+
Ensure the notebook is well-organized with explanations for each step.
|
| 130 |
+
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
|
| 131 |
+
|
| 132 |
+
The user will provide the dataset information in the following format:
|
| 133 |
|
| 134 |
## Columns and Data Types
|
| 135 |
|
|
|
|
| 137 |
|
| 138 |
## Loading Data code
|
| 139 |
|
| 140 |
+
Use the provided code to load the dataset; do not use any other method.
|
| 141 |
"""
|