Wedyan2023 commited on
Commit
ac47e7a
·
verified ·
1 Parent(s): d9f4633

Delete app100.py

Browse files
Files changed (1) hide show
  1. app100.py +0 -1247
app100.py DELETED
@@ -1,1247 +0,0 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import os
4
- import json
5
- import base64
6
- import random
7
- from streamlit_pdf_viewer import pdf_viewer
8
- from langchain.prompts import PromptTemplate
9
- from datetime import datetime
10
- from pathlib import Path
11
- from openai import OpenAI
12
- from dotenv import load_dotenv
13
- import warnings
14
-
15
- warnings.filterwarnings('ignore')
16
-
17
- os.getenv("OAUTH_CLIENT_ID")
18
-
19
-
20
- # Load environment variables and initialize the OpenAI client to use Hugging Face Inference API.
21
- load_dotenv()
22
- client = OpenAI(
23
- base_url="https://api-inference.huggingface.co/v1",
24
- api_key=os.environ.get('TOKEN2') # Hugging Face API token
25
- )
26
-
27
- # Create necessary directories
28
- for dir_name in ['data', 'feedback']:
29
- if not os.path.exists(dir_name):
30
- os.makedirs(dir_name)
31
-
32
- # Custom CSS
33
- st.markdown("""
34
- <style>
35
- .stButton > button {
36
- width: 100%;
37
- margin-bottom: 10px;
38
- background-color: #4CAF50;
39
- color: white;
40
- border: none;
41
- padding: 10px;
42
- border-radius: 5px;
43
- }
44
- .task-button {
45
- background-color: #2196F3 !important;
46
- }
47
- .stSelectbox {
48
- margin-bottom: 20px;
49
- }
50
- .output-container {
51
- padding: 20px;
52
- border-radius: 5px;
53
- border: 1px solid #ddd;
54
- margin: 10px 0;
55
- }
56
- .status-container {
57
- padding: 10px;
58
- border-radius: 5px;
59
- margin: 10px 0;
60
- }
61
- .sidebar-info {
62
- padding: 10px;
63
- background-color: #f0f2f6;
64
- border-radius: 5px;
65
- margin: 10px 0;
66
- }
67
- .feedback-button {
68
- background-color: #ff9800 !important;
69
- }
70
- .feedback-container {
71
- padding: 15px;
72
- background-color: #f5f5f5;
73
- border-radius: 5px;
74
- margin: 15px 0;
75
- }
76
- </style>
77
- """, unsafe_allow_html=True)
78
-
79
- # Helper functions
80
- def read_csv_with_encoding(file):
81
- encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
82
- for encoding in encodings:
83
- try:
84
- return pd.read_csv(file, encoding=encoding)
85
- except UnicodeDecodeError:
86
- continue
87
- raise UnicodeDecodeError("Failed to read file with any supported encoding")
88
-
89
- #def save_feedback(feedback_data):
90
- #feedback_file = 'feedback/user_feedback.csv'
91
- #feedback_df = pd.DataFrame([feedback_data])
92
-
93
- #if os.path.exists(feedback_file):
94
- #feedback_df.to_csv(feedback_file, mode='a', header=False, index=False)
95
- #else:
96
- #feedback_df.to_csv(feedback_file, index=False)
97
-
98
- def reset_conversation():
99
- st.session_state.conversation = []
100
- st.session_state.messages = []
101
- if 'task_choice' in st.session_state:
102
- del st.session_state.task_choice
103
- return None
104
- #new 24 March
105
- #user_input = st.text_input("Enter your prompt:")
106
- ###########33
107
-
108
- # Initialize session state variables
109
- if "messages" not in st.session_state:
110
- st.session_state.messages = []
111
- if "examples_to_classify" not in st.session_state:
112
- st.session_state.examples_to_classify = []
113
- if "system_role" not in st.session_state:
114
- st.session_state.system_role = ""
115
-
116
-
117
-
118
- # Main app title
119
- st.title("🤖🦙 Text Data Labeling and Generation App")
120
- # def embed_pdf_sidebar(pdf_path):
121
- # with open(pdf_path, "rb") as f:
122
- # base64_pdf = base64.b64encode(f.read()).decode('utf-8')
123
- # pdf_display = f"""
124
- # <iframe src="data:application/pdf;base64,{base64_pdf}"
125
- # width="100%" height="400" type="application/pdf"></iframe>
126
- # """
127
- # st.markdown(pdf_display, unsafe_allow_html=True)
128
- #
129
-
130
-
131
- # Sidebar settings
132
- with st.sidebar:
133
- st.title("⚙️ Settings")
134
- # Add PDF upload section
135
- #
136
- # if st.button("📘 Show Instructions"):
137
- # # This should be a path to a local file
138
- # pdf_path = os.path.join("Streamlit.pdf")
139
- # pdf_viewer(
140
- # pdf_path,
141
- # width="100%",
142
- # height=300,
143
- # render_text=True
144
- # )
145
- # with st.sidebar:
146
- # with st.expander("📘 View Instructions"):
147
- # pdf_viewer("Streamlit.pdf", width="100%", height=300, render_text=True)
148
-
149
- #
150
- ###4
151
- # with st.sidebar:
152
- # st.markdown("### 📘 Instructions")
153
- # st.markdown("[📄 Open Instructions PDF](/file/instructions.pdf)")
154
-
155
-
156
-
157
-
158
- #
159
- ####2
160
- # #with st.sidebar:
161
- # st.markdown("### 📘 Instructions")
162
-
163
- # # PDF served from Space's file system
164
- # pdf_url = "/file/instructions.pdf"
165
-
166
- # st.markdown(f"""
167
- # <a href="{pdf_url}" target="_blank">
168
- # <button style='padding:10px;width:100%;font-size:16px;'>📄 Open Instructions PDF</button>
169
- # </a>
170
- # """, unsafe_allow_html=True)
171
- # ###3 working code
172
- # with st.sidebar:
173
- # with open("instructions.pdf", "rb") as f:
174
- # st.sidebar.download_button(
175
- # label="📄 Download Instructions PDF",
176
- # data=f,
177
- # file_name="instructions.pdf",
178
- # mime="application/pdf"
179
- # )
180
-
181
- ###6
182
- #this last code works
183
- with st.sidebar:
184
- st.markdown("### 📘Data Generation and Labeling Instructions")
185
- #st.markdown("<h4 style='color: #4A90E2;'>📘 Instructions</h4>", unsafe_allow_html=True)
186
- with open("User instructions.pdf", "rb") as f:
187
- st.download_button(
188
- label="📄 Download Instructions PDF",
189
- data=f,
190
- #file_name="instructions.pdf",
191
- file_name="User instructions.pdf",
192
- mime="application/pdf"
193
- )
194
-
195
-
196
- #works with blu color text
197
- # with st.sidebar:
198
- # # Stylish "Instructions" label
199
- # st.markdown("<h4 style='color: #4A90E2;'>📘 Instructions</h4>", unsafe_allow_html=True)
200
-
201
- # # PDF download button
202
- # with open("instructions.pdf", "rb") as f:
203
- # st.download_button(
204
- # label="📄 Download Instructions PDF",
205
- # data=f,
206
- # file_name="instructions.pdf",
207
- # mime="application/pdf"
208
- # )
209
-
210
- ###5
211
-
212
- #with st.sidebar:
213
- # st.markdown("### 📘 Instructions")
214
-
215
- # # PDF served from Space's file system
216
- # pdf_url = "/file/instructions.pdf"
217
-
218
- # st.markdown(f"""
219
- # <a href="{pdf_url}" target="_blank">
220
- # <button style='padding:15px;width:100%;font-size:16px;'> 📄 Open Instructions PDF</button>
221
- # </a>
222
- # """, unsafe_allow_html=True)
223
-
224
-
225
-
226
- selected_model = st.selectbox(
227
- "Select Model",
228
- ["meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-3.2-3B-Instruct","meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct",
229
- "meta-llama/Llama-3.1-70B-Instruct"],
230
- key='model_select'
231
- )
232
-
233
- temperature = st.slider(
234
- "Temperature",
235
- 0.0, 1.0, 0.7,
236
- help="Controls randomness in generation"
237
- )
238
-
239
- st.button("🔄 New Conversation", on_click=reset_conversation)
240
- # st.markdown("### 📘 Instructions")
241
- # embed_pdf_sidebar("Streamlit.pdf")
242
- #Add PDF Instructions
243
- # with st.expander("📚 Instructions"):
244
- # st.write("View or download instruction guides:")
245
-
246
- # # Option 1: Using st.download_button for PDFs stored in your app
247
- # with open("file:///C:/Users/hp/Downloads/Streamlit.pdf", "rb") as file:
248
- # first_pdf = file.read()
249
- # st.download_button(
250
- # label="Download Guide 1",
251
- # data=first_pdf,
252
- # file_name="user_guide.pdf",
253
- # mime="application/pdf"
254
- # )
255
-
256
- # #with open("https://huggingface.co/spaces/Wedyan2023/COPY/blob/main/Streamlit.pdf", "rb") as file:
257
- # with open("file:///C:/Users/hp/Downloads/Streamlit.pdf", "rb") as file:
258
- # second_pdf = file.read()
259
- # st.download_button(
260
- # label="Download Guide 2",
261
- # data=second_pdf,
262
- # file_name="technical_guide.pdf",
263
- # mime="application/pdf"
264
- # )
265
-
266
-
267
-
268
- with st.container():
269
- st.markdown(f"""
270
- <div class="sidebar-info">
271
- <h4>Current Model: {selected_model}</h4>
272
- <p><em>Note: Generated content may be inaccurate or false. Check important info.</em></p>
273
- </div>
274
- """, unsafe_allow_html=True)
275
-
276
- # with st.sidebar:
277
- # st.markdown("### 📘 Instructions")
278
- # if pdf_file := st.file_uploader("Upload Instruction PDF", type="pdf"):
279
- # embed_pdf(pdf_file)
280
-
281
-
282
- feedback_url = "https://docs.google.com/forms/d/e/1FAIpQLSdZ_5mwW-pjqXHgxR0xriyVeRhqdQKgb5c-foXlYAV55Rilsg/viewform?usp=header"
283
- st.sidebar.markdown(
284
- f'<a href="{feedback_url}" target="_blank"><button style="width: 100%;">Feedback Form</button></a>',
285
- unsafe_allow_html=True
286
- )
287
-
288
- # Display conversation
289
- for message in st.session_state.messages:
290
- with st.chat_message(message["role"]):
291
- st.markdown(message["content"])
292
-
293
- # Main content
294
- if 'task_choice' not in st.session_state:
295
- col1, col2 = st.columns(2)
296
- with col1:
297
- if st.button("📝 Data Generation", key="gen_button", help="Generate new data"):
298
- st.session_state.task_choice = "Data Generation"
299
- with col2:
300
- if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"):
301
- st.session_state.task_choice = "Data Labeling"
302
-
303
- if "task_choice" in st.session_state:
304
- if st.session_state.task_choice == "Data Generation":
305
- st.header("📝 Data Generation")
306
-
307
- # 1. Domain selection
308
- domain_selection = st.selectbox("Domain", [
309
- "Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"
310
- ])
311
-
312
- # 2. Handle custom domain input
313
- custom_domain_valid = True # Assume valid until proven otherwise
314
-
315
- if domain_selection == "Custom":
316
- domain = st.text_input("Specify custom domain")
317
- if not domain.strip():
318
- st.error("Please specify a domain name.")
319
- custom_domain_valid = False
320
- else:
321
- domain = domain_selection
322
-
323
-
324
-
325
-
326
- # Classification type selection
327
- classification_type = st.selectbox(
328
- "Classification Type",
329
- ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
330
- )
331
-
332
-
333
-
334
-
335
-
336
- #system role before
337
-
338
- ####
339
- # Labels setup based on classification type
340
- #labels = []
341
- labels = []
342
- labels_valid = False
343
- errors = []
344
-
345
- def validate_binary_labels(labels):
346
- errors = []
347
- normalized = [label.strip().lower() for label in labels]
348
-
349
- if not labels[0].strip():
350
- errors.append("First class name is required.")
351
- if not labels[1].strip():
352
- errors.append("Second class name is required.")
353
- if normalized[0] == normalized[1] and all(normalized):
354
- errors.append("Class names must be different.")
355
- return errors
356
-
357
- if classification_type == "Sentiment Analysis":
358
- st.write("### Sentiment Analysis Labels (Fixed)")
359
- col1, col2, col3 = st.columns(3)
360
- with col1:
361
- st.text_input("First class", "Positive", disabled=True)
362
- with col2:
363
- st.text_input("Second class", "Negative", disabled=True)
364
- with col3:
365
- st.text_input("Third class", "Neutral", disabled=True)
366
- labels = ["Positive", "Negative", "Neutral"]
367
-
368
- elif classification_type == "Binary Classification":
369
- st.write("### Binary Classification Labels")
370
- col1, col2 = st.columns(2)
371
- with col1:
372
- label_1 = st.text_input("First class", "Positive")
373
- with col2:
374
- label_2 = st.text_input("Second class", "Negative")
375
-
376
- labels = [label_1, label_2]
377
- errors = validate_binary_labels(labels)
378
-
379
- if errors:
380
- st.error("\n".join(errors))
381
- else:
382
- st.success("Binary class names are valid and unique!")
383
-
384
-
385
- # if classification_type == "Sentiment Analysis":
386
- # st.write("### Sentiment Analysis Labels (Fixed)")
387
- # col1, col2, col3 = st.columns(3)
388
- # with col1:
389
- # label_1 = st.text_input("First class", "Positive", disabled=True)
390
- # with col2:
391
- # label_2 = st.text_input("Second class", "Negative", disabled=True)
392
- # with col3:
393
- # label_3 = st.text_input("Third class", "Neutral", disabled=True)
394
- # labels = ["Positive", "Negative", "Neutral"]
395
-
396
-
397
- # elif classification_type == "Binary Classification":
398
- # st.write("### Binary Classification Labels")
399
- # col1, col2 = st.columns(2)
400
-
401
- # with col1:
402
- # label_1 = st.text_input("First class", "Positive")
403
- # with col2:
404
- # label_2 = st.text_input("Second class", "Negative")
405
-
406
- # errors = []
407
- # labels = [label_1.strip(), label_2.strip()]
408
-
409
- # # Check for empty class names
410
- # if not labels[0]:
411
- # errors.append("First class name is required.")
412
- # if not labels[1]:
413
- # errors.append("Second class name is required.")
414
-
415
- # # Check for duplicates
416
- # if labels[0].lower() == labels[1].lower():
417
- # errors.append("Class names must be different.")
418
-
419
- # # Show errors or success
420
- # if errors:
421
- # for error in errors:
422
- # st.error(error)
423
- # else:
424
- # st.success("Binary class names are valid and unique!")
425
-
426
- #########
427
-
428
- elif classification_type == "Multi-Class Classification":
429
- st.write("### Multi-Class Classification Labels")
430
-
431
- default_labels_by_domain = {
432
- "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
433
- "AG News": ["World", "Sports", "Business", "Sci/Tech"],
434
- "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
435
- "Food & Dining", "Local Experience", "Adventure Activities",
436
- "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
437
- "Luxury Tourism"],
438
- "Restaurant reviews": ["Italian", "French", "American"],
439
- "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
440
- "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
441
- "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
442
- "Books & Stationery","Toys & Games", "Sports & Fitness",
443
- "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
444
- }
445
-
446
- num_classes = st.slider("Number of classes", 3, 15, 3)
447
-
448
- # Get defaults for selected domain, or empty list
449
- defaults = default_labels_by_domain.get(domain, [])
450
-
451
- labels = []
452
- errors = []
453
- cols = st.columns(3)
454
-
455
- for i in range(num_classes):
456
- with cols[i % 3]:
457
- default_value = defaults[i] if i < len(defaults) else ""
458
- label_input = st.text_input(f"Class {i+1}", default_value)
459
- normalized_label = label_input.strip().title()
460
-
461
- if not normalized_label:
462
- errors.append(f"Class {i+1} name is required.")
463
- else:
464
- labels.append(normalized_label)
465
-
466
- # Check for duplicates (case-insensitive)
467
- if len(labels) != len(set(labels)):
468
- errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
469
-
470
- # Show validation results
471
- if errors:
472
- for error in errors:
473
- st.error(error)
474
- else:
475
- st.success("All Labels names are valid and unique!")
476
- labels_valid = not errors # Will be True only if there are no label errors
477
-
478
-
479
-
480
-
481
- ##############
482
-
483
- # Generation parameters
484
- col1, col2 = st.columns(2)
485
- with col1:
486
- min_words = st.number_input("Min words", 1, 100, 20)
487
- with col2:
488
- max_words = st.number_input("Max words", min_words, 100, 50)
489
-
490
- # Few-shot examples
491
- use_few_shot = st.toggle("Use few-shot examples")
492
- few_shot_examples = []
493
- if use_few_shot:
494
- num_examples = st.slider("Number of few-shot examples", 1, 10, 1)
495
- for i in range(num_examples):
496
- with st.expander(f"Example {i+1}"):
497
- content = st.text_area(f"Content", key=f"few_shot_content_{i}")
498
- label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}")
499
- if content and label:
500
- few_shot_examples.append({"content": content, "label": label})
501
-
502
- num_to_generate = st.number_input("Number of examples", 1, 200, 10)
503
- #sytem role after
504
- # System role customization
505
- default_system_role = f"You are a professional {classification_type} expert, your role is to generate text examples for {domain} domain. Always generate unique diverse examples and do not repeat the generated data. The generated text should be between {min_words} to {max_words} words long."
506
- system_role = st.text_area("Modify System Role (optional)",
507
- value=default_system_role,
508
- key="system_role_input")
509
- st.session_state['system_role'] = system_role if system_role else default_system_role
510
- # Labels initialization
511
- #labels = []
512
-
513
-
514
- user_prompt = st.text_area("User Prompt (optional)")
515
-
516
- # Updated prompt template including system role
517
- prompt_template = PromptTemplate(
518
- input_variables=["system_role", "classification_type", "domain", "num_examples",
519
- "min_words", "max_words", "labels", "user_prompt", "few_shot_examples"],
520
- template=(
521
- "{system_role}\n"
522
- "- Use the following parameters:\n"
523
- "- Generate {num_examples} examples\n"
524
- "- Each example should be between {min_words} to {max_words} words long\n"
525
- #"- Word range: {min_words} - {max_words} words\n "
526
- "- Use these labels: {labels}.\n"
527
- "- Generate the examples in this format: 'Example text. Label: label'\n"
528
- "- Do not include word counts or any additional information\n"
529
- "- Always use your creativity and intelligence to generate unique and diverse text data\n"
530
- "- Write unique examples every time.\n"
531
- "- DO NOT REPEAT your gnerated text. \n"
532
- "- For each Output, describe it once and move to the next.\n"
533
- "- List each Output only once, and avoid repeating details.\n"
534
- "- Additional instructions: {user_prompt}\n\n"
535
- "- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n"
536
- "- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it"
537
- #"- Think step by step, generate numbered examples and check every new generated example if it is generated before and change it."
538
-
539
- )
540
- )
541
-
542
- # Generate system prompt
543
- system_prompt = prompt_template.format(
544
- system_role=st.session_state['system_role'],
545
- classification_type=classification_type,
546
- domain=domain,
547
- num_examples=num_to_generate,
548
- min_words=min_words,
549
- max_words=max_words,
550
- labels=", ".join(labels),
551
- user_prompt=user_prompt,
552
- few_shot_examples="\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) if few_shot_examples else ""
553
- )
554
-
555
- # Store system prompt in session state
556
- st.session_state['system_prompt'] = system_prompt
557
-
558
- # Display system prompt
559
- st.write("System Prompt:")
560
- st.text_area("Current System Prompt", value=st.session_state['system_prompt'],
561
- height=400, disabled=True)
562
-
563
-
564
- if st.button("🎯 Generate Examples"):
565
- #
566
- errors = []
567
- if domain_selection == "Custom" and not domain.strip():
568
- st.warning("Custom domain name is required.")
569
- elif len(labels) != len(set(labels)):
570
- st.warning("Class names must be unique.")
571
- elif any(not lbl.strip() for lbl in labels):
572
- st.warning("All class labels must be filled in.")
573
- #else:
574
- #st.success("Generating examples for domain: {domain}")
575
-
576
- #if not custom_domain_valid:
577
- #st.warning("Custom domain name is required.")
578
- #elif not labels_valid:
579
- #st.warning("Please fix the label errors before generating examples.")
580
- #else:
581
- # Proceed to generate examples
582
- #st.success(f"Generating examples for domain: {domain}")
583
-
584
- with st.spinner("Generating examples..."):
585
- try:
586
- stream = client.chat.completions.create(
587
- model=selected_model,
588
- messages=[{"role": "system", "content": st.session_state['system_prompt']}],
589
- temperature=temperature,
590
- stream=True,
591
- max_tokens=80000,
592
- top_p=0.9,
593
- # repetition_penalty=1.2,
594
- #frequency_penalty=0.5, # Discourages frequent words
595
- #presence_penalty=0.6,
596
- )
597
- #st.session_state['system_prompt'] = system_prompt
598
- #new 24 march
599
- st.session_state.messages.append({"role": "user", "content": system_prompt})
600
- # # ####################
601
- response = st.write_stream(stream)
602
- st.session_state.messages.append({"role": "assistant", "content": response})
603
- # Initialize session state variables if they don't exist
604
- if 'system_prompt' not in st.session_state:
605
- st.session_state.system_prompt = system_prompt
606
-
607
- if 'response' not in st.session_state:
608
- st.session_state.response = response
609
-
610
- if 'generated_examples' not in st.session_state:
611
- st.session_state.generated_examples = []
612
-
613
- if 'generated_examples_csv' not in st.session_state:
614
- st.session_state.generated_examples_csv = None
615
-
616
- if 'generated_examples_json' not in st.session_state:
617
- st.session_state.generated_examples_json = None
618
-
619
- # Parse response and generate examples list
620
- examples_list = []
621
- for line in response.split('\n'):
622
- if line.strip():
623
- parts = line.rsplit('Label:', 1)
624
- if len(parts) == 2:
625
- text = parts[0].strip()
626
- label = parts[1].strip()
627
- if text and label:
628
- examples_list.append({
629
- 'text': text,
630
- 'label': label,
631
- 'system_prompt': st.session_state.system_prompt,
632
- 'system_role': st.session_state.system_role,
633
- 'task_type': 'Data Generation',
634
- 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
635
- })
636
-
637
- if examples_list:
638
- # Update session state with new data
639
- st.session_state.generated_examples = examples_list
640
-
641
- # Generate CSV and JSON data
642
- df = pd.DataFrame(examples_list)
643
- st.session_state.generated_examples_csv = df.to_csv(index=False).encode('utf-8')
644
- st.session_state.generated_examples_json = json.dumps(examples_list, indent=2).encode('utf-8')
645
-
646
- # Vertical layout with centered "or" between buttons
647
- st.download_button(
648
- "📥 Download Generated Examples (CSV)",
649
- st.session_state.generated_examples_csv,
650
- "generated_examples.csv",
651
- "text/csv",
652
- key='download-csv-persistent'
653
- )
654
-
655
- # Add space and center the "or"
656
- st.markdown("""
657
- <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
658
- """, unsafe_allow_html=True)
659
-
660
- st.download_button(
661
- "📥 Download Generated Examples (JSON)",
662
- st.session_state.generated_examples_json,
663
- "generated_examples.json",
664
- "application/json",
665
- key='download-json-persistent'
666
- )
667
- # # Display the labeled examples
668
- # st.markdown("##### 📋 Labeled Examples Preview")
669
- # st.dataframe(df, use_container_width=True)
670
-
671
- if st.button("Continue"):
672
- if follow_up == "Generate more examples":
673
- st.experimental_rerun()
674
- elif follow_up == "Data Labeling":
675
- st.session_state.task_choice = "Data Labeling"
676
- st.experimental_rerun()
677
-
678
- except Exception as e:
679
- st.error("An error occurred during generation.")
680
- st.error(f"Details: {e}")
681
-
682
-
683
- # Lableing Process
684
- elif st.session_state.task_choice == "Data Labeling":
685
- st.header("🏷️ Data Labeling")
686
- #new new new
687
- # 1. Domain selection
688
- # 1. Domain selection
689
-
690
-
691
- domain_selection = st.selectbox("Domain", ["Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"])
692
- # 2. Handle custom domain input
693
- custom_domain_valid = True # Assume valid until proven otherwise
694
-
695
- if domain_selection == "Custom":
696
- domain = st.text_input("Specify custom domain")
697
- if not domain.strip():
698
- st.error("Please specify a domain name.")
699
- custom_domain_valid = False
700
- else:
701
- domain = domain_selection
702
-
703
-
704
- # # Classification type selection
705
- # classification_type = st.selectbox(
706
- # "Classification Type",
707
- # ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
708
- # )
709
- #NNew edit
710
- # classification_type = st.selectbox(
711
- # "Classification Type",
712
- # #["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"],
713
- # ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"],
714
- # key="label_class_type"
715
- # )
716
-
717
- # Classification type selection
718
- classification_type = st.selectbox(
719
- "Classification Type",
720
- ["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"]
721
- )
722
- #NNew edit
723
- # Labels setup based on classification type
724
- labels = []
725
- labels_valid = False
726
- errors = []
727
-
728
- if classification_type == "Sentiment Analysis":
729
- st.write("### Sentiment Analysis Labels (Fixed)")
730
- col1, col2, col3 = st.columns(3)
731
- with col1:
732
- label_1 = st.text_input("First class", "Positive", disabled=True)
733
- with col2:
734
- label_2 = st.text_input("Second class", "Negative", disabled=True)
735
- with col3:
736
- label_3 = st.text_input("Third class", "Neutral", disabled=True)
737
- labels = ["Positive", "Negative", "Neutral"]
738
-
739
-
740
- elif classification_type == "Binary Classification":
741
- st.write("### Binary Classification Labels")
742
- col1, col2 = st.columns(2)
743
-
744
- with col1:
745
- label_1 = st.text_input("First class", "Positive")
746
- with col2:
747
- label_2 = st.text_input("Second class", "Negative")
748
-
749
- errors = []
750
- labels = [label_1.strip(), label_2.strip()]
751
-
752
-
753
- # Strip and lower-case labels for validation
754
- label_1 = labels[0].strip()
755
- label_2 = labels[1].strip()
756
-
757
- # Check for empty class names
758
- if not label_1:
759
- errors.append("First class name is required.")
760
- if not label_2:
761
- errors.append("Second class name is required.")
762
-
763
- # Check for duplicates (case insensitive)
764
- if label_1.lower() == label_2.lower() and label_1 and label_2:
765
- errors.append("Class names must be different.")
766
-
767
- # Show errors or success
768
- if errors:
769
- for error in errors:
770
- st.error(error)
771
- else:
772
- st.success("Binary class names are valid and unique!")
773
-
774
-
775
- elif classification_type == "Multi-Class Classification":
776
- st.write("### Multi-Class Classification Labels")
777
-
778
- default_labels_by_domain = {
779
- "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
780
- "AG News": ["World", "Sports", "Business", "Sci/Tech"],
781
- "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
782
- "Food & Dining", "Local Experience", "Adventure Activities",
783
- "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
784
- "Luxury Tourism"],
785
- "Restaurant reviews": ["Italian", "French", "American"],
786
- "E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
787
- "Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
788
- "Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
789
- "Books & Stationery","Toys & Games", "Sports & Fitness",
790
- "Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
791
- }
792
-
793
-
794
-
795
- # Ask user how many classes they want to define
796
- num_classes = st.slider("Select the number of classes (labels)", min_value=3, max_value=10, value=3)
797
-
798
- # Use default labels based on selected domain, if available
799
- defaults = default_labels_by_domain.get(domain, [])
800
-
801
- labels = []
802
- errors = []
803
- cols = st.columns(3) # For nicely arranged label inputs
804
-
805
- for i in range(num_classes):
806
- with cols[i % 3]: # Distribute inputs across columns
807
- default_value = defaults[i] if i < len(defaults) else ""
808
- label_input = st.text_input(f"Label {i + 1}", default_value)
809
- normalized_label = label_input.strip().title()
810
-
811
- if not normalized_label:
812
- errors.append(f"Label {i + 1} is required.")
813
- else:
814
- labels.append(normalized_label)
815
-
816
- # Check for duplicates (case-insensitive)
817
- normalized_set = {label.lower() for label in labels}
818
- if len(labels) != len(normalized_set):
819
- errors.append("Label names must be unique (case-insensitive).")
820
-
821
- # Show validation results
822
- if errors:
823
- for error in errors:
824
- st.error(error)
825
- else:
826
- st.success("All label names are valid and unique!")
827
-
828
- labels_valid = not errors # True if no validation errors
829
-
830
- elif classification_type == "Named Entity Recognition (NER)":
831
- # NER entity options
832
- ner_entities = [
833
- "PERSON - Names of people, fictional characters, historical figures",
834
- "ORG - Companies, institutions, agencies, teams",
835
- "LOC - Physical locations (mountains, oceans, etc.)",
836
- "GPE - Countries, cities, states, political regions",
837
- "DATE - Calendar dates, years, centuries",
838
- "TIME - Times, durations",
839
- "MONEY - Monetary values with currency"
840
- ]
841
- selected_entities = st.multiselect(
842
- "Select entities to recognize",
843
- ner_entities,
844
- default=["PERSON - Names of people, fictional characters, historical figures",
845
- "ORG - Companies, institutions, agencies, teams",
846
- "LOC - Physical locations (mountains, oceans, etc.)",
847
- "GPE - Countries, cities, states, political regions",
848
- "DATE - Calendar dates, years, centuries",
849
- "TIME - Times, durations",
850
- "MONEY - Monetary values with currency"],
851
- key="ner_entity_selection"
852
- )
853
-
854
- # Extract just the entity type (before the dash)
855
- labels = [entity.split(" - ")[0] for entity in selected_entities]
856
-
857
- if not labels:
858
- st.warning("Please select at least one entity type")
859
- labels = ["PERSON"] # Default if nothing selected
860
-
861
-
862
-
863
-
864
-
865
- #NNew edit
866
- # elif classification_type == "Multi-Class Classification":
867
- # st.write("### Multi-Class Classification Labels")
868
-
869
- # default_labels_by_domain = {
870
- # "News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
871
- # "AG News": ["World", "Sports", "Business", "Sci/Tech"],
872
- # "Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
873
- # "Food & Dining", "Local Experience", "Adventure Activities",
874
- # "Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
875
- # "Luxury Tourism"],
876
- # "Restaurant reviews": ["Italian", "French", "American"]
877
- # }
878
- # num_classes = st.slider("Number of classes", 3, 10, 3)
879
-
880
- # # Get defaults for selected domain, or empty list
881
- # defaults = default_labels_by_domain.get(domain, [])
882
-
883
- # labels = []
884
- # errors = []
885
- # cols = st.columns(3)
886
-
887
- # for i in range(num_classes):
888
- # with cols[i % 3]:
889
- # default_value = defaults[i] if i < len(defaults) else ""
890
- # label_input = st.text_input(f"Class {i+1}", default_value)
891
- # normalized_label = label_input.strip().title()
892
-
893
- # if not normalized_label:
894
- # errors.append(f"Class {i+1} name is required.")
895
- # else:
896
- # labels.append(normalized_label)
897
-
898
- # # Check for duplicates (case-insensitive)
899
- # if len(labels) != len(set(labels)):
900
- # errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")
901
-
902
- # # Show validation results
903
- # if errors:
904
- # for error in errors:
905
- # st.error(error)
906
- # else:
907
- # st.success("All Labels names are valid and unique!")
908
- # labels_valid = not errors # Will be True only if there are no label errors
909
-
910
-
911
-
912
-
913
- # else:
914
- # num_classes = st.slider("Number of classes", 3, 23, 3, key="label_num_classes")
915
- # labels = []
916
- # cols = st.columns(3)
917
- # for i in range(num_classes):
918
- # with cols[i % 3]:
919
- # label = st.text_input(f"Class {i+1}", f"Class_{i+1}", key=f"label_class_{i}")
920
- # labels.append(label)
921
-
922
- use_few_shot = st.toggle("Use few-shot examples for labeling")
923
- few_shot_examples = []
924
- if use_few_shot:
925
- num_few_shot = st.slider("Number of few-shot examples", 1, 10, 1)
926
- for i in range(num_few_shot):
927
- with st.expander(f"Few-shot Example {i+1}"):
928
- content = st.text_area(f"Content", key=f"label_few_shot_content_{i}")
929
- label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}")
930
- if content and label:
931
- few_shot_examples.append(f"{content}\nLabel: {label}")
932
-
933
- num_examples = st.number_input("Number of examples to classify", 1, 100, 1)
934
-
935
- examples_to_classify = []
936
- if num_examples <= 20:
937
- for i in range(num_examples):
938
- example = st.text_area(f"Example {i+1}", key=f"example_{i}")
939
- if example:
940
- examples_to_classify.append(example)
941
- else:
942
- examples_text = st.text_area(
943
- "Enter examples (one per line)",
944
- height=300,
945
- help="Enter each example on a new line"
946
- )
947
- if examples_text:
948
- examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()]
949
- if len(examples_to_classify) > num_examples:
950
- examples_to_classify = examples_to_classify[:num_examples]
951
-
952
- #New Wedyan
953
- default_system_role = f"You are a professional {classification_type} expert, your role is to classify the provided text examples for {domain} domain."
954
- system_role = st.text_area("Modify System Role (optional)",
955
- value=default_system_role,
956
- key="system_role_input")
957
- st.session_state['system_role'] = system_role if system_role else default_system_role
958
- # Labels initialization
959
- #labels = []
960
- ####
961
-
962
- user_prompt = st.text_area("User prompt (optional)", key="label_instructions")
963
-
964
- few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else ""
965
- examples_text = "\n".join([f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify)])
966
-
967
- # Customize prompt template based on classification type
968
- if classification_type == "Named Entity Recognition (NER)":
969
- label_prompt_template = PromptTemplate(
970
- input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
971
- template=(
972
- "{system_role}\n"
973
- #"- You are a professional Named Entity Recognition (NER) expert in {domain} domain. Your role is to identify and extract the following entity types: {labels}.\n"
974
- "- For each text example provided, identify all entities of the requested types.\n"
975
- "- Use the following entities: {labels}.\n"
976
- "- Return each example followed by the entities you found in this format: 'Example text.\n Entities: [ENTITY_TYPE: entity text\n, ENTITY_TYPE: entity text\n, ...] or [No entities found]'\n"
977
- "- If no entities of the requested types are found, indicate 'No entities found' in this text.\n"
978
- "- Be precise about entity boundaries - don't include unnecessary words.\n"
979
- "- Do not provide any additional information or explanations.\n"
980
- "- Additional instructions:\n {user_prompt}\n\n"
981
- "- Use user few-shot examples as guidance if provided:\n{few_shot_examples}\n\n"
982
- "- Examples to analyze:\n{examples}\n\n"
983
- "Output:\n"
984
- )
985
- )
986
- else:
987
- label_prompt_template = PromptTemplate(
988
-
989
- input_variables=["system_role", "classification_type", "labels", "few_shot_examples", "examples","domain", "user_prompt"],
990
- template=(
991
- #"- Let'\s think step by step:"
992
- "{system_role}\n"
993
- # "- You are a professional {classification_type} expert in {domain} domain. Your role is to classify the following examples using these labels: {labels}.\n"
994
- "- Use the following instructions:\n"
995
- "- Use the following labels: {labels}.\n"
996
- "- Return the classified text followed by the label in this format: 'text. Label: [label]'\n"
997
- "- Do not provide any additional information or explanations\n"
998
- "- User prompt:\n {user_prompt}\n\n"
999
- "- Use user provided examples as guidence in the classification process:\n\n {few_shot_examples}\n"
1000
- "- Examples to classify:\n{examples}\n\n"
1001
- "- Think step by step then classify the examples"
1002
- #"Output:\n"
1003
- ))
1004
-
1005
- # Check if few_shot_examples is already a formatted string
1006
- # Check if few_shot_examples is already a formatted string
1007
- if isinstance(few_shot_examples, str):
1008
- formatted_few_shot = few_shot_examples
1009
- # If it's a list of already formatted strings
1010
- elif isinstance(few_shot_examples, list) and all(isinstance(ex, str) for ex in few_shot_examples):
1011
- formatted_few_shot = "\n".join(few_shot_examples)
1012
- # If it's a list of dictionaries with 'content' and 'label' keys
1013
- elif isinstance(few_shot_examples, list) and all(isinstance(ex, dict) and 'content' in ex and 'label' in ex for ex in few_shot_examples):
1014
- formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
1015
- else:
1016
- formatted_few_shot = ""
1017
-
1018
- system_prompt = label_prompt_template.format(
1019
- system_role=st.session_state['system_role'],
1020
- classification_type=classification_type,
1021
- domain=domain,
1022
- examples="\n".join(examples_to_classify),
1023
- labels=", ".join(labels),
1024
- user_prompt=user_prompt,
1025
- few_shot_examples=formatted_few_shot
1026
- )
1027
-
1028
- # Step 2: Store the system_prompt in st.session_state
1029
- st.session_state['system_prompt'] = system_prompt
1030
- #::contentReference[oaicite:0]{index=0}
1031
- st.write("System Prompt:")
1032
- #st.code(system_prompt)
1033
- #st.code(st.session_state['system_prompt'])
1034
- st.text_area("System Prompt", value=st.session_state['system_prompt'], height=300, max_chars=None, key=None, help=None, disabled=True)
1035
-
1036
-
1037
-
1038
- if st.button("🏷️ Label Data"):
1039
- if examples_to_classify:
1040
- with st.spinner("Labeling data..."):
1041
- # Generate the system prompt based on classification type
1042
- if classification_type == "Named Entity Recognition (NER)":
1043
- system_prompt = label_prompt_template.format(
1044
- system_role=st.session_state['system_role'],
1045
- labels=", ".join(labels),
1046
- domain = domain,
1047
- few_shot_examples=few_shot_text,
1048
- examples=examples_text,
1049
- user_prompt=user_prompt
1050
- )
1051
- else:
1052
- system_prompt = label_prompt_template.format(
1053
- classification_type=classification_type,
1054
- system_role=st.session_state['system_role'],
1055
- domain = domain,
1056
- labels=", ".join(labels),
1057
- few_shot_examples=few_shot_text,
1058
- examples=examples_text,
1059
- user_prompt=user_prompt
1060
- )
1061
- try:
1062
- stream = client.chat.completions.create(
1063
- model=selected_model,
1064
- messages=[{"role": "system", "content": system_prompt}],
1065
- temperature=temperature,
1066
- stream=True,
1067
- max_tokens=20000,
1068
- top_p = 0.9,
1069
-
1070
- )
1071
- #new 24 March
1072
- # Append user message
1073
- st.session_state.messages.append({"role": "user", "content": system_prompt})
1074
- #################
1075
- response = st.write_stream(stream)
1076
- st.session_state.messages.append({"role": "assistant", "content": response})
1077
- # Display the labeled examples
1078
- # # Optional: If you want to add it as a chat-style message log
1079
- # preview_str = st.session_state.labeled_preview.to_markdown(index=False)
1080
- # st.session_state.messages.append({"role": "assistant", "content": f"Here is a preview of the labeled examples:\n\n{preview_str}"})
1081
-
1082
-
1083
- # # Stream response and append assistant message
1084
- # #14/4/2024
1085
- # response = st.write_stream(stream)
1086
- # st.session_state.messages.append({"role": "assistant", "content": response})
1087
-
1088
- # Initialize session state variables if they don't exist
1089
- if 'system_prompt' not in st.session_state:
1090
- st.session_state.system_prompt = system_prompt
1091
-
1092
- if 'response' not in st.session_state:
1093
- st.session_state.response = response
1094
-
1095
- if 'generated_examples' not in st.session_state:
1096
- st.session_state.generated_examples = []
1097
-
1098
- if 'generated_examples_csv' not in st.session_state:
1099
- st.session_state.generated_examples_csv = None
1100
-
1101
- if 'generated_examples_json' not in st.session_state:
1102
- st.session_state.generated_examples_json = None
1103
-
1104
-
1105
-
1106
-
1107
- # Save labeled examples to CSV
1108
- #new 14/4/2025
1109
- labeled_examples = []
1110
- if classification_type == "Named Entity Recognition (NER)":
1111
- labeled_examples = []
1112
- for line in response.split('\n'):
1113
- if line.strip():
1114
- parts = line.rsplit('Entities:', 1)
1115
- if len(parts) == 2:
1116
- text = parts[0].strip()
1117
- entities = parts[1].strip()
1118
- if text and entities:
1119
- labeled_examples.append({
1120
- 'text': text,
1121
- 'entities': entities,
1122
- 'system_prompt': st.session_state.system_prompt,
1123
- 'system_role': st.session_state.system_role,
1124
- 'task_type': 'Named Entity Recognition (NER)',
1125
- 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1126
- })
1127
-
1128
-
1129
- else:
1130
- labeled_examples = []
1131
- for line in response.split('\n'):
1132
- if line.strip():
1133
- parts = line.rsplit('Label:', 1)
1134
- if len(parts) == 2:
1135
- text = parts[0].strip()
1136
- label = parts[1].strip()
1137
- if text and label:
1138
- labeled_examples.append({
1139
- 'text': text,
1140
- 'label': label,
1141
- 'system_prompt': st.session_state.system_prompt,
1142
- 'system_role': st.session_state.system_role,
1143
- 'task_type': 'Data Labeling',
1144
- 'Use few-shot example?': 'Yes' if use_few_shot else 'No',
1145
- })
1146
- # Save and provide download options
1147
- if labeled_examples:
1148
- # Update session state
1149
- st.session_state.labeled_examples = labeled_examples
1150
-
1151
- # Convert to CSV and JSON
1152
- df = pd.DataFrame(labeled_examples)
1153
- st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')
1154
- st.session_state.labeled_examples_json = json.dumps(labeled_examples, indent=2).encode('utf-8')
1155
-
1156
- # Download buttons
1157
- st.download_button(
1158
- "📥 Download Labeled Examples (CSV)",
1159
- st.session_state.labeled_examples_csv,
1160
- "labeled_examples.csv",
1161
- "text/csv",
1162
- key='download-labeled-csv'
1163
- )
1164
-
1165
- st.markdown("""
1166
- <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1167
- """, unsafe_allow_html=True)
1168
-
1169
- st.download_button(
1170
- "📥 Download Labeled Examples (JSON)",
1171
- st.session_state.labeled_examples_json,
1172
- "labeled_examples.json",
1173
- "application/json",
1174
- key='download-labeled-json'
1175
- )
1176
- # Display the labeled examples
1177
- st.markdown("##### 📋 Labeled Examples Preview")
1178
- st.dataframe(df, use_container_width=True)
1179
- # Display section
1180
- #st.markdown("### 📋 Labeled Examples Preview")
1181
- #st.dataframe(st.session_state.labeled_preview, use_container_width=True)
1182
-
1183
-
1184
-
1185
- # if labeled_examples:
1186
- # df = pd.DataFrame(labeled_examples)
1187
- # csv = df.to_csv(index=False).encode('utf-8')
1188
- # st.download_button(
1189
- # "📥 Download Labeled Examples",
1190
- # csv,
1191
- # "labeled_examples.csv",
1192
- # "text/csv",
1193
- # key='download-labeled-csv'
1194
- # )
1195
- # # Add space and center the "or"
1196
- # st.markdown("""
1197
- # <div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
1198
- # """, unsafe_allow_html=True)
1199
-
1200
- # if labeled_examples:
1201
- # df = pd.DataFrame(labeled_examples)
1202
- # csv = df.to_csv(index=False).encode('utf-8')
1203
- # st.download_button(
1204
- # "📥 Download Labeled Examples",
1205
- # csv,
1206
- # "labeled_examples.json",
1207
- # "text/json",
1208
- # key='download-labeled-JSON'
1209
- # )
1210
-
1211
- # Add follow-up interaction options
1212
- #st.markdown("---")
1213
- #follow_up = st.radio(
1214
- #"What would you like to do next?",
1215
- #["Label more data", "Data Generation"],
1216
- # key="labeling_follow_up"
1217
- # )
1218
-
1219
- if st.button("Continue"):
1220
- if follow_up == "Label more data":
1221
- st.session_state.examples_to_classify = []
1222
- st.experimental_rerun()
1223
- elif follow_up == "Data Generation":
1224
- st.session_state.task_choice = "Data Labeling"
1225
- st.experimental_rerun()
1226
-
1227
- except Exception as e:
1228
- st.error("An error occurred during labeling.")
1229
- st.error(f"Details: {e}")
1230
- else:
1231
- st.warning("Please enter at least one example to classify.")
1232
-
1233
- #st.session_state.messages.append({"role": "assistant", "content": response})
1234
-
1235
-
1236
-
1237
-
1238
- # Footer
1239
- st.markdown("---")
1240
- st.markdown(
1241
- """
1242
- <div style='text-align: center'>
1243
- <p>Made with ❤️ by Wedyan AlSakran 2025</p>
1244
- </div>
1245
- """,
1246
- unsafe_allow_html=True
1247
- )