Spaces:
Running
Running
nam pham
commited on
Commit
·
64d96d3
1
Parent(s):
ad042b1
feat: download and upload file
Browse files- app.py +136 -39
- data/annotated_data.json +0 -0
app.py
CHANGED
@@ -247,9 +247,10 @@ def merge_entities(entities):
|
|
247 |
merged.append(current)
|
248 |
return merged
|
249 |
|
250 |
-
def annotate_text(
|
|
|
|
|
251 |
labels = [label.strip() for label in labels]
|
252 |
-
entities = model.predict_entities(text, labels, flat_ner=not nested_ner, threshold=threshold)
|
253 |
r = {
|
254 |
"text": text,
|
255 |
"entities": [
|
@@ -260,7 +261,9 @@ def annotate_text(model: GLiNER, text, labels: List[str], threshold: float, nest
|
|
260 |
"end": entity["end"],
|
261 |
"score": 0,
|
262 |
}
|
263 |
-
for entity in
|
|
|
|
|
264 |
],
|
265 |
}
|
266 |
r["entities"] = merge_entities(r["entities"])
|
@@ -311,25 +314,23 @@ class AutoAnnotator:
|
|
311 |
self.stat["current"] = -1 # Reset current progress
|
312 |
|
313 |
# Process texts in batches
|
314 |
-
batch_size = 32 # Adjust based on your GPU memory
|
315 |
processed_data = []
|
316 |
|
317 |
-
for i in
|
318 |
-
batch_texts = data[i:i + batch_size]
|
319 |
if isinstance(prompt, list):
|
320 |
prompt_text = random.choice(prompt)
|
321 |
else:
|
322 |
prompt_text = prompt
|
323 |
|
324 |
-
# Add prompt to
|
325 |
-
|
326 |
|
327 |
-
# Process
|
328 |
-
|
329 |
-
processed_data.
|
330 |
|
331 |
# Update progress
|
332 |
-
self.stat["current"] =
|
333 |
|
334 |
self.annotated_data = processed_data
|
335 |
return self.annotated_data
|
@@ -338,22 +339,93 @@ class AutoAnnotator:
|
|
338 |
annotator = None
|
339 |
sentences = []
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
def process_uploaded_file(file_obj):
|
342 |
if file_obj is None:
|
343 |
return "Please upload a file first!"
|
344 |
|
345 |
try:
|
346 |
# Read the uploaded file
|
347 |
-
|
348 |
-
|
349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
return f"Successfully loaded {len(sentences)} sentences from file!"
|
351 |
except Exception as e:
|
352 |
return f"Error reading file: {str(e)}"
|
353 |
|
354 |
def is_valid_repo_name(repo_name):
|
355 |
# Hugging Face repo names must not contain slashes or spaces
|
356 |
-
return bool(re.match(r'^[A-Za-z0-9_
|
357 |
|
358 |
def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
|
359 |
"""Create a new repository on Hugging Face Hub"""
|
@@ -443,7 +515,7 @@ def convert_hf_dataset_to_ner_format(dataset):
|
|
443 |
|
444 |
return converted_data
|
445 |
|
446 |
-
def load_from_huggingface(dataset_name: str, split: str = "
|
447 |
"""Load dataset from Hugging Face Hub"""
|
448 |
try:
|
449 |
dataset = load_dataset(dataset_name, split=split)
|
@@ -797,17 +869,21 @@ with gr.Blocks() as demo:
|
|
797 |
)
|
798 |
local_status = gr.Textbox(label="Local File Status", visible=False)
|
799 |
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
|
|
|
|
|
|
|
|
811 |
|
812 |
bar = gr.Slider(
|
813 |
minimum=0,
|
@@ -827,7 +903,7 @@ with gr.Blocks() as demo:
|
|
827 |
save_btn = gr.Button("Save validated dataset")
|
828 |
|
829 |
# Add Hugging Face upload section
|
830 |
-
with gr.Group():
|
831 |
gr.Markdown("### Upload to Hugging Face")
|
832 |
hf_repo_name = gr.Textbox(
|
833 |
label="Repository Name",
|
@@ -846,6 +922,29 @@ with gr.Blocks() as demo:
|
|
846 |
upload_to_hf_btn = gr.Button("Upload to Hugging Face")
|
847 |
hf_upload_status = gr.Textbox(label="Upload Status")
|
848 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
849 |
inp_box = gr.HighlightedText(value=None, interactive=True)
|
850 |
|
851 |
def toggle_local_inputs():
|
@@ -853,8 +952,7 @@ with gr.Blocks() as demo:
|
|
853 |
local_file: gr.update(visible=True),
|
854 |
file_format: gr.update(visible=True),
|
855 |
local_status: gr.update(visible=True),
|
856 |
-
|
857 |
-
dataset_split: gr.update(visible=False)
|
858 |
}
|
859 |
|
860 |
def toggle_hf_inputs():
|
@@ -862,20 +960,19 @@ with gr.Blocks() as demo:
|
|
862 |
local_file: gr.update(visible=False),
|
863 |
file_format: gr.update(visible=False),
|
864 |
local_status: gr.update(visible=False),
|
865 |
-
|
866 |
-
dataset_split: gr.update(visible=True)
|
867 |
}
|
868 |
|
869 |
load_local_btn.click(
|
870 |
fn=toggle_local_inputs,
|
871 |
inputs=None,
|
872 |
-
outputs=[local_file, file_format, local_status,
|
873 |
)
|
874 |
|
875 |
load_hf_btn.click(
|
876 |
fn=toggle_hf_inputs,
|
877 |
inputs=None,
|
878 |
-
outputs=[local_file, file_format, local_status,
|
879 |
)
|
880 |
|
881 |
def process_and_load_local(file_obj, format):
|
@@ -893,13 +990,13 @@ with gr.Blocks() as demo:
|
|
893 |
def load_hf_dataset(name, split):
|
894 |
status = load_from_huggingface(name, split)
|
895 |
if "Successfully" in status:
|
896 |
-
return load_dataset()
|
897 |
-
return [status], 0, 0
|
898 |
|
899 |
-
|
900 |
fn=load_hf_dataset,
|
901 |
inputs=[dataset_name, dataset_split],
|
902 |
-
outputs=[inp_box, bar]
|
903 |
)
|
904 |
|
905 |
apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
|
|
|
247 |
merged.append(current)
|
248 |
return merged
|
249 |
|
250 |
+
def annotate_text(
|
251 |
+
model, text, labels: List[str], threshold: float, nested_ner: bool
|
252 |
+
) -> Dict:
|
253 |
labels = [label.strip() for label in labels]
|
|
|
254 |
r = {
|
255 |
"text": text,
|
256 |
"entities": [
|
|
|
261 |
"end": entity["end"],
|
262 |
"score": 0,
|
263 |
}
|
264 |
+
for entity in model.predict_entities(
|
265 |
+
text, labels, flat_ner=not nested_ner, threshold=threshold
|
266 |
+
)
|
267 |
],
|
268 |
}
|
269 |
r["entities"] = merge_entities(r["entities"])
|
|
|
314 |
self.stat["current"] = -1 # Reset current progress
|
315 |
|
316 |
# Process texts in batches
|
|
|
317 |
processed_data = []
|
318 |
|
319 |
+
for i, text in enumerate(data):
|
|
|
320 |
if isinstance(prompt, list):
|
321 |
prompt_text = random.choice(prompt)
|
322 |
else:
|
323 |
prompt_text = prompt
|
324 |
|
325 |
+
# Add prompt to text
|
326 |
+
text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
|
327 |
|
328 |
+
# Process single text
|
329 |
+
result = annotate_text(self.model, text_with_prompt, labels, threshold, nested_ner)
|
330 |
+
processed_data.append(result)
|
331 |
|
332 |
# Update progress
|
333 |
+
self.stat["current"] = i + 1
|
334 |
|
335 |
self.annotated_data = processed_data
|
336 |
return self.annotated_data
|
|
|
339 |
annotator = None
|
340 |
sentences = []
|
341 |
|
342 |
+
def process_text_for_gliner(text: str, max_tokens: int = 384, overlap: int = 50) -> List[str]:
|
343 |
+
"""
|
344 |
+
Process text for GLiNER by splitting long texts into overlapping chunks.
|
345 |
+
Preserves sentence boundaries and context when possible.
|
346 |
+
|
347 |
+
Args:
|
348 |
+
text: The input text to process
|
349 |
+
max_tokens: Maximum number of tokens per chunk
|
350 |
+
overlap: Number of tokens to overlap between chunks
|
351 |
+
|
352 |
+
Returns:
|
353 |
+
List of text chunks suitable for GLiNER
|
354 |
+
"""
|
355 |
+
# First split into sentences to preserve natural boundaries
|
356 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
357 |
+
chunks = []
|
358 |
+
current_chunk = []
|
359 |
+
current_length = 0
|
360 |
+
|
361 |
+
for sentence in sentences:
|
362 |
+
# Tokenize the sentence
|
363 |
+
sentence_tokens = tokenize_text(sentence)
|
364 |
+
sentence_length = len(sentence_tokens)
|
365 |
+
|
366 |
+
# If a single sentence is too long, split it
|
367 |
+
if sentence_length > max_tokens:
|
368 |
+
# If we have accumulated tokens, add them as a chunk
|
369 |
+
if current_chunk:
|
370 |
+
chunks.append(" ".join(current_chunk))
|
371 |
+
current_chunk = []
|
372 |
+
current_length = 0
|
373 |
+
|
374 |
+
# Split the long sentence into smaller chunks
|
375 |
+
start = 0
|
376 |
+
while start < sentence_length:
|
377 |
+
end = min(start + max_tokens, sentence_length)
|
378 |
+
chunk_tokens = sentence_tokens[start:end]
|
379 |
+
chunks.append(" ".join(chunk_tokens))
|
380 |
+
start = end - overlap if end < sentence_length else end
|
381 |
+
|
382 |
+
# If adding this sentence would exceed max_tokens, start a new chunk
|
383 |
+
elif current_length + sentence_length > max_tokens:
|
384 |
+
chunks.append(" ".join(current_chunk))
|
385 |
+
current_chunk = sentence_tokens
|
386 |
+
current_length = sentence_length
|
387 |
+
else:
|
388 |
+
current_chunk.extend(sentence_tokens)
|
389 |
+
current_length += sentence_length
|
390 |
+
|
391 |
+
# Add any remaining tokens as the final chunk
|
392 |
+
if current_chunk:
|
393 |
+
chunks.append(" ".join(current_chunk))
|
394 |
+
|
395 |
+
return chunks
|
396 |
+
|
397 |
def process_uploaded_file(file_obj):
|
398 |
if file_obj is None:
|
399 |
return "Please upload a file first!"
|
400 |
|
401 |
try:
|
402 |
# Read the uploaded file
|
403 |
+
global sentences
|
404 |
+
if file_obj.name.endswith('.csv'):
|
405 |
+
import pandas as pd
|
406 |
+
df = pd.read_csv(file_obj.name)
|
407 |
+
sentences = df['Nội dung'].dropna().tolist()
|
408 |
+
# Process each sentence and flatten the list
|
409 |
+
processed_sentences = []
|
410 |
+
for sentence in sentences:
|
411 |
+
processed_sentences.extend(process_text_for_gliner(sentence))
|
412 |
+
sentences = processed_sentences
|
413 |
+
else:
|
414 |
+
# Read the file content directly from the file object
|
415 |
+
content = file_obj.read().decode('utf-8')
|
416 |
+
raw_sentences = [line.strip() for line in content.splitlines() if line.strip()]
|
417 |
+
# Process each sentence and flatten the list
|
418 |
+
processed_sentences = []
|
419 |
+
for sentence in raw_sentences:
|
420 |
+
processed_sentences.extend(process_text_for_gliner(sentence))
|
421 |
+
sentences = processed_sentences
|
422 |
return f"Successfully loaded {len(sentences)} sentences from file!"
|
423 |
except Exception as e:
|
424 |
return f"Error reading file: {str(e)}"
|
425 |
|
426 |
def is_valid_repo_name(repo_name):
|
427 |
# Hugging Face repo names must not contain slashes or spaces
|
428 |
+
return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
|
429 |
|
430 |
def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
|
431 |
"""Create a new repository on Hugging Face Hub"""
|
|
|
515 |
|
516 |
return converted_data
|
517 |
|
518 |
+
def load_from_huggingface(dataset_name: str, split: str = "all"):
|
519 |
"""Load dataset from Hugging Face Hub"""
|
520 |
try:
|
521 |
dataset = load_dataset(dataset_name, split=split)
|
|
|
869 |
)
|
870 |
local_status = gr.Textbox(label="Local File Status", visible=False)
|
871 |
|
872 |
+
with gr.Group(visible=False) as hf_inputs:
|
873 |
+
with gr.Row():
|
874 |
+
dataset_name = gr.Textbox(
|
875 |
+
label="Hugging Face Dataset Name",
|
876 |
+
placeholder="Enter dataset name (e.g., conll2003)",
|
877 |
+
scale=3
|
878 |
+
)
|
879 |
+
dataset_split = gr.Dropdown(
|
880 |
+
choices=["train", "validation", "test"],
|
881 |
+
value="train",
|
882 |
+
label="Dataset Split",
|
883 |
+
scale=2
|
884 |
+
)
|
885 |
+
load_dataset_btn = gr.Button("Load Dataset", scale=1)
|
886 |
+
hf_status = gr.Textbox(label="Dataset Loading Status")
|
887 |
|
888 |
bar = gr.Slider(
|
889 |
minimum=0,
|
|
|
903 |
save_btn = gr.Button("Save validated dataset")
|
904 |
|
905 |
# Add Hugging Face upload section
|
906 |
+
with gr.Group(visible=False) as hf_upload_group:
|
907 |
gr.Markdown("### Upload to Hugging Face")
|
908 |
hf_repo_name = gr.Textbox(
|
909 |
label="Repository Name",
|
|
|
922 |
upload_to_hf_btn = gr.Button("Upload to Hugging Face")
|
923 |
hf_upload_status = gr.Textbox(label="Upload Status")
|
924 |
|
925 |
+
with gr.Row():
|
926 |
+
show_hf_upload_btn = gr.Button("Show Upload Options")
|
927 |
+
hide_hf_upload_btn = gr.Button("Hide Upload Options", visible=False)
|
928 |
+
|
929 |
+
def toggle_hf_upload(show: bool):
|
930 |
+
return {
|
931 |
+
hf_upload_group: gr.update(visible=show),
|
932 |
+
show_hf_upload_btn: gr.update(visible=not show),
|
933 |
+
hide_hf_upload_btn: gr.update(visible=show)
|
934 |
+
}
|
935 |
+
|
936 |
+
show_hf_upload_btn.click(
|
937 |
+
fn=lambda: toggle_hf_upload(True),
|
938 |
+
inputs=None,
|
939 |
+
outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
|
940 |
+
)
|
941 |
+
|
942 |
+
hide_hf_upload_btn.click(
|
943 |
+
fn=lambda: toggle_hf_upload(False),
|
944 |
+
inputs=None,
|
945 |
+
outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
|
946 |
+
)
|
947 |
+
|
948 |
inp_box = gr.HighlightedText(value=None, interactive=True)
|
949 |
|
950 |
def toggle_local_inputs():
|
|
|
952 |
local_file: gr.update(visible=True),
|
953 |
file_format: gr.update(visible=True),
|
954 |
local_status: gr.update(visible=True),
|
955 |
+
hf_inputs: gr.update(visible=False)
|
|
|
956 |
}
|
957 |
|
958 |
def toggle_hf_inputs():
|
|
|
960 |
local_file: gr.update(visible=False),
|
961 |
file_format: gr.update(visible=False),
|
962 |
local_status: gr.update(visible=False),
|
963 |
+
hf_inputs: gr.update(visible=True)
|
|
|
964 |
}
|
965 |
|
966 |
load_local_btn.click(
|
967 |
fn=toggle_local_inputs,
|
968 |
inputs=None,
|
969 |
+
outputs=[local_file, file_format, local_status, hf_inputs]
|
970 |
)
|
971 |
|
972 |
load_hf_btn.click(
|
973 |
fn=toggle_hf_inputs,
|
974 |
inputs=None,
|
975 |
+
outputs=[local_file, file_format, local_status, hf_inputs]
|
976 |
)
|
977 |
|
978 |
def process_and_load_local(file_obj, format):
|
|
|
990 |
def load_hf_dataset(name, split):
|
991 |
status = load_from_huggingface(name, split)
|
992 |
if "Successfully" in status:
|
993 |
+
return load_dataset(), status
|
994 |
+
return [status], 0, 0, status
|
995 |
|
996 |
+
load_dataset_btn.click(
|
997 |
fn=load_hf_dataset,
|
998 |
inputs=[dataset_name, dataset_split],
|
999 |
+
outputs=[inp_box, bar, hf_status]
|
1000 |
)
|
1001 |
|
1002 |
apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
|
data/annotated_data.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|