Merlintxu commited on
Commit
92e06e0
verified
1 Parent(s): 3596d24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -29
app.py CHANGED
@@ -7,14 +7,10 @@ from docx import Document
7
  import PyPDF2
8
  from sentence_transformers import SentenceTransformer
9
  import tiktoken
10
- import os
11
 
12
- # Carga modelo de embeddings de HF
13
  model = SentenceTransformer('all-MiniLM-L6-v2')
14
- # Tokenizer para chunking
15
  tokenizer = tiktoken.get_encoding("cl100k_base")
16
 
17
- # Extrae front-matter YAML (si existe) y cuerpo
18
  def extract_front_matter_and_body(text: str):
19
  import re
20
  fm_regex = r"^---\n(.*?)\n---\n(.*)$"
@@ -27,7 +23,6 @@ def extract_front_matter_and_body(text: str):
27
  body = text
28
  return meta, body
29
 
30
- # Chunking en base a tokens
31
  def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
32
  tokens = tokenizer.encode(text)
33
  chunks = []
@@ -39,10 +34,8 @@ def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
39
  start += max_tokens - overlap
40
  return chunks
41
 
42
- # Procesa un archivo individual (md/docx/pdf)
43
  def process_file(path: str, vertical: str, language: str):
44
  ext = Path(path).suffix.lower()
45
- # Leer y extraer texto
46
  if ext in ['.md', '.markdown']:
47
  raw = Path(path).read_text(encoding='utf-8')
48
  meta, body = extract_front_matter_and_body(raw)
@@ -58,15 +51,12 @@ def process_file(path: str, vertical: str, language: str):
58
  else:
59
  return []
60
 
61
- # Metadatos por defecto + front-matter
62
  default_meta = {
63
  'vertical': vertical,
64
  'language': language,
65
  'source': Path(path).name
66
  }
67
  meta = {**default_meta, **meta}
68
-
69
- # Chunking y embeddings
70
  records = []
71
  for i, chunk in enumerate(chunk_text(body)):
72
  emb = model.encode(chunk).tolist()
@@ -75,46 +65,33 @@ def process_file(path: str, vertical: str, language: str):
75
  'chunk_index': i+1,
76
  **meta
77
  }
78
- records.append({ 'vector': emb, 'metadata': metadata })
79
  return records
80
 
81
- # Funci贸n para el bot贸n
82
  def run_pipeline(files, vertical, language):
83
  all_records = []
84
- # Guardar temporalmente y procesar
85
- for file in files:
86
- # Gradio pasa un dict con 'name' y 'data'
87
- tmp_path = file.name
88
- os.replace(file.name, tmp_path)
89
- recs = process_file(tmp_path, vertical, language)
90
  all_records.extend(recs)
91
 
92
- # Generar JSONL
93
  out_file = f"/tmp/{uuid.uuid4().hex}.jsonl"
94
  with open(out_file, 'w', encoding='utf-8') as f:
95
  for rec in all_records:
96
- json.dump({ 'id': rec['metadata']['id'],
97
- 'vector': rec['vector'],
98
- 'metadata': rec['metadata']
99
- }, f, ensure_ascii=False)
100
  f.write("\n")
101
-
102
  return out_file
103
 
104
- # Interfaz Gradio
105
  demo = gr.Blocks()
106
  with demo:
107
  gr.Markdown("## Ingesta para Amazon S3 Vector Features")
108
  with gr.Row():
109
- uploader = gr.File(label="Sube tus documentos", file_count="multiple", type="file")
110
  vertical = gr.Textbox(label="Vertical (p.ej. SEO, eCommerce)", value="general")
111
  language = gr.Textbox(label="Idioma", value="es")
112
  btn = gr.Button("Procesar y Generar JSONL")
113
  output = gr.File(label="Descarga el JSONL")
114
 
115
- btn.click(fn=run_pipeline,
116
- inputs=[uploader, vertical, language],
117
- outputs=output)
118
 
119
  if __name__ == "__main__":
120
  demo.launch()
 
7
  import PyPDF2
8
  from sentence_transformers import SentenceTransformer
9
  import tiktoken
 
10
 
 
11
  model = SentenceTransformer('all-MiniLM-L6-v2')
 
12
  tokenizer = tiktoken.get_encoding("cl100k_base")
13
 
 
14
  def extract_front_matter_and_body(text: str):
15
  import re
16
  fm_regex = r"^---\n(.*?)\n---\n(.*)$"
 
23
  body = text
24
  return meta, body
25
 
 
26
  def chunk_text(text: str, max_tokens: int = 500, overlap: int = 50):
27
  tokens = tokenizer.encode(text)
28
  chunks = []
 
34
  start += max_tokens - overlap
35
  return chunks
36
 
 
37
  def process_file(path: str, vertical: str, language: str):
38
  ext = Path(path).suffix.lower()
 
39
  if ext in ['.md', '.markdown']:
40
  raw = Path(path).read_text(encoding='utf-8')
41
  meta, body = extract_front_matter_and_body(raw)
 
51
  else:
52
  return []
53
 
 
54
  default_meta = {
55
  'vertical': vertical,
56
  'language': language,
57
  'source': Path(path).name
58
  }
59
  meta = {**default_meta, **meta}
 
 
60
  records = []
61
  for i, chunk in enumerate(chunk_text(body)):
62
  emb = model.encode(chunk).tolist()
 
65
  'chunk_index': i+1,
66
  **meta
67
  }
68
+ records.append({'vector': emb, 'metadata': metadata})
69
  return records
70
 
 
71
  def run_pipeline(files, vertical, language):
72
  all_records = []
73
+ for file_path in files:
74
+ recs = process_file(file_path, vertical, language)
 
 
 
 
75
  all_records.extend(recs)
76
 
 
77
  out_file = f"/tmp/{uuid.uuid4().hex}.jsonl"
78
  with open(out_file, 'w', encoding='utf-8') as f:
79
  for rec in all_records:
80
+ json.dump({'id': rec['metadata']['id'], 'vector': rec['vector'], 'metadata': rec['metadata']}, f, ensure_ascii=False)
 
 
 
81
  f.write("\n")
 
82
  return out_file
83
 
 
84
  demo = gr.Blocks()
85
  with demo:
86
  gr.Markdown("## Ingesta para Amazon S3 Vector Features")
87
  with gr.Row():
88
+ uploader = gr.File(label="Sube tus documentos", file_count="multiple", type="filepath")
89
  vertical = gr.Textbox(label="Vertical (p.ej. SEO, eCommerce)", value="general")
90
  language = gr.Textbox(label="Idioma", value="es")
91
  btn = gr.Button("Procesar y Generar JSONL")
92
  output = gr.File(label="Descarga el JSONL")
93
 
94
+ btn.click(fn=run_pipeline, inputs=[uploader, vertical, language], outputs=output)
 
 
95
 
96
  if __name__ == "__main__":
97
  demo.launch()