Marcos Morales commited on
Commit
9710d1d
·
1 Parent(s): c69ce57

modified: app.py

Browse files

modified: requirements.txt
new file: src/metadata_llm.py
modified: src/preprocess.py

Files changed (4) hide show
  1. app.py +3 -1
  2. requirements.txt +2 -0
  3. src/metadata_llm.py +53 -0
  4. src/preprocess.py +37 -26
app.py CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
5
  from src.reader import read_file
6
  from src.chunker import chunk_text
7
  from src.embeddings import embed_texts
 
8
 
9
  def run_pipeline(files, vertical, language):
10
  records = []
@@ -14,10 +15,11 @@ def run_pipeline(files, vertical, language):
14
  chunks = chunk_text(body)
15
  vectors = embed_texts(chunks)
16
  for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
 
17
  records.append({
18
  "id": f"{Path(file_path).stem}-chunk-{i:04d}",
19
  "vector": vec,
20
- "metadata": {**base_meta, "chunk_index": i}
21
  })
22
  out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
23
  with open(out_path, "w", encoding="utf-8") as f:
 
5
  from src.reader import read_file
6
  from src.chunker import chunk_text
7
  from src.embeddings import embed_texts
8
+ from src.metadata_llm import extract_metadata
9
 
10
  def run_pipeline(files, vertical, language):
11
  records = []
 
15
  chunks = chunk_text(body)
16
  vectors = embed_texts(chunks)
17
  for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
18
+ meta_llm = extract_metadata(chunk)
19
  records.append({
20
  "id": f"{Path(file_path).stem}-chunk-{i:04d}",
21
  "vector": vec,
22
+ "metadata": {**base_meta, "chunk_index": i, **meta_llm}
23
  })
24
  out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
25
  with open(out_path, "w", encoding="utf-8") as f:
requirements.txt CHANGED
@@ -13,3 +13,5 @@ gradio>=4.32
13
 
14
  # Opcional: variables de entorno
15
  python-dotenv>=1.0
 
 
 
13
 
14
  # Opcional: variables de entorno
15
  python-dotenv>=1.0
16
+
17
+ transformers>=4.41
src/metadata_llm.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Auto-metadatos vía LLM HuggingFace."""
2
+ from transformers import pipeline
3
+ import os
4
+
5
+ # Por defecto usa Gemma 2B instruct, puedes cambiarlo.
6
+ LLM_MODEL = os.getenv("LLM_METADATA_MODEL", "google/gemma-1.1-2b-it")
7
+
8
+ # Sólo cargar el pipeline una vez (lazy).
9
+ _llm = None
10
+
11
+ def get_llm():
12
+ global _llm
13
+ if _llm is None:
14
+ _llm = pipeline(
15
+ "text-generation",
16
+ model=LLM_MODEL,
17
+ device_map="auto",
18
+ max_new_tokens=256,
19
+ trust_remote_code=True
20
+ )
21
+ return _llm
22
+
23
+ def build_prompt(text: str) -> str:
24
+ # Puedes personalizar el prompt aquí
25
+ return (
26
+ "Analiza el siguiente texto y responde en JSON con las claves:\n"
27
+ "\"summary\": resumen en una frase;\n"
28
+ "\"topics\": lista de hasta 5 palabras clave o temas principales;\n"
29
+ "\"language\": idioma detectado ('es', 'en', etc.);\n"
30
+ "\"vertical\": vertical temática (SEO-LLM, eCommerce, etc).\n\n"
31
+ "TEXTO:\n" + text.strip() + "\n\nJSON:"
32
+ )
33
+
34
+ def extract_metadata(text: str) -> dict:
35
+ llm = get_llm()
36
+ prompt = build_prompt(text)
37
+ output = llm(prompt)[0]['generated_text']
38
+ # Extraer sólo el bloque JSON del resultado
39
+ import re, json
40
+ m = re.search(r'\\{.*\\}', output, re.DOTALL)
41
+ if not m:
42
+ return {}
43
+ try:
44
+ result = json.loads(m.group(0))
45
+ return result
46
+ except Exception:
47
+ return {}
48
+
49
+ # --- CLI de prueba rápida
50
+ if __name__ == "__main__":
51
+ chunk = "El SEO para LLMs consiste en optimizar contenido pensando en cómo los grandes modelos de lenguaje recuperan información relevante, mejorando la estructura, claridad y contexto del texto para aumentar la calidad de las respuestas generadas por IA."
52
+ meta = extract_metadata(chunk)
53
+ print(meta)
src/preprocess.py CHANGED
@@ -1,4 +1,4 @@
1
- """CLI: leechunkea → embed → JSONL."""
2
  from pathlib import Path
3
  from typing import Dict, List
4
  import json
@@ -6,36 +6,47 @@ import click
6
  from .reader import read_file
7
  from .chunker import chunk_text
8
  from .embeddings import embed_texts
 
9
 
10
  @click.command()
11
- @click.option("--input-dir", type=click.Path(exists=True, file_okay=False),
12
- required=True, help="Carpeta con documentos.")
13
- @click.option("--output", type=click.Path(), required=True,
14
- help="Ruta del JSONL de salida.")
15
- @click.option("--vertical", default="general", help="Vertical.")
16
- @click.option("--language", default="es", help="Idioma.")
17
  def main(input_dir: str, output: str, vertical: str, language: str):
18
- recs: List[Dict] = []
19
- for p in Path(input_dir).iterdir():
20
- if not p.is_file():
21
  continue
22
- meta, body = read_file(p)
23
- base_meta = {"vertical": vertical, "language": language, "source": p.name, **meta}
 
 
 
 
 
24
  chunks = chunk_text(body)
25
- vecs = embed_texts(chunks)
26
- for i, (chunk, vec) in enumerate(zip(chunks, vecs), 1):
27
- recs.append({
28
- "id": f"{p.stem}-chunk-{i:04d}",
29
- "vector": vec,
30
- "metadata": {**base_meta, "chunk_index": i}
31
- })
32
- out = Path(output)
33
- out.parent.mkdir(parents=True, exist_ok=True)
34
- with out.open("w", encoding="utf-8") as f:
35
- for r in recs:
 
 
 
 
 
 
 
36
  json.dump(r, f, ensure_ascii=False)
37
- f.write("\n")
38
- click.echo(f"Wrote {len(recs)} records → {out}")
39
 
40
- if __name__ == "__main__":
41
  main()
 
1
+ """CLI entry‑point: read files chunk → embed → enrich metadata → JSONL."""
2
  from pathlib import Path
3
  from typing import Dict, List
4
  import json
 
6
  from .reader import read_file
7
  from .chunker import chunk_text
8
  from .embeddings import embed_texts
9
+ from .metadata_llm import extract_metadata
10
 
11
  @click.command()
12
+ @click.option('--input-dir', type=click.Path(exists=True, file_okay=False), required=True, help='Directory with docs.')
13
+ @click.option('--output', type=click.Path(), required=True, help='JSONL output path.')
14
+ @click.option('--vertical', default='general', help='Vertical tag.')
15
+ @click.option('--language', default='es', help='Language tag.')
 
 
16
  def main(input_dir: str, output: str, vertical: str, language: str):
17
+ records: List[Dict] = []
18
+ for path in Path(input_dir).iterdir():
19
+ if not path.is_file():
20
  continue
21
+ meta, body = read_file(path)
22
+ merged_meta = {
23
+ 'vertical': vertical,
24
+ 'language': language,
25
+ 'source': path.name,
26
+ **meta
27
+ }
28
  chunks = chunk_text(body)
29
+ embeddings = embed_texts(chunks)
30
+ for i, (chunk, vec) in enumerate(zip(chunks, embeddings), 1):
31
+ meta_llm = extract_metadata(chunk)
32
+ rec = {
33
+ 'id': f"{path.stem}-chunk-{i:04d}",
34
+ 'vector': vec,
35
+ 'metadata': {
36
+ **merged_meta,
37
+ 'chunk_index': i,
38
+ **meta_llm # summary, topics, vertical, language
39
+ }
40
+ }
41
+ records.append(rec)
42
+
43
+ out_path = Path(output)
44
+ out_path.parent.mkdir(parents=True, exist_ok=True)
45
+ with out_path.open('w', encoding='utf-8') as f:
46
+ for r in records:
47
  json.dump(r, f, ensure_ascii=False)
48
+ f.write('\n')
49
+ click.echo(f"Wrote {len(records)} records → {out_path}")
50
 
51
+ if __name__ == '__main__':
52
  main()