Marcos Morales
commited on
Commit
·
9710d1d
1
Parent(s):
c69ce57
modified: app.py
Browse filesmodified: requirements.txt
new file: src/metadata_llm.py
modified: src/preprocess.py
- app.py +3 -1
- requirements.txt +2 -0
- src/metadata_llm.py +53 -0
- src/preprocess.py +37 -26
app.py
CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
5 |
from src.reader import read_file
|
6 |
from src.chunker import chunk_text
|
7 |
from src.embeddings import embed_texts
|
|
|
8 |
|
9 |
def run_pipeline(files, vertical, language):
|
10 |
records = []
|
@@ -14,10 +15,11 @@ def run_pipeline(files, vertical, language):
|
|
14 |
chunks = chunk_text(body)
|
15 |
vectors = embed_texts(chunks)
|
16 |
for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
|
|
|
17 |
records.append({
|
18 |
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
19 |
"vector": vec,
|
20 |
-
"metadata": {**base_meta, "chunk_index": i}
|
21 |
})
|
22 |
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
23 |
with open(out_path, "w", encoding="utf-8") as f:
|
|
|
5 |
from src.reader import read_file
|
6 |
from src.chunker import chunk_text
|
7 |
from src.embeddings import embed_texts
|
8 |
+
from src.metadata_llm import extract_metadata
|
9 |
|
10 |
def run_pipeline(files, vertical, language):
|
11 |
records = []
|
|
|
15 |
chunks = chunk_text(body)
|
16 |
vectors = embed_texts(chunks)
|
17 |
for i, (chunk, vec) in enumerate(zip(chunks, vectors), 1):
|
18 |
+
meta_llm = extract_metadata(chunk)
|
19 |
records.append({
|
20 |
"id": f"{Path(file_path).stem}-chunk-{i:04d}",
|
21 |
"vector": vec,
|
22 |
+
"metadata": {**base_meta, "chunk_index": i, **meta_llm}
|
23 |
})
|
24 |
out_path = f"/tmp/{uuid.uuid4().hex}.jsonl"
|
25 |
with open(out_path, "w", encoding="utf-8") as f:
|
requirements.txt
CHANGED
@@ -13,3 +13,5 @@ gradio>=4.32
|
|
13 |
|
14 |
# Opcional: variables de entorno
|
15 |
python-dotenv>=1.0
|
|
|
|
|
|
13 |
|
14 |
# Opcional: variables de entorno
|
15 |
python-dotenv>=1.0
|
16 |
+
|
17 |
+
transformers>=4.41
|
src/metadata_llm.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Auto-metadatos vía LLM HuggingFace."""
|
2 |
+
from transformers import pipeline
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Por defecto usa Gemma 2B instruct, puedes cambiarlo.
|
6 |
+
LLM_MODEL = os.getenv("LLM_METADATA_MODEL", "google/gemma-1.1-2b-it")
|
7 |
+
|
8 |
+
# Sólo cargar el pipeline una vez (lazy).
|
9 |
+
_llm = None
|
10 |
+
|
11 |
+
def get_llm():
|
12 |
+
global _llm
|
13 |
+
if _llm is None:
|
14 |
+
_llm = pipeline(
|
15 |
+
"text-generation",
|
16 |
+
model=LLM_MODEL,
|
17 |
+
device_map="auto",
|
18 |
+
max_new_tokens=256,
|
19 |
+
trust_remote_code=True
|
20 |
+
)
|
21 |
+
return _llm
|
22 |
+
|
23 |
+
def build_prompt(text: str) -> str:
|
24 |
+
# Puedes personalizar el prompt aquí
|
25 |
+
return (
|
26 |
+
"Analiza el siguiente texto y responde en JSON con las claves:\n"
|
27 |
+
"\"summary\": resumen en una frase;\n"
|
28 |
+
"\"topics\": lista de hasta 5 palabras clave o temas principales;\n"
|
29 |
+
"\"language\": idioma detectado ('es', 'en', etc.);\n"
|
30 |
+
"\"vertical\": vertical temática (SEO-LLM, eCommerce, etc).\n\n"
|
31 |
+
"TEXTO:\n" + text.strip() + "\n\nJSON:"
|
32 |
+
)
|
33 |
+
|
34 |
+
def extract_metadata(text: str) -> dict:
|
35 |
+
llm = get_llm()
|
36 |
+
prompt = build_prompt(text)
|
37 |
+
output = llm(prompt)[0]['generated_text']
|
38 |
+
# Extraer sólo el bloque JSON del resultado
|
39 |
+
import re, json
|
40 |
+
m = re.search(r'\\{.*\\}', output, re.DOTALL)
|
41 |
+
if not m:
|
42 |
+
return {}
|
43 |
+
try:
|
44 |
+
result = json.loads(m.group(0))
|
45 |
+
return result
|
46 |
+
except Exception:
|
47 |
+
return {}
|
48 |
+
|
49 |
+
# --- CLI de prueba rápida
|
50 |
+
if __name__ == "__main__":
|
51 |
+
chunk = "El SEO para LLMs consiste en optimizar contenido pensando en cómo los grandes modelos de lenguaje recuperan información relevante, mejorando la estructura, claridad y contexto del texto para aumentar la calidad de las respuestas generadas por IA."
|
52 |
+
meta = extract_metadata(chunk)
|
53 |
+
print(meta)
|
src/preprocess.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
"""CLI:
|
2 |
from pathlib import Path
|
3 |
from typing import Dict, List
|
4 |
import json
|
@@ -6,36 +6,47 @@ import click
|
|
6 |
from .reader import read_file
|
7 |
from .chunker import chunk_text
|
8 |
from .embeddings import embed_texts
|
|
|
9 |
|
10 |
@click.command()
|
11 |
-
@click.option(
|
12 |
-
|
13 |
-
@click.option(
|
14 |
-
|
15 |
-
@click.option("--vertical", default="general", help="Vertical.")
|
16 |
-
@click.option("--language", default="es", help="Idioma.")
|
17 |
def main(input_dir: str, output: str, vertical: str, language: str):
|
18 |
-
|
19 |
-
for
|
20 |
-
if not
|
21 |
continue
|
22 |
-
meta, body = read_file(
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
chunks = chunk_text(body)
|
25 |
-
|
26 |
-
for i, (chunk, vec) in enumerate(zip(chunks,
|
27 |
-
|
28 |
-
|
29 |
-
"
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
json.dump(r, f, ensure_ascii=False)
|
37 |
-
f.write(
|
38 |
-
click.echo(f"Wrote {len(
|
39 |
|
40 |
-
if __name__ ==
|
41 |
main()
|
|
|
1 |
+
"""CLI entry‑point: read files → chunk → embed → enrich metadata → JSONL."""
|
2 |
from pathlib import Path
|
3 |
from typing import Dict, List
|
4 |
import json
|
|
|
6 |
from .reader import read_file
|
7 |
from .chunker import chunk_text
|
8 |
from .embeddings import embed_texts
|
9 |
+
from .metadata_llm import extract_metadata
|
10 |
|
11 |
@click.command()
|
12 |
+
@click.option('--input-dir', type=click.Path(exists=True, file_okay=False), required=True, help='Directory with docs.')
|
13 |
+
@click.option('--output', type=click.Path(), required=True, help='JSONL output path.')
|
14 |
+
@click.option('--vertical', default='general', help='Vertical tag.')
|
15 |
+
@click.option('--language', default='es', help='Language tag.')
|
|
|
|
|
16 |
def main(input_dir: str, output: str, vertical: str, language: str):
|
17 |
+
records: List[Dict] = []
|
18 |
+
for path in Path(input_dir).iterdir():
|
19 |
+
if not path.is_file():
|
20 |
continue
|
21 |
+
meta, body = read_file(path)
|
22 |
+
merged_meta = {
|
23 |
+
'vertical': vertical,
|
24 |
+
'language': language,
|
25 |
+
'source': path.name,
|
26 |
+
**meta
|
27 |
+
}
|
28 |
chunks = chunk_text(body)
|
29 |
+
embeddings = embed_texts(chunks)
|
30 |
+
for i, (chunk, vec) in enumerate(zip(chunks, embeddings), 1):
|
31 |
+
meta_llm = extract_metadata(chunk)
|
32 |
+
rec = {
|
33 |
+
'id': f"{path.stem}-chunk-{i:04d}",
|
34 |
+
'vector': vec,
|
35 |
+
'metadata': {
|
36 |
+
**merged_meta,
|
37 |
+
'chunk_index': i,
|
38 |
+
**meta_llm # summary, topics, vertical, language
|
39 |
+
}
|
40 |
+
}
|
41 |
+
records.append(rec)
|
42 |
+
|
43 |
+
out_path = Path(output)
|
44 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
45 |
+
with out_path.open('w', encoding='utf-8') as f:
|
46 |
+
for r in records:
|
47 |
json.dump(r, f, ensure_ascii=False)
|
48 |
+
f.write('\n')
|
49 |
+
click.echo(f"Wrote {len(records)} records → {out_path}")
|
50 |
|
51 |
+
if __name__ == '__main__':
|
52 |
main()
|