DHEIVER commited on
Commit
7453f77
·
verified ·
1 Parent(s): a70ebc7

Create metrology_rag.py

Browse files
Files changed (1) hide show
  1. metrology_rag.py +430 -0
metrology_rag.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio_client import Client
2
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+ from sentence_transformers import SentenceTransformer
5
+ from rank_bm25 import BM25Okapi
6
+ import faiss
7
+ import re
8
+ import os
9
+ import sys
10
+ import time
11
+ import json
12
+ import numpy as np
13
+ import logging
14
+ from typing import List, Dict, Tuple, Optional
15
+ from PyPDF2 import PdfReader
16
+ from colorama import Fore, Style
17
+ from datetime import datetime
18
+ from sklearn.metrics.pairwise import cosine_similarity
19
+
20
+ class MetrologyRAGSystem:
21
+ def __init__(self, config: Optional[Dict] = None):
22
+ self.config = self._load_default_config(config)
23
+ self.embedder = SentenceTransformer(self.config['embedding_model'])
24
+ self.client = Client(self.config['api_endpoint'])
25
+ self.documents = []
26
+ self.faiss_index = None
27
+ self.bm25 = None
28
+ self._init_logger()
29
+
30
+ def _load_default_config(self, config: Dict) -> Dict:
31
+ default_config = {
32
+ 'embedding_model': 'all-MiniLM-L6-v2',
33
+ 'chunk_size': 1600,
34
+ 'chunk_overlap': 450,
35
+ 'top_k': 7,
36
+ 'max_retries': 5,
37
+ 'hybrid_ratio': 0.6,
38
+ 'allowed_file_types': ['.pdf'],
39
+ 'api_endpoint': "yuntian-deng/ChatGPT",
40
+ 'required_norms': ['ISO/IEC 17025', 'ABNT NBR ISO 9001'],
41
+ 'min_confidence': 0.78,
42
+ 'temperature': 0.3
43
+ }
44
+ return {**default_config, **(config or {})}
45
+
46
+ def _init_logger(self):
47
+ self.logger = logging.getLogger('MetrologyRAG')
48
+ self.logger.setLevel(logging.INFO)
49
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
50
+
51
+ file_handler = logging.FileHandler('metrology_audit.log')
52
+ file_handler.setFormatter(formatter)
53
+
54
+ stream_handler = logging.StreamHandler()
55
+ stream_handler.setFormatter(formatter)
56
+
57
+ self.logger.addHandler(file_handler)
58
+ self.logger.addHandler(stream_handler)
59
+
60
+ def initialize_system(self, pdf_folder: str):
61
+ try:
62
+ self._validate_data_source(pdf_folder)
63
+ start_time = time.time()
64
+ self._load_documents(pdf_folder)
65
+ self._create_vector_index()
66
+ self.logger.info(f"Sistema inicializado em {time.time()-start_time:.2f}s | Documentos: {len(self.documents)}")
67
+ except Exception as e:
68
+ self.logger.critical(f"Falha na inicialização: {str(e)}")
69
+ sys.exit(1)
70
+
71
+ def _validate_data_source(self, folder_path: str):
72
+ if not os.path.exists(folder_path):
73
+ raise FileNotFoundError(f"Diretório inexistente: {folder_path}")
74
+
75
+ valid_files = [f for f in os.listdir(folder_path)
76
+ if os.path.splitext(f)[1].lower() in self.config['allowed_file_types']]
77
+
78
+ if not valid_files:
79
+ raise ValueError("Nenhum documento PDF válido encontrado")
80
+
81
+ def _load_documents(self, folder_path: str):
82
+ try:
83
+ loader = PyPDFDirectoryLoader(folder_path)
84
+ pages = loader.load()
85
+
86
+ text_splitter = RecursiveCharacterTextSplitter(
87
+ chunk_size=self.config['chunk_size'],
88
+ chunk_overlap=self.config['chunk_overlap'],
89
+ separators=["\n\n• ", "\n■ ", "(?<=\. )", "; ", "► ", "\\|"]
90
+ )
91
+
92
+ clean_docs = []
93
+ for i, page in enumerate(pages):
94
+ try:
95
+ text = self._preprocess_technical_text(page.page_content)
96
+ clean_docs.extend(text_splitter.split_text(text))
97
+ except Exception as e:
98
+ self.logger.error(f"Erro no documento {i+1}: {str(e)}")
99
+ continue
100
+
101
+ self.documents = clean_docs
102
+ self.logger.info(f"Documentos técnicos carregados: {len(self.documents)} segmentos")
103
+
104
+ except Exception as e:
105
+ self.logger.error(f"Falha no carregamento: {str(e)}")
106
+ raise
107
+
108
+ def _preprocess_technical_text(self, text: str) -> str:
109
+ replacements = [
110
+ (r'\b(um)\b', 'µm'),
111
+ (r'(?i)graus?\s*C', '°C'),
112
+ (r'(\d)([A-Za-z°µ])', r'\1 \2'),
113
+ (r'±\s*(\d)', r'±\1'),
114
+ (r'kN/m²', 'kPa'),
115
+ (r'(\d+)\s*-\s*(\d+)', r'\1 a \2'),
116
+ (r'\s+', ' '),
117
+ (r'\[.*?\]', '')
118
+ ]
119
+
120
+ for pattern, replacement in replacements:
121
+ text = re.sub(pattern, replacement, text)
122
+
123
+ return text.strip()
124
+
125
+ def _create_vector_index(self):
126
+ try:
127
+ dense_vectors = self.embedder.encode(self.documents)
128
+ self.faiss_index = faiss.IndexHNSWFlat(dense_vectors.shape[1], 32)
129
+ self.faiss_index.add(dense_vectors.astype('float32'))
130
+
131
+ tokenized_docs = [self._technical_tokenizer(doc) for doc in self.documents]
132
+ self.bm25 = BM25Okapi(tokenized_docs)
133
+
134
+ self.logger.info("Índices vetoriais criados com sucesso")
135
+
136
+ except Exception as e:
137
+ self.logger.error(f"Erro na criação de índices: {str(e)}")
138
+ raise
139
+
140
+ def _technical_tokenizer(self, text: str) -> List[str]:
141
+ tokens = re.findall(
142
+ r'\b[\wµ°±]+(?:[/-]\d+)?\b|'
143
+ r'\d+\.\d+[eE]?[+-]?\d*|'
144
+ r'[A-Z]{2,}(?:\s+\d+[A-Z]*)?|'
145
+ r'[;:±≤≥]',
146
+ text
147
+ )
148
+ return [t.lower() for t in tokens if t]
149
+
150
+ def retrieve_context(self, query: str) -> List[str]:
151
+ try:
152
+ boosted_query = self._boost_query(query)
153
+
154
+ query_embedding = self.embedder.encode([boosted_query])
155
+ _, dense_ids = self.faiss_index.search(query_embedding.astype('float32'), 50)
156
+
157
+ tokenized_query = self._technical_tokenizer(boosted_query)
158
+ bm25_scores = self.bm25.get_scores(tokenized_query)
159
+ bm25_ids = np.argsort(bm25_scores)[::-1][:50]
160
+
161
+ combined_scores = self._reciprocal_rank_fusion(dense_ids[0], bm25_ids)
162
+
163
+ return [self.documents[i] for i in combined_scores[:self.config['top_k']]]
164
+
165
+ except Exception as e:
166
+ self.logger.error(f"Falha na recuperação: {str(e)}")
167
+ return []
168
+
169
+ def _boost_query(self, query: str) -> str:
170
+ terms = [
171
+ 'incerteza de medição',
172
+ 'calibração rastreável',
173
+ 'certificado de calibração',
174
+ 'padrão de referência',
175
+ 'ISO/IEC 17025'
176
+ ]
177
+ return f"{query} {' '.join(terms)}"
178
+
179
+ def _reciprocal_rank_fusion(self, dense_ids: List[int], bm25_ids: List[int]) -> List[int]:
180
+ combined_scores = {}
181
+ for i, idx in enumerate(dense_ids):
182
+ combined_scores[idx] = combined_scores.get(idx, 0) + 1/(i + 60)
183
+
184
+ for i, idx in enumerate(bm25_ids):
185
+ combined_scores[idx] = combined_scores.get(idx, 0) + 1/(i + 60)
186
+
187
+ sorted_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
188
+ valid_ids = [idx for idx, _ in sorted_scores if idx < len(self.documents)]
189
+ return valid_ids
190
+
191
+ def generate_technical_response(self, query: str) -> str:
192
+ try:
193
+ context = self.retrieve_context(query)
194
+ if not context:
195
+ raise ValueError("Contexto insuficiente")
196
+
197
+ prompt = self._build_structured_prompt(query, context)
198
+
199
+ if not self._validate_prompt(prompt):
200
+ raise ValueError("Prompt inválido")
201
+
202
+ response = self._call_llm_with_retry(prompt)
203
+ return self._postprocess_response(response, context)
204
+
205
+ except Exception as e:
206
+ self.logger.error(f"Falha na geração: {str(e)}")
207
+ return self._fallback_procedure(query)
208
+
209
+ def _build_structured_prompt(self, query: str, context: List[str]) -> str:
210
+ detected_norms = self._detect_norms(context)
211
+ detected_equipment = self._detect_equipment(context)
212
+
213
+ context_entries = []
214
+ for i, text in enumerate(context[:3]):
215
+ cleaned_text = text[:250].replace('\n', ' ')
216
+ context_entries.append(f'[Doc {i+1}] {cleaned_text}...')
217
+
218
+ context_str = '\n'.join(context_entries)
219
+
220
+ template = (
221
+ f"## Diretrizes Técnicas ISO/IEC 17025:2017 ##\n"
222
+ f"1. Formato obrigatório:\n"
223
+ f" - Seção 1: Fundamentação Normativa ({', '.join(detected_norms)})\n"
224
+ f" - Seção 2: Procedimento de Medição\n"
225
+ f" - Seção 3: Análise de Incertezas (k=2)\n"
226
+ f" - Seção 4: Condições Ambientais\n\n"
227
+ f"2. Dados obrigatórios:\n"
228
+ f" - Tolerâncias: ± valores com unidades\n"
229
+ f" - Equipamentos: {', '.join(detected_equipment)}\n"
230
+ f" - Normas: {', '.join(detected_norms)}\n\n"
231
+ f"## Contexto Técnico ##\n"
232
+ f"{context_str}\n\n"
233
+ f"## Consulta ##\n"
234
+ f"{query}\n\n"
235
+ f"## Resposta Estruturada ##"
236
+ )
237
+ return template
238
+
239
+ def _detect_norms(self, context: List[str]) -> List[str]:
240
+ norms = set()
241
+ pattern = r'\b(ISO/IEC|ABNT NBR|OIML R)\s+[\d\.]+'
242
+ for text in context:
243
+ norms.update(re.findall(pattern, text))
244
+ return list(norms)[:3] or self.config['required_norms']
245
+
246
+ def _detect_equipment(self, context: List[str]) -> List[str]:
247
+ equipment = set()
248
+ pattern = r'\b([A-Z][a-z]*\s+)?(\d+[A-Z]+\b|Micrômetro|Paquímetro|Manômetro|Multímetro)'
249
+ for text in context:
250
+ matches = re.findall(pattern, text)
251
+ equipment.update([f"{m[0]}{m[1]}" for m in matches])
252
+ return list(equipment)[:5]
253
+
254
+ def _validate_prompt(self, prompt: str) -> bool:
255
+ checks = [
256
+ (r'ISO/IEC 17025', 2),
257
+ (r'\d+ ± \d+', 1),
258
+ (r'k=\d', 1),
259
+ (r'°C', 1)
260
+ ]
261
+ score = sum(weight for pattern, weight in checks if re.search(pattern, prompt))
262
+ return score >= 3
263
+
264
+ def _call_llm_with_retry(self, prompt: str) -> str:
265
+ for attempt in range(self.config['max_retries']):
266
+ try:
267
+ result = self.client.predict(
268
+ inputs=prompt,
269
+ top_p=0.9,
270
+ temperature=self.config['temperature'],
271
+ chat_counter=0,
272
+ chatbot=[],
273
+ api_name="/predict"
274
+ )
275
+ return self._clean_api_response(result)
276
+ except Exception as e:
277
+ self.logger.warning(f"Tentativa {attempt+1} falhou: {str(e)}")
278
+ time.sleep(2**attempt)
279
+ raise TimeoutError("Falha após múltiplas tentativas")
280
+
281
+ def _clean_api_response(self, response) -> str:
282
+ if isinstance(response, (list, tuple)):
283
+ return ' '.join(str(item) for item in response if item)
284
+ return str(response).replace('**', '').replace('```', '').strip()
285
+
286
+ def _postprocess_response(self, response: str, context: List[str]) -> str:
287
+ processed = response.replace('Resposta Estruturada', '').strip()
288
+ processed = self._enhance_technical_terms(processed)
289
+ processed = self._add_references(processed, context)
290
+ return self._format_response(processed)
291
+
292
+ def _enhance_technical_terms(self, text: str) -> str:
293
+ replacements = {
294
+ r'\b(incerteza)\b': r'incerteza de medição',
295
+ r'\b(calibração)\b': r'calibração rastreável',
296
+ r'\b(norma)\b': r'norma técnica',
297
+ r'(\d)([a-zA-Zµ°])': r'\1 \2'
298
+ }
299
+ for pattern, repl in replacements.items():
300
+ text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
301
+ return text
302
+
303
+ def _add_references(self, text: str, context: List[str]) -> str:
304
+ refs = set()
305
+ for doc in context[:3]:
306
+ match = re.search(r'\[Doc \d+\] (.{30})', doc)
307
+ if match:
308
+ refs.add(f"- {match.group(1)}...")
309
+ return f"{text}\n\n## Referências Técnicas ##\n" + "\n".join(list(refs)[:3])
310
+
311
+ def _format_response(self, text: str) -> str:
312
+ border = "="*80
313
+ header = f"{Fore.GREEN}▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓\n RESPOSTA TÉCNICA CERTIFICADA\n▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓{Style.RESET_ALL}"
314
+
315
+ formatted = re.sub(r'^(\d+\.)\s+(.+)$',
316
+ f'{Fore.CYAN}\\1 {Style.RESET_ALL}\\2',
317
+ text, flags=re.M)
318
+
319
+ formatted = re.sub(r'(± \d+\.?\d*)',
320
+ f'{Fore.YELLOW}\\1{Style.RESET_ALL}',
321
+ formatted)
322
+
323
+ return f"\n{border}\n{header}\n{border}\n{formatted}\n{border}"
324
+
325
+ def _fallback_procedure(self, query: str) -> str:
326
+ try:
327
+ key_terms = re.findall(r'\b[A-Z]{3,}\b|\b\d+[A-Z]+\b', query)
328
+ relevant = [doc for doc in self.documents if any(term in doc for term in key_terms)][:3]
329
+
330
+ return (
331
+ f"{Fore.YELLOW}INFORMAÇÃO TÉCNICA PARCIAL:{Style.RESET_ALL}\n" +
332
+ "\n".join([f"• {doc[:300]}..." for doc in relevant]) +
333
+ f"\n\n{Fore.RED}AVISO: Resposta não validada - consulte documentos originais{Style.RESET_ALL}"
334
+ )
335
+ except:
336
+ return f"{Fore.RED}Erro crítico - sistema necessita re-inicialização{Style.RESET_ALL}"
337
+
338
+ def generate_report(self, query: str, response: str, filename: str = "relatorio_tecnico.md"):
339
+ try:
340
+ timestamp = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
341
+ report = (
342
+ f"# RELATÓRIO TÉCNICO - METROLOGIA\n\n"
343
+ f"**Data:** {timestamp}\n"
344
+ f"**Consulta:** {query}\n\n"
345
+ "## Resposta Técnica\n"
346
+ f"{response}\n\n"
347
+ "**Assinatura Digital:** [Sistema Certificado v2.1]"
348
+ )
349
+
350
+ with open(filename, 'w', encoding='utf-8') as f:
351
+ f.write(report)
352
+
353
+ self.logger.info(f"Relatório gerado: {filename}")
354
+ except Exception as e:
355
+ self.logger.error(f"Falha ao gerar relatório: {str(e)}")
356
+
357
+ def analyze_metrology_report(self, pdf_path: str) -> str:
358
+ try:
359
+ text = self._extract_pdf_text(pdf_path)
360
+ compliance = self._check_compliance(text)
361
+ analysis = self._generate_analysis_report(text, compliance)
362
+ return self._format_compliance_report(analysis, compliance)
363
+ except Exception as e:
364
+ self.logger.error(f"Falha na análise: {str(e)}")
365
+ return self._fallback_procedure("Análise de relatório")
366
+
367
+ def _extract_pdf_text(self, path: str) -> str:
368
+ reader = PdfReader(path)
369
+ return '\n'.join([page.extract_text() for page in reader.pages if page.extract_text()])
370
+
371
+ def _check_compliance(self, text: str) -> Dict:
372
+ checks = {
373
+ 'rastreabilidade': {'patterns': [r'rastreab[i|í]lidade.*INMETRO'], 'required': True},
374
+ 'incerteza': {'patterns': [r'incerteza expandida.*≤?\s*\d+'], 'required': True},
375
+ 'ambiente': {'patterns': [r'temperatura.*23\s*±\s*2\s*°C'], 'required': False},
376
+ 'normas': {'patterns': [r'ISO/IEC\s+17025'], 'required': True}
377
+ }
378
+
379
+ results = {}
380
+ for key, config in checks.items():
381
+ found = any(re.search(p, text) for p in config['patterns'])
382
+ results[key] = {
383
+ 'status': 'OK' if found else 'FALHA' if config['required'] else 'N/A',
384
+ 'critical': config['required'] and not found
385
+ }
386
+ return results
387
+
388
+ def _generate_analysis_report(self, text: str, compliance: Dict) -> str:
389
+ critical = sum(1 for v in compliance.values() if v['critical'])
390
+ status = "NÃO CONFORME" if critical else "CONFORME"
391
+
392
+ prompt = f"""## Análise de Conformidade Metrológica ##
393
+ Documento analisado: {text[:2000]}...
394
+
395
+ Resultados:
396
+ {json.dumps(compliance, indent=2)}
397
+
398
+ ## Parecer Técnico ##
399
+ Emitir parecer considerando:
400
+ - Status: {status}
401
+ - Itens críticos: {critical}
402
+ - Recomendações de adequação"""
403
+
404
+ return self._call_llm_with_retry(prompt)
405
+
406
+ def _format_compliance_report(self, text: str, compliance: Dict) -> str:
407
+ status = "APROVADO" if not any(v['critical'] for v in compliance.values()) else "REPROVADO"
408
+ color = Fore.GREEN if status == "APROVADO" else Fore.RED
409
+
410
+ header = f"""
411
+ {color}▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓
412
+ PARECER TÉCNICO - STATUS: {status}
413
+ ▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓{Style.RESET_ALL}
414
+
415
+ """
416
+ summary = "## Resumo de Conformidade ##\n"
417
+ for k, v in compliance.items():
418
+ summary += f"• {k.upper()}: {v['status']}\n"
419
+
420
+ return header + summary + "\n" + text
421
+
422
+ def main_menu():
423
+ print(Fore.BLUE + "\n🔧 Sistema de Metrologia Inteligente v2.1" + Style.RESET_ALL)
424
+ print(Fore.CYAN + "Menu Principal:" + Style.RESET_ALL)
425
+ print("1. Inicializar sistema com documentos PDF")
426
+ print("2. Consulta técnica")
427
+ print("3. Analisar relatório PDF")
428
+ print("4. Gerar relatório completo")
429
+ print("5. Sair")
430
+ return input(Fore.YELLOW + "> Selecione uma opção: " + Style.RESET_ALL)