rahimizadeh commited on
Commit
3a5abb3
·
verified ·
1 Parent(s): 4ff5454

Upload 3 files

Browse files
Files changed (3) hide show
  1. modules/analysis.py +52 -0
  2. modules/parser.py +51 -0
  3. modules/vectorizer.py +21 -0
modules/analysis.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ from langchain.llms import HuggingFacePipeline
3
+ from transformers import pipeline
4
+ from modules import parser, vectorizer
5
+
6
+ def run_analysis(uploaded_files, text_input, query, quick_action, temperature, start_time, end_time):
7
+ """
8
+ Main logic that runs when the user clicks 'Analyze Logs'.
9
+ It combines file and text inputs, applies embeddings,
10
+ and performs question answering using a language model.
11
+ """
12
+ logs_text = ""
13
+
14
+ # Combine all uploaded files into one text string
15
+ if uploaded_files:
16
+ logs_text += parser.parse_uploaded_files(uploaded_files)
17
+
18
+ # Add manual pasted text logs
19
+ if text_input:
20
+ logs_text += "\n" + text_input
21
+
22
+ # Show error if no log input provided
23
+ if not logs_text.strip():
24
+ return "❌ No logs provided.", None, None, None
25
+
26
+ # Use either free-form query or a quick action
27
+ query_text = query if query else quick_action
28
+ if not query_text:
29
+ return "❌ No query provided.", None, None, None
30
+
31
+ # Chunk logs and embed them
32
+ docs = vectorizer.prepare_documents(logs_text)
33
+ vectordb = vectorizer.create_vectorstore(docs)
34
+
35
+ # Load a small Hugging Face text generation pipeline (GPT-2 here)
36
+ pipe = pipeline("text-generation", model="gpt2", max_length=512, temperature=temperature)
37
+ llm = HuggingFacePipeline(pipeline=pipe)
38
+
39
+ # Create LangChain retrieval-based QA chain
40
+ qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectordb.as_retriever())
41
+
42
+ # Run the query against embedded document chunks
43
+ result = qa.run(query_text)
44
+
45
+ # -------- Mocked example chart and alert outputs --------
46
+ bar_data = {"Hour": ["14:00", "15:00"], "Count": [8, 4]}
47
+ pie_data = {"Event Type": ["Blocked", "Scan"], "Count": [8, 4]}
48
+ alerts = [("CRITICAL", "8 blocked SSH attempts from 192.168.1.5"),
49
+ ("WARNING", "4 port scanning alerts from 10.0.0.8")]
50
+
51
+ # Return structured outputs to Gradio UI
52
+ return result, bar_data, pie_data, alerts
modules/parser.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import fitz # PyMuPDF for reading PDFs
4
+ from typing import List
5
+
6
+ def extract_text_from_pdf(file_path: str) -> str:
7
+ """
8
+ Extracts plain text from a PDF file using PyMuPDF.
9
+ """
10
+ doc = fitz.open(file_path)
11
+ text = ""
12
+ for page in doc:
13
+ text += page.get_text() # Extract text from each page
14
+ return text
15
+
16
+ def read_log_file(file_path: str) -> str:
17
+ """
18
+ Reads a .log or .txt file and returns its content as text.
19
+ """
20
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
21
+ return f.read()
22
+
23
+ def detect_log_format(text: str) -> str:
24
+ """
25
+ Detects the log format using basic pattern matching.
26
+ Returns one of: 'syslog', 'json', 'cef', or 'unknown'.
27
+ """
28
+ if re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", text):
29
+ return "syslog"
30
+ elif re.search(r"\{.*\}", text):
31
+ return "json"
32
+ elif "CEF:" in text:
33
+ return "cef"
34
+ else:
35
+ return "unknown"
36
+
37
+ def parse_uploaded_files(files: List[str]) -> str:
38
+ """
39
+ Accepts a list of uploaded files, extracts content from each,
40
+ and returns all logs as one combined string.
41
+ """
42
+ all_logs = ""
43
+ for file_obj in files:
44
+ file_path = file_obj.name
45
+
46
+ if file_path.endswith('.pdf'):
47
+ all_logs += extract_text_from_pdf(file_path) + "\n"
48
+ elif file_path.endswith(('.log', '.txt')):
49
+ all_logs += read_log_file(file_path) + "\n"
50
+
51
+ return all_logs
modules/vectorizer.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import CharacterTextSplitter
2
+ from langchain.vectorstores import FAISS
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.schema import Document
5
+
6
+ def prepare_documents(text: str, chunk_size=1000, chunk_overlap=200):
7
+ """
8
+ Splits the combined log text into smaller chunks using LangChain's splitter,
9
+ so they can be processed and embedded efficiently.
10
+ """
11
+ docs = [Document(page_content=text)] # Wrap raw text in a Document
12
+ splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
13
+ return splitter.split_documents(docs)
14
+
15
+ def create_vectorstore(documents, model_name="sentence-transformers/all-MiniLM-L6-v2"):
16
+ """
17
+ Uses Hugging Face Transformers to embed the document chunks,
18
+ and stores them in a FAISS vector database for fast retrieval.
19
+ """
20
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
21
+ return FAISS.from_documents(documents, embeddings)