Spaces:

rahimizadeh
/

Log_Assistant_Application

Running

App Files Files Community

rahimizadeh commited on May 11

Commit

3a5abb3

verified ·

1 Parent(s): 4ff5454

Upload 3 files

Browse files

Files changed (3) hide show

modules/analysis.py +52 -0
modules/parser.py +51 -0
modules/vectorizer.py +21 -0

modules/analysis.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from langchain.chains import RetrievalQA
+from langchain.llms import HuggingFacePipeline
+from transformers import pipeline
+from modules import parser, vectorizer
+def run_analysis(uploaded_files, text_input, query, quick_action, temperature, start_time, end_time):
+    """
+    Main logic that runs when the user clicks 'Analyze Logs'.
+    It combines file and text inputs, applies embeddings,
+    and performs question answering using a language model.
+    """
+    logs_text = ""
+    # Combine all uploaded files into one text string
+    if uploaded_files:
+        logs_text += parser.parse_uploaded_files(uploaded_files)
+    # Add manual pasted text logs
+    if text_input:
+        logs_text += "\n" + text_input
+    # Show error if no log input provided
+    if not logs_text.strip():
+        return "❌ No logs provided.", None, None, None
+    # Use either free-form query or a quick action
+    query_text = query if query else quick_action
+    if not query_text:
+        return "❌ No query provided.", None, None, None
+    # Chunk logs and embed them
+    docs = vectorizer.prepare_documents(logs_text)
+    vectordb = vectorizer.create_vectorstore(docs)
+    # Load a small Hugging Face text generation pipeline (GPT-2 here)
+    pipe = pipeline("text-generation", model="gpt2", max_length=512, temperature=temperature)
+    llm = HuggingFacePipeline(pipeline=pipe)
+    # Create LangChain retrieval-based QA chain
+    qa = RetrievalQA.from_chain_type(llm=llm, retriever=vectordb.as_retriever())
+    # Run the query against embedded document chunks
+    result = qa.run(query_text)
+    # -------- Mocked example chart and alert outputs --------
+    bar_data = {"Hour": ["14:00", "15:00"], "Count": [8, 4]}
+    pie_data = {"Event Type": ["Blocked", "Scan"], "Count": [8, 4]}
+    alerts = [("CRITICAL", "8 blocked SSH attempts from 192.168.1.5"),
+              ("WARNING", "4 port scanning alerts from 10.0.0.8")]
+    # Return structured outputs to Gradio UI
+    return result, bar_data, pie_data, alerts

modules/parser.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import re
+import fitz  # PyMuPDF for reading PDFs
+from typing import List
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extracts plain text from a PDF file using PyMuPDF.
+    """
+    doc = fitz.open(file_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()  # Extract text from each page
+    return text
+def read_log_file(file_path: str) -> str:
+    """
+    Reads a .log or .txt file and returns its content as text.
+    """
+    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+        return f.read()
+def detect_log_format(text: str) -> str:
+    """
+    Detects the log format using basic pattern matching.
+    Returns one of: 'syslog', 'json', 'cef', or 'unknown'.
+    """
+    if re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", text):
+        return "syslog"
+    elif re.search(r"\{.*\}", text):
+        return "json"
+    elif "CEF:" in text:
+        return "cef"
+    else:
+        return "unknown"
+def parse_uploaded_files(files: List[str]) -> str:
+    """
+    Accepts a list of uploaded files, extracts content from each,
+    and returns all logs as one combined string.
+    """
+    all_logs = ""
+    for file_obj in files:
+        file_path = file_obj.name
+        if file_path.endswith('.pdf'):
+            all_logs += extract_text_from_pdf(file_path) + "\n"
+        elif file_path.endswith(('.log', '.txt')):
+            all_logs += read_log_file(file_path) + "\n"
+    return all_logs

modules/vectorizer.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.schema import Document
+def prepare_documents(text: str, chunk_size=1000, chunk_overlap=200):
+    """
+    Splits the combined log text into smaller chunks using LangChain's splitter,
+    so they can be processed and embedded efficiently.
+    """
+    docs = [Document(page_content=text)]  # Wrap raw text in a Document
+    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    return splitter.split_documents(docs)
+def create_vectorstore(documents, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+    """
+    Uses Hugging Face Transformers to embed the document chunks,
+    and stores them in a FAISS vector database for fast retrieval.
+    """
+    embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    return FAISS.from_documents(documents, embeddings)