Spaces:
Sleeping
Sleeping
tomas.helmfridsson
commited on
Commit
Β·
f9a8906
1
Parent(s):
bdb1db1
update 42
Browse files
app.py
CHANGED
@@ -1,181 +1,148 @@
|
|
1 |
-
|
2 |
-
import logging
|
3 |
-
|
4 |
import gradio as gr
|
5 |
from transformers import pipeline
|
6 |
from langchain_community.document_loaders import PyPDFLoader
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
9 |
-
from langchain_huggingface.llms import HuggingFacePipeline
|
10 |
-
from langchain.chains import RetrievalQA
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
|
13 |
-
# ββ KONFIGURATION
|
14 |
DOCS_DIR = "document"
|
15 |
INDEX_DIR = "faiss_index"
|
16 |
-
CHUNK_SIZE = 500
|
17 |
-
CHUNK_OVERLAP = 50
|
18 |
EMB_MODEL = "KBLab/sentence-bert-swedish-cased"
|
19 |
LLM_MODEL = "tiiuae/falcon-rw-1b"
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
21 |
DEFAULT_TEMP = 0.3
|
22 |
-
K = 10
|
23 |
|
24 |
-
# ββ LOGGING
|
25 |
-
logging.basicConfig(
|
26 |
-
level=logging.INFO,
|
27 |
-
format="%(asctime)s %(levelname)s %(message)s"
|
28 |
-
)
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
31 |
-
# ββ 1)
|
32 |
emb = HuggingFaceEmbeddings(model_name=EMB_MODEL)
|
|
|
33 |
if os.path.isdir(INDEX_DIR):
|
34 |
-
logger.info(f"π Laddar
|
35 |
vs = FAISS.load_local(INDEX_DIR, emb)
|
36 |
else:
|
37 |
-
logger.info("βοΈ Bygger
|
38 |
-
splitter = RecursiveCharacterTextSplitter(
|
39 |
-
|
40 |
-
|
41 |
-
all_docs, files = [], []
|
42 |
for fn in os.listdir(DOCS_DIR):
|
43 |
if fn.lower().endswith(".pdf"):
|
44 |
-
|
45 |
-
loader = PyPDFLoader(path)
|
46 |
-
pages = loader.load()
|
47 |
chunks = splitter.split_documents(pages)
|
48 |
for c in chunks:
|
49 |
c.metadata["source"] = fn
|
50 |
-
|
51 |
files.append(fn)
|
52 |
-
vs = FAISS.from_documents(
|
53 |
vs.save_local(INDEX_DIR)
|
54 |
-
logger.info(f"β
Sparade index
|
55 |
-
|
56 |
-
# ββ 2) Initiera LLM ββββββββββββββββββββββββββββββββββββββββββ
|
57 |
-
logger.info("π Initierar LLM-pipelineβ¦")
|
58 |
-
pipe = pipeline(
|
59 |
-
"text-generation",
|
60 |
-
model=LLM_MODEL,
|
61 |
-
device=-1,
|
62 |
-
max_new_tokens=MAX_NEW_TOKENS
|
63 |
-
)
|
64 |
-
llm = HuggingFacePipeline(
|
65 |
-
pipeline=pipe,
|
66 |
-
model_kwargs={
|
67 |
-
"temperature": DEFAULT_TEMP,
|
68 |
-
"max_new_tokens": MAX_NEW_TOKENS
|
69 |
-
}
|
70 |
-
)
|
71 |
-
logger.info("β
LLM-pipeline initierad")
|
72 |
-
|
73 |
-
# ββ 3) Bygg RetrievalQA βββββββββββββββββββββββββββββββββββββ
|
74 |
retriever = vs.as_retriever(search_kwargs={"k": K})
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
def test_retrieval(query: str) -> str:
|
89 |
-
logger.debug(f"test_retrieval() anropad med query={query!r}")
|
90 |
docs = retriever.get_relevant_documents(query)
|
91 |
if not docs:
|
92 |
return "π« Inga trΓ€ffar"
|
93 |
out = []
|
94 |
-
for i, d in enumerate(docs,
|
95 |
-
src
|
96 |
-
snippet = d.page_content.replace("\n", " ")[:
|
97 |
out.append(f"{i}. ({src}) β¦{snippet}β¦")
|
98 |
return "\n\n".join(out)
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
temperature: float,
|
103 |
-
history: list[dict]
|
104 |
-
) -> tuple[list[dict], list[dict]]:
|
105 |
-
"""
|
106 |
-
query: anvΓ€ndarens frΓ₯ga
|
107 |
-
temperature: slump-parameter fΓΆr LLM
|
108 |
-
history: tidigare chatt-meddelanden som lista av {"role","content"}
|
109 |
-
return: (uppdaterad_history, uppdaterad_history)
|
110 |
-
"""
|
111 |
-
logger.info(f"chat_fn() anropad med query={query!r}, temp={temperature}, history_len={len(history) if history else 0}")
|
112 |
history = history or []
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
|
|
118 |
return history, history
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
history.append({"role": "assistant", "content": msg})
|
124 |
-
return history, history
|
125 |
|
126 |
-
|
|
|
|
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
133 |
|
134 |
-
# Justera temp
|
135 |
-
try:
|
136 |
-
llm.pipeline.model_kwargs["temperature"] = temperature
|
137 |
-
except Exception as e:
|
138 |
-
logger.exception("Kunde inte sΓ€tta temperature")
|
139 |
-
|
140 |
try:
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
except Exception as e:
|
143 |
-
logger.exception("Fel vid
|
144 |
-
|
145 |
-
|
146 |
-
# Ange kΓ€lla
|
147 |
-
src = docs[0].metadata.get("source", "okΓ€nd") if docs else "okΓ€nd"
|
148 |
-
content = f"**(Dokument: {src})**\n\n{svar}"
|
149 |
-
history.append({"role": "assistant", "content": content})
|
150 |
-
logger.info(f"chat_fn fΓ€rdig, svarslΓ€ngd={len(svar)} tecken, totalt history={len(history)}")
|
151 |
|
|
|
|
|
|
|
152 |
return history, history
|
153 |
|
154 |
-
# ββ 5) Bygg Gradio
|
155 |
with gr.Blocks() as demo:
|
156 |
-
gr.Markdown("#
|
157 |
-
gr.Markdown(f"
|
158 |
|
159 |
with gr.Row():
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
test_btn = gr.Button("π Testa Retrieval")
|
164 |
-
test_out = gr.Textbox(label="Retrieval-snippet")
|
165 |
|
166 |
with gr.Row():
|
167 |
-
|
168 |
-
temp
|
169 |
-
send
|
170 |
|
171 |
-
|
172 |
-
|
173 |
|
174 |
-
|
175 |
-
test_btn.click(
|
176 |
-
send.click(
|
177 |
|
178 |
if __name__ == "__main__":
|
179 |
-
# share=True om du vill publikt dela lΓ€nken
|
180 |
demo.launch(share=True)
|
181 |
-
|
|
|
1 |
+
# ββ app.py βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
2 |
+
import os, logging, math, textwrap
|
|
|
3 |
import gradio as gr
|
4 |
from transformers import pipeline
|
5 |
from langchain_community.document_loaders import PyPDFLoader
|
6 |
from langchain_community.vectorstores import FAISS
|
7 |
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
|
|
|
|
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
|
10 |
+
# ββ KONFIGURATION βββββββββββββββββββββββββββββββββββββββββ
|
11 |
DOCS_DIR = "document"
|
12 |
INDEX_DIR = "faiss_index"
|
|
|
|
|
13 |
EMB_MODEL = "KBLab/sentence-bert-swedish-cased"
|
14 |
LLM_MODEL = "tiiuae/falcon-rw-1b"
|
15 |
+
|
16 |
+
CHUNK_SIZE = 500
|
17 |
+
CHUNK_OVERLAP = 50
|
18 |
+
MAX_NEW_TOKENS = 128 # svarlΓ€ngd
|
19 |
+
CTX_TOKEN_MAX = 900 # fΓΆr att stanna under modellβbegrΓ€nsningen 1β―024
|
20 |
+
K = 10 # hur mΓ₯nga chunkar vi hΓ€mtar
|
21 |
DEFAULT_TEMP = 0.3
|
|
|
22 |
|
23 |
+
# ββ LOGGING ββββββββββββββββββββββββββββββββββββββββββββββ
|
24 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
|
|
|
|
|
25 |
logger = logging.getLogger(__name__)
|
26 |
|
27 |
+
# ββ 1) Index (bygg eller ladda) βββββββββββββββββββββββββ
|
28 |
emb = HuggingFaceEmbeddings(model_name=EMB_MODEL)
|
29 |
+
|
30 |
if os.path.isdir(INDEX_DIR):
|
31 |
+
logger.info(f"π Laddar FAISSβindex frΓ₯n `{INDEX_DIR}`")
|
32 |
vs = FAISS.load_local(INDEX_DIR, emb)
|
33 |
else:
|
34 |
+
logger.info("βοΈ Bygger FAISSβindex frΓ₯n PDFβfiler β¦")
|
35 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
|
36 |
+
chunk_overlap=CHUNK_OVERLAP)
|
37 |
+
docs, files = [], []
|
|
|
38 |
for fn in os.listdir(DOCS_DIR):
|
39 |
if fn.lower().endswith(".pdf"):
|
40 |
+
pages = PyPDFLoader(os.path.join(DOCS_DIR, fn)).load()
|
|
|
|
|
41 |
chunks = splitter.split_documents(pages)
|
42 |
for c in chunks:
|
43 |
c.metadata["source"] = fn
|
44 |
+
docs.extend(chunks)
|
45 |
files.append(fn)
|
46 |
+
vs = FAISS.from_documents(docs, emb)
|
47 |
vs.save_local(INDEX_DIR)
|
48 |
+
logger.info(f"β
Sparade index ({len(files)}β―PDF, {len(docs)}β―chunkar)")
|
49 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
retriever = vs.as_retriever(search_kwargs={"k": K})
|
51 |
+
|
52 |
+
# ββ 2) LLMβpipeline βββββββββββββββββββββββββββββββββββββ
|
53 |
+
logger.info("π Initierar textβgenereringsβpipeline β¦")
|
54 |
+
gen_pipe = pipeline("text-generation",
|
55 |
+
model=LLM_MODEL,
|
56 |
+
device=-1,
|
57 |
+
max_new_tokens=MAX_NEW_TOKENS)
|
58 |
+
|
59 |
+
logger.info("β
LLM klar")
|
60 |
+
|
61 |
+
# ββ 3) HjΓ€lpfunktioner ββββββββββββββββββββββββββββββββββ
|
62 |
+
def truncate_tokens(text: str, max_tokens: int = CTX_TOKEN_MAX) -> str:
|
63 |
+
"""VΓ€ldigt enkel tokenβapprox (1β―token β4β―tecken)"""
|
64 |
+
approx_tokens = len(text) // 4
|
65 |
+
if approx_tokens <= max_tokens:
|
66 |
+
return text
|
67 |
+
slice_len = max_tokens * 4
|
68 |
+
return text[:slice_len]
|
69 |
|
70 |
def test_retrieval(query: str) -> str:
|
|
|
71 |
docs = retriever.get_relevant_documents(query)
|
72 |
if not docs:
|
73 |
return "π« Inga trΓ€ffar"
|
74 |
out = []
|
75 |
+
for i, d in enumerate(docs, 1):
|
76 |
+
src = d.metadata.get("source", "okΓ€nd")
|
77 |
+
snippet = d.page_content.replace("\n", " ")[:160]
|
78 |
out.append(f"{i}. ({src}) β¦{snippet}β¦")
|
79 |
return "\n\n".join(out)
|
80 |
|
81 |
+
# ββ 4) Chatβfunktionen (exakt 3β―param, 2β―retur) βββββββββ
|
82 |
+
def chat_fn(query: str, temperature: float, history: list[dict]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
history = history or []
|
84 |
+
history.append({"role": "user", "content": query})
|
85 |
|
86 |
+
# HΓ€mtar K chunkar
|
87 |
+
docs = retriever.get_relevant_documents(query)
|
88 |
+
if not docs:
|
89 |
+
ans = "π« Hittade inget relevant innehΓ₯ll i dokumenten."
|
90 |
+
history.append({"role": "assistant", "content": ans})
|
91 |
return history, history
|
92 |
|
93 |
+
# Bygg kontext och trimma
|
94 |
+
context = "\n\n---\n\n".join(d.page_content for d in docs)
|
95 |
+
context = truncate_tokens(context, CTX_TOKEN_MAX)
|
|
|
|
|
96 |
|
97 |
+
prompt = textwrap.dedent(f"""
|
98 |
+
Du Γ€r en hjΓ€lpsam assistent som svarar pΓ₯ svenska.
|
99 |
+
Kontext (ur PDFβdokument):
|
100 |
|
101 |
+
{context}
|
102 |
+
|
103 |
+
FrΓ₯ga: {query}
|
104 |
+
Svar (svenska):""").strip()
|
105 |
+
|
106 |
+
logger.info(f"π PromptβlΓ€ngdβ{len(prompt)}β―tecken, temp={temperature}")
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
try:
|
109 |
+
resp = gen_pipe(prompt,
|
110 |
+
temperature=float(temperature),
|
111 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
112 |
+
pad_token_id=2,
|
113 |
+
eos_token_id=2,
|
114 |
+
do_sample=True,
|
115 |
+
return_full_text=False)[0]["generated_text"]
|
116 |
except Exception as e:
|
117 |
+
logger.exception("Fel vid generering")
|
118 |
+
resp = f"β Fel: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
src_hint = docs[0].metadata.get("source", "okΓ€nd")
|
121 |
+
history.append({"role": "assistant",
|
122 |
+
"content": f"**(KΓ€lla: {src_hint})**\n\n{resp}"})
|
123 |
return history, history
|
124 |
|
125 |
+
# ββ 5) Bygg GradioβUI ββββββββββββββββββββββββββββββββββ
|
126 |
with gr.Blocks() as demo:
|
127 |
+
gr.Markdown("# π svensk RAGβchat\nStΓ€ll frΓ₯gor till dina PDFβfiler")
|
128 |
+
gr.Markdown(f"**PDFβfiler i index:** {', '.join(os.listdir(DOCS_DIR)) or 'inga'}")
|
129 |
|
130 |
with gr.Row():
|
131 |
+
test_in = gr.Textbox(label="Snabbβretrieval (ingen AI)", lines=1)
|
132 |
+
test_btn = gr.Button("π Testa")
|
133 |
+
test_out = gr.Textbox(label="Chunkar")
|
|
|
|
|
134 |
|
135 |
with gr.Row():
|
136 |
+
q_in = gr.Textbox(placeholder="Ex: Vad stΓ₯r det om krav?", label="FrΓ₯ga")
|
137 |
+
temp = gr.Slider(0, 1, value=DEFAULT_TEMP, step=0.05, label="Temperatur")
|
138 |
+
send = gr.Button("π¨ Skicka")
|
139 |
|
140 |
+
chat = gr.Chatbot(type="messages", label="Chat")
|
141 |
+
chat_hist = gr.State([])
|
142 |
|
143 |
+
# Kopplingar
|
144 |
+
test_btn.click(test_retrieval, inputs=[test_in], outputs=[test_out])
|
145 |
+
send.click(chat_fn, inputs=[q_in, temp, chat_hist], outputs=[chat, chat_hist])
|
146 |
|
147 |
if __name__ == "__main__":
|
|
|
148 |
demo.launch(share=True)
|
|