Spaces:
Running
Running
Commit
·
64fd9b7
1
Parent(s):
39d9e01
Initial commit: FastAPI RAG API
Browse files- .gitignore +7 -1
- app/__init__.py +0 -0
- app/api.py +76 -0
- app/rag/__init__.py +0 -0
- app/rag_system.py +93 -0
- app/schemas.py +25 -0
- app/store.py +12 -0
- app/utils.py +9 -0
- requirements.txt +9 -0
- saa +21 -0
- tests/test_requests.http +9 -0
.gitignore
CHANGED
@@ -1,6 +1,12 @@
|
|
1 |
-
venv/
|
2 |
.venv/
|
3 |
__pycache__/
|
4 |
*.pyc
|
5 |
*.pyo
|
6 |
*.pyd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
venv/
|
2 |
.venv/
|
3 |
__pycache__/
|
4 |
*.pyc
|
5 |
*.pyo
|
6 |
*.pyd
|
7 |
+
*.log
|
8 |
+
.env
|
9 |
+
data/uploads/
|
10 |
+
data/index/
|
11 |
+
.git
|
12 |
+
.gitignore
|
app/__init__.py
ADDED
File without changes
|
app/api.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/api.py
|
2 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
4 |
+
from fastapi.responses import JSONResponse
|
5 |
+
from pathlib import Path
|
6 |
+
import shutil
|
7 |
+
import traceback
|
8 |
+
|
9 |
+
from .rag_system import SimpleRAG, UPLOAD_DIR
|
10 |
+
from .schemas import AskRequest, AskResponse, UploadResponse, HistoryResponse, HistoryItem
|
11 |
+
from .store import add_history, get_history
|
12 |
+
from .utils import ensure_session, http400
|
13 |
+
|
14 |
+
app = FastAPI(title="RAG API", version="1.0.0")
|
15 |
+
|
16 |
+
# CORS (Streamlit/UI üçün)
|
17 |
+
app.add_middleware(
|
18 |
+
CORSMiddleware,
|
19 |
+
allow_origins=["*"], # prod-da domeninlə dəyiş
|
20 |
+
allow_credentials=True,
|
21 |
+
allow_methods=["*"],
|
22 |
+
allow_headers=["*"],
|
23 |
+
)
|
24 |
+
|
25 |
+
rag = SimpleRAG()
|
26 |
+
|
27 |
+
@app.get("/health")
|
28 |
+
def health():
|
29 |
+
return {"status": "ok"}
|
30 |
+
|
31 |
+
@app.post("/upload_pdf", response_model=UploadResponse)
|
32 |
+
async def upload_pdf(file: UploadFile = File(...)):
|
33 |
+
try:
|
34 |
+
if not file.filename.lower().endswith(".pdf"):
|
35 |
+
http400("Yalnız PDF faylları qəbul olunur.")
|
36 |
+
dest = UPLOAD_DIR / file.filename
|
37 |
+
with dest.open("wb") as f:
|
38 |
+
shutil.copyfileobj(file.file, f)
|
39 |
+
chunks_added = rag.add_pdf(dest)
|
40 |
+
return UploadResponse(filename=file.filename, chunks_added=chunks_added)
|
41 |
+
except Exception as e:
|
42 |
+
traceback.print_exc()
|
43 |
+
return JSONResponse(status_code=500, content={"detail": f"Server xətası: {str(e)}"})
|
44 |
+
|
45 |
+
@app.post("/ask_question", response_model=AskResponse)
|
46 |
+
async def ask_question(payload: AskRequest):
|
47 |
+
try:
|
48 |
+
session_id = ensure_session(payload.session_id)
|
49 |
+
add_history(session_id, "user", payload.question)
|
50 |
+
results = rag.search(payload.question, k=payload.top_k)
|
51 |
+
contexts = [c for c, _ in results]
|
52 |
+
answer = rag.synthesize_answer(payload.question, contexts) if hasattr(rag, "synthesize_answer") else None
|
53 |
+
if answer is None:
|
54 |
+
# rag_system.synthesize_answer funksiyasını birbaşa import etməmişiksə:
|
55 |
+
from .rag_system import synthesize_answer
|
56 |
+
answer = synthesize_answer(payload.question, contexts)
|
57 |
+
add_history(session_id, "assistant", answer)
|
58 |
+
return AskResponse(answer=answer, contexts=contexts, session_id=session_id)
|
59 |
+
except Exception as e:
|
60 |
+
traceback.print_exc()
|
61 |
+
return JSONResponse(status_code=500, content={"detail": f"Server xətası: {str(e)}"})
|
62 |
+
|
63 |
+
@app.get("/get_history", response_model=HistoryResponse)
|
64 |
+
async def get_history_endpoint(session_id: str):
|
65 |
+
try:
|
66 |
+
hist_raw = get_history(session_id)
|
67 |
+
history = [HistoryItem(**h) for h in hist_raw]
|
68 |
+
return HistoryResponse(session_id=session_id, history=history)
|
69 |
+
except Exception as e:
|
70 |
+
traceback.print_exc()
|
71 |
+
return JSONResponse(status_code=500, content={"detail": f"Server xətası: {str(e)}"})
|
72 |
+
from fastapi.responses import RedirectResponse
|
73 |
+
|
74 |
+
@app.get("/")
|
75 |
+
def root():
|
76 |
+
return RedirectResponse(url="/docs")
|
app/rag/__init__.py
ADDED
File without changes
|
app/rag_system.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/rag_system.py
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import List, Tuple
|
4 |
+
import os
|
5 |
+
import faiss
|
6 |
+
import numpy as np
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
from pypdf import PdfReader
|
9 |
+
|
10 |
+
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
11 |
+
UPLOAD_DIR = DATA_DIR / "uploads"
|
12 |
+
INDEX_DIR = DATA_DIR / "index"
|
13 |
+
INDEX_DIR.mkdir(parents=True, exist_ok=True)
|
14 |
+
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
15 |
+
|
16 |
+
MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
17 |
+
|
18 |
+
class SimpleRAG:
|
19 |
+
def __init__(self, index_path: Path = INDEX_DIR / "faiss.index", meta_path: Path = INDEX_DIR / "meta.npy"):
|
20 |
+
self.model = SentenceTransformer(MODEL_NAME)
|
21 |
+
self.index_path = index_path
|
22 |
+
self.meta_path = meta_path
|
23 |
+
self.index = None
|
24 |
+
self.chunks: List[str] = []
|
25 |
+
self._load()
|
26 |
+
|
27 |
+
def _load(self):
|
28 |
+
# meta (chunks) yüklə
|
29 |
+
if self.meta_path.exists():
|
30 |
+
self.chunks = np.load(self.meta_path, allow_pickle=True).tolist()
|
31 |
+
# faiss index yüklə
|
32 |
+
if self.index_path.exists():
|
33 |
+
# dim modelin çıxış ölçüsü
|
34 |
+
dim = self.model.get_sentence_embedding_dimension()
|
35 |
+
self.index = faiss.read_index(str(self.index_path))
|
36 |
+
# təhlükəsizlik: ölçüsü uyğun olmalıdır
|
37 |
+
if self.index.d != dim:
|
38 |
+
# uyğunsuzluqda sıfırdan başla
|
39 |
+
self.index = faiss.IndexFlatIP(dim)
|
40 |
+
else:
|
41 |
+
dim = self.model.get_sentence_embedding_dimension()
|
42 |
+
self.index = faiss.IndexFlatIP(dim)
|
43 |
+
|
44 |
+
def _persist(self):
|
45 |
+
faiss.write_index(self.index, str(self.index_path))
|
46 |
+
np.save(self.meta_path, np.array(self.chunks, dtype=object))
|
47 |
+
|
48 |
+
@staticmethod
|
49 |
+
def _pdf_to_texts(pdf_path: Path) -> List[str]:
|
50 |
+
reader = PdfReader(str(pdf_path))
|
51 |
+
full_text = []
|
52 |
+
for page in reader.pages:
|
53 |
+
t = page.extract_text() or ""
|
54 |
+
if t.strip():
|
55 |
+
full_text.append(t)
|
56 |
+
# sadə parçalama: ~500 hərf
|
57 |
+
chunks = []
|
58 |
+
for txt in full_text:
|
59 |
+
step = 800
|
60 |
+
for i in range(0, len(txt), step):
|
61 |
+
chunks.append(txt[i:i+step])
|
62 |
+
return chunks
|
63 |
+
|
64 |
+
def add_pdf(self, pdf_path: Path) -> int:
|
65 |
+
texts = self._pdf_to_texts(pdf_path)
|
66 |
+
if not texts:
|
67 |
+
return 0
|
68 |
+
emb = self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
|
69 |
+
self.index.add(emb)
|
70 |
+
self.chunks.extend(texts)
|
71 |
+
self._persist()
|
72 |
+
return len(texts)
|
73 |
+
|
74 |
+
def search(self, query: str, k: int = 5) -> List[Tuple[str, float]]:
|
75 |
+
q = self.model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
|
76 |
+
D, I = self.index.search(q, k)
|
77 |
+
results = []
|
78 |
+
if I.size > 0 and len(self.chunks) > 0:
|
79 |
+
for idx, score in zip(I[0], D[0]):
|
80 |
+
if 0 <= idx < len(self.chunks):
|
81 |
+
results.append((self.chunks[idx], float(score)))
|
82 |
+
return results
|
83 |
+
|
84 |
+
# sadə cavab formalaşdırıcı (LLM yoxdursa, kontekst + heuristika)
|
85 |
+
def synthesize_answer(question: str, contexts: List[str]) -> str:
|
86 |
+
if not contexts:
|
87 |
+
return "Kontekst tapılmadı. Sualı daha dəqiq verin və ya PDF yükləyin."
|
88 |
+
joined = "\n---\n".join(contexts[:3])
|
89 |
+
return (
|
90 |
+
f"Sual: {question}\n\n"
|
91 |
+
f"Cavab (kontekstdən çıxarış):\n{joined}\n\n"
|
92 |
+
f"(Qeyd: Demo rejimi — LLM inteqrasiyası üçün / later: OpenAI/Groq və s.)"
|
93 |
+
)
|
app/schemas.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/schemas.py
|
2 |
+
from pydantic import BaseModel, Field
|
3 |
+
from typing import Optional, List
|
4 |
+
|
5 |
+
class AskRequest(BaseModel):
|
6 |
+
question: str = Field(..., min_length=2)
|
7 |
+
session_id: Optional[str] = None
|
8 |
+
top_k: int = 5
|
9 |
+
|
10 |
+
class AskResponse(BaseModel):
|
11 |
+
answer: str
|
12 |
+
contexts: List[str]
|
13 |
+
session_id: str
|
14 |
+
|
15 |
+
class UploadResponse(BaseModel):
|
16 |
+
filename: str
|
17 |
+
chunks_added: int
|
18 |
+
|
19 |
+
class HistoryItem(BaseModel):
|
20 |
+
role: str
|
21 |
+
content: str
|
22 |
+
|
23 |
+
class HistoryResponse(BaseModel):
|
24 |
+
session_id: str
|
25 |
+
history: List[HistoryItem]
|
app/store.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/store.py
|
2 |
+
from collections import defaultdict
|
3 |
+
from typing import List, Dict
|
4 |
+
|
5 |
+
# in-memory chat tarixi (prod üçün Redis/Postgres məsləhətdir)
|
6 |
+
_history: Dict[str, List[dict]] = defaultdict(list)
|
7 |
+
|
8 |
+
def add_history(session_id: str, role: str, content: str):
|
9 |
+
_history[session_id].append({"role": role, "content": content})
|
10 |
+
|
11 |
+
def get_history(session_id: str) -> List[dict]:
|
12 |
+
return _history.get(session_id, [])
|
app/utils.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app/utils.py
|
2 |
+
import uuid
|
3 |
+
from fastapi import HTTPException
|
4 |
+
|
5 |
+
def ensure_session(session_id: str | None) -> str:
|
6 |
+
return session_id or str(uuid.uuid4())
|
7 |
+
|
8 |
+
def http400(msg: str):
|
9 |
+
raise HTTPException(status_code=400, detail=msg)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi
|
2 |
+
uvicorn[standard]
|
3 |
+
python-multipart
|
4 |
+
pydantic==2.*
|
5 |
+
pydantic-settings
|
6 |
+
sentence-transformers
|
7 |
+
faiss-cpu
|
8 |
+
pypdf
|
9 |
+
python-dotenv
|
saa
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
diff.astextplain.textconv=astextplain
|
2 |
+
filter.lfs.clean=git-lfs clean -- %f
|
3 |
+
filter.lfs.smudge=git-lfs smudge -- %f
|
4 |
+
filter.lfs.process=git-lfs filter-process
|
5 |
+
filter.lfs.required=true
|
6 |
+
http.sslbackend=schannel
|
7 |
+
core.autocrlf=true
|
8 |
+
core.fscache=true
|
9 |
+
core.symlinks=false
|
10 |
+
pull.rebase=false
|
11 |
+
credential.helper=manager
|
12 |
+
credential.https://dev.azure.com.usehttppath=true
|
13 |
+
init.defaultbranch=master
|
14 |
+
user.name=HamidOmarov
|
15 | |
16 |
+
core.repositoryformatversion=0
|
17 |
+
core.filemode=false
|
18 |
+
core.bare=false
|
19 |
+
core.logallrefupdates=true
|
20 |
+
core.symlinks=false
|
21 |
+
core.ignorecase=true
|
tests/test_requests.http
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PDF yüklə
|
2 |
+
curl -X POST "http://127.0.0.1:8000/upload_pdf" \
|
3 |
+
-H "accept: application/json" -H "Content-Type: multipart/form-data" \
|
4 |
+
-F "file=@/path/to/file.pdf"
|
5 |
+
|
6 |
+
# Sual ver
|
7 |
+
curl -X POST "http://127.0.0.1:8000/ask_question" \
|
8 |
+
-H "Content-Type: application/json" \
|
9 |
+
-d '{"question":"Bu sənəd nədən bəhs edir?","top_k":5}'
|