Spaces:
Running
Running
Commit
·
43e97e3
1
Parent(s):
8e6b116
started RAG
Browse files- .gitignore +1 -0
- __pycache__/main.cpython-311.pyc +0 -0
- __pycache__/upload.cpython-311.pyc +0 -0
- client.py +11 -0
- main.py +23 -4
- rag/extractor.py +11 -0
- rag/repository.py +0 -0
- rag/transform.py +28 -0
- requirements.txt +5 -1
- upload.py +14 -0
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
usage.csv
|
2 |
temp_95d8748770ec4615942cc78d473f5cee.csv
|
|
|
|
1 |
usage.csv
|
2 |
temp_95d8748770ec4615942cc78d473f5cee.csv
|
3 |
+
uploads/*
|
__pycache__/main.cpython-311.pyc
CHANGED
Binary files a/__pycache__/main.cpython-311.pyc and b/__pycache__/main.cpython-311.pyc differ
|
|
__pycache__/upload.cpython-311.pyc
ADDED
Binary file (1.5 kB). View file
|
|
client.py
CHANGED
@@ -5,6 +5,17 @@ import streamlit as st
|
|
5 |
# stui = StreamlitUI(api_url="http://localhost:8000") # FastAPI backend URL
|
6 |
st.title("FastAPI ChatBot")
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
if "messages" not in st.session_state:
|
9 |
st.session_state.messages = []
|
10 |
|
|
|
5 |
# stui = StreamlitUI(api_url="http://localhost:8000") # FastAPI backend URL
|
6 |
st.title("FastAPI ChatBot")
|
7 |
|
8 |
+
st.write("Upload a file to FastAPI")
|
9 |
+
file = st.file_uploader("Choose a file", type=["pdf"])
|
10 |
+
|
11 |
+
if st.button("Submit"):
|
12 |
+
if file is not None:
|
13 |
+
files = {"file": (file.name, file, file.type)}
|
14 |
+
response = requests.post("http://localhost:8000/upload", files=files)
|
15 |
+
st.write(response.text)
|
16 |
+
else:
|
17 |
+
st.write("No file uploaded.")
|
18 |
+
|
19 |
if "messages" not in st.session_state:
|
20 |
st.session_state.messages = []
|
21 |
|
main.py
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
# main.py
|
2 |
-
from fastapi import FastAPI,status,Response,Request,Depends
|
3 |
from fastapi.responses import StreamingResponse,FileResponse
|
4 |
from models import load_text_model,generate_text,load_audio_model,generate_audio,load_image_model, generate_image
|
5 |
from schemas import VoicePresets
|
6 |
from utils import audio_array_to_buffer,img_to_bytes
|
7 |
from contextlib import asynccontextmanager
|
8 |
-
from typing import AsyncIterator,Callable,Awaitable
|
9 |
from uuid import uuid4
|
10 |
import time
|
11 |
from datetime import datetime, timezone
|
@@ -13,13 +13,14 @@ import csv
|
|
13 |
from dependencies import get_urls_content
|
14 |
from schemas import TextModelResponse,TextModelRequest
|
15 |
import shutil, uuid
|
|
|
16 |
|
17 |
models = {}
|
18 |
|
19 |
@asynccontextmanager
|
20 |
async def lifespan(_: FastAPI) -> AsyncIterator[None]:
|
21 |
# models["text2image"] = load_image_model()
|
22 |
-
models["text"]=load_text_model()
|
23 |
yield
|
24 |
models.clear()
|
25 |
|
@@ -112,4 +113,22 @@ def serve_text_to_image_model_controller(prompt: str):
|
|
112 |
# pipe = load_image_model()
|
113 |
# output = generate_image(pipe, prompt)
|
114 |
output = generate_image(models["text2image"], prompt)
|
115 |
-
return Response(content=img_to_bytes(output), media_type="image/png")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# main.py
|
2 |
+
from fastapi import FastAPI,status,Response,Request,Depends,HTTPException,UploadFile, File
|
3 |
from fastapi.responses import StreamingResponse,FileResponse
|
4 |
from models import load_text_model,generate_text,load_audio_model,generate_audio,load_image_model, generate_image
|
5 |
from schemas import VoicePresets
|
6 |
from utils import audio_array_to_buffer,img_to_bytes
|
7 |
from contextlib import asynccontextmanager
|
8 |
+
from typing import AsyncIterator,Callable,Awaitable,Annotated
|
9 |
from uuid import uuid4
|
10 |
import time
|
11 |
from datetime import datetime, timezone
|
|
|
13 |
from dependencies import get_urls_content
|
14 |
from schemas import TextModelResponse,TextModelRequest
|
15 |
import shutil, uuid
|
16 |
+
from upload import save_file
|
17 |
|
18 |
models = {}
|
19 |
|
20 |
@asynccontextmanager
|
21 |
async def lifespan(_: FastAPI) -> AsyncIterator[None]:
|
22 |
# models["text2image"] = load_image_model()
|
23 |
+
# models["text"]=load_text_model()
|
24 |
yield
|
25 |
models.clear()
|
26 |
|
|
|
113 |
# pipe = load_image_model()
|
114 |
# output = generate_image(pipe, prompt)
|
115 |
output = generate_image(models["text2image"], prompt)
|
116 |
+
return Response(content=img_to_bytes(output), media_type="image/png")
|
117 |
+
|
118 |
+
@app.post("/upload")
|
119 |
+
async def file_upload_controller(
|
120 |
+
file: Annotated[UploadFile, File(description="Uploaded PDF documents")]
|
121 |
+
):
|
122 |
+
if file.content_type != "application/pdf":
|
123 |
+
raise HTTPException(
|
124 |
+
detail=f"Only uploading PDF documents are supported",
|
125 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
126 |
+
)
|
127 |
+
try:
|
128 |
+
await save_file(file)
|
129 |
+
except Exception as e:
|
130 |
+
raise HTTPException(
|
131 |
+
detail=f"An error occurred while saving file - Error: {e}",
|
132 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
133 |
+
)
|
134 |
+
return {"filename": file.filename, "message": "File uploaded successfully"}
|
rag/extractor.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pypdf import PdfReader
|
2 |
+
|
3 |
+
def pdf_text_extractor(filepath: str) -> None:
|
4 |
+
content = ""
|
5 |
+
pdf_reader = PdfReader(filepath, strict=True)
|
6 |
+
for page in pdf_reader.pages:
|
7 |
+
page_text = page.extract_text()
|
8 |
+
if page_text:
|
9 |
+
content += f"{page_text}\n\n"
|
10 |
+
with open(filepath.replace("pdf", "txt"), "w", encoding="utf-8") as file:
|
11 |
+
file.write(content)
|
rag/repository.py
ADDED
File without changes
|
rag/transform.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from typing import Any, AsyncGenerator
|
3 |
+
|
4 |
+
import aiofiles
|
5 |
+
from transformers import AutoModel
|
6 |
+
|
7 |
+
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 50 # 50 megabytes
|
8 |
+
|
9 |
+
embedder = AutoModel.from_pretrained(
|
10 |
+
"jinaai/jina-embeddings-v2-base-en", trust_remote_code=True
|
11 |
+
)
|
12 |
+
|
13 |
+
async def load(filepath: str) -> AsyncGenerator[str, Any]:
|
14 |
+
async with aiofiles.open(filepath, "r", encoding="utf-8") as f:
|
15 |
+
while chunk := await f.read(DEFAULT_CHUNK_SIZE):
|
16 |
+
yield chunk
|
17 |
+
|
18 |
+
def clean(text: str) -> str:
|
19 |
+
t = text.replace("\n", " ")
|
20 |
+
t = re.sub(r"\s+", " ", t)
|
21 |
+
t = re.sub(r"\. ,", "", t)
|
22 |
+
t = t.replace("..", ".")
|
23 |
+
t = t.replace(". .", ".")
|
24 |
+
cleaned_text = t.replace("\n", " ").strip()
|
25 |
+
return cleaned_text
|
26 |
+
|
27 |
+
def embed(text: str) -> list[float]:
|
28 |
+
return embedder.encode(text).tolist()
|
requirements.txt
CHANGED
@@ -10,4 +10,8 @@ loguru
|
|
10 |
beautifulsoup4
|
11 |
lxml
|
12 |
aiohttp
|
13 |
-
tiktoken
|
|
|
|
|
|
|
|
|
|
10 |
beautifulsoup4
|
11 |
lxml
|
12 |
aiohttp
|
13 |
+
tiktoken
|
14 |
+
aiofiles
|
15 |
+
python-multipart
|
16 |
+
qdrant_client
|
17 |
+
pypdf
|
upload.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import aiofiles
|
3 |
+
from aiofiles.os import makedirs
|
4 |
+
from fastapi import UploadFile
|
5 |
+
|
6 |
+
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 50 # 50 megabytes
|
7 |
+
|
8 |
+
async def save_file(file: UploadFile) -> str:
|
9 |
+
await makedirs("uploads", exist_ok=True)
|
10 |
+
filepath = os.path.join("uploads", file.filename)
|
11 |
+
async with aiofiles.open(filepath, "wb") as f:
|
12 |
+
while chunk := await file.read(DEFAULT_CHUNK_SIZE):
|
13 |
+
await f.write(chunk)
|
14 |
+
return filepath
|