ahmed-eisa commited on
Commit
43e97e3
·
1 Parent(s): 8e6b116

started RAG

Browse files
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  usage.csv
2
  temp_95d8748770ec4615942cc78d473f5cee.csv
 
 
1
  usage.csv
2
  temp_95d8748770ec4615942cc78d473f5cee.csv
3
+ uploads/*
__pycache__/main.cpython-311.pyc CHANGED
Binary files a/__pycache__/main.cpython-311.pyc and b/__pycache__/main.cpython-311.pyc differ
 
__pycache__/upload.cpython-311.pyc ADDED
Binary file (1.5 kB). View file
 
client.py CHANGED
@@ -5,6 +5,17 @@ import streamlit as st
5
  # stui = StreamlitUI(api_url="http://localhost:8000") # FastAPI backend URL
6
  st.title("FastAPI ChatBot")
7
 
 
 
 
 
 
 
 
 
 
 
 
8
  if "messages" not in st.session_state:
9
  st.session_state.messages = []
10
 
 
5
  # stui = StreamlitUI(api_url="http://localhost:8000") # FastAPI backend URL
6
  st.title("FastAPI ChatBot")
7
 
8
+ st.write("Upload a file to FastAPI")
9
+ file = st.file_uploader("Choose a file", type=["pdf"])
10
+
11
+ if st.button("Submit"):
12
+ if file is not None:
13
+ files = {"file": (file.name, file, file.type)}
14
+ response = requests.post("http://localhost:8000/upload", files=files)
15
+ st.write(response.text)
16
+ else:
17
+ st.write("No file uploaded.")
18
+
19
  if "messages" not in st.session_state:
20
  st.session_state.messages = []
21
 
main.py CHANGED
@@ -1,11 +1,11 @@
1
  # main.py
2
- from fastapi import FastAPI,status,Response,Request,Depends
3
  from fastapi.responses import StreamingResponse,FileResponse
4
  from models import load_text_model,generate_text,load_audio_model,generate_audio,load_image_model, generate_image
5
  from schemas import VoicePresets
6
  from utils import audio_array_to_buffer,img_to_bytes
7
  from contextlib import asynccontextmanager
8
- from typing import AsyncIterator,Callable,Awaitable
9
  from uuid import uuid4
10
  import time
11
  from datetime import datetime, timezone
@@ -13,13 +13,14 @@ import csv
13
  from dependencies import get_urls_content
14
  from schemas import TextModelResponse,TextModelRequest
15
  import shutil, uuid
 
16
 
17
  models = {}
18
 
19
  @asynccontextmanager
20
  async def lifespan(_: FastAPI) -> AsyncIterator[None]:
21
  # models["text2image"] = load_image_model()
22
- models["text"]=load_text_model()
23
  yield
24
  models.clear()
25
 
@@ -112,4 +113,22 @@ def serve_text_to_image_model_controller(prompt: str):
112
  # pipe = load_image_model()
113
  # output = generate_image(pipe, prompt)
114
  output = generate_image(models["text2image"], prompt)
115
- return Response(content=img_to_bytes(output), media_type="image/png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # main.py
2
+ from fastapi import FastAPI,status,Response,Request,Depends,HTTPException,UploadFile, File
3
  from fastapi.responses import StreamingResponse,FileResponse
4
  from models import load_text_model,generate_text,load_audio_model,generate_audio,load_image_model, generate_image
5
  from schemas import VoicePresets
6
  from utils import audio_array_to_buffer,img_to_bytes
7
  from contextlib import asynccontextmanager
8
+ from typing import AsyncIterator,Callable,Awaitable,Annotated
9
  from uuid import uuid4
10
  import time
11
  from datetime import datetime, timezone
 
13
  from dependencies import get_urls_content
14
  from schemas import TextModelResponse,TextModelRequest
15
  import shutil, uuid
16
+ from upload import save_file
17
 
18
  models = {}
19
 
20
  @asynccontextmanager
21
  async def lifespan(_: FastAPI) -> AsyncIterator[None]:
22
  # models["text2image"] = load_image_model()
23
+ # models["text"]=load_text_model()
24
  yield
25
  models.clear()
26
 
 
113
  # pipe = load_image_model()
114
  # output = generate_image(pipe, prompt)
115
  output = generate_image(models["text2image"], prompt)
116
+ return Response(content=img_to_bytes(output), media_type="image/png")
117
+
118
+ @app.post("/upload")
119
+ async def file_upload_controller(
120
+ file: Annotated[UploadFile, File(description="Uploaded PDF documents")]
121
+ ):
122
+ if file.content_type != "application/pdf":
123
+ raise HTTPException(
124
+ detail=f"Only uploading PDF documents are supported",
125
+ status_code=status.HTTP_400_BAD_REQUEST,
126
+ )
127
+ try:
128
+ await save_file(file)
129
+ except Exception as e:
130
+ raise HTTPException(
131
+ detail=f"An error occurred while saving file - Error: {e}",
132
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
133
+ )
134
+ return {"filename": file.filename, "message": "File uploaded successfully"}
rag/extractor.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+
3
+ def pdf_text_extractor(filepath: str) -> None:
4
+ content = ""
5
+ pdf_reader = PdfReader(filepath, strict=True)
6
+ for page in pdf_reader.pages:
7
+ page_text = page.extract_text()
8
+ if page_text:
9
+ content += f"{page_text}\n\n"
10
+ with open(filepath.replace("pdf", "txt"), "w", encoding="utf-8") as file:
11
+ file.write(content)
rag/repository.py ADDED
File without changes
rag/transform.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Any, AsyncGenerator
3
+
4
+ import aiofiles
5
+ from transformers import AutoModel
6
+
7
+ DEFAULT_CHUNK_SIZE = 1024 * 1024 * 50 # 50 megabytes
8
+
9
+ embedder = AutoModel.from_pretrained(
10
+ "jinaai/jina-embeddings-v2-base-en", trust_remote_code=True
11
+ )
12
+
13
+ async def load(filepath: str) -> AsyncGenerator[str, Any]:
14
+ async with aiofiles.open(filepath, "r", encoding="utf-8") as f:
15
+ while chunk := await f.read(DEFAULT_CHUNK_SIZE):
16
+ yield chunk
17
+
18
+ def clean(text: str) -> str:
19
+ t = text.replace("\n", " ")
20
+ t = re.sub(r"\s+", " ", t)
21
+ t = re.sub(r"\. ,", "", t)
22
+ t = t.replace("..", ".")
23
+ t = t.replace(". .", ".")
24
+ cleaned_text = t.replace("\n", " ").strip()
25
+ return cleaned_text
26
+
27
+ def embed(text: str) -> list[float]:
28
+ return embedder.encode(text).tolist()
requirements.txt CHANGED
@@ -10,4 +10,8 @@ loguru
10
  beautifulsoup4
11
  lxml
12
  aiohttp
13
- tiktoken
 
 
 
 
 
10
  beautifulsoup4
11
  lxml
12
  aiohttp
13
+ tiktoken
14
+ aiofiles
15
+ python-multipart
16
+ qdrant_client
17
+ pypdf
upload.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import aiofiles
3
+ from aiofiles.os import makedirs
4
+ from fastapi import UploadFile
5
+
6
+ DEFAULT_CHUNK_SIZE = 1024 * 1024 * 50 # 50 megabytes
7
+
8
+ async def save_file(file: UploadFile) -> str:
9
+ await makedirs("uploads", exist_ok=True)
10
+ filepath = os.path.join("uploads", file.filename)
11
+ async with aiofiles.open(filepath, "wb") as f:
12
+ while chunk := await file.read(DEFAULT_CHUNK_SIZE):
13
+ await f.write(chunk)
14
+ return filepath