Codegeass321 commited on
Commit
0068013
Β·
1 Parent(s): a7005fc
Files changed (5) hide show
  1. README.md +23 -4
  2. api.py +81 -6
  3. app.py +17 -8
  4. requirements.txt +2 -1
  5. utils.py +58 -29
README.md CHANGED
@@ -9,26 +9,45 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # Backend (FastAPI)
13
 
14
  ## Structure
15
  - `api.py` β€” Main FastAPI app
 
16
  - `utils.py` β€” Helper functions
17
  - `requirements.txt` β€” Python dependencies
18
  - `.env.example` β€” Example environment variables
19
 
 
 
 
 
 
 
 
 
 
 
20
  ## Running Locally
21
  ```sh
22
  pip install -r requirements.txt
23
- uvicorn api:app --reload --host 0.0.0.0 --port 8000
24
  ```
25
 
26
- ## Deploying to Render
27
- - Push this folder to a GitHub repo
 
 
 
 
 
 
 
28
  - Use the following start command on Render:
29
  ```
30
  uvicorn api:app --host 0.0.0.0 --port 10000
31
  ```
 
32
  - Add your environment variables in the Render dashboard
33
 
34
  ---
 
9
  pinned: false
10
  ---
11
 
12
+ # Backend (FastAPI with Gradio UI)
13
 
14
  ## Structure
15
  - `api.py` β€” Main FastAPI app
16
+ - `app.py` β€” Gradio wrapper for FastAPI
17
  - `utils.py` β€” Helper functions
18
  - `requirements.txt` β€” Python dependencies
19
  - `.env.example` β€” Example environment variables
20
 
21
+ ## Port Configuration
22
+ - **FastAPI**: Port 8000 (internal)
23
+ - **Gradio**: Port 7860 (default for Hugging Face Spaces)
24
+
25
+ ## API Endpoints
26
+ - **Upload Documents**: `/proxy/8000/upload`
27
+ - **Ask Questions**: `/proxy/8000/ask`
28
+ - **Check Status**: `/proxy/8000/status`
29
+ - **API Documentation**: `/proxy/8000/docs`
30
+
31
  ## Running Locally
32
  ```sh
33
  pip install -r requirements.txt
34
+ python app.py
35
  ```
36
 
37
+ ## Deploying to Hugging Face Spaces
38
+ 1. Create a new Space with Gradio SDK
39
+ 2. Upload this repository to the Space
40
+ 3. Set the following environment variables in the Space settings:
41
+ - `GOOGLE_API_KEY` - Your Google Gemini API key
42
+ - `SUPABASE_URL` - Your Supabase URL
43
+ - `SUPABASE_KEY` - Your Supabase API key
44
+
45
+ ## For Render (Legacy)
46
  - Use the following start command on Render:
47
  ```
48
  uvicorn api:app --host 0.0.0.0 --port 10000
49
  ```
50
+ ```
51
  - Add your environment variables in the Render dashboard
52
 
53
  ---
api.py CHANGED
@@ -31,6 +31,7 @@ origins = [
31
  "https://huggingface.co", # Hugging Face Spaces domain
32
  "https://codegeass321-chatdocxai.hf.space", # Old HF space
33
  "https://codegeass321-backendserver.hf.space", # New HF space
 
34
  ]
35
 
36
  app.add_middleware(
@@ -60,9 +61,15 @@ async def options_upload():
60
  @app.post("/upload")
61
  async def upload(files: List[UploadFile] = File(...)):
62
  headers = {
63
- "Access-Control-Allow-Origin": "*"
 
 
64
  }
65
  try:
 
 
 
 
66
  if not files:
67
  return JSONResponse(
68
  content={"status": "error", "message": "No files uploaded."},
@@ -77,21 +84,59 @@ async def upload(files: List[UploadFile] = File(...)):
77
  print("Memory cleared.")
78
 
79
  print("Starting document processing...")
80
- raw_docs = load_documents_gradio(files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  print("Documents loaded. Splitting documents...")
82
- chunks = split_documents(raw_docs)
 
 
 
 
 
 
 
 
 
 
83
  print("Documents split. Building vector store...")
84
- store["value"] = build_vectorstore(chunks)
85
- print("Vector store built successfully.")
 
 
 
 
 
 
 
 
86
 
87
  return JSONResponse(
88
  content={"status": "success", "message": "Document processed successfully! You can now ask questions."},
89
  headers=headers
90
  )
91
  except Exception as e:
 
 
92
  print(f"An error occurred during upload: {e}")
 
93
  return JSONResponse(
94
- content={"status": "error", "message": f"An internal server error occurred: {e}"},
95
  status_code=500,
96
  headers=headers
97
  )
@@ -131,3 +176,33 @@ async def ask(
131
  prompt = build_prompt(top_chunks, query)
132
  answer = ask_gemini(prompt, client)
133
  return {"status": "success", "answer": answer.strip(), "transcribed": transcribed}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "https://huggingface.co", # Hugging Face Spaces domain
32
  "https://codegeass321-chatdocxai.hf.space", # Old HF space
33
  "https://codegeass321-backendserver.hf.space", # New HF space
34
+ "*", # Allow requests from the proxy (same origin)
35
  ]
36
 
37
  app.add_middleware(
 
61
  @app.post("/upload")
62
  async def upload(files: List[UploadFile] = File(...)):
63
  headers = {
64
+ "Access-Control-Allow-Origin": "*",
65
+ "Access-Control-Allow-Methods": "POST, OPTIONS",
66
+ "Access-Control-Allow-Headers": "Content-Type, Authorization",
67
  }
68
  try:
69
+ print(f"Upload request received with {len(files)} files")
70
+ for i, file in enumerate(files):
71
+ print(f"File {i+1}: {file.filename}, content_type: {file.content_type}")
72
+
73
  if not files:
74
  return JSONResponse(
75
  content={"status": "error", "message": "No files uploaded."},
 
84
  print("Memory cleared.")
85
 
86
  print("Starting document processing...")
87
+ try:
88
+ raw_docs = load_documents_gradio(files)
89
+ print(f"Documents loaded: {len(raw_docs)} documents")
90
+ except Exception as doc_error:
91
+ print(f"Error loading documents: {doc_error}")
92
+ return JSONResponse(
93
+ content={"status": "error", "message": f"Error loading documents: {str(doc_error)}"},
94
+ status_code=500,
95
+ headers=headers
96
+ )
97
+
98
+ if not raw_docs:
99
+ return JSONResponse(
100
+ content={"status": "error", "message": "No content could be extracted from the uploaded files."},
101
+ status_code=400,
102
+ headers=headers
103
+ )
104
+
105
  print("Documents loaded. Splitting documents...")
106
+ try:
107
+ chunks = split_documents(raw_docs)
108
+ print(f"Documents split into {len(chunks)} chunks")
109
+ except Exception as split_error:
110
+ print(f"Error splitting documents: {split_error}")
111
+ return JSONResponse(
112
+ content={"status": "error", "message": f"Error splitting documents: {str(split_error)}"},
113
+ status_code=500,
114
+ headers=headers
115
+ )
116
+
117
  print("Documents split. Building vector store...")
118
+ try:
119
+ store["value"] = build_vectorstore(chunks)
120
+ print("Vector store built successfully.")
121
+ except Exception as vector_error:
122
+ print(f"Error building vector store: {vector_error}")
123
+ return JSONResponse(
124
+ content={"status": "error", "message": f"Error building vector store: {str(vector_error)}"},
125
+ status_code=500,
126
+ headers=headers
127
+ )
128
 
129
  return JSONResponse(
130
  content={"status": "success", "message": "Document processed successfully! You can now ask questions."},
131
  headers=headers
132
  )
133
  except Exception as e:
134
+ import traceback
135
+ error_trace = traceback.format_exc()
136
  print(f"An error occurred during upload: {e}")
137
+ print(f"Traceback: {error_trace}")
138
  return JSONResponse(
139
+ content={"status": "error", "message": f"An internal server error occurred: {str(e)}"},
140
  status_code=500,
141
  headers=headers
142
  )
 
176
  prompt = build_prompt(top_chunks, query)
177
  answer = ask_gemini(prompt, client)
178
  return {"status": "success", "answer": answer.strip(), "transcribed": transcribed}
179
+
180
+ @app.get("/status")
181
+ async def status():
182
+ """Simple endpoint to check if the server is running."""
183
+ import platform
184
+ import sys
185
+ import psutil
186
+
187
+ # Get memory info
188
+ process = psutil.Process(os.getpid())
189
+ memory_info = process.memory_info()
190
+
191
+ return {
192
+ "status": "ok",
193
+ "message": "Server is running",
194
+ "google_api_key_set": bool(os.environ.get("GOOGLE_API_KEY")),
195
+ "vectorstore_loaded": store.get("value") is not None,
196
+ "system_info": {
197
+ "platform": platform.platform(),
198
+ "python_version": sys.version,
199
+ "memory_usage_mb": memory_info.rss / (1024 * 1024),
200
+ "cpu_percent": process.cpu_percent(),
201
+ "available_memory_mb": psutil.virtual_memory().available / (1024 * 1024)
202
+ },
203
+ "env_vars": {
204
+ "PORT": os.environ.get("PORT"),
205
+ "SPACE_ID": os.environ.get("SPACE_ID"),
206
+ "SYSTEM": os.environ.get("SYSTEM")
207
+ }
208
+ }
app.py CHANGED
@@ -3,16 +3,24 @@ from api import app
3
  import uvicorn
4
  import threading
5
  import time
 
 
 
 
 
 
 
6
 
7
  # Create a simple Gradio interface
8
  def create_interface():
9
  with gr.Blocks(title="ChatDocxAI Backend") as interface:
10
  gr.Markdown("# ChatDocxAI Backend")
11
- gr.Markdown("""
12
  This is the backend server for ChatDocxAI. It provides the following endpoints:
13
 
14
- - `/upload` - Upload documents
15
- - `/ask` - Ask questions about uploaded documents
 
16
 
17
  The frontend should be configured to communicate with this backend.
18
  """)
@@ -25,13 +33,13 @@ def create_interface():
25
  with gr.Row():
26
  with gr.Column():
27
  gr.Markdown("## API Documentation")
28
- doc_link = gr.HTML(f"<a href='/docs' target='_blank'>View FastAPI Docs</a>")
29
 
30
  return interface
31
 
32
- # Function to start FastAPI in a separate thread
33
  def start_fastapi():
34
- uvicorn.run(app, host="0.0.0.0", port=7860)
35
 
36
  # Start FastAPI in a separate thread
37
  fastapi_thread = threading.Thread(target=start_fastapi)
@@ -44,6 +52,7 @@ time.sleep(2)
44
  # Create and launch the Gradio interface
45
  interface = create_interface()
46
 
47
- # Launch the interface with Gradio defaults (no specific port)
48
  if __name__ == "__main__":
49
- interface.launch()
 
 
3
  import uvicorn
4
  import threading
5
  import time
6
+ import os
7
+ from fastapi import FastAPI
8
+ from fastapi.middleware.cors import CORSMiddleware
9
+ from fastapi.responses import RedirectResponse
10
+
11
+ # FastAPI port - used internally
12
+ FASTAPI_PORT = 8000
13
 
14
  # Create a simple Gradio interface
15
  def create_interface():
16
  with gr.Blocks(title="ChatDocxAI Backend") as interface:
17
  gr.Markdown("# ChatDocxAI Backend")
18
+ gr.Markdown(f"""
19
  This is the backend server for ChatDocxAI. It provides the following endpoints:
20
 
21
+ - `/proxy/8000/upload` - Upload documents
22
+ - `/proxy/8000/ask` - Ask questions about uploaded documents
23
+ - `/proxy/8000/status` - Check API status
24
 
25
  The frontend should be configured to communicate with this backend.
26
  """)
 
33
  with gr.Row():
34
  with gr.Column():
35
  gr.Markdown("## API Documentation")
36
+ doc_link = gr.HTML(f"<a href='/proxy/8000/docs' target='_blank'>View FastAPI Docs</a>")
37
 
38
  return interface
39
 
40
+ # Set up FastAPI to run on port 8000
41
  def start_fastapi():
42
+ uvicorn.run(app, host="0.0.0.0", port=FASTAPI_PORT)
43
 
44
  # Start FastAPI in a separate thread
45
  fastapi_thread = threading.Thread(target=start_fastapi)
 
52
  # Create and launch the Gradio interface
53
  interface = create_interface()
54
 
55
+ # Launch the Gradio interface on the default Hugging Face Spaces port (7860)
56
  if __name__ == "__main__":
57
+ # Use the port specified by Hugging Face Spaces
58
+ interface.launch(server_name="0.0.0.0")
requirements.txt CHANGED
@@ -18,4 +18,5 @@ unstructured[excel]
18
  unstructured[xml]
19
  torch
20
  torchaudio
21
- transformers
 
 
18
  unstructured[xml]
19
  torch
20
  torchaudio
21
+ transformers
22
+ psutil
utils.py CHANGED
@@ -38,36 +38,65 @@ def authenticate():
38
 
39
  def load_documents_gradio(uploaded_files):
40
  docs = []
 
41
  for file in uploaded_files:
42
- # For FastAPI UploadFile, save to a temp file
43
- if hasattr(file, "filename") and hasattr(file, "file"):
44
- import tempfile
45
- suffix = os.path.splitext(file.filename)[1]
46
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
47
- tmp.write(file.file.read())
48
- tmp_path = tmp.name
49
- file_path = tmp_path
50
- else:
51
- file_path = file.name # For Gradio or other file types
52
- # Detect type and load accordingly
53
- if file_path.lower().endswith('.pdf'):
54
- docs.extend(UnstructuredPDFLoader(file_path).load())
55
- elif file_path.lower().endswith('.txt'):
56
- docs.extend(TextLoader(file_path).load())
57
- elif file_path.lower().endswith('.csv'):
58
- docs.extend(CSVLoader(file_path).load())
59
- elif file_path.lower().endswith('.json'):
60
- docs.extend(JSONLoader(file_path).load())
61
- elif file_path.lower().endswith('.pptx'):
62
- docs.extend(UnstructuredPowerPointLoader(file_path).load())
63
- elif file_path.lower().endswith('.xlsx'):
64
- docs.extend(UnstructuredExcelLoader(file_path).load())
65
- elif file_path.lower().endswith('.xml'):
66
- docs.extend(UnstructuredXMLLoader(file_path).load())
67
- elif file_path.lower().endswith('.docx'):
68
- docs.extend(UnstructuredWordDocumentLoader(file_path).load())
69
- else:
70
- print(f'Unsupported File Type: {file_path}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  return docs
72
 
73
 
 
38
 
39
  def load_documents_gradio(uploaded_files):
40
  docs = []
41
+ print(f"Processing {len(uploaded_files)} files")
42
  for file in uploaded_files:
43
+ try:
44
+ # For FastAPI UploadFile, save to a temp file
45
+ if hasattr(file, "filename") and hasattr(file, "file"):
46
+ import tempfile
47
+ suffix = os.path.splitext(file.filename)[1].lower()
48
+ print(f"Processing file: {file.filename} with suffix {suffix}")
49
+
50
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
51
+ content = file.file.read()
52
+ print(f"Read {len(content)} bytes from {file.filename}")
53
+ tmp.write(content)
54
+ tmp_path = tmp.name
55
+
56
+ # Rewind the file cursor for potential further reads
57
+ file.file.seek(0)
58
+ file_path = tmp_path
59
+ else:
60
+ file_path = file.name # For Gradio or other file types
61
+ print(f"Non-FastAPI file: {file_path}")
62
+
63
+ # Detect type and load accordingly
64
+ if file_path.lower().endswith('.pdf'):
65
+ print(f"Loading PDF: {file_path}")
66
+ docs.extend(UnstructuredPDFLoader(file_path).load())
67
+ elif file_path.lower().endswith('.txt'):
68
+ print(f"Loading TXT: {file_path}")
69
+ docs.extend(TextLoader(file_path).load())
70
+ elif file_path.lower().endswith('.csv'):
71
+ print(f"Loading CSV: {file_path}")
72
+ docs.extend(CSVLoader(file_path).load())
73
+ elif file_path.lower().endswith('.json'):
74
+ print(f"Loading JSON: {file_path}")
75
+ docs.extend(JSONLoader(file_path, jq_schema='.', text_content=False).load())
76
+ elif file_path.lower().endswith('.pptx'):
77
+ print(f"Loading PPTX: {file_path}")
78
+ docs.extend(UnstructuredPowerPointLoader(file_path).load())
79
+ elif file_path.lower().endswith(('.xlsx', '.xls')):
80
+ print(f"Loading Excel: {file_path}")
81
+ docs.extend(UnstructuredExcelLoader(file_path).load())
82
+ elif file_path.lower().endswith('.xml'):
83
+ print(f"Loading XML: {file_path}")
84
+ docs.extend(UnstructuredXMLLoader(file_path).load())
85
+ elif file_path.lower().endswith(('.docx', '.doc')):
86
+ print(f"Loading Word: {file_path}")
87
+ docs.extend(UnstructuredWordDocumentLoader(file_path).load())
88
+ else:
89
+ print(f'Unsupported File Type: {file_path}')
90
+
91
+ print(f"Successfully processed {file_path}")
92
+ except Exception as e:
93
+ import traceback
94
+ print(f"Error processing file {getattr(file, 'filename', file)}: {e}")
95
+ print(traceback.format_exc())
96
+ # Continue with next file instead of failing completely
97
+ continue
98
+
99
+ print(f"Total documents loaded: {len(docs)}")
100
  return docs
101
 
102