Spaces:
Sleeping
Sleeping
Update app_main.py
Browse files- app_main.py +20 -1
app_main.py
CHANGED
@@ -10,10 +10,22 @@ import pytesseract
|
|
10 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
11 |
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
12 |
from werkzeug.utils import secure_filename
|
13 |
-
import tempfile
|
14 |
|
15 |
app = Flask(__name__)
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
18 |
poppler_path=r"C:\poppler-23.11.0\Library\bin"
|
19 |
|
@@ -131,7 +143,9 @@ def index():
|
|
131 |
@app.route('/process_pdf', methods=['POST'])
|
132 |
def process_pdf():
|
133 |
try:
|
|
|
134 |
if 'pdf_file' not in request.files:
|
|
|
135 |
return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
|
136 |
|
137 |
pdf_file = request.files['pdf_file']
|
@@ -144,16 +158,21 @@ def process_pdf():
|
|
144 |
saved_pdf_path = os.path.join(temp_dir, filename)
|
145 |
pdf_file.save(saved_pdf_path)
|
146 |
|
|
|
|
|
147 |
# Extract & process
|
148 |
json_path = None
|
149 |
output_path, result = extract_images_from_pdf(saved_pdf_path, json_path)
|
150 |
|
|
|
|
|
151 |
return jsonify({
|
152 |
"message": "✅ PDF processed successfully",
|
153 |
"output_json": output_path,
|
154 |
"sprites": result
|
155 |
})
|
156 |
except Exception as e:
|
|
|
157 |
return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
|
158 |
|
159 |
if __name__ == '__main__':
|
|
|
10 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
11 |
from langchain_community.document_loaders.image_captions import ImageCaptionLoader
|
12 |
from werkzeug.utils import secure_filename
|
13 |
+
import tempfile, logging
|
14 |
|
15 |
app = Flask(__name__)
|
16 |
|
17 |
+
# Configure logging
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.DEBUG, # Use INFO or ERROR in production
|
20 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
21 |
+
handlers=[
|
22 |
+
logging.FileHandler("app.log"),
|
23 |
+
logging.StreamHandler()
|
24 |
+
]
|
25 |
+
)
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
30 |
poppler_path=r"C:\poppler-23.11.0\Library\bin"
|
31 |
|
|
|
143 |
@app.route('/process_pdf', methods=['POST'])
|
144 |
def process_pdf():
|
145 |
try:
|
146 |
+
logger.info("Received request to process PDF.")
|
147 |
if 'pdf_file' not in request.files:
|
148 |
+
logger.warning("No PDF file found in request.")
|
149 |
return jsonify({"error": "Missing PDF file in form-data with key 'pdf_file'"}), 400
|
150 |
|
151 |
pdf_file = request.files['pdf_file']
|
|
|
158 |
saved_pdf_path = os.path.join(temp_dir, filename)
|
159 |
pdf_file.save(saved_pdf_path)
|
160 |
|
161 |
+
logger.info(f"Saved uploaded PDF to: {saved_pdf_path}")
|
162 |
+
|
163 |
# Extract & process
|
164 |
json_path = None
|
165 |
output_path, result = extract_images_from_pdf(saved_pdf_path, json_path)
|
166 |
|
167 |
+
logger.info("Received request to process PDF.")
|
168 |
+
|
169 |
return jsonify({
|
170 |
"message": "✅ PDF processed successfully",
|
171 |
"output_json": output_path,
|
172 |
"sprites": result
|
173 |
})
|
174 |
except Exception as e:
|
175 |
+
logger.exception("❌ Failed to process PDF")
|
176 |
return jsonify({"error": f"❌ Failed to process PDF: {str(e)}"}), 500
|
177 |
|
178 |
if __name__ == '__main__':
|