Spaces:

muryshev
/

cb-api

Sleeping

App Files Files Community

muryshev commited on Mar 11, 2024

Commit

b7484d7

1 Parent(s): 58d107c

init

Browse files

Files changed (4) hide show

.gitignore +1 -0
Dockerfile +27 -0
app.py +54 -0
requirements.txt +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.bat

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# Use a base image that supports Python and includes Tesseract
+FROM python:3.9-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED 1
+ENV FLASK_APP app.py
+ENV APP_HOME /app
+# Install Tesseract and its dependencies
+RUN apt-get update && apt-get install --no-install-recommends -y \
+    tesseract-ocr \
+    tesseract-ocr-rus poppler-utils && \
+    rm -rf /var/lib/apt/lists/*
+# Create and set the working directory
+RUN mkdir /var/www
+RUN mkdir /var/www/tmp
+ENV HOME /var/www
+WORKDIR /var/www
+COPY . /var/www
+RUN pip install --no-cache-dir -r requirements.txt
+EXPOSE 7860
+# Run the Flask application
+CMD flask run --host=0.0.0.0 --port=7860

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import uuid
+from flask import Flask, request, jsonify
+import pytesseract
+from pdf2image import convert_from_bytes
+from flask_cors import CORS
+os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
+app = Flask(__name__)
+CORS(app)
+UPLOAD_FOLDER = './tmp'
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+# Endpoint for uploading PDF and extracting text
+@app.route('/upload', methods=['POST'])
+def upload_file():
+    # Check if the post request has the file part
+    if 'file' not in request.files:
+        return jsonify({'error': 'No file part'})
+    file = request.files['file']
+    # Check if the file is a PDF
+    if file.filename == '':
+        return jsonify({'error': 'No selected file'})
+    if file and file.filename.endswith('.pdf'):
+        # Convert PDF to images
+        # images = convert_from_bytes(file.read())
+        filename = str(uuid.uuid4()) + '.pdf'
+        # Save the file to the temporary upload directory
+        file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+        # Construct and return the path where the file is saved
+        temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        text = ''
+        # for img in images:
+        #     # Perform OCR on each page
+        #     text += pytesseract.image_to_string(img, lang='rus')
+        # присрать сюда вызов библиотеки Андрея с temp_path
+        os.remove(temp_path)
+        return jsonify({'text': text})
+    else:
+        return jsonify({'error': 'File must be a PDF'})
+if __name__ == '__main__':
+    app.run(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+flask
+flask-cors
+pytesseract
+pdf2image