init
Browse files- .gitignore +1 -0
- Dockerfile +27 -0
- app.py +54 -0
- requirements.txt +4 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.bat
|
Dockerfile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use a base image that supports Python and includes Tesseract
|
2 |
+
FROM python:3.9-slim
|
3 |
+
|
4 |
+
# Set environment variables
|
5 |
+
ENV PYTHONUNBUFFERED 1
|
6 |
+
ENV FLASK_APP app.py
|
7 |
+
ENV APP_HOME /app
|
8 |
+
|
9 |
+
# Install Tesseract and its dependencies
|
10 |
+
RUN apt-get update && apt-get install --no-install-recommends -y \
|
11 |
+
tesseract-ocr \
|
12 |
+
tesseract-ocr-rus poppler-utils && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
# Create and set the working directory
|
16 |
+
RUN mkdir /var/www
|
17 |
+
RUN mkdir /var/www/tmp
|
18 |
+
ENV HOME /var/www
|
19 |
+
WORKDIR /var/www
|
20 |
+
COPY . /var/www
|
21 |
+
|
22 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
23 |
+
|
24 |
+
EXPOSE 7860
|
25 |
+
|
26 |
+
# Run the Flask application
|
27 |
+
CMD flask run --host=0.0.0.0 --port=7860
|
app.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import uuid
|
3 |
+
from flask import Flask, request, jsonify
|
4 |
+
import pytesseract
|
5 |
+
from pdf2image import convert_from_bytes
|
6 |
+
from flask_cors import CORS
|
7 |
+
|
8 |
+
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
|
9 |
+
|
10 |
+
app = Flask(__name__)
|
11 |
+
CORS(app)
|
12 |
+
UPLOAD_FOLDER = './tmp'
|
13 |
+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
14 |
+
|
15 |
+
# Endpoint for uploading PDF and extracting text
|
16 |
+
@app.route('/upload', methods=['POST'])
|
17 |
+
def upload_file():
|
18 |
+
# Check if the post request has the file part
|
19 |
+
if 'file' not in request.files:
|
20 |
+
return jsonify({'error': 'No file part'})
|
21 |
+
|
22 |
+
file = request.files['file']
|
23 |
+
|
24 |
+
# Check if the file is a PDF
|
25 |
+
if file.filename == '':
|
26 |
+
return jsonify({'error': 'No selected file'})
|
27 |
+
if file and file.filename.endswith('.pdf'):
|
28 |
+
# Convert PDF to images
|
29 |
+
# images = convert_from_bytes(file.read())
|
30 |
+
filename = str(uuid.uuid4()) + '.pdf'
|
31 |
+
|
32 |
+
# Save the file to the temporary upload directory
|
33 |
+
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
|
34 |
+
|
35 |
+
# Construct and return the path where the file is saved
|
36 |
+
temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
|
37 |
+
|
38 |
+
text = ''
|
39 |
+
# for img in images:
|
40 |
+
# # Perform OCR on each page
|
41 |
+
# text += pytesseract.image_to_string(img, lang='rus')
|
42 |
+
|
43 |
+
|
44 |
+
# присрать сюда вызов библиотеки Андрея с temp_path
|
45 |
+
|
46 |
+
|
47 |
+
os.remove(temp_path)
|
48 |
+
|
49 |
+
return jsonify({'text': text})
|
50 |
+
else:
|
51 |
+
return jsonify({'error': 'File must be a PDF'})
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
app.run(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask
|
2 |
+
flask-cors
|
3 |
+
pytesseract
|
4 |
+
pdf2image
|