muryshev commited on
Commit
b7484d7
·
1 Parent(s): 58d107c
Files changed (4) hide show
  1. .gitignore +1 -0
  2. Dockerfile +27 -0
  3. app.py +54 -0
  4. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.bat
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a base image that supports Python and includes Tesseract
2
+ FROM python:3.9-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED 1
6
+ ENV FLASK_APP app.py
7
+ ENV APP_HOME /app
8
+
9
+ # Install Tesseract and its dependencies
10
+ RUN apt-get update && apt-get install --no-install-recommends -y \
11
+ tesseract-ocr \
12
+ tesseract-ocr-rus poppler-utils && \
13
+ rm -rf /var/lib/apt/lists/*
14
+
15
+ # Create and set the working directory
16
+ RUN mkdir /var/www
17
+ RUN mkdir /var/www/tmp
18
+ ENV HOME /var/www
19
+ WORKDIR /var/www
20
+ COPY . /var/www
21
+
22
+ RUN pip install --no-cache-dir -r requirements.txt
23
+
24
+ EXPOSE 7860
25
+
26
+ # Run the Flask application
27
+ CMD flask run --host=0.0.0.0 --port=7860
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ from flask import Flask, request, jsonify
4
+ import pytesseract
5
+ from pdf2image import convert_from_bytes
6
+ from flask_cors import CORS
7
+
8
+ os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
9
+
10
+ app = Flask(__name__)
11
+ CORS(app)
12
+ UPLOAD_FOLDER = './tmp'
13
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
14
+
15
+ # Endpoint for uploading PDF and extracting text
16
+ @app.route('/upload', methods=['POST'])
17
+ def upload_file():
18
+ # Check if the post request has the file part
19
+ if 'file' not in request.files:
20
+ return jsonify({'error': 'No file part'})
21
+
22
+ file = request.files['file']
23
+
24
+ # Check if the file is a PDF
25
+ if file.filename == '':
26
+ return jsonify({'error': 'No selected file'})
27
+ if file and file.filename.endswith('.pdf'):
28
+ # Convert PDF to images
29
+ # images = convert_from_bytes(file.read())
30
+ filename = str(uuid.uuid4()) + '.pdf'
31
+
32
+ # Save the file to the temporary upload directory
33
+ file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
34
+
35
+ # Construct and return the path where the file is saved
36
+ temp_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
37
+
38
+ text = ''
39
+ # for img in images:
40
+ # # Perform OCR on each page
41
+ # text += pytesseract.image_to_string(img, lang='rus')
42
+
43
+
44
+ # присрать сюда вызов библиотеки Андрея с temp_path
45
+
46
+
47
+ os.remove(temp_path)
48
+
49
+ return jsonify({'text': text})
50
+ else:
51
+ return jsonify({'error': 'File must be a PDF'})
52
+
53
+ if __name__ == '__main__':
54
+ app.run(debug=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ flask
2
+ flask-cors
3
+ pytesseract
4
+ pdf2image