muryshev commited on
Commit
071a451
·
1 Parent(s): 01f0b80
Files changed (2) hide show
  1. app.py +3 -3
  2. lib/ocr_2.py +303 -0
app.py CHANGED
@@ -5,7 +5,7 @@ from flask import Flask, request, jsonify, Response
5
  import pytesseract
6
  from pdf2image import convert_from_bytes
7
  from flask_cors import CORS
8
- from lib import ocr_1
9
  from lib import llm_3 as llm
10
 
11
  os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
@@ -46,11 +46,11 @@ def upload_file():
46
  # text += pytesseract.image_to_string(img, lang='rus')
47
 
48
 
49
- docs_info = ocr_1.processSingleFile(temp_path)
50
 
51
 
52
  os.remove(temp_path)
53
- return json.dumps(docs_info, sort_keys=False)
54
  else:
55
  return jsonify({'error': 'File must be a PDF'})
56
 
 
5
  import pytesseract
6
  from pdf2image import convert_from_bytes
7
  from flask_cors import CORS
8
+ from lib import ocr_1 as ocr
9
  from lib import llm_3 as llm
10
 
11
  os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
 
46
  # text += pytesseract.image_to_string(img, lang='rus')
47
 
48
 
49
+ docs_info = ocr.processSingleFile(temp_path)
50
 
51
 
52
  os.remove(temp_path)
53
+ return Response(json.dumps(docs_info, sort_keys=False, ensure_ascii=False), content_type='application/json; charset=utf-8')
54
  else:
55
  return jsonify({'error': 'File must be a PDF'})
56
 
lib/ocr_2.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image, ImageFilter
2
+ import cv2
3
+ import pytesseract
4
+ from pytesseract import Output
5
+ from os import listdir, getcwd
6
+ from os.path import isfile, join
7
+ import numpy as np
8
+ import json
9
+ import matplotlib.pyplot as plt
10
+ from pdf2image import convert_from_path
11
+ from matplotlib import pyplot as plt
12
+ import re
13
+ import requests
14
+ import json
15
+
16
+ def getResponse(prompt) :
17
+ url = "https://muryshev-mixtral-api.hf.space/completion"
18
+
19
+ payload = json.dumps({
20
+ "prompt": '[INST]' + prompt + '[/INST]'
21
+ })
22
+
23
+ headers = {
24
+ 'Content-Type': 'application/json'
25
+ }
26
+
27
+ response = requests.request("POST", url, headers = headers, data = payload)
28
+ result = response.content.decode('utf-8')
29
+ return result
30
+
31
+ def getOrgAddr(application) :
32
+ # prefix = ''''Отвечайте всегда ТОЛЬКО НА РУССКОМ языке. Я предоставляю тебе "материал". Идентифицируй организацию и адрес организации в зависимости от указанного филиала.
33
+ # Используй такой формат: "Организация: *название*; Отделение/филиал: *название* если есть; Адрес: *адрес*;".
34
+ # Ты не комментируешь, не объясняешь, не выражаешь мысли, вообще ничего больше не говоришь.
35
+ # Материал: '''
36
+
37
+ prefix = ''''Отвечайте всегда ТОЛЬКО НА РУССКОМ языке. Я предоставляю тебе "материал". Идентифицируй организацию и адрес организации в зависимости от указанного филиала.
38
+ Используй такой формат: "Организация: *название*; Адрес: *адрес*;".
39
+ Ты не комментируешь, не объясняешь, не выражаешь мысли, вообще ничего больше не говоришь.
40
+ Материал: '''
41
+
42
+ prompt = prefix + application
43
+ response = getResponse(prompt)
44
+
45
+ s = response.strip()
46
+ # x = s.find('Адрес:')
47
+ # if x != -1 :
48
+ # y = s.find('(', x)
49
+ # if y != -1 :
50
+ # s = s[: y]
51
+ l = response.split('\n')
52
+ ll = []
53
+ for s in l :
54
+ s = s.strip()
55
+ if ('Адрес:' in s or 'Организация:' in s) and s not in ll :
56
+ ll.append(s)
57
+
58
+ result = '\n'.join(ll)
59
+
60
+ return result
61
+
62
+ def processFiles(pdfs, verbose = False) :
63
+ images_per_pdf_2d = [convert_from_path(file) for file in pdfs]
64
+
65
+ images_per_pdf = []
66
+ docfilenames = []
67
+ pagenames = []
68
+ fileindices = []
69
+ for i in range(len(images_per_pdf_2d)) :
70
+ docfilenames.append(filenames[i][:-4])
71
+ pageindices = []
72
+ for j in range(len(images_per_pdf_2d[i])) :
73
+ images_per_pdf.append(images_per_pdf_2d[i][j])
74
+ pagenames.append(filenames[i][:-4] + '_page_' + str(j))
75
+ pageindices.append(len(pagenames) - 1)
76
+ # print(i, j, len(pagenames) - 1, pagenames[-1])
77
+
78
+ fileindices.append(pageindices)
79
+
80
+ gray_images_per_pdf_cropped = []
81
+ for i in range(len(images_per_pdf)) :
82
+ image = images_per_pdf[i]
83
+ crop = image.convert("L").crop((
84
+ 750, 150, # left top point
85
+ 1654, 850 # right bottom point
86
+ ))
87
+ gray_images_per_pdf_cropped.append(crop)
88
+
89
+ texts = [pytesseract.image_to_string(image, lang='rus') for image in gray_images_per_pdf_cropped]
90
+ fulltexts = [pytesseract.image_to_string(image, lang='rus') for image in images_per_pdf]
91
+
92
+ cropped_images = gray_images_per_pdf_cropped
93
+ init_size = cropped_images[0].size
94
+ thresh_imgs = [
95
+ image.resize(
96
+ (init_size[0] //4, init_size[1] // 4)
97
+ ).point(
98
+ lambda x: 0 if x < 220 else 255
99
+ ).filter(
100
+ ImageFilter.MedianFilter(5)
101
+ ).filter(
102
+ ImageFilter.MinFilter(15) #15
103
+ ) for i,(name,image) in enumerate(zip(pagenames, cropped_images))
104
+ ]
105
+
106
+ masks = thresh_imgs
107
+ masks_arr = [np.array(img) for img in masks]
108
+ mask_shape = masks_arr[0].shape
109
+
110
+ str_size = 7
111
+ masks = []
112
+ masks_bw = []
113
+ for name, mask in zip(pagenames, masks_arr):
114
+ cleaned_mask = mask.copy()
115
+
116
+ for iter in range(mask_shape[0] // str_size):
117
+ temp_mean = int(cleaned_mask[iter*str_size : iter*str_size + str_size, :].mean())
118
+
119
+ if (temp_mean < 49) or (temp_mean > 160):
120
+ cleaned_mask[iter*str_size : iter*str_size + str_size, :] = 255
121
+
122
+ vertical_threshold = 200
123
+
124
+ for i in range(mask_shape[1] // str_size + 1):
125
+ if (i*str_size + str_size) > mask_shape[1]:
126
+ temp_mean_vertical = int(cleaned_mask[:, i*str_size : mask_shape[1]].mean())
127
+
128
+ if temp_mean_vertical > vertical_threshold:
129
+ cleaned_mask[:, i*str_size : mask_shape[1]] = 255
130
+ else:
131
+ temp_mean_vertical = int(cleaned_mask[:, i*str_size : i*str_size + str_size].mean())
132
+
133
+ if temp_mean_vertical > vertical_threshold:
134
+ cleaned_mask[:, i*str_size : i*str_size + str_size] = 255
135
+
136
+ image = Image.fromarray(cleaned_mask).filter(
137
+ ImageFilter.MedianFilter(13)
138
+ ).filter(
139
+ ImageFilter.MinFilter(25) #15
140
+ )
141
+ masks.append(image)
142
+ masks_bw.append(image.convert('1'))
143
+
144
+ masks_bw_arr = [np.array(img) for img in masks_bw]
145
+
146
+ # check which pages have address box: if there is no address box the mask is empty
147
+
148
+ addressexists = [bool((~mask_bw).sum()) for mask_bw in masks_bw_arr]
149
+
150
+ # this is a list of CB names that may be used in address
151
+
152
+ CBnames = [
153
+ 'цб рф',
154
+ 'центральный банк',
155
+ 'центрального банка',
156
+ 'банк россии',
157
+ 'банка россии',
158
+ ]
159
+
160
+ # check which pages have address box addressed to CB
161
+
162
+ toCB = []
163
+ for i in range(len(addressexists)) :
164
+ iftoCB = False
165
+ for j in range(len(CBnames)) :
166
+ if addressexists[i] and CBnames[j] in texts[i].lower() :
167
+ iftoCB = True
168
+ break
169
+
170
+ toCB.append(iftoCB)
171
+
172
+ # build 3-level list: file -> doc -> page
173
+
174
+ docindices = []
175
+ doctypes = []
176
+ for i in range(len(fileindices)) :
177
+ docs = []
178
+ types = []
179
+ pages = []
180
+ doctype = False
181
+ for j in range(len(fileindices[i])) :
182
+ index = fileindices[i][j]
183
+ ifaddress = addressexists[index]
184
+ iftoCB = toCB[index]
185
+ if ifaddress :
186
+ if len(pages) > 0 :
187
+ docs.append(pages)
188
+ types.append(doctype)
189
+
190
+ pages = []
191
+ doctype = iftoCB
192
+
193
+ pages.append(index)
194
+
195
+ docs.append(pages)
196
+ types.append(doctype)
197
+ docindices.append(docs)
198
+ doctypes.append(types)
199
+
200
+ cropped = cropped_images
201
+ orig_size = cropped[0].size
202
+ masks = [mask.convert('L').resize((orig_size)) for mask in masks]
203
+
204
+ if verbose :
205
+ for i in range(len(masks)) :
206
+ img = np.array(masks[i])
207
+ out = np.array(cropped[i])
208
+
209
+ bw = cv2.inRange(img, 0, 12)
210
+ contours, hierarchy = cv2.findContours(bw, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
211
+
212
+ aaa = cv2.drawContours(out, contours, -1, (0, 255, 0), 5, cv2.LINE_AA, hierarchy, 1)
213
+
214
+ print()
215
+ print(pagenames[i])
216
+ print('Address exists :', addressexists[i])
217
+ print('To CB :', toCB[i])
218
+ # if addressflags[i] :
219
+
220
+ # if toCB[i] :
221
+ # print('text :', texts[i])
222
+ plt.imshow(Image.fromarray(aaa))
223
+ plt.show()
224
+
225
+ # print recognized text with marks: file - > doc # and doc type -> page number and text
226
+
227
+ docs_info = []
228
+ for i in range(len(docindices)) :
229
+ docs = []
230
+ if verbose :
231
+ print('File =', docfilenames[i])
232
+
233
+ for j in range(len(docindices[i])) :
234
+ doc = {}
235
+ doctype = 'Сопроводительное письмо'
236
+ if doctypes[i][j] :
237
+ doctype = 'Обращение'
238
+
239
+ doc['Тип документа'] = doctype
240
+ text = ''
241
+ if verbose :
242
+ print('Doc =', j, 'Type =', doctype)
243
+
244
+ index = docindices[i][j][0]
245
+ orginfo = ''
246
+ if toCB[index] :
247
+ orginfo = getOrgAddr(texts[index])
248
+
249
+ doc['Атрибуты'] = orginfo
250
+
251
+ for k in range(len(docindices[i][j])) :
252
+ index = docindices[i][j][k]
253
+ text += fulltexts[index]
254
+ if verbose :
255
+ print('Page =', pagenames[index])
256
+ print(fulltexts[index])
257
+ print('--- end of page ---')
258
+ print()
259
+
260
+ text = re.sub(r'\n +', r'\n', text)
261
+ text = re.sub(r'\n+', r'\n', text)
262
+ doc['Текст документа'] = text
263
+ docs.append(doc)
264
+
265
+ docs_info.append(docs)
266
+
267
+ for i in range(len(docindices)) :
268
+ for j in range(len(docindices[i])) :
269
+ for k in range(len(docindices[i][j])) :
270
+ index = docindices[i][j][k]
271
+ if toCB[index] :
272
+ orginfo = getOrgAddr(texts[index])
273
+ print(file)
274
+ print()
275
+ print(orginfo)
276
+ print()
277
+ # print(texts[index].strip())
278
+ # print()
279
+ # if toCB[index] :
280
+ # if verbose :
281
+ # print('Page =', pagenames[index])
282
+ # print(texts[index].strip())
283
+ # print('------------------------')
284
+ # companie
285
+ # print()
286
+
287
+ return docs_info
288
+
289
+ def processSingleFile(file) :
290
+ return processFiles([file])[0]
291
+
292
+ # file = 'test.pdf'
293
+ # docs_info = processSingleFile(file)
294
+
295
+ # docs_info =
296
+ # [
297
+ # {
298
+ # 'Имя поля' : 'Текст поля',
299
+ # ...
300
+ # },
301
+ # ...
302
+ # ]
303
+ # то есть это массив документов, содержащихся в файле, для каждого документа задан словарь 'Имя поля' : 'Текст поля' (сейчас там 2 поля для каждого документа)