import collections # 统计库 import re from magic_pdf.libs.commons import fitz # pyMuPDF库 #--------------------------------------- Tool Functions --------------------------------------# # 正则化,输入文本,输出只保留a-z,A-Z,0-9 def remove_special_chars(s: str) -> str: pattern = r"[^a-zA-Z0-9]" res = re.sub(pattern, "", s) return res def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: # 判断rect1和rect2是否一模一样 return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2 def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: # 判断rect1包含了rect2 return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1) def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: # 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠) return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2) def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float): # 计算两个rect,重叠面积各占2个rect面积的比例 if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2): return 0, 0 square_1 = (R1 - L1) * (D1 - U1) square_2 = (R2 - L2) * (D2 - U2) if square_1 == 0 or square_2 == 0: return 0, 0 square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2)) return square_overlap / square_1, square_overlap / square_2 def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float): # 计算两个line,重叠区间各占2个line长度的比例 if max(L1, L2) > min(R1, R2): return 0, 0 if L1 == R1 or L2 == R2: return 0, 0 overlap_line = min(R1, R2) - max(L1, L2) return overlap_line / (R1 - L1), overlap_line / (R2 - L2) # 判断rect其实是一条line def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool: width = R - L height = D - U if width <= 3 or height <= 3: return True if width / height >= 30 or height / width >= 30: return True def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]): """ :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 :param page :fitz读取的当前页的内容 :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict """ #### 通过fitz获取page信息 ## 超越边界 DPI = 72 # use this resolution pix = page.get_pixmap(dpi=DPI) pageL = 0 pageR = int(pix.w) pageU = 0 pageD = int(pix.h) #----------------- 保存每一个文本块的LURD ------------------# textLine_blocks = [] blocks = page.get_text( "dict", flags=fitz.TEXTFLAGS_TEXT, #clip=clip, )["blocks"] for i in range(len(blocks)): bbox = blocks[i]['bbox'] # print(bbox) for tt in blocks[i]['lines']: # 当前line cur_line_bbox = None # 当前line,最右侧的section的bbox for xf in tt['spans']: L, U, R, D = xf['bbox'] L, R = min(L, R), max(L, R) U, D = min(U, D), max(U, D) textLine_blocks.append((L, U, R, D)) textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0])) #---------------------------------------------- 保存img --------------------------------------------------# raw_imgs = page.get_images() # 获取所有的图片 imgs = [] img_names = [] # 保存图片的名字,方便在md中插入引用 img_bboxs = [] # 保存图片的location信息。 img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了 img_ID = 0 ## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标) for i in range(len(raw_imgs)): # 如果图片在junklist中则跳过 if raw_imgs[i][0] in junk_img_bojids: continue else: try: tt = page.get_image_rects(raw_imgs[i][0], transform = True) rec = tt[0][0] L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3]) L, R = min(L, R), max(L, R) U, D = min(U, D), max(U, D) if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD): continue if pageL == L and R == pageR: continue if pageU == U and D == pageD: continue # pix1 = page.get_Pixmap(clip=(L,U,R,D)) new_img_name = "{}_{}.png".format(page_ID, i) # 图片name # pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名 img_names.append(new_img_name) img_bboxs.append((L, U, R, D)) img_visited.append(False) imgs.append(raw_imgs[i]) except: continue #-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------# imgs_ok = [True for _ in range(len(imgs))] for i in range(len(imgs)): L1, U1, R1, D1 = img_bboxs[i] for j in range(i + 1, len(imgs)): L2, U2, R2, D2 = img_bboxs[j] ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) s1 = abs(R1 - L1) * abs(D1 - U1) s2 = abs(R2 - L2) * abs(D2 - U2) if ratio_1 > 0 and ratio_2 > 0: if ratio_1 == 1 and ratio_2 > 0.8: imgs_ok[i] = False elif ratio_1 > 0.8 and ratio_2 == 1: imgs_ok[j] = False elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4: imgs_ok[i] = False imgs_ok[j] = False elif s1 / s2 > 5 and ratio_2 > 0.5: imgs_ok[j] = False elif s2 / s1 > 5 and ratio_1 > 0.5: imgs_ok[i] = False imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True] img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True] img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True] img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True] #*******************************************************************************# #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------# # svgs = page.get_drawings() #------------ preprocess, check一些大框,看是否是合理的 ----------# ## 去重。有时候会遇到rect1和rect2是完全一样的情形。 svg_rect_visited = set() available_svgIdx = [] for i in range(len(svgs)): L, U, R, D = svgs[i]['rect'].irect L, R = min(L, R), max(L, R) U, D = min(U, D), max(U, D) tt = (L, U, R, D) if tt not in svg_rect_visited: svg_rect_visited.add(tt) available_svgIdx.append(i) svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs svg_childs = [[] for _ in range(len(svgs))] svg_parents = [[] for _ in range(len(svgs))] svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9] svg_visited = [False for _ in range(len(svgs))] svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。 for i in range(len(svgs)): L, U, R, D = svgs[i]['rect'].irect ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD) if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20): if ratio_2 >= 0.7: svg_exceedPage[i] += 4 else: if L <= pageL: svg_exceedPage[i] += 1 if pageR <= R: svg_exceedPage[i] += 1 if U <= pageU: svg_exceedPage[i] += 1 if pageD <= D: svg_exceedPage[i] += 1 #### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。 if len([x for x in svg_exceedPage if x >= 1]) >= 2: svgs = [] svg_childs = [] svg_parents = [] svg_overlaps = [] svg_visited = [] svg_exceedPage = [] #---------------------------- build graph ----------------------------# for i, p in enumerate(svgs): L1, U1, R1, D1 = svgs[i]["rect"].irect for j in range(len(svgs)): if i == j: continue L2, U2, R2, D2 = svgs[j]["rect"].irect ## 包含 if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: svg_childs[i].append(j) svg_parents[j].append(i) else: ## 交叉 if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: svg_overlaps[i].append(j) #---------------- 确定最终的svg。连通块儿的外围 -------------------# eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准) svg_ID = 0 svg_final_names = [] svg_final_bboxs = [] svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited svg_idxs = [i for i in range(len(svgs))] svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序 for i in svg_idxs: if svg_visited[i] == True: continue svg_visited[i] = True L, U, R, D = svgs[i]['rect'].irect width = R - L height = D - U if check_rect_isLine(L, U, R, D) == True: svg_visited[i] = False continue # if i == 4: # print(i, L, U, R, D) # print(svg_parents[i]) cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。 if len(svg_parents[i]) == 0: ## 是个普通框的情形 cur_block_element_cnt += len(svg_childs[i]) if svg_exceedPage[i] == 0: ## 误差。可能已经包含在某个框里面了 neglect_flag = False for pL, pU, pR, pD in svg_final_bboxs: if pL <= L <= R <= pR and pU <= U <= D <= pD: neglect_flag = True break if neglect_flag == True: continue ## 搜索连通域, bfs+记忆化 q = collections.deque() for j in svg_overlaps[i]: q.append(j) while q: j = q.popleft() svg_visited[j] = True L2, U2, R2, D2 = svgs[j]['rect'].irect # width2 = R2 - L2 # height2 = D2 - U2 # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30: # continue L = min(L, L2) R = max(R, R2) U = min(U, U2) D = max(D, D2) cur_block_element_cnt += 1 cur_block_element_cnt += len(svg_childs[j]) for k in svg_overlaps[j]: if svg_visited[k] == False and svg_exceedPage[k] == 0: svg_visited[k] = True q.append(k) elif svg_exceedPage[i] <= 2: ## 误差。可能已经包含在某个svg_final_bbox框里面了 neglect_flag = False for sL, sU, sR, sD in svg_final_bboxs: if sL <= L <= R <= sR and sU <= U <= D <= sD: neglect_flag = True break if neglect_flag == True: continue L, U, R, D = pageR, pageD, pageL, pageU ## 所有孩子元素的最大边界 for j in svg_childs[i]: if svg_visited[j] == True: continue if svg_exceedPage[j] >= 1: continue svg_visited[j] = True #### 这个位置考虑一下 L2, U2, R2, D2 = svgs[j]['rect'].irect L = min(L, L2) R = max(R, R2) U = min(U, U2) D = max(D, D2) cur_block_element_cnt += 1 # 如果是条line,就不用保存了 if check_rect_isLine(L, U, R, D) == True: continue # 如果当前的svg,连2个elements都没有,就不用保存了 if cur_block_element_cnt < 3: continue ## 当前svg,框住了多少文本框。如果框多了,可能就是错了 contain_textLineBlock_cnt = 0 for L2, U2, R2, D2 in textLine_blocks: if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True: contain_textLineBlock_cnt += 1 if contain_textLineBlock_cnt >= 10: continue # L -= eps_ERROR * 2 # U -= eps_ERROR # R += eps_ERROR * 2 # D += eps_ERROR # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True) # cur_svg = page.get_pixmap(clip=(L,U,R,D)) new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名 svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用 svg_final_bboxs.append((L, U, R, D)) svg_final_visited.append(False) svg_ID += 1 ## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并 svg_idxs = [i for i in range(len(svg_final_bboxs))] svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L) svg_final_names_2 = [] svg_final_bboxs_2 = [] svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited svg_ID_2 = 0 for i in range(len(svg_final_bboxs)): L1, U1, R1, D1 = svg_final_bboxs[i] for j in range(i + 1, len(svg_final_bboxs)): L2, U2, R2, D2 = svg_final_bboxs[j] # 如果 rect1包含了rect2 if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: svg_final_visited[j] = True continue # 水平并列 ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2) if ratio_1 >= 0.7 and ratio_2 >= 0.7: if abs(L2 - R1) >= 20: continue LL = min(L1, L2) UU = min(U1, U2) RR = max(R1, R2) DD = max(D1, D2) svg_final_bboxs[i] = (LL, UU, RR, DD) svg_final_visited[j] = True continue # 竖直并列 ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2) if ratio_1 >= 0.7 and ratio_2 >= 0.7: if abs(U2 - D1) >= 20: continue LL = min(L1, L2) UU = min(U1, U2) RR = max(R1, R2) DD = max(D1, D2) svg_final_bboxs[i] = (LL, UU, RR, DD) svg_final_visited[j] = True for i in range(len(svg_final_bboxs)): if svg_final_visited[i] == False: L, U, R, D = svg_final_bboxs[i] svg_final_bboxs_2.append((L, U, R, D)) L -= eps_ERROR * 2 U -= eps_ERROR R += eps_ERROR * 2 D += eps_ERROR # cur_svg = page.get_pixmap(clip=(L,U,R,D)) new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名 svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用 svg_final_bboxs_2.append((L, U, R, D)) svg_final_visited_2.append(False) svg_ID_2 += 1 ## svg收尾。识别为drawing,但是在上面没有拼成一张图的。 # 有收尾才comprehensive # xxxx # xxxx # xxxx # xxxx #--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------# figure_bbox_from_DocXChain = [] figure_from_DocXChain_visited = [] # 记忆化 figure_bbox_from_DocXChain_overlappedRatio = [] figure_only_from_DocXChain_bboxs = [] # 存储 figure_only_from_DocXChain_names = [] figure_only_from_DocXChain_visited = [] figure_only_ID = 0 xf_json = json_from_DocXchain_obj width_from_json = xf_json['page_info']['width'] height_from_json = xf_json['page_info']['height'] LR_scaleRatio = width_from_json / (pageR - pageL) UD_scaleRatio = height_from_json / (pageD - pageU) for xf in xf_json['layout_dets']: # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'} L = xf['poly'][0] / LR_scaleRatio U = xf['poly'][1] / UD_scaleRatio R = xf['poly'][2] / LR_scaleRatio D = xf['poly'][5] / UD_scaleRatio # L += pageL # 有的页面,artBox偏移了。不在(0,0) # R += pageL # U += pageU # D += pageU L, R = min(L, R), max(L, R) U, D = min(U, D), max(U, D) # figure if xf["category_id"] == 1 and xf['score'] >= 0.3: figure_bbox_from_DocXChain.append((L, U, R, D)) figure_from_DocXChain_visited.append(False) figure_bbox_from_DocXChain_overlappedRatio.append(0.0) #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------# ## 比对imgs for i, b1 in enumerate(figure_bbox_from_DocXChain): # print('--------- DocXChain的图片', b1) L1, U1, R1, D1 = b1 for b2 in img_bboxs: # print('-------- igms得到的图', b2) L2, U2, R2, D2 = b2 s1 = abs(R1 - L1) * abs(D1 - U1) s2 = abs(R2 - L2) * abs(D2 - U2) # 相同 if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: figure_from_DocXChain_visited[i] = True # 包含 elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: if s2 / s1 > 0.8: figure_from_DocXChain_visited[i] = True elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: if s1 / s2 > 0.8: figure_from_DocXChain_visited[i] = True else: # 重叠了相当一部分 # print('进入第3部分') ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8): figure_from_DocXChain_visited[i] = True else: figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1 # print('图片的重叠率是{}'.format(ratio_1)) ## 比对svgs svg_final_bboxs_2_badIdxs = [] for i, b1 in enumerate(figure_bbox_from_DocXChain): L1, U1, R1, D1 = b1 for j, b2 in enumerate(svg_final_bboxs_2): L2, U2, R2, D2 = b2 s1 = abs(R1 - L1) * abs(D1 - U1) s2 = abs(R2 - L2) * abs(D2 - U2) # 相同 if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: figure_from_DocXChain_visited[i] = True # 包含 elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: figure_from_DocXChain_visited[i] = True elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: if s1 / s2 > 0.7: figure_from_DocXChain_visited[i] = True else: svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。 else: # 重叠了相当一部分 ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6): figure_from_DocXChain_visited[i] = True else: figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1 # 丢掉错误的svg svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)] for i in range(len(figure_from_DocXChain_visited)): if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7: figure_from_DocXChain_visited[i] = True # DocXChain识别出来的figure,但是没被保存的。 for i in range(len(figure_from_DocXChain_visited)): if figure_from_DocXChain_visited[i] == False: figure_from_DocXChain_visited[i] = True cur_bbox = figure_bbox_from_DocXChain[i] # cur_figure = page.get_pixmap(clip=cur_bbox) new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name # cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名 figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用 figure_only_from_DocXChain_bboxs.append(cur_bbox) figure_only_from_DocXChain_visited.append(False) figure_only_ID += 1 img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0])) figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs #--------------------------- 最后统一去重 -----------------------------------# curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) ) #### 先考虑包含关系的小块 final_duplicate = set() for i in range(len(curPage_all_fig_bboxs)): L1, U1, R1, D1 = curPage_all_fig_bboxs[i] for j in range(len(curPage_all_fig_bboxs)): if i == j: continue L2, U2, R2, D2 = curPage_all_fig_bboxs[j] s1 = abs(R1 - L1) * abs(D1 - U1) s2 = abs(R2 - L2) * abs(D2 - U2) if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: final_duplicate.add((L1, U1, R1, D1)) else: ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) if ratio_1 >= 0.8 and ratio_2 <= 0.6: final_duplicate.add((L1, U1, R1, D1)) curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate] #### 再考虑重叠关系的块 final_duplicate = set() final_synthetic_bboxs = [] for i in range(len(curPage_all_fig_bboxs)): L1, U1, R1, D1 = curPage_all_fig_bboxs[i] for j in range(len(curPage_all_fig_bboxs)): if i == j: continue L2, U2, R2, D2 = curPage_all_fig_bboxs[j] s1 = abs(R1 - L1) * abs(D1 - U1) s2 = abs(R2 - L2) * abs(D2 - U2) ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) union_ok = False if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): union_ok = True if (ratio_1 > 0.2 and s2 / s1 > 5): union_ok = True if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): union_ok = True if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): union_ok = True if union_ok == True: final_duplicate.add((L1, U1, R1, D1)) final_duplicate.add((L2, U2, R2, D2)) L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2) final_synthetic_bboxs.append((L3, U3, R3, D3)) # print('---------- curPage_all_fig_bboxs ---------') # print(curPage_all_fig_bboxs) curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate] final_synthetic_bboxs = list(set(final_synthetic_bboxs)) ## 再再考虑重叠关系。极端情况下会迭代式地2进1 new_images = [] droped_img_idx = [] image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs] for i in range(0, len(image_bboxes)): for j in range(i+1, len(image_bboxes)): if j not in droped_img_idx: L2, U2, R2, D2 = image_bboxes[j] s1 = abs(R1 - L1) * abs(D1 - U1) s2 = abs(R2 - L2) * abs(D2 - U2) ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) union_ok = False if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): union_ok = True if (ratio_1 > 0.2 and s2 / s1 > 5): union_ok = True if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): union_ok = True if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): union_ok = True if union_ok == True: # 合并 image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3]) droped_img_idx.append(j) for i in range(0, len(image_bboxes)): if i not in droped_img_idx: new_images.append(image_bboxes[i]) # find_union_FLAG = True # while find_union_FLAG == True: # find_union_FLAG = False # final_duplicate = set() # tmp = [] # for i in range(len(final_synthetic_bboxs)): # L1, U1, R1, D1 = final_synthetic_bboxs[i] # for j in range(len(final_synthetic_bboxs)): # if i == j: # continue # L2, U2, R2, D2 = final_synthetic_bboxs[j] # s1 = abs(R1 - L1) * abs(D1 - U1) # s2 = abs(R2 - L2) * abs(D2 - U2) # ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) # union_ok = False # if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): # union_ok = True # if (ratio_1 > 0.2 and s2 / s1 > 5): # union_ok = True # if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): # union_ok = True # if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): # union_ok = True # if union_ok == True: # find_union_FLAG = True # final_duplicate.add((L1, U1, R1, D1)) # final_duplicate.add((L2, U2, R2, D2)) # L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2) # tmp.append((L3, U3, R3, D3)) # if find_union_FLAG == True: # tmp = list(set(tmp)) # final_synthetic_bboxs = tmp[:] # curPage_all_fig_bboxs += final_synthetic_bboxs # print('--------- final synthetic') # print(final_synthetic_bboxs) #**************************************************************************# images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs] images = images1 + new_images return images