import math from loguru import logger from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox from magic_pdf.libs.commons import join_path from magic_pdf.libs.ocr_content_type import ContentType TYPE_INLINE_EQUATION = ContentType.InlineEquation TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] @DeprecationWarning def mk_nlp_markdown_1(para_dict: dict): """ 对排序后的bboxes拼接内容 """ content_lst = [] for _, page_info in para_dict.items(): para_blocks = page_info.get("para_blocks") if not para_blocks: continue for block in para_blocks: item = block["paras"] for _, p in item.items(): para_text = p["para_text"] is_title = p["is_para_title"] title_level = p['para_title_level'] md_title_prefix = "#"*title_level if is_title: content_lst.append(f"{md_title_prefix} {para_text}") else: content_lst.append(para_text) content_text = "\n\n".join(content_lst) return content_text # 找到目标字符串在段落中的索引 def __find_index(paragraph, target): index = paragraph.find(target) if index != -1: return index else: return None def __insert_string(paragraph, target, postion): new_paragraph = paragraph[:postion] + target + paragraph[postion:] return new_paragraph def __insert_after(content, image_content, target): """ 在content中找到target,将image_content插入到target后面 """ index = content.find(target) if index != -1: content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):] else: logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}") return content def __insert_before(content, image_content, target): """ 在content中找到target,将image_content插入到target前面 """ index = content.find(target) if index != -1: content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:] else: logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}") return content @DeprecationWarning def mk_mm_markdown_1(para_dict: dict): """拼装多模态markdown""" content_lst = [] for _, page_info in para_dict.items(): page_lst = [] # 一个page内的段落列表 para_blocks = page_info.get("para_blocks") pymu_raw_blocks = page_info.get("preproc_blocks") all_page_images = [] all_page_images.extend(page_info.get("images",[])) all_page_images.extend(page_info.get("image_backup", []) ) all_page_images.extend(page_info.get("tables",[])) all_page_images.extend(page_info.get("table_backup",[]) ) if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景 for img in all_page_images: page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序 page_md = "\n\n".join(page_lst) else: for block in para_blocks: item = block["paras"] for _, p in item.items(): para_text = p["para_text"] is_title = p["is_para_title"] title_level = p['para_title_level'] md_title_prefix = "#"*title_level if is_title: page_lst.append(f"{md_title_prefix} {para_text}") else: page_lst.append(para_text) """拼装成一个页面的文本""" page_md = "\n\n".join(page_lst) """插入图片""" for img in all_page_images: imgbox = img['bbox'] img_content = f"![]({img['image_path']})" # 先看在哪个block内 for block in pymu_raw_blocks: bbox = block['bbox'] if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内 for l in block['lines']: line_box = l['bbox'] if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面 line_txt = "".join([s['text'] for s in l['spans']]) page_md = __insert_before(page_md, img_content, line_txt) break break else:# 在行与行之间 # 找到图片x0,y0与line的x0,y0最近的line min_distance = 100000 min_line = None for l in block['lines']: line_box = l['bbox'] distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2) if distance < min_distance: min_distance = distance min_line = l if min_line: line_txt = "".join([s['text'] for s in min_line['spans']]) img_h = imgbox[3] - imgbox[1] if min_distance